diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 7c759a1adc950..690ab1d5af575 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -31,3 +31,6 @@ d8f0e6caa91e230a486c948ab643174e40bdf215 # Remove line-endings added by r320089. NFC. 100a0eedc00b2bf48bcdc6c209c000745a4a0e48 + +# Cleanup __config indention. NFC. +2b772b930e097ed6f06d698a51e291c7fd318baa diff --git a/clang-tools-extra/CMakeLists.txt b/clang-tools-extra/CMakeLists.txt index 57bb970575608..2e73b6ba81d2e 100644 --- a/clang-tools-extra/CMakeLists.txt +++ b/clang-tools-extra/CMakeLists.txt @@ -1,5 +1,8 @@ include(CMakeDependentOption) +option(CLANG_TIDY_ENABLE_STATIC_ANALYZER + "Include static analyzer checks in clang-tidy" ON) + add_subdirectory(clang-apply-replacements) add_subdirectory(clang-reorder-fields) add_subdirectory(modularize) diff --git a/clang-tools-extra/clang-tidy/CMakeLists.txt b/clang-tools-extra/clang-tidy/CMakeLists.txt index 02573534ccaef..ca7a5afed6b0b 100644 --- a/clang-tools-extra/clang-tidy/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/CMakeLists.txt @@ -3,6 +3,11 @@ set(LLVM_LINK_COMPONENTS Support ) +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/clang-tidy-config.h.cmake + ${CMAKE_CURRENT_BINARY_DIR}/clang-tidy-config.h) +include_directories(BEFORE ${CMAKE_CURRENT_BINARY_DIR}) + add_clang_library(clangTidy ClangTidy.cpp ClangTidyCheck.cpp @@ -34,7 +39,7 @@ clang_target_link_libraries(clangTidy clangToolingCore ) -if(CLANG_ENABLE_STATIC_ANALYZER) +if(CLANG_TIDY_ENABLE_STATIC_ANALYZER) clang_target_link_libraries(clangTidy PRIVATE clangStaticAnalyzerCore @@ -46,6 +51,7 @@ endif() # If you add a check, also add it to ClangTidyForceLinker.h in this directory. add_subdirectory(android) add_subdirectory(abseil) +add_subdirectory(altera) add_subdirectory(boost) add_subdirectory(bugprone) add_subdirectory(cert) @@ -59,7 +65,7 @@ add_subdirectory(llvm) add_subdirectory(llvmlibc) add_subdirectory(misc) add_subdirectory(modernize) -if(CLANG_ENABLE_STATIC_ANALYZER) +if(CLANG_TIDY_ENABLE_STATIC_ANALYZER) add_subdirectory(mpi) endif() add_subdirectory(objc) @@ -71,6 +77,7 @@ add_subdirectory(zircon) set(ALL_CLANG_TIDY_CHECKS clangTidyAndroidModule clangTidyAbseilModule + clangTidyAlteraModule clangTidyBoostModule clangTidyBugproneModule clangTidyCERTModule @@ -91,7 +98,7 @@ set(ALL_CLANG_TIDY_CHECKS clangTidyReadabilityModule clangTidyZirconModule ) -if(CLANG_ENABLE_STATIC_ANALYZER) +if(CLANG_TIDY_ENABLE_STATIC_ANALYZER) list(APPEND ALL_CLANG_TIDY_CHECKS clangTidyMPIModule) endif() set(ALL_CLANG_TIDY_CHECKS ${ALL_CLANG_TIDY_CHECKS} PARENT_SCOPE) diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp index 90b39347bc9ac..1f94ab4977c23 100644 --- a/clang-tools-extra/clang-tidy/ClangTidy.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp @@ -20,11 +20,11 @@ #include "ClangTidyModuleRegistry.h" #include "ClangTidyProfiling.h" #include "ExpandModularHeadersPPCallbacks.h" +#include "clang-tidy-config.h" #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" #include "clang/AST/Decl.h" #include "clang/ASTMatchers/ASTMatchFinder.h" -#include "clang/Config/config.h" #include "clang/Format/Format.h" #include "clang/Frontend/ASTConsumers.h" #include "clang/Frontend/CompilerInstance.h" @@ -47,10 +47,10 @@ #include #include -#if CLANG_ENABLE_STATIC_ANALYZER +#if CLANG_TIDY_ENABLE_STATIC_ANALYZER #include "clang/Analysis/PathDiagnostic.h" #include "clang/StaticAnalyzer/Frontend/AnalysisConsumer.h" -#endif // CLANG_ENABLE_STATIC_ANALYZER +#endif // CLANG_TIDY_ENABLE_STATIC_ANALYZER using namespace clang::ast_matchers; using namespace clang::driver; @@ -63,7 +63,7 @@ namespace clang { namespace tidy { namespace { -#if CLANG_ENABLE_STATIC_ANALYZER +#if CLANG_TIDY_ENABLE_STATIC_ANALYZER static const char *AnalyzerCheckNamePrefix = "clang-analyzer-"; class AnalyzerDiagnosticConsumer : public ento::PathDiagnosticConsumer { @@ -95,7 +95,7 @@ class AnalyzerDiagnosticConsumer : public ento::PathDiagnosticConsumer { private: ClangTidyContext &Context; }; -#endif // CLANG_ENABLE_STATIC_ANALYZER +#endif // CLANG_TIDY_ENABLE_STATIC_ANALYZER class ErrorReporter { public: @@ -324,7 +324,7 @@ ClangTidyASTConsumerFactory::ClangTidyASTConsumerFactory( } } -#if CLANG_ENABLE_STATIC_ANALYZER +#if CLANG_TIDY_ENABLE_STATIC_ANALYZER static void setStaticAnalyzerCheckerOpts(const ClangTidyOptions &Opts, AnalyzerOptionsRef AnalyzerOptions) { StringRef AnalyzerPrefix(AnalyzerCheckNamePrefix); @@ -369,7 +369,7 @@ static CheckersList getAnalyzerCheckersAndPackages(ClangTidyContext &Context, } return List; } -#endif // CLANG_ENABLE_STATIC_ANALYZER +#endif // CLANG_TIDY_ENABLE_STATIC_ANALYZER std::unique_ptr ClangTidyASTConsumerFactory::CreateASTConsumer( @@ -424,7 +424,7 @@ ClangTidyASTConsumerFactory::CreateASTConsumer( if (!Checks.empty()) Consumers.push_back(Finder->newASTConsumer()); -#if CLANG_ENABLE_STATIC_ANALYZER +#if CLANG_TIDY_ENABLE_STATIC_ANALYZER AnalyzerOptionsRef AnalyzerOptions = Compiler.getAnalyzerOpts(); AnalyzerOptions->CheckersAndPackages = getAnalyzerCheckersAndPackages( Context, Context.canEnableAnalyzerAlphaCheckers()); @@ -440,7 +440,7 @@ ClangTidyASTConsumerFactory::CreateASTConsumer( new AnalyzerDiagnosticConsumer(Context)); Consumers.push_back(std::move(AnalysisConsumer)); } -#endif // CLANG_ENABLE_STATIC_ANALYZER +#endif // CLANG_TIDY_ENABLE_STATIC_ANALYZER return std::make_unique( std::move(Consumers), std::move(Profiling), std::move(Finder), std::move(Checks)); @@ -453,11 +453,11 @@ std::vector ClangTidyASTConsumerFactory::getCheckNames() { CheckNames.emplace_back(CheckFactory.getKey()); } -#if CLANG_ENABLE_STATIC_ANALYZER +#if CLANG_TIDY_ENABLE_STATIC_ANALYZER for (const auto &AnalyzerCheck : getAnalyzerCheckersAndPackages( Context, Context.canEnableAnalyzerAlphaCheckers())) CheckNames.push_back(AnalyzerCheckNamePrefix + AnalyzerCheck.first); -#endif // CLANG_ENABLE_STATIC_ANALYZER +#endif // CLANG_TIDY_ENABLE_STATIC_ANALYZER llvm::sort(CheckNames); return CheckNames; diff --git a/clang-tools-extra/clang-tidy/ClangTidyForceLinker.h b/clang-tools-extra/clang-tidy/ClangTidyForceLinker.h index 1d6bd2a4fd621..3a5330c85c3b0 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyForceLinker.h +++ b/clang-tools-extra/clang-tidy/ClangTidyForceLinker.h @@ -9,7 +9,7 @@ #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CLANGTIDYFORCELINKER_H #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CLANGTIDYFORCELINKER_H -#include "clang/Config/config.h" +#include "clang-tidy-config.h" #include "llvm/Support/Compiler.h" namespace clang { @@ -20,6 +20,11 @@ extern volatile int AbseilModuleAnchorSource; static int LLVM_ATTRIBUTE_UNUSED AbseilModuleAnchorDestination = AbseilModuleAnchorSource; +// This anchor is used to force the linker to link the AlteraModule. +extern volatile int AlteraModuleAnchorSource; +static int LLVM_ATTRIBUTE_UNUSED AlteraModuleAnchorDestination = + AlteraModuleAnchorSource; + // This anchor is used to force the linker to link the AndroidModule. extern volatile int AndroidModuleAnchorSource; static int LLVM_ATTRIBUTE_UNUSED AndroidModuleAnchorDestination = @@ -90,7 +95,7 @@ extern volatile int ModernizeModuleAnchorSource; static int LLVM_ATTRIBUTE_UNUSED ModernizeModuleAnchorDestination = ModernizeModuleAnchorSource; -#if CLANG_ENABLE_STATIC_ANALYZER && \ +#if CLANG_TIDY_ENABLE_STATIC_ANALYZER && \ !defined(CLANG_TIDY_DISABLE_STATIC_ANALYZER_CHECKS) // This anchor is used to force the linker to link the MPIModule. extern volatile int MPIModuleAnchorSource; diff --git a/clang-tools-extra/clang-tidy/altera/AlteraTidyModule.cpp b/clang-tools-extra/clang-tidy/altera/AlteraTidyModule.cpp new file mode 100644 index 0000000000000..d91f67ac14856 --- /dev/null +++ b/clang-tools-extra/clang-tidy/altera/AlteraTidyModule.cpp @@ -0,0 +1,39 @@ +//===--- AlteraTidyModule.cpp - clang-tidy --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../ClangTidy.h" +#include "../ClangTidyModule.h" +#include "../ClangTidyModuleRegistry.h" +#include "StructPackAlignCheck.h" + +using namespace clang::ast_matchers; + +namespace clang { +namespace tidy { +namespace altera { + +class AlteraModule : public ClangTidyModule { +public: + void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override { + CheckFactories.registerCheck( + "altera-struct-pack-align"); + } +}; + +} // namespace altera + +// Register the AlteraTidyModule using this statically initialized variable. +static ClangTidyModuleRegistry::Add + X("altera-module", "Adds Altera FPGA OpenCL lint checks."); + +// This anchor is used to force the linker to link in the generated object file +// and thus register the AlteraModule. +volatile int AlteraModuleAnchorSource = 0; + +} // namespace tidy +} // namespace clang diff --git a/clang-tools-extra/clang-tidy/altera/CMakeLists.txt b/clang-tools-extra/clang-tidy/altera/CMakeLists.txt new file mode 100644 index 0000000000000..ed28d9f4892d2 --- /dev/null +++ b/clang-tools-extra/clang-tidy/altera/CMakeLists.txt @@ -0,0 +1,22 @@ +set(LLVM_LINK_COMPONENTS + FrontendOpenMP + support + ) + +add_clang_library(clangTidyAlteraModule + AlteraTidyModule.cpp + StructPackAlignCheck.cpp + + LINK_LIBS + clangTidy + clangTidyUtils + ) + +clang_target_link_libraries(clangTidyAlteraModule + PRIVATE + clangAnalysis + clangAST + clangASTMatchers + clangBasic + clangLex + ) diff --git a/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp new file mode 100644 index 0000000000000..9f28a22a9d03e --- /dev/null +++ b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp @@ -0,0 +1,144 @@ +//===--- StructPackAlignCheck.cpp - clang-tidy ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "StructPackAlignCheck.h" +#include "clang/AST/ASTContext.h" +#include "clang/AST/RecordLayout.h" +#include "clang/ASTMatchers/ASTMatchFinder.h" +#include +#include + +using namespace clang::ast_matchers; + +namespace clang { +namespace tidy { +namespace altera { + +void StructPackAlignCheck::registerMatchers(MatchFinder *Finder) { + Finder->addMatcher(recordDecl(isStruct(), isDefinition(), + unless(isExpansionInSystemHeader())) + .bind("struct"), + this); +} + +CharUnits +StructPackAlignCheck::computeRecommendedAlignment(CharUnits MinByteSize) { + CharUnits NewAlign = CharUnits::fromQuantity(1); + if (!MinByteSize.isPowerOfTwo()) { + int MSB = (int)MinByteSize.getQuantity(); + for (; MSB > 0; MSB /= 2) { + NewAlign = NewAlign.alignTo( + CharUnits::fromQuantity(((int)NewAlign.getQuantity()) * 2)); + // Abort if the computed alignment meets the maximum configured alignment. + if (NewAlign.getQuantity() >= MaxConfiguredAlignment) + break; + } + } else { + NewAlign = MinByteSize; + } + return NewAlign; +} + +void StructPackAlignCheck::check(const MatchFinder::MatchResult &Result) { + const auto *Struct = Result.Nodes.getNodeAs("struct"); + + // Do not trigger on templated struct declarations because the packing and + // alignment requirements are unknown. + if (Struct->isTemplated()) + return; + + // Get sizing info for the struct. + llvm::SmallVector, 10> FieldSizes; + unsigned int TotalBitSize = 0; + for (const FieldDecl *StructField : Struct->fields()) { + // For each StructField, record how big it is (in bits). + // Would be good to use a pair of to advise a better + // packing order. + unsigned int StructFieldWidth = + (unsigned int)Result.Context + ->getTypeInfo(StructField->getType().getTypePtr()) + .Width; + FieldSizes.emplace_back(StructFieldWidth, StructField->getFieldIndex()); + // FIXME: Recommend a reorganization of the struct (sort by StructField + // size, largest to smallest). + TotalBitSize += StructFieldWidth; + } + + uint64_t CharSize = Result.Context->getCharWidth(); + CharUnits CurrSize = Result.Context->getASTRecordLayout(Struct).getSize(); + CharUnits MinByteSize = + CharUnits::fromQuantity(ceil((float)TotalBitSize / CharSize)); + CharUnits MaxAlign = CharUnits::fromQuantity( + ceil((float)Struct->getMaxAlignment() / CharSize)); + CharUnits CurrAlign = + Result.Context->getASTRecordLayout(Struct).getAlignment(); + CharUnits NewAlign = computeRecommendedAlignment(MinByteSize); + + bool IsPacked = Struct->hasAttr(); + bool NeedsPacking = (MinByteSize < CurrSize) && (MaxAlign != NewAlign) && + (CurrSize != NewAlign); + bool NeedsAlignment = CurrAlign.getQuantity() != NewAlign.getQuantity(); + + if (!NeedsAlignment && !NeedsPacking) + return; + + // If it's using much more space than it needs, suggest packing. + // (Do not suggest packing if it is currently explicitly aligned to what the + // minimum byte size would suggest as the new alignment.) + if (NeedsPacking && !IsPacked) { + diag(Struct->getLocation(), + "accessing fields in struct %0 is inefficient due to padding; only " + "needs %1 bytes but is using %2 bytes") + << Struct << (int)MinByteSize.getQuantity() + << (int)CurrSize.getQuantity() + << FixItHint::CreateInsertion(Struct->getEndLoc().getLocWithOffset(1), + " __attribute__((packed))"); + diag(Struct->getLocation(), + "use \"__attribute__((packed))\" to reduce the amount of padding " + "applied to struct %0", + DiagnosticIDs::Note) + << Struct; + } + + FixItHint FixIt; + AlignedAttr *Attribute = Struct->getAttr(); + std::string NewAlignQuantity = std::to_string((int)NewAlign.getQuantity()); + if (Attribute) { + std::ostringstream FixItString; + FixItString << "aligned(" << NewAlignQuantity << ")"; + FixIt = + FixItHint::CreateReplacement(Attribute->getRange(), FixItString.str()); + } else { + std::ostringstream FixItString; + FixItString << " __attribute__((aligned(" << NewAlignQuantity << ")))"; + FixIt = FixItHint::CreateInsertion(Struct->getEndLoc().getLocWithOffset(1), + FixItString.str()); + } + + // And suggest the minimum power-of-two alignment for the struct as a whole + // (with and without packing). + if (NeedsAlignment) { + diag(Struct->getLocation(), + "accessing fields in struct %0 is inefficient due to poor alignment; " + "currently aligned to %1 bytes, but recommended alignment is %2 bytes") + << Struct << (int)CurrAlign.getQuantity() << NewAlignQuantity << FixIt; + + diag(Struct->getLocation(), + "use \"__attribute__((aligned(%0)))\" to align struct %1 to %0 bytes", + DiagnosticIDs::Note) + << NewAlignQuantity << Struct; + } +} + +void StructPackAlignCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { + Options.store(Opts, "MaxConfiguredAlignment", MaxConfiguredAlignment); +} + +} // namespace altera +} // namespace tidy +} // namespace clang diff --git a/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.h b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.h new file mode 100644 index 0000000000000..510e03030590c --- /dev/null +++ b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.h @@ -0,0 +1,41 @@ +//===--- StructPackAlignCheck.h - clang-tidy --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_STRUCTPACKALIGNCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_STRUCTPACKALIGNCHECK_H + +#include "../ClangTidyCheck.h" + +namespace clang { +namespace tidy { +namespace altera { + +/// Finds structs that are inefficiently packed or aligned, and recommends +/// packing and/or aligning of said structs as needed. +/// +/// For the user-facing documentation see: +/// http://clang.llvm.org/extra/clang-tidy/checks/altera-struct-pack-align.html +class StructPackAlignCheck : public ClangTidyCheck { +public: + StructPackAlignCheck(StringRef Name, ClangTidyContext *Context) + : ClangTidyCheck(Name, Context), + MaxConfiguredAlignment(Options.get("MaxConfiguredAlignment", 128)) {} + void registerMatchers(ast_matchers::MatchFinder *Finder) override; + void check(const ast_matchers::MatchFinder::MatchResult &Result) override; + void storeOptions(ClangTidyOptions::OptionMap &Opts) override; + +private: + const unsigned MaxConfiguredAlignment; + CharUnits computeRecommendedAlignment(CharUnits MinByteSize); +}; + +} // namespace altera +} // namespace tidy +} // namespace clang + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_STRUCTPACKALIGNCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.cpp index 2a6a0ae53a4f3..6208cb5cfc9dc 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.cpp @@ -77,9 +77,9 @@ void MisplacedPointerArithmeticInAllocCheck::check( CallName = "operator new[]"; } else { const auto *CtrE = New->getConstructExpr(); - if (!CtrE->getArg(CtrE->getNumArgs() - 1) - ->getType() - ->isIntegralOrEnumerationType()) + if (!CtrE || !CtrE->getArg(CtrE->getNumArgs() - 1) + ->getType() + ->isIntegralOrEnumerationType()) return; CallName = "operator new"; } diff --git a/clang-tools-extra/clang-tidy/clang-tidy-config.h.cmake b/clang-tools-extra/clang-tidy/clang-tidy-config.h.cmake new file mode 100644 index 0000000000000..f4d1a4b38004b --- /dev/null +++ b/clang-tools-extra/clang-tidy/clang-tidy-config.h.cmake @@ -0,0 +1,10 @@ +/* This generated file is for internal use. Do not include it from headers. */ + +#ifdef CLANG_TIDY_CONFIG_H +#error clang-tidy-config.h can only be included once +#else +#define CLANG_TIDY_CONFIG_H + +#cmakedefine01 CLANG_TIDY_ENABLE_STATIC_ANALYZER + +#endif diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt index a9f5b3e0c15bc..39c2c552eb73e 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt @@ -13,7 +13,6 @@ add_clang_library(clangTidyCppCoreGuidelinesModule NarrowingConversionsCheck.cpp NoMallocCheck.cpp OwningMemoryCheck.cpp - PreferMemberInitializerCheck.cpp ProBoundsArrayToPointerDecayCheck.cpp ProBoundsConstantArrayIndexCheck.cpp ProBoundsPointerArithmeticCheck.cpp diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp index bf613109f0ebd..4cb5022888d3d 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp @@ -22,7 +22,6 @@ #include "NarrowingConversionsCheck.h" #include "NoMallocCheck.h" #include "OwningMemoryCheck.h" -#include "PreferMemberInitializerCheck.h" #include "ProBoundsArrayToPointerDecayCheck.h" #include "ProBoundsConstantArrayIndexCheck.h" #include "ProBoundsPointerArithmeticCheck.h" @@ -67,8 +66,6 @@ class CppCoreGuidelinesModule : public ClangTidyModule { "cppcoreguidelines-non-private-member-variables-in-classes"); CheckFactories.registerCheck( "cppcoreguidelines-owning-memory"); - CheckFactories.registerCheck( - "cppcoreguidelines-prefer-member-initializer"); CheckFactories.registerCheck( "cppcoreguidelines-pro-bounds-array-to-pointer-decay"); CheckFactories.registerCheck( diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp deleted file mode 100644 index 97ae586f9fdb6..0000000000000 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp +++ /dev/null @@ -1,233 +0,0 @@ -//===--- PreferMemberInitializerCheck.cpp - clang-tidy -------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "PreferMemberInitializerCheck.h" -#include "clang/AST/ASTContext.h" -#include "clang/ASTMatchers/ASTMatchFinder.h" -#include "clang/Lex/Lexer.h" - -using namespace clang::ast_matchers; - -namespace clang { -namespace tidy { -namespace cppcoreguidelines { - -static bool isControlStatement(const Stmt *S) { - return isa(S) || isa(S) || isa(S) || - isa(S) || isa(S) || isa(S) || - isa(S) || isa(S) || isa(S); -} - -static bool isNoReturnCallStatement(const Stmt *S) { - const auto *Call = dyn_cast(S); - if (!Call) - return false; - - const FunctionDecl *Func = Call->getDirectCallee(); - if (!Func) - return false; - - return Func->isNoReturn(); -} - -static bool isLiteral(const Expr *E) { - return isa(E) || isa(E) || - isa(E) || isa(E) || - isa(E) || isa(E); -} - -static bool isUnaryExprOfLiteral(const Expr *E) { - if (const auto *UnOp = dyn_cast(E)) - return isLiteral(UnOp->getSubExpr()); - return false; -} - -static bool shouldBeDefaultMemberInitializer(const Expr *Value) { - if (isLiteral(Value) || isUnaryExprOfLiteral(Value)) - return true; - - if (const auto *DRE = dyn_cast(Value)) - return isa(DRE->getDecl()); - - return false; -} - -static const std::pair -isAssignmentToMemberOf(const RecordDecl *Rec, const Stmt *S) { - if (const auto *BO = dyn_cast(S)) { - if (BO->getOpcode() != BO_Assign) - return std::make_pair(nullptr, nullptr); - - const auto *ME = dyn_cast(BO->getLHS()->IgnoreParenImpCasts()); - if (!ME) - return std::make_pair(nullptr, nullptr); - - const auto *Field = dyn_cast(ME->getMemberDecl()); - if (!Field) - return std::make_pair(nullptr, nullptr); - - if (isa(ME->getBase())) - return std::make_pair(Field, BO->getRHS()->IgnoreParenImpCasts()); - } else if (const auto *COCE = dyn_cast(S)) { - if (COCE->getOperator() != OO_Equal) - return std::make_pair(nullptr, nullptr); - - const auto *ME = - dyn_cast(COCE->getArg(0)->IgnoreParenImpCasts()); - if (!ME) - return std::make_pair(nullptr, nullptr); - - const auto *Field = dyn_cast(ME->getMemberDecl()); - if (!Field) - return std::make_pair(nullptr, nullptr); - - if (isa(ME->getBase())) - return std::make_pair(Field, COCE->getArg(1)->IgnoreParenImpCasts()); - } - - return std::make_pair(nullptr, nullptr); -} - -PreferMemberInitializerCheck::PreferMemberInitializerCheck( - StringRef Name, ClangTidyContext *Context) - : ClangTidyCheck(Name, Context), - IsUseDefaultMemberInitEnabled( - Context->isCheckEnabled("modernize-use-default-member-init")), - UseAssignment(OptionsView("modernize-use-default-member-init", - Context->getOptions().CheckOptions) - .get("UseAssignment", false)) {} - -void PreferMemberInitializerCheck::storeOptions( - ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "UseAssignment", UseAssignment); -} - -void PreferMemberInitializerCheck::registerMatchers(MatchFinder *Finder) { - Finder->addMatcher( - cxxConstructorDecl(hasBody(compoundStmt()), unless(isInstantiated())) - .bind("ctor"), - this); -} - -void PreferMemberInitializerCheck::check( - const MatchFinder::MatchResult &Result) { - const auto *Ctor = Result.Nodes.getNodeAs("ctor"); - const auto *Body = cast(Ctor->getBody()); - - const CXXRecordDecl *Class = Ctor->getParent(); - SourceLocation InsertPos; - bool FirstToCtorInits = true; - - for (const auto *S : Body->body()) { - if (isControlStatement(S)) - return; - - if (isNoReturnCallStatement(S)) - return; - - const FieldDecl *Field; - const Expr *InitValue; - std::tie(Field, InitValue) = isAssignmentToMemberOf(Class, S); - if (Field) { - if (IsUseDefaultMemberInitEnabled && getLangOpts().CPlusPlus11 && - Ctor->isDefaultConstructor() && - (getLangOpts().CPlusPlus20 || !Field->isBitField()) && - (!isa(Class->getDeclContext()) || - !cast(Class->getDeclContext())->isUnion()) && - shouldBeDefaultMemberInitializer(InitValue)) { - auto Diag = - diag(S->getBeginLoc(), "%0 should be initialized in an in-class" - " default member initializer") - << Field; - - SourceLocation FieldEnd = - Lexer::getLocForEndOfToken(Field->getSourceRange().getEnd(), 0, - *Result.SourceManager, getLangOpts()); - Diag << FixItHint::CreateInsertion(FieldEnd, - UseAssignment ? " = " : "{") - << FixItHint::CreateInsertionFromRange( - FieldEnd, - CharSourceRange(InitValue->getSourceRange(), true)) - << FixItHint::CreateInsertion(FieldEnd, UseAssignment ? "" : "}"); - - SourceLocation SemiColonEnd = - Lexer::findNextToken(S->getEndLoc(), *Result.SourceManager, - getLangOpts()) - ->getEndLoc(); - CharSourceRange StmtRange = - CharSourceRange::getCharRange(S->getBeginLoc(), SemiColonEnd); - - Diag << FixItHint::CreateRemoval(StmtRange); - } else { - auto Diag = - diag(S->getBeginLoc(), "%0 should be initialized in a member" - " initializer of the constructor") - << Field; - - bool AddComma = false; - if (!Ctor->getNumCtorInitializers() && FirstToCtorInits) { - SourceLocation BodyPos = Ctor->getBody()->getBeginLoc(); - SourceLocation NextPos = Ctor->getBeginLoc(); - do { - InsertPos = NextPos; - NextPos = Lexer::findNextToken(NextPos, *Result.SourceManager, - getLangOpts()) - ->getLocation(); - } while (NextPos != BodyPos); - InsertPos = Lexer::getLocForEndOfToken( - InsertPos, 0, *Result.SourceManager, getLangOpts()); - - Diag << FixItHint::CreateInsertion(InsertPos, " : "); - } else { - bool Found = false; - for (const auto *Init : Ctor->inits()) { - if (Result.SourceManager->isBeforeInTranslationUnit( - Field->getLocation(), Init->getMember()->getLocation())) { - InsertPos = Init->getSourceLocation(); - Found = true; - break; - } - } - - if (!Found) { - if (Ctor->getNumCtorInitializers()) { - InsertPos = Lexer::getLocForEndOfToken( - (*Ctor->init_rbegin())->getSourceRange().getEnd(), 0, - *Result.SourceManager, getLangOpts()); - } - Diag << FixItHint::CreateInsertion(InsertPos, ", "); - } else { - AddComma = true; - } - } - Diag << FixItHint::CreateInsertion(InsertPos, Field->getName()) - << FixItHint::CreateInsertion(InsertPos, "(") - << FixItHint::CreateInsertionFromRange( - InsertPos, - CharSourceRange(InitValue->getSourceRange(), true)) - << FixItHint::CreateInsertion(InsertPos, ")"); - if (AddComma) - Diag << FixItHint::CreateInsertion(InsertPos, ", "); - - SourceLocation SemiColonEnd = - Lexer::findNextToken(S->getEndLoc(), *Result.SourceManager, - getLangOpts()) - ->getEndLoc(); - CharSourceRange StmtRange = - CharSourceRange::getCharRange(S->getBeginLoc(), SemiColonEnd); - - Diag << FixItHint::CreateRemoval(StmtRange); - FirstToCtorInits = false; - } - } - } -} - -} // namespace cppcoreguidelines -} // namespace tidy -} // namespace clang diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h deleted file mode 100644 index dbef7c98d8e35..0000000000000 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h +++ /dev/null @@ -1,41 +0,0 @@ -//===--- PreferMemberInitializerCheck.h - clang-tidy ------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PREFERMEMBERINITIALIZERCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PREFERMEMBERINITIALIZERCHECK_H - -#include "../ClangTidyCheck.h" - -namespace clang { -namespace tidy { -namespace cppcoreguidelines { - -/// Finds member initializations in the constructor body which can be placed -/// into the initialization list instead. -/// -/// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines-prefer-member-initializer.html -class PreferMemberInitializerCheck : public ClangTidyCheck { -public: - PreferMemberInitializerCheck(StringRef Name, ClangTidyContext *Context); - bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { - return LangOpts.CPlusPlus; - } - void storeOptions(ClangTidyOptions::OptionMap &Opts) override; - void registerMatchers(ast_matchers::MatchFinder *Finder) override; - void check(const ast_matchers::MatchFinder::MatchResult &Result) override; - - const bool IsUseDefaultMemberInitEnabled; - const bool UseAssignment; -}; - -} // namespace cppcoreguidelines -} // namespace tidy -} // namespace clang - -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PREFERMEMBERINITIALIZERCHECK_H diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp index ea4bf91b0d438..7d5ae89551731 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp @@ -36,12 +36,12 @@ void UseEqualsDeleteCheck::registerMatchers(MatchFinder *Finder) { Finder->addMatcher( cxxMethodDecl( PrivateSpecialFn, - unless(anyOf(hasBody(stmt()), isDefaulted(), isDeleted(), + unless(anyOf(hasAnyBody(stmt()), isDefaulted(), isDeleted(), ast_matchers::isTemplateInstantiation(), // Ensure that all methods except private special member // functions are defined. hasParent(cxxRecordDecl(hasMethod(unless( - anyOf(PrivateSpecialFn, hasBody(stmt()), isPure(), + anyOf(PrivateSpecialFn, hasAnyBody(stmt()), isPure(), isDefaulted(), isDeleted())))))))) .bind(SpecialFunction), this); diff --git a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp index cc4bc05a35dd0..c4e7f12e74acb 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp @@ -77,13 +77,16 @@ void UseNoexceptCheck::check(const MatchFinder::MatchResult &Result) { .getExceptionSpecRange(); } + assert(FnTy && "FunctionProtoType is null."); + if (isUnresolvedExceptionSpec(FnTy->getExceptionSpecType())) + return; + assert(Range.isValid() && "Exception Source Range is invalid."); CharSourceRange CRange = Lexer::makeFileCharRange( CharSourceRange::getTokenRange(Range), *Result.SourceManager, Result.Context->getLangOpts()); - assert(FnTy && "FunctionProtoType is null."); bool IsNoThrow = FnTy->isNothrow(); StringRef ReplacementStr = IsNoThrow diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp index f7b21a50203cb..03b4450d8ca8c 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp @@ -54,7 +54,8 @@ void UnnecessaryCopyInitialization::registerMatchers(MatchFinder *Finder) { on(declRefExpr(to(varDecl().bind("objectArg"))))); auto ConstRefReturningFunctionCall = callExpr(callee(functionDecl(returns(ConstReference))), - unless(callee(cxxMethodDecl()))); + unless(callee(cxxMethodDecl()))) + .bind("initFunctionCall"); auto localVarCopiedFrom = [this](const internal::Matcher &CopyCtorArg) { return compoundStmt( @@ -96,6 +97,8 @@ void UnnecessaryCopyInitialization::check( const auto *ObjectArg = Result.Nodes.getNodeAs("objectArg"); const auto *BlockStmt = Result.Nodes.getNodeAs("blockStmt"); const auto *CtorCall = Result.Nodes.getNodeAs("ctorCall"); + const auto *InitFunctionCall = + Result.Nodes.getNodeAs("initFunctionCall"); TraversalKindScope RAII(*Result.Context, ast_type_traits::TK_AsIs); @@ -113,6 +116,11 @@ void UnnecessaryCopyInitialization::check( return; if (OldVar == nullptr) { + // Only allow initialization of a const reference from a free function if it + // has no arguments. Otherwise it could return an alias to one of its + // arguments and the arguments need to be checked for const use as well. + if (InitFunctionCall != nullptr && InitFunctionCall->getNumArgs() > 0) + return; handleCopyFromMethodReturn(*NewVar, *BlockStmt, IssueFix, ObjectArg, *Result.Context); } else { diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt index 639441e8130ab..3a1a034ed17ba 100644 --- a/clang-tools-extra/clangd/CMakeLists.txt +++ b/clang-tools-extra/clangd/CMakeLists.txt @@ -33,6 +33,8 @@ if(MSVC AND NOT CLANG_CL) set_source_files_properties(CompileCommands.cpp PROPERTIES COMPILE_FLAGS -wd4130) # disables C4130: logical operation on address of string constant endif() +include_directories(BEFORE "${CMAKE_CURRENT_BINARY_DIR}/../clang-tidy") + add_clang_library(clangDaemon AST.cpp ClangdLSPServer.cpp diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp index 15ef89cb34faa..4cc1feabb15f7 100644 --- a/clang-tools-extra/clangd/ClangdLSPServer.cpp +++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp @@ -57,7 +57,7 @@ llvm::Optional decodeVersion(llvm::StringRef Encoded) { int64_t Result; if (llvm::to_integer(Encoded, Result, 10)) return Result; - else if (!Encoded.empty()) // Empty can be e.g. diagnostics on close. + if (!Encoded.empty()) // Empty can be e.g. diagnostics on close. elog("unexpected non-numeric version {0}", Encoded); return llvm::None; } @@ -147,13 +147,9 @@ llvm::Error validateEdits(const DraftStore &DraftMgr, const FileEdits &FE) { if (!InvalidFileCount) return llvm::Error::success(); if (InvalidFileCount == 1) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "File must be saved first: " + - LastInvalidFile); - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "Files must be saved first: " + LastInvalidFile + " (and " + - llvm::to_string(InvalidFileCount - 1) + " others)"); + return error("File must be saved first: {0}", LastInvalidFile); + return error("Files must be saved first: {0} (and {1} others)", + LastInvalidFile, InvalidFileCount - 1); } } // namespace @@ -284,10 +280,9 @@ class ClangdLSPServer::MessageHandler : public Transport::MessageHandler { } } if (OldestCB) - OldestCB->second(llvm::createStringError( - llvm::inconvertibleErrorCode(), - llvm::formatv("failed to receive a client reply for request ({0})", - OldestCB->first))); + OldestCB->second( + error("failed to receive a client reply for request ({0})", + OldestCB->first)); return ID; } @@ -661,8 +656,7 @@ void ClangdLSPServer::onSync(const NoParams &Params, if (Server->blockUntilIdleForTest(/*TimeoutSeconds=*/60)) Reply(nullptr); else - Reply(llvm::createStringError(llvm::inconvertibleErrorCode(), - "Not idle after a minute")); + Reply(error("Not idle after a minute")); } void ClangdLSPServer::onDocumentDidOpen( @@ -729,9 +723,7 @@ void ClangdLSPServer::onCommand(const ExecuteCommandParams &Params, std::string Reason = Response->failureReason ? *Response->failureReason : "unknown reason"; - return Reply(llvm::createStringError( - llvm::inconvertibleErrorCode(), - ("edits were not applied: " + Reason).c_str())); + return Reply(error("edits were not applied: {0}", Reason)); } return Reply(SuccessMessage); }); @@ -752,9 +744,7 @@ void ClangdLSPServer::onCommand(const ExecuteCommandParams &Params, Params.tweakArgs) { auto Code = DraftMgr.getDraft(Params.tweakArgs->file.file()); if (!Code) - return Reply(llvm::createStringError( - llvm::inconvertibleErrorCode(), - "trying to apply a code action for a non-added file")); + return Reply(error("trying to apply a code action for a non-added file")); auto Action = [this, ApplyEdit, Reply = std::move(Reply), File = Params.tweakArgs->file, Code = std::move(*Code)]( diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp index d204e87c143b4..27d1a2dc7cdce 100644 --- a/clang-tools-extra/clangd/ClangdServer.cpp +++ b/clang-tools-extra/clangd/ClangdServer.cpp @@ -342,8 +342,7 @@ void ClangdServer::signatureHelp(PathRef File, Position Pos, const auto *PreambleData = IP->Preamble; if (!PreambleData) - return CB(llvm::createStringError(llvm::inconvertibleErrorCode(), - "Failed to parse includes")); + return CB(error("Failed to parse includes")); ParseInputs ParseInput{IP->Command, &TFS, IP->Contents.str()}; ParseInput.Index = Index; @@ -537,9 +536,12 @@ void ClangdServer::enumerateTweaks(PathRef File, Range Sel, void ClangdServer::applyTweak(PathRef File, Range Sel, StringRef TweakID, Callback CB) { - // Tracks number of times a tweak has been applied. + // Tracks number of times a tweak has been attempted. static constexpr trace::Metric TweakAttempt( "tweak_attempt", trace::Metric::Counter, "tweak_id"); + // Tracks number of times a tweak has failed to produce edits. + static constexpr trace::Metric TweakFailed( + "tweak_failed", trace::Metric::Counter, "tweak_id"); TweakAttempt.record(1, TweakID); auto Action = [File = File.str(), Sel, TweakID = TweakID.str(), CB = std::move(CB), @@ -570,6 +572,8 @@ void ClangdServer::applyTweak(PathRef File, Range Sel, StringRef TweakID, if (llvm::Error Err = reformatEdit(E, Style)) elog("Failed to format {0}: {1}", It.first(), std::move(Err)); } + } else { + TweakFailed.record(1, TweakID); } return CB(std::move(*Effect)); }; diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp index 92ebc4c39f64c..4d5b2975c9aee 100644 --- a/clang-tools-extra/clangd/CodeComplete.cpp +++ b/clang-tools-extra/clangd/CodeComplete.cpp @@ -333,8 +333,7 @@ struct CodeCompletionBuilder { return ResolvedInserted.takeError(); auto Spelled = Includes.calculateIncludePath(*ResolvedInserted, FileName); if (!Spelled) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Header not on include path"); + return error("Header not on include path"); return std::make_pair( std::move(*Spelled), Includes.shouldInsertInclude(*ResolvedDeclaring, *ResolvedInserted)); diff --git a/clang-tools-extra/clangd/ConfigYAML.cpp b/clang-tools-extra/clangd/ConfigYAML.cpp index 16639f6649c2b..9988fe3766480 100644 --- a/clang-tools-extra/clangd/ConfigYAML.cpp +++ b/clang-tools-extra/clangd/ConfigYAML.cpp @@ -38,6 +38,7 @@ class Parser { DictParser Dict("Config", this); Dict.handle("If", [&](Node &N) { parse(F.If, N); }); Dict.handle("CompileFlags", [&](Node &N) { parse(F.CompileFlags, N); }); + Dict.handle("Index", [&](Node &N) { parse(F.Index, N); }); Dict.parse(N); return !(N.failed() || HadError); } diff --git a/clang-tools-extra/clangd/DraftStore.cpp b/clang-tools-extra/clangd/DraftStore.cpp index bef48ddfa37d6..1299efbfba9fa 100644 --- a/clang-tools-extra/clangd/DraftStore.cpp +++ b/clang-tools-extra/clangd/DraftStore.cpp @@ -64,9 +64,9 @@ llvm::Expected DraftStore::updateDraft( auto EntryIt = Drafts.find(File); if (EntryIt == Drafts.end()) { - return llvm::make_error( - "Trying to do incremental update on non-added document: " + File, - llvm::errc::invalid_argument); + return error(llvm::errc::invalid_argument, + "Trying to do incremental update on non-added document: {0}", + File); } Draft &D = EntryIt->second; std::string Contents = EntryIt->second.Contents; @@ -89,11 +89,9 @@ llvm::Expected DraftStore::updateDraft( return EndIndex.takeError(); if (*EndIndex < *StartIndex) - return llvm::make_error( - llvm::formatv( - "Range's end position ({0}) is before start position ({1})", End, - Start), - llvm::errc::invalid_argument); + return error(llvm::errc::invalid_argument, + "Range's end position ({0}) is before start position ({1})", + End, Start); // Since the range length between two LSP positions is dependent on the // contents of the buffer we compute the range length between the start and @@ -106,11 +104,10 @@ llvm::Expected DraftStore::updateDraft( lspLength(Contents.substr(*StartIndex, *EndIndex - *StartIndex)); if (Change.rangeLength && ComputedRangeLength != *Change.rangeLength) - return llvm::make_error( - llvm::formatv("Change's rangeLength ({0}) doesn't match the " - "computed range length ({1}).", - *Change.rangeLength, ComputedRangeLength), - llvm::errc::invalid_argument); + return error(llvm::errc::invalid_argument, + "Change's rangeLength ({0}) doesn't match the " + "computed range length ({1}).", + *Change.rangeLength, ComputedRangeLength); std::string NewContents; NewContents.reserve(*StartIndex + Change.text.length() + diff --git a/clang-tools-extra/clangd/FindSymbols.cpp b/clang-tools-extra/clangd/FindSymbols.cpp index 2471656988250..e37d73103e36d 100644 --- a/clang-tools-extra/clangd/FindSymbols.cpp +++ b/clang-tools-extra/clangd/FindSymbols.cpp @@ -43,12 +43,9 @@ struct ScoredSymbolGreater { llvm::Expected indexToLSPLocation(const SymbolLocation &Loc, llvm::StringRef TUPath) { auto Path = URI::resolve(Loc.FileURI, TUPath); - if (!Path) { - return llvm::make_error( - llvm::formatv("Could not resolve path for file '{0}': {1}", Loc.FileURI, - llvm::toString(Path.takeError())), - llvm::inconvertibleErrorCode()); - } + if (!Path) + return error("Could not resolve path for file '{0}': {1}", Loc.FileURI, + Path.takeError()); Location L; L.uri = URIForFile::canonicalize(*Path, TUPath); Position Start, End; diff --git a/clang-tools-extra/clangd/IncludeFixer.cpp b/clang-tools-extra/clangd/IncludeFixer.cpp index 945f4eced88c4..7704ccb82c0f0 100644 --- a/clang-tools-extra/clangd/IncludeFixer.cpp +++ b/clang-tools-extra/clangd/IncludeFixer.cpp @@ -153,8 +153,7 @@ std::vector IncludeFixer::fixesForSymbols(const SymbolSlab &Syms) const { return ResolvedInserted.takeError(); auto Spelled = Inserter->calculateIncludePath(*ResolvedInserted, File); if (!Spelled) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Header not on include path"); + return error("Header not on include path"); return std::make_pair( std::move(*Spelled), Inserter->shouldInsertInclude(*ResolvedDeclaring, *ResolvedInserted)); diff --git a/clang-tools-extra/clangd/JSONTransport.cpp b/clang-tools-extra/clangd/JSONTransport.cpp index fa86baf6c5816..eb5a83882b2bd 100644 --- a/clang-tools-extra/clangd/JSONTransport.cpp +++ b/clang-tools-extra/clangd/JSONTransport.cpp @@ -12,6 +12,7 @@ #include "support/Shutdown.h" #include "llvm/Support/Errno.h" #include "llvm/Support/Error.h" +#include namespace clang { namespace clangd { @@ -51,12 +52,10 @@ llvm::json::Object encodeError(llvm::Error E) { } llvm::Error decodeError(const llvm::json::Object &O) { - std::string Msg = - std::string(O.getString("message").getValueOr("Unspecified error")); + llvm::StringRef Msg = O.getString("message").getValueOr("Unspecified error"); if (auto Code = O.getInteger("code")) - return llvm::make_error(std::move(Msg), ErrorCode(*Code)); - return llvm::make_error(std::move(Msg), - llvm::inconvertibleErrorCode()); + return llvm::make_error(Msg.str(), ErrorCode(*Code)); + return error(Msg.str()); } class JSONTransport : public Transport { @@ -102,9 +101,8 @@ class JSONTransport : public Transport { llvm::Error loop(MessageHandler &Handler) override { while (!feof(In)) { if (shutdownRequested()) - return llvm::createStringError( - std::make_error_code(std::errc::operation_canceled), - "Got signal, shutting down"); + return error(std::make_error_code(std::errc::operation_canceled), + "Got signal, shutting down"); if (ferror(In)) return llvm::errorCodeToError( std::error_code(errno, std::system_category())); diff --git a/clang-tools-extra/clangd/PathMapping.cpp b/clang-tools-extra/clangd/PathMapping.cpp index eb568b917966d..0cd9d22b998ca 100644 --- a/clang-tools-extra/clangd/PathMapping.cpp +++ b/clang-tools-extra/clangd/PathMapping.cpp @@ -8,6 +8,7 @@ #include "PathMapping.h" #include "Transport.h" #include "URI.h" +#include "support/Logger.h" #include "llvm/ADT/None.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Errno.h" @@ -156,8 +157,7 @@ llvm::Expected parsePath(llvm::StringRef Path) { Converted = "/" + Converted; return Converted; } - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Path not absolute: " + Path); + return error("Path not absolute: {0}", Path); } } // namespace @@ -174,9 +174,7 @@ parsePathMappings(llvm::StringRef RawPathMappings) { std::tie(PathPair, Rest) = Rest.split(","); std::tie(ClientPath, ServerPath) = PathPair.split("="); if (ClientPath.empty() || ServerPath.empty()) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Not a valid path mapping pair: " + - PathPair); + return error("Not a valid path mapping pair: {0}", PathPair); llvm::Expected ParsedClientPath = parsePath(ClientPath); if (!ParsedClientPath) return ParsedClientPath.takeError(); diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp index b71afa0b16191..8e1ad7242eb01 100644 --- a/clang-tools-extra/clangd/Preamble.cpp +++ b/clang-tools-extra/clangd/Preamble.cpp @@ -243,8 +243,7 @@ scanPreamble(llvm::StringRef Contents, const tooling::CompileCommand &Cmd) { IgnoringDiagConsumer IgnoreDiags; auto CI = buildCompilerInvocation(PI, IgnoreDiags); if (!CI) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "failed to create compiler invocation"); + return error("failed to create compiler invocation"); CI->getDiagnosticOpts().IgnoreWarnings = true; auto ContentsBuffer = llvm::MemoryBuffer::getMemBuffer(Contents); // This means we're scanning (though not preprocessing) the preamble section @@ -260,14 +259,12 @@ scanPreamble(llvm::StringRef Contents, const tooling::CompileCommand &Cmd) { // also implies missing resolved paths for includes. FS.view(llvm::None), IgnoreDiags); if (Clang->getFrontendOpts().Inputs.empty()) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "compiler instance had no inputs"); + return error("compiler instance had no inputs"); // We are only interested in main file includes. Clang->getPreprocessorOpts().SingleFileParseMode = true; PreprocessOnlyAction Action; if (!Action.BeginSourceFile(*Clang, Clang->getFrontendOpts().Inputs[0])) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "failed BeginSourceFile"); + return error("failed BeginSourceFile"); const auto &SM = Clang->getSourceManager(); Preprocessor &PP = Clang->getPreprocessor(); IncludeStructure Includes; diff --git a/clang-tools-extra/clangd/RIFF.cpp b/clang-tools-extra/clangd/RIFF.cpp index f59200bd58561..8423580f9b46d 100644 --- a/clang-tools-extra/clangd/RIFF.cpp +++ b/clang-tools-extra/clangd/RIFF.cpp @@ -7,35 +7,28 @@ //===----------------------------------------------------------------------===// #include "RIFF.h" +#include "support/Logger.h" #include "llvm/Support/Endian.h" -#include "llvm/Support/Error.h" namespace clang { namespace clangd { namespace riff { -static llvm::Error makeError(const llvm::Twine &Msg) { - return llvm::make_error(Msg, - llvm::inconvertibleErrorCode()); -} - llvm::Expected readChunk(llvm::StringRef &Stream) { if (Stream.size() < 8) - return makeError("incomplete chunk header: " + llvm::Twine(Stream.size()) + - " bytes available"); + return error("incomplete chunk header: {0} bytes available", Stream.size()); Chunk C; std::copy(Stream.begin(), Stream.begin() + 4, C.ID.begin()); Stream = Stream.drop_front(4); uint32_t Len = llvm::support::endian::read32le(Stream.take_front(4).begin()); Stream = Stream.drop_front(4); if (Stream.size() < Len) - return makeError("truncated chunk: want " + llvm::Twine(Len) + ", got " + - llvm::Twine(Stream.size())); + return error("truncated chunk: want {0}, got {1}", Len, Stream.size()); C.Data = Stream.take_front(Len); Stream = Stream.drop_front(Len); if ((Len % 2) && !Stream.empty()) { // Skip padding byte. if (Stream.front()) - return makeError("nonzero padding byte"); + return error("nonzero padding byte"); Stream = Stream.drop_front(); } return std::move(C); @@ -57,9 +50,9 @@ llvm::Expected readFile(llvm::StringRef Stream) { if (!RIFF) return RIFF.takeError(); if (RIFF->ID != fourCC("RIFF")) - return makeError("not a RIFF container: root is " + fourCCStr(RIFF->ID)); + return error("not a RIFF container: root is {0}", fourCCStr(RIFF->ID)); if (RIFF->Data.size() < 4) - return makeError("RIFF chunk too short"); + return error("RIFF chunk too short"); File F; std::copy(RIFF->Data.begin(), RIFF->Data.begin() + 4, F.Type.begin()); for (llvm::StringRef Body = RIFF->Data.drop_front(4); !Body.empty();) diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp index 2b50aea82fb28..0432097b43488 100644 --- a/clang-tools-extra/clangd/SourceCode.cpp +++ b/clang-tools-extra/clangd/SourceCode.cpp @@ -175,20 +175,17 @@ size_t lspLength(llvm::StringRef Code) { llvm::Expected positionToOffset(llvm::StringRef Code, Position P, bool AllowColumnsBeyondLineLength) { if (P.line < 0) - return llvm::make_error( - llvm::formatv("Line value can't be negative ({0})", P.line), - llvm::errc::invalid_argument); + return error(llvm::errc::invalid_argument, + "Line value can't be negative ({0})", P.line); if (P.character < 0) - return llvm::make_error( - llvm::formatv("Character value can't be negative ({0})", P.character), - llvm::errc::invalid_argument); + return error(llvm::errc::invalid_argument, + "Character value can't be negative ({0})", P.character); size_t StartOfLine = 0; for (int I = 0; I != P.line; ++I) { size_t NextNL = Code.find('\n', StartOfLine); if (NextNL == llvm::StringRef::npos) - return llvm::make_error( - llvm::formatv("Line value is out of range ({0})", P.line), - llvm::errc::invalid_argument); + return error(llvm::errc::invalid_argument, + "Line value is out of range ({0})", P.line); StartOfLine = NextNL + 1; } StringRef Line = @@ -198,10 +195,9 @@ llvm::Expected positionToOffset(llvm::StringRef Code, Position P, bool Valid; size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid); if (!Valid && !AllowColumnsBeyondLineLength) - return llvm::make_error( - llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(), - P.character, P.line), - llvm::errc::invalid_argument); + return error(llvm::errc::invalid_argument, + "{0} offset {1} is invalid for line {2}", lspEncoding(), + P.character, P.line); return StartOfLine + ByteInLine; } diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp index ed367005177b2..c408c8c0731de 100644 --- a/clang-tools-extra/clangd/TUScheduler.cpp +++ b/clang-tools-extra/clangd/TUScheduler.cpp @@ -717,8 +717,7 @@ void ASTWorker::runWithAST( [&AST, this]() { IdleASTs.put(this, std::move(*AST)); }); // Run the user-provided action. if (!*AST) - return Action(llvm::make_error( - "invalid AST", llvm::errc::invalid_argument)); + return Action(error(llvm::errc::invalid_argument, "invalid AST")); vlog("ASTWorker running {0} on version {2} of {1}", Name, FileName, FileInputs.Version); Action(InputsAndAST{FileInputs, **AST}); diff --git a/clang-tools-extra/clangd/URI.cpp b/clang-tools-extra/clangd/URI.cpp index fad93143a30dd..f9e8fdc46fa7f 100644 --- a/clang-tools-extra/clangd/URI.cpp +++ b/clang-tools-extra/clangd/URI.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "URI.h" +#include "support/Logger.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/Error.h" @@ -21,11 +22,6 @@ namespace clang { namespace clangd { namespace { -inline llvm::Error make_string_error(const llvm::Twine &Message) { - return llvm::make_error(Message, - llvm::inconvertibleErrorCode()); -} - bool isWindowsPath(llvm::StringRef Path) { return Path.size() > 1 && llvm::isAlpha(Path[0]) && Path[1] == ':'; } @@ -45,9 +41,9 @@ class FileSystemScheme : public URIScheme { getAbsolutePath(llvm::StringRef Authority, llvm::StringRef Body, llvm::StringRef /*HintPath*/) const override { if (!Body.startswith("/")) - return make_string_error("File scheme: expect body to be an absolute " - "path starting with '/': " + - Body); + return error("File scheme: expect body to be an absolute path starting " + "with '/': {0}", + Body); llvm::SmallString<128> Path; if (!Authority.empty()) { // Windows UNC paths e.g. file://server/share => \\server\share @@ -89,7 +85,7 @@ findSchemeByName(llvm::StringRef Scheme) { continue; return URIScheme.instantiate(); } - return make_string_error("Can't find scheme: " + Scheme); + return error("Can't find scheme: {0}", Scheme); } bool shouldEscape(unsigned char C) { @@ -187,12 +183,11 @@ llvm::Expected URI::parse(llvm::StringRef OrigUri) { auto Pos = Uri.find(':'); if (Pos == llvm::StringRef::npos) - return make_string_error("Scheme must be provided in URI: " + OrigUri); + return error("Scheme must be provided in URI: {0}", OrigUri); auto SchemeStr = Uri.substr(0, Pos); U.Scheme = percentDecode(SchemeStr); if (!isValidScheme(U.Scheme)) - return make_string_error(llvm::formatv("Invalid scheme: {0} (decoded: {1})", - SchemeStr, U.Scheme)); + return error("Invalid scheme: {0} (decoded: {1})", SchemeStr, U.Scheme); Uri = Uri.substr(Pos + 1); if (Uri.consume_front("//")) { Pos = Uri.find('/'); @@ -217,7 +212,7 @@ llvm::Expected URI::resolve(llvm::StringRef FileURI, llvm::Expected URI::create(llvm::StringRef AbsolutePath, llvm::StringRef Scheme) { if (!llvm::sys::path::is_absolute(AbsolutePath)) - return make_string_error("Not a valid absolute path: " + AbsolutePath); + return error("Not a valid absolute path: {0}", AbsolutePath); auto S = findSchemeByName(Scheme); if (!S) return S.takeError(); diff --git a/clang-tools-extra/clangd/index/Background.cpp b/clang-tools-extra/clangd/index/Background.cpp index 2bac6ec39d308..a1aafeaf31a96 100644 --- a/clang-tools-extra/clangd/index/Background.cpp +++ b/clang-tools-extra/clangd/index/Background.cpp @@ -272,15 +272,13 @@ llvm::Error BackgroundIndex::index(tooling::CompileCommand Cmd) { IgnoreDiagnostics IgnoreDiags; auto CI = buildCompilerInvocation(Inputs, IgnoreDiags); if (!CI) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Couldn't build compiler invocation"); + return error("Couldn't build compiler invocation"); auto Clang = prepareCompilerInstance(std::move(CI), /*Preamble=*/nullptr, std::move(*Buf), std::move(FS), IgnoreDiags); if (!Clang) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Couldn't build compiler instance"); + return error("Couldn't build compiler instance"); SymbolCollector::Options IndexOpts; // Creates a filter to not collect index results from files with unchanged @@ -318,8 +316,7 @@ llvm::Error BackgroundIndex::index(tooling::CompileCommand Cmd) { const FrontendInputFile &Input = Clang->getFrontendOpts().Inputs.front(); if (!Action->BeginSourceFile(*Clang, Input)) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "BeginSourceFile() failed"); + return error("BeginSourceFile() failed"); if (llvm::Error Err = Action->Execute()) return Err; diff --git a/clang-tools-extra/clangd/index/Serialization.cpp b/clang-tools-extra/clangd/index/Serialization.cpp index 11d70b550642b..e7f65f087b1c4 100644 --- a/clang-tools-extra/clangd/index/Serialization.cpp +++ b/clang-tools-extra/clangd/index/Serialization.cpp @@ -25,10 +25,6 @@ namespace clang { namespace clangd { namespace { -llvm::Error makeError(const llvm::Twine &Msg) { - return llvm::make_error(Msg, - llvm::inconvertibleErrorCode()); -} // IO PRIMITIVES // We use little-endian 32 bit ints, sometimes with variable-length encoding. @@ -199,18 +195,19 @@ llvm::Expected readStringTable(llvm::StringRef Data) { Reader R(Data); size_t UncompressedSize = R.consume32(); if (R.err()) - return makeError("Truncated string table"); + return error("Truncated string table"); llvm::StringRef Uncompressed; llvm::SmallString<1> UncompressedStorage; if (UncompressedSize == 0) // No compression Uncompressed = R.rest(); - else { + else if (llvm::zlib::isAvailable()) { if (llvm::Error E = llvm::zlib::uncompress(R.rest(), UncompressedStorage, UncompressedSize)) return std::move(E); Uncompressed = UncompressedStorage; - } + } else + return error("Compressed string table, but zlib is unavailable"); StringTableIn Table; llvm::StringSaver Saver(Table.Arena); @@ -218,12 +215,12 @@ llvm::Expected readStringTable(llvm::StringRef Data) { for (Reader R(Uncompressed); !R.eof();) { auto Len = R.rest().find(0); if (Len == llvm::StringRef::npos) - return makeError("Bad string table: not null terminated"); + return error("Bad string table: not null terminated"); Table.Strings.push_back(Saver.save(R.consume(Len))); R.consume8(); } if (R.err()) - return makeError("Truncated string table"); + return error("Truncated string table"); return std::move(Table); } @@ -426,24 +423,23 @@ llvm::Expected readRIFF(llvm::StringRef Data) { if (!RIFF) return RIFF.takeError(); if (RIFF->Type != riff::fourCC("CdIx")) - return makeError("wrong RIFF filetype: " + riff::fourCCStr(RIFF->Type)); + return error("wrong RIFF filetype: {0}", riff::fourCCStr(RIFF->Type)); llvm::StringMap Chunks; for (const auto &Chunk : RIFF->Chunks) Chunks.try_emplace(llvm::StringRef(Chunk.ID.data(), Chunk.ID.size()), Chunk.Data); if (!Chunks.count("meta")) - return makeError("missing meta chunk"); + return error("missing meta chunk"); Reader Meta(Chunks.lookup("meta")); auto SeenVersion = Meta.consume32(); if (SeenVersion != Version) - return makeError("wrong version: want " + llvm::Twine(Version) + ", got " + - llvm::Twine(SeenVersion)); + return error("wrong version: want {0}, got {1}", Version, SeenVersion); // meta chunk is checked above, as we prefer the "version mismatch" error. for (llvm::StringRef RequiredChunk : {"stri"}) if (!Chunks.count(RequiredChunk)) - return makeError("missing required chunk " + RequiredChunk); + return error("missing required chunk {0}", RequiredChunk); auto Strings = readStringTable(Chunks.lookup("stri")); if (!Strings) @@ -464,7 +460,7 @@ llvm::Expected readRIFF(llvm::StringRef Data) { Include = Result.Sources->try_emplace(Include).first->getKey(); } if (SrcsReader.err()) - return makeError("malformed or truncated include uri"); + return error("malformed or truncated include uri"); } if (Chunks.count("symb")) { @@ -473,7 +469,7 @@ llvm::Expected readRIFF(llvm::StringRef Data) { while (!SymbolReader.eof()) Symbols.insert(readSymbol(SymbolReader, Strings->Strings)); if (SymbolReader.err()) - return makeError("malformed or truncated symbol"); + return error("malformed or truncated symbol"); Result.Symbols = std::move(Symbols).build(); } if (Chunks.count("refs")) { @@ -485,7 +481,7 @@ llvm::Expected readRIFF(llvm::StringRef Data) { Refs.insert(RefsBundle.first, Ref); } if (RefsReader.err()) - return makeError("malformed or truncated refs"); + return error("malformed or truncated refs"); Result.Refs = std::move(Refs).build(); } if (Chunks.count("rela")) { @@ -496,13 +492,13 @@ llvm::Expected readRIFF(llvm::StringRef Data) { Relations.insert(Relation); } if (RelationsReader.err()) - return makeError("malformed or truncated relations"); + return error("malformed or truncated relations"); Result.Relations = std::move(Relations).build(); } if (Chunks.count("cmdl")) { Reader CmdReader(Chunks.lookup("cmdl")); if (CmdReader.err()) - return makeError("malformed or truncated commandline section"); + return error("malformed or truncated commandline section"); InternedCompileCommand Cmd = readCompileCommand(CmdReader, Strings->Strings); Result.Cmd.emplace(); @@ -660,8 +656,8 @@ llvm::Expected readIndexFile(llvm::StringRef Data) { } else if (auto YAMLContents = readYAML(Data)) { return std::move(*YAMLContents); } else { - return makeError("Not a RIFF file and failed to parse as YAML: " + - llvm::toString(YAMLContents.takeError())); + return error("Not a RIFF file and failed to parse as YAML: {0}", + YAMLContents.takeError()); } } diff --git a/clang-tools-extra/clangd/index/SymbolID.cpp b/clang-tools-extra/clangd/index/SymbolID.cpp index b97103d377ca2..2bb3d4f0b6a0d 100644 --- a/clang-tools-extra/clangd/index/SymbolID.cpp +++ b/clang-tools-extra/clangd/index/SymbolID.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "SymbolID.h" +#include "support/Logger.h" #include "llvm/Support/SHA1.h" namespace clang { @@ -34,12 +35,10 @@ std::string SymbolID::str() const { return llvm::toHex(raw()); } llvm::Expected SymbolID::fromStr(llvm::StringRef Str) { if (Str.size() != RawSize * 2) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Bad ID length"); + return error("Bad ID length"); for (char C : Str) if (!llvm::isHexDigit(C)) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Bad hex ID"); + return error("Bad hex ID"); return fromRaw(llvm::fromHex(Str)); } diff --git a/clang-tools-extra/clangd/index/YAMLSerialization.cpp b/clang-tools-extra/clangd/index/YAMLSerialization.cpp index 4f6bd927cc196..d269a3b36eb48 100644 --- a/clang-tools-extra/clangd/index/YAMLSerialization.cpp +++ b/clang-tools-extra/clangd/index/YAMLSerialization.cpp @@ -18,6 +18,7 @@ #include "SymbolLocation.h" #include "SymbolOrigin.h" #include "dex/Dex.h" +#include "support/Logger.h" #include "support/Trace.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" @@ -533,9 +534,7 @@ symbolFromYAML(StringRef YAML, llvm::UniqueStringSaver *Strings) { clangd::Symbol Deserialized; llvm::yaml::Input YAMLInput(YAML, Strings); if (YAMLInput.error()) - return llvm::make_error( - llvm::formatv("Unable to deserialize Symbol from YAML: {0}", YAML), - llvm::inconvertibleErrorCode()); + return error("Unable to deserialize Symbol from YAML: {0}", YAML); YAMLInput >> Deserialized; return Deserialized; } @@ -545,9 +544,7 @@ llvm::Expected refFromYAML(StringRef YAML, clangd::Ref Deserialized; llvm::yaml::Input YAMLInput(YAML, Strings); if (YAMLInput.error()) - return llvm::make_error( - llvm::formatv("Unable to deserialize Symbol from YAML: {0}", YAML), - llvm::inconvertibleErrorCode()); + return error("Unable to deserialize Symbol from YAML: {0}", YAML); YAMLInput >> Deserialized; return Deserialized; } diff --git a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp index cfc72ce87be61..839250982a03b 100644 --- a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp +++ b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp @@ -45,11 +45,6 @@ llvm::Expected> getIDs(IDRange IDs) { return Result; } -llvm::Error makeStringError(llvm::StringRef Message) { - return llvm::make_error(Message, - llvm::inconvertibleErrorCode()); -} - } // namespace Marshaller::Marshaller(llvm::StringRef RemoteIndexRoot, @@ -132,7 +127,7 @@ Marshaller::fromProtobuf(const RelationsRequest *Message) { llvm::Expected Marshaller::fromProtobuf(const Symbol &Message) { if (!Message.has_info() || !Message.has_canonical_declaration()) - return makeStringError("Missing info or declaration."); + return error("Missing info or declaration."); clangd::Symbol Result; auto ID = SymbolID::fromStr(Message.id()); if (!ID) @@ -170,7 +165,7 @@ llvm::Expected Marshaller::fromProtobuf(const Symbol &Message) { llvm::Expected Marshaller::fromProtobuf(const Ref &Message) { if (!Message.has_location()) - return makeStringError("Missing location."); + return error("Missing location."); clangd::Ref Result; auto Location = fromProtobuf(Message.location()); if (!Location) @@ -186,7 +181,7 @@ Marshaller::fromProtobuf(const Relation &Message) { if (!SubjectID) return SubjectID.takeError(); if (!Message.has_object()) - return makeStringError("Missing Object."); + return error("Missing Object."); auto Object = fromProtobuf(Message.object()); if (!Object) return Object.takeError(); @@ -304,10 +299,9 @@ Marshaller::relativePathToURI(llvm::StringRef RelativePath) { assert(RelativePath == llvm::sys::path::convert_to_slash( RelativePath, llvm::sys::path::Style::posix)); if (RelativePath.empty()) - return makeStringError("Empty relative path."); + return error("Empty relative path."); if (llvm::sys::path::is_absolute(RelativePath)) - return makeStringError( - llvm::formatv("RelativePath '{0}' is absolute.", RelativePath).str()); + return error("RelativePath '{0}' is absolute.", RelativePath); llvm::SmallString<256> FullPath = llvm::StringRef(*LocalIndexRoot); llvm::sys::path::append(FullPath, RelativePath); auto Result = URI::createFile(FullPath); @@ -320,16 +314,11 @@ llvm::Expected Marshaller::uriToRelativePath(llvm::StringRef URI) { if (!ParsedURI) return ParsedURI.takeError(); if (ParsedURI->scheme() != "file") - return makeStringError( - llvm::formatv("Can not use URI schemes other than file, given: '{0}'.", - URI) - .str()); + return error("Can not use URI schemes other than file, given: '{0}'.", URI); llvm::SmallString<256> Result = ParsedURI->body(); if (!llvm::sys::path::replace_path_prefix(Result, *RemoteIndexRoot, "")) - return makeStringError( - llvm::formatv("File path '{0}' doesn't start with '{1}'.", Result.str(), - *RemoteIndexRoot) - .str()); + return error("File path '{0}' doesn't start with '{1}'.", Result.str(), + *RemoteIndexRoot); // Make sure the result has UNIX slashes. return llvm::sys::path::convert_to_slash(Result, llvm::sys::path::Style::posix); diff --git a/clang-tools-extra/clangd/index/remote/server/Server.cpp b/clang-tools-extra/clangd/index/remote/server/Server.cpp index e9838cce85e3d..d8cf542496627 100644 --- a/clang-tools-extra/clangd/index/remote/server/Server.cpp +++ b/clang-tools-extra/clangd/index/remote/server/Server.cpp @@ -12,15 +12,25 @@ #include "index/Symbol.h" #include "index/remote/marshalling/Marshalling.h" #include "support/Logger.h" +#include "support/Shutdown.h" +#include "support/ThreadsafeFS.h" #include "support/Trace.h" +#include "llvm/ADT/IntrusiveRefCntPtr.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Chrono.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/Support/Signals.h" +#include "llvm/Support/VirtualFileSystem.h" +#include #include #include +#include +#include #include "Index.grpc.pb.h" @@ -63,15 +73,10 @@ llvm::cl::opt ServerAddress( "server-address", llvm::cl::init("0.0.0.0:50051"), llvm::cl::desc("Address of the invoked server. Defaults to 0.0.0.0:50051")); -std::unique_ptr openIndex(llvm::StringRef Index) { - return loadIndex(Index, /*UseIndex=*/true); -} - class RemoteIndexServer final : public SymbolIndex::Service { public: - RemoteIndexServer(std::unique_ptr Index, - llvm::StringRef IndexRoot) - : Index(std::move(Index)) { + RemoteIndexServer(clangd::SymbolIndex &Index, llvm::StringRef IndexRoot) + : Index(Index) { llvm::SmallString<256> NativePath = IndexRoot; llvm::sys::path::native(NativePath); ProtobufMarshaller = std::unique_ptr(new Marshaller( @@ -91,7 +96,7 @@ class RemoteIndexServer final : public SymbolIndex::Service { } unsigned Sent = 0; unsigned FailedToSend = 0; - Index->lookup(*Req, [&](const clangd::Symbol &Item) { + Index.lookup(*Req, [&](const clangd::Symbol &Item) { auto SerializedItem = ProtobufMarshaller->toProtobuf(Item); if (!SerializedItem) { elog("Unable to convert Symbol to protobuf: {0}", @@ -124,7 +129,7 @@ class RemoteIndexServer final : public SymbolIndex::Service { } unsigned Sent = 0; unsigned FailedToSend = 0; - bool HasMore = Index->fuzzyFind(*Req, [&](const clangd::Symbol &Item) { + bool HasMore = Index.fuzzyFind(*Req, [&](const clangd::Symbol &Item) { auto SerializedItem = ProtobufMarshaller->toProtobuf(Item); if (!SerializedItem) { elog("Unable to convert Symbol to protobuf: {0}", @@ -155,7 +160,7 @@ class RemoteIndexServer final : public SymbolIndex::Service { } unsigned Sent = 0; unsigned FailedToSend = 0; - bool HasMore = Index->refs(*Req, [&](const clangd::Ref &Item) { + bool HasMore = Index.refs(*Req, [&](const clangd::Ref &Item) { auto SerializedItem = ProtobufMarshaller->toProtobuf(Item); if (!SerializedItem) { elog("Unable to convert Ref to protobuf: {0}", @@ -188,7 +193,7 @@ class RemoteIndexServer final : public SymbolIndex::Service { } unsigned Sent = 0; unsigned FailedToSend = 0; - Index->relations( + Index.relations( *Req, [&](const SymbolID &Subject, const clangd::Symbol &Object) { auto SerializedItem = ProtobufMarshaller->toProtobuf(Subject, Object); if (!SerializedItem) { @@ -210,22 +215,56 @@ class RemoteIndexServer final : public SymbolIndex::Service { return grpc::Status::OK; } - std::unique_ptr Index; std::unique_ptr ProtobufMarshaller; + clangd::SymbolIndex &Index; }; -void runServer(std::unique_ptr Index, - const std::string &ServerAddress) { - RemoteIndexServer Service(std::move(Index), IndexRoot); +// Detect changes in \p IndexPath file and load new versions of the index +// whenever they become available. +void hotReload(clangd::SwapIndex &Index, llvm::StringRef IndexPath, + llvm::vfs::Status &LastStatus, + llvm::IntrusiveRefCntPtr &FS) { + auto Status = FS->status(IndexPath); + // Requested file is same as loaded index: no reload is needed. + if (!Status || (Status->getLastModificationTime() == + LastStatus.getLastModificationTime() && + Status->getSize() == LastStatus.getSize())) + return; + vlog("Found different index version: existing index was modified at {0}, new " + "index was modified at {1}. Attempting to reload.", + LastStatus.getLastModificationTime(), Status->getLastModificationTime()); + LastStatus = *Status; + std::unique_ptr NewIndex = loadIndex(IndexPath); + if (!NewIndex) { + elog("Failed to load new index. Old index will be served."); + return; + } + Index.reset(std::move(NewIndex)); + log("New index version loaded. Last modification time: {0}, size: {1} bytes.", + Status->getLastModificationTime(), Status->getSize()); +} + +void runServerAndWait(clangd::SymbolIndex &Index, llvm::StringRef ServerAddress, + llvm::StringRef IndexPath) { + RemoteIndexServer Service(Index, IndexRoot); grpc::EnableDefaultHealthCheckService(true); grpc::ServerBuilder Builder; - Builder.AddListeningPort(ServerAddress, grpc::InsecureServerCredentials()); + Builder.AddListeningPort(ServerAddress.str(), + grpc::InsecureServerCredentials()); Builder.RegisterService(&Service); std::unique_ptr Server(Builder.BuildAndStart()); log("Server listening on {0}", ServerAddress); + std::thread ServerShutdownWatcher([&]() { + static constexpr auto WatcherFrequency = std::chrono::seconds(5); + while (!clang::clangd::shutdownRequested()) + std::this_thread::sleep_for(WatcherFrequency); + Server->Shutdown(); + }); + Server->Wait(); + ServerShutdownWatcher.join(); } } // namespace @@ -239,6 +278,7 @@ int main(int argc, char *argv[]) { using namespace clang::clangd::remote; llvm::cl::ParseCommandLineOptions(argc, argv, Overview); llvm::sys::PrintStackTraceOnErrorSignal(argv[0]); + llvm::sys::SetInterruptFunction(&clang::clangd::requestShutdown); if (!llvm::sys::path::is_absolute(IndexRoot)) { llvm::errs() << "Index root should be an absolute path.\n"; @@ -273,12 +313,32 @@ int main(int argc, char *argv[]) { if (Tracer) TracingSession.emplace(*Tracer); - std::unique_ptr Index = openIndex(IndexPath); + clang::clangd::RealThreadsafeFS TFS; + auto FS = TFS.view(llvm::None); + auto Status = FS->status(IndexPath); + if (!Status) { + elog("{0} does not exist.", IndexPath); + return Status.getError().value(); + } + + auto Index = std::make_unique( + clang::clangd::loadIndex(IndexPath)); if (!Index) { llvm::errs() << "Failed to open the index.\n"; return -1; } - runServer(std::move(Index), ServerAddress); + std::thread HotReloadThread([&Index, &Status, &FS]() { + llvm::vfs::Status LastStatus = *Status; + static constexpr auto RefreshFrequency = std::chrono::seconds(90); + while (!clang::clangd::shutdownRequested()) { + hotReload(*Index, llvm::StringRef(IndexPath), LastStatus, FS); + std::this_thread::sleep_for(RefreshFrequency); + } + }); + + runServerAndWait(*Index, ServerAddress, IndexPath); + + HotReloadThread.join(); } diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp index ea75de6e86eac..2744caa586485 100644 --- a/clang-tools-extra/clangd/refactor/Rename.cpp +++ b/clang-tools-extra/clangd/refactor/Rename.cpp @@ -213,9 +213,7 @@ llvm::Error makeError(ReasonToReject Reason) { } llvm_unreachable("unhandled reason kind"); }; - return llvm::make_error( - llvm::formatv("Cannot rename symbol: {0}", Message(Reason)), - llvm::inconvertibleErrorCode()); + return error("Cannot rename symbol: {0}", Message(Reason)); } // Return all rename occurrences in the main file. @@ -319,16 +317,11 @@ findOccurrencesOutsideFile(const NamedDecl &RenameDecl, }); if (AffectedFiles.size() >= MaxLimitFiles) - return llvm::make_error( - llvm::formatv("The number of affected files exceeds the max limit {0}", - MaxLimitFiles), - llvm::inconvertibleErrorCode()); - if (HasMore) { - return llvm::make_error( - llvm::formatv("The symbol {0} has too many occurrences", - RenameDecl.getQualifiedNameAsString()), - llvm::inconvertibleErrorCode()); - } + return error("The number of affected files exceeds the max limit {0}", + MaxLimitFiles); + if (HasMore) + return error("The symbol {0} has too many occurrences", + RenameDecl.getQualifiedNameAsString()); // Sort and deduplicate the results, in case that index returns duplications. for (auto &FileAndOccurrences : AffectedFiles) { auto &Ranges = FileAndOccurrences.getValue(); @@ -379,20 +372,15 @@ llvm::Expected renameOutsideFile( // Our heuristics fails to adjust rename ranges to the current state of // the file, it is most likely the index is stale, so we give up the // entire rename. - return llvm::make_error( - llvm::formatv("Index results don't match the content of file {0} " - "(the index may be stale)", - FilePath), - llvm::inconvertibleErrorCode()); + return error("Index results don't match the content of file {0} " + "(the index may be stale)", + FilePath); } auto RenameEdit = buildRenameEdit(FilePath, *AffectedFileCode, *RenameRanges, NewName); - if (!RenameEdit) { - return llvm::make_error( - llvm::formatv("fail to build rename edit for file {0}: {1}", FilePath, - llvm::toString(RenameEdit.takeError())), - llvm::inconvertibleErrorCode()); - } + if (!RenameEdit) + return error("failed to rename in file {0}: {1}", FilePath, + RenameEdit.takeError()); if (!RenameEdit->Replacements.empty()) Results.insert({FilePath, std::move(*RenameEdit)}); } @@ -455,14 +443,10 @@ llvm::Expected rename(const RenameInputs &RInputs) { auto Content = SM.getFileManager().getVirtualFileSystem().getBufferForFile(AbsPath); if (!Content) - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - llvm::formatv("Fail to open file {0}: {1}", AbsPath, - Content.getError().message())); + return error("Fail to open file {0}: {1}", AbsPath, + Content.getError().message()); if (!*Content) - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - llvm::formatv("Got no buffer for file {0}", AbsPath)); + return error("Got no buffer for file {0}", AbsPath); return (*Content)->getBuffer().str(); }; @@ -559,10 +543,8 @@ llvm::Expected buildRenameEdit(llvm::StringRef AbsFilePath, auto ShiftedOffset = positionToOffset(InitialCode.substr(LastOffset), Shifted); if (!ShiftedOffset) - return llvm::make_error( - llvm::formatv("fail to convert the position {0} to offset ({1})", P, - llvm::toString(ShiftedOffset.takeError())), - llvm::inconvertibleErrorCode()); + return error("fail to convert the position {0} to offset ({1})", P, + ShiftedOffset.takeError()); LastPos = P; LastOffset += *ShiftedOffset; return LastOffset; diff --git a/clang-tools-extra/clangd/refactor/Tweak.cpp b/clang-tools-extra/clangd/refactor/Tweak.cpp index b1f4dcd69af6b..34b5b2b544dff 100644 --- a/clang-tools-extra/clangd/refactor/Tweak.cpp +++ b/clang-tools-extra/clangd/refactor/Tweak.cpp @@ -80,12 +80,10 @@ llvm::Expected> prepareTweak(StringRef ID, TweakRegistry::entries(), [ID](const TweakRegistry::entry &E) { return E.getName() == ID; }); if (It == TweakRegistry::end()) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "id of the tweak is invalid"); + return error("tweak ID {0} is invalid", ID); std::unique_ptr T = It->instantiate(); if (!T->prepare(S)) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "failed to prepare() a check"); + return error("failed to prepare() tweak {0}", ID); return std::move(T); } @@ -95,10 +93,8 @@ Tweak::Effect::fileEdit(const SourceManager &SM, FileID FID, Edit Ed(SM.getBufferData(FID), std::move(Replacements)); if (auto FilePath = getCanonicalPath(SM.getFileEntryForID(FID), SM)) return std::make_pair(*FilePath, std::move(Ed)); - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "Failed to get absolute path for edited file: " + - SM.getFileEntryForID(FID)->getName()); + return error("Failed to get absolute path for edited file: {0}", + SM.getFileEntryForID(FID)->getName()); } llvm::Expected diff --git a/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp b/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp index e4900041671a4..d5e6e12b31aad 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp @@ -169,8 +169,7 @@ findInsertionPoint(const Tweak::Selection &Inputs, return Tok.kind() == tok::l_brace; }); if (Tok == Toks.end() || Tok->endLocation().isInvalid()) { - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Namespace with no {"); + return error("Namespace with no {"); } if (!Tok->endLocation().isMacroID()) { InsertionPointData Out; @@ -183,8 +182,7 @@ findInsertionPoint(const Tweak::Selection &Inputs, // top level decl. auto TLDs = Inputs.AST->getLocalTopLevelDecls(); if (TLDs.empty()) { - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Cannot find place to insert \"using\""); + return error("Cannot find place to insert \"using\""); } InsertionPointData Out; Out.Loc = SM.getExpansionLoc(TLDs[0]->getBeginLoc()); @@ -272,9 +270,7 @@ Expected AddUsing::apply(const Selection &Inputs) { auto SpelledTokens = TB.spelledForExpanded( TB.expandedTokens(QualifierToRemove.getSourceRange())); if (!SpelledTokens) { - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "Could not determine length of the qualifier"); + return error("Could not determine length of the qualifier"); } unsigned Length = syntax::Token::range(SM, SpelledTokens->front(), SpelledTokens->back()) diff --git a/clang-tools-extra/clangd/refactor/tweaks/DefineInline.cpp b/clang-tools-extra/clangd/refactor/tweaks/DefineInline.cpp index 698d2a406811a..cdd5f9c6595b0 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/DefineInline.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/DefineInline.cpp @@ -205,18 +205,15 @@ llvm::Expected qualifyAllDecls(const FunctionDecl *FD, } }); - if (HadErrors) { - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "define inline: Failed to compute qualifiers see logs for details."); - } + if (HadErrors) + return error( + "define inline: Failed to compute qualifiers. See logs for details."); // Get new begin and end positions for the qualified body. auto OrigBodyRange = toHalfOpenFileRange( SM, FD->getASTContext().getLangOpts(), FD->getBody()->getSourceRange()); if (!OrigBodyRange) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Couldn't get range func body."); + return error("Couldn't get range func body."); unsigned BodyBegin = SM.getFileOffset(OrigBodyRange->getBegin()); unsigned BodyEnd = Replacements.getShiftedCodePosition( @@ -311,9 +308,7 @@ renameParameters(const FunctionDecl *Dest, const FunctionDecl *Source) { ReplaceRange = Lexer::makeFileCharRange(ReplaceRange, SM, LangOpts); // Bail out if we need to replace macro bodies. if (ReplaceRange.isInvalid()) { - auto Err = llvm::createStringError( - llvm::inconvertibleErrorCode(), - "Cant rename parameter inside macro body."); + auto Err = error("Cant rename parameter inside macro body."); elog("define inline: {0}", Err); return std::move(Err); } @@ -450,11 +445,8 @@ class DefineInline : public Tweak { const auto &SM = AST.getSourceManager(); auto Semicolon = getSemicolonForDecl(Target); - if (!Semicolon) { - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "Couldn't find semicolon for target declaration."); - } + if (!Semicolon) + return error("Couldn't find semicolon for target declaration."); auto AddInlineIfNecessary = addInlineIfInHeader(Target); auto ParamReplacements = renameParameters(Target, Source); @@ -479,10 +471,8 @@ class DefineInline : public Tweak { SM.getExpansionRange(CharSourceRange::getCharRange(getBeginLoc(Source), Source->getEndLoc())) .getAsRange()); - if (!DefRange) { - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Couldn't get range for the source."); - } + if (!DefRange) + return error("Couldn't get range for the source."); unsigned int SourceLen = SM.getFileOffset(DefRange->getEnd()) - SM.getFileOffset(DefRange->getBegin()); const tooling::Replacement DeleteFuncBody(SM, DefRange->getBegin(), diff --git a/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp b/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp index 66d9c4c36b122..ed4d0cc462692 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp @@ -120,8 +120,7 @@ getFunctionSourceAfterReplacements(const FunctionDecl *FD, auto OrigFuncRange = toHalfOpenFileRange( SM, FD->getASTContext().getLangOpts(), FD->getSourceRange()); if (!OrigFuncRange) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Couldn't get range for function."); + return error("Couldn't get range for function."); assert(!FD->getDescribedFunctionTemplate() && "Define out-of-line doesn't apply to function templates."); @@ -151,9 +150,7 @@ getFunctionSourceCode(const FunctionDecl *FD, llvm::StringRef TargetNamespace, auto &SM = AST.getSourceManager(); auto TargetContext = findContextForNS(TargetNamespace, FD->getDeclContext()); if (!TargetContext) - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "define outline: couldn't find a context for target"); + return error("define outline: couldn't find a context for target"); llvm::Error Errors = llvm::Error::success(); tooling::Replacements DeclarationCleanups; @@ -219,12 +216,9 @@ getFunctionSourceCode(const FunctionDecl *FD, llvm::StringRef TargetNamespace, assert(A->getLocation().isValid()); if (!AttrTokens || AttrTokens->empty()) { Errors = llvm::joinErrors( - std::move(Errors), - llvm::createStringError( - llvm::inconvertibleErrorCode(), - llvm::StringRef("define outline: Can't move out of line as " - "function has a macro `") + - A->getSpelling() + "` specifier.")); + std::move(Errors), error("define outline: Can't move out of line as " + "function has a macro `{0}` specifier.", + A->getSpelling())); return; } CharSourceRange DelRange = @@ -248,10 +242,8 @@ getFunctionSourceCode(const FunctionDecl *FD, llvm::StringRef TargetNamespace, if (!Spelling) { Errors = llvm::joinErrors( std::move(Errors), - llvm::createStringError( - llvm::inconvertibleErrorCode(), - llvm::formatv("define outline: couldn't remove `{0}` keyword.", - tok::getKeywordSpelling(Kind)))); + error("define outline: couldn't remove `{0}` keyword.", + tok::getKeywordSpelling(Kind))); break; } CharSourceRange DelRange = @@ -264,11 +256,8 @@ getFunctionSourceCode(const FunctionDecl *FD, llvm::StringRef TargetNamespace, if (!FoundAny) { Errors = llvm::joinErrors( std::move(Errors), - llvm::createStringError( - llvm::inconvertibleErrorCode(), - llvm::formatv( - "define outline: couldn't find `{0}` keyword to remove.", - tok::getKeywordSpelling(Kind)))); + error("define outline: couldn't find `{0}` keyword to remove.", + tok::getKeywordSpelling(Kind))); } }; @@ -411,15 +400,11 @@ class DefineOutline : public Tweak { auto MainFileName = getCanonicalPath(SM.getFileEntryForID(SM.getMainFileID()), SM); if (!MainFileName) - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "Couldn't get absolute path for mainfile."); + return error("Couldn't get absolute path for main file."); auto CCFile = getSourceFile(*MainFileName, Sel); if (!CCFile) - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "Couldn't find a suitable implementation file."); + return error("Couldn't find a suitable implementation file."); auto &FS = Sel.AST->getSourceManager().getFileManager().getVirtualFileSystem(); @@ -427,8 +412,7 @@ class DefineOutline : public Tweak { // FIXME: Maybe we should consider creating the implementation file if it // doesn't exist? if (!Buffer) - return llvm::createStringError(Buffer.getError(), - Buffer.getError().message()); + return llvm::errorCodeToError(Buffer.getError()); auto Contents = Buffer->get()->getBuffer(); auto InsertionPoint = getInsertionPoint( Contents, Source->getQualifiedNameAsString(), Sel.AST->getLangOpts()); diff --git a/clang-tools-extra/clangd/refactor/tweaks/ExpandAutoType.cpp b/clang-tools-extra/clangd/refactor/tweaks/ExpandAutoType.cpp index d2dfc4a537d4a..f9db50d934b09 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/ExpandAutoType.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/ExpandAutoType.cpp @@ -45,11 +45,6 @@ class ExpandAutoType : public Tweak { private: /// Cache the AutoTypeLoc, so that we do not need to search twice. llvm::Optional CachedLocation; - - /// Create an error message with filename and line number in it - llvm::Error createErrorMessage(const std::string& Message, - const Selection &Inputs); - }; REGISTER_TWEAK(ExpandAutoType) @@ -78,21 +73,19 @@ Expected ExpandAutoType::apply(const Selection& Inputs) { // if we can't resolve the type, return an error message if (DeducedType == llvm::None) - return createErrorMessage("Could not deduce type for 'auto' type", Inputs); + return error("Could not deduce type for 'auto' type"); // if it's a lambda expression, return an error message if (isa(*DeducedType) && dyn_cast(*DeducedType)->getDecl()->isLambda()) { - return createErrorMessage("Could not expand type of lambda expression", - Inputs); + return error("Could not expand type of lambda expression"); } // if it's a function expression, return an error message // naively replacing 'auto' with the type will break declarations. // FIXME: there are other types that have similar problems if (DeducedType->getTypePtr()->isFunctionPointerType()) { - return createErrorMessage("Could not expand type of function pointer", - Inputs); + return error("Could not expand type of function pointer"); } std::string PrettyTypeName = printType(*DeducedType, @@ -105,18 +98,6 @@ Expected ExpandAutoType::apply(const Selection& Inputs) { return Effect::mainFileEdit(SrcMgr, tooling::Replacements(Expansion)); } -llvm::Error ExpandAutoType::createErrorMessage(const std::string& Message, - const Selection& Inputs) { - auto &SrcMgr = Inputs.AST->getSourceManager(); - std::string ErrorMessage = - Message + ": " + - SrcMgr.getFilename(Inputs.Cursor).str() + " Line " + - std::to_string(SrcMgr.getExpansionLineNumber(Inputs.Cursor)); - - return llvm::createStringError(llvm::inconvertibleErrorCode(), - ErrorMessage.c_str()); -} - } // namespace } // namespace clangd } // namespace clang diff --git a/clang-tools-extra/clangd/refactor/tweaks/ExtractFunction.cpp b/clang-tools-extra/clangd/refactor/tweaks/ExtractFunction.cpp index d4c723e02eebe..6ee5aee37f51c 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/ExtractFunction.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/ExtractFunction.cpp @@ -625,9 +625,8 @@ llvm::Expected getExtractedFunction(ExtractionZone &ExtZone, CapturedZoneInfo CapturedInfo = captureZoneInfo(ExtZone); // Bail out if any break of continue exists if (CapturedInfo.BrokenControlFlow) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - +"Cannot extract break/continue without " - "corresponding loop/switch statement."); + return error("Cannot extract break/continue without corresponding " + "loop/switch statement."); NewFunction ExtractedFunc(getSemicolonPolicy(ExtZone, SM, LangOpts)); ExtractedFunc.BodyRange = ExtZone.ZoneRange; ExtractedFunc.InsertionPoint = ExtZone.getInsertionPoint(); @@ -637,8 +636,7 @@ llvm::Expected getExtractedFunction(ExtractionZone &ExtZone, if (!createParameters(ExtractedFunc, CapturedInfo) || !generateReturnProperties(ExtractedFunc, *ExtZone.EnclosingFunction, CapturedInfo)) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - +"Too complex to extract."); + return error("Too complex to extract."); return ExtractedFunc; } diff --git a/clang-tools-extra/clangd/refactor/tweaks/ObjCLocalizeStringLiteral.cpp b/clang-tools-extra/clangd/refactor/tweaks/ObjCLocalizeStringLiteral.cpp index 2534cf562daa8..894f018aa7968 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/ObjCLocalizeStringLiteral.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/ObjCLocalizeStringLiteral.cpp @@ -68,8 +68,7 @@ ObjCLocalizeStringLiteral::apply(const Selection &Inputs) { const auto &TB = AST->getTokens(); auto Toks = TB.spelledForExpanded(TB.expandedTokens(Str->getSourceRange())); if (!Toks || Toks->empty()) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Failed to find tokens to replace."); + return error("Failed to find tokens to replace."); // Insert `NSLocalizedString(` before the literal. auto Reps = tooling::Replacements(tooling::Replacement( SM, Toks->front().location(), 0, "NSLocalizedString(")); diff --git a/clang-tools-extra/clangd/refactor/tweaks/RemoveUsingNamespace.cpp b/clang-tools-extra/clangd/refactor/tweaks/RemoveUsingNamespace.cpp index e054e33c046a0..9d1a9f12567c4 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/RemoveUsingNamespace.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/RemoveUsingNamespace.cpp @@ -10,6 +10,7 @@ #include "Selection.h" #include "SourceCode.h" #include "refactor/Tweak.h" +#include "support/Logger.h" #include "clang/AST/Decl.h" #include "clang/AST/DeclBase.h" #include "clang/AST/DeclCXX.h" @@ -73,8 +74,7 @@ removeUsingDirective(ASTContext &Ctx, const UsingDirectiveDecl *D) { llvm::Optional NextTok = Lexer::findNextToken(D->getEndLoc(), SM, Ctx.getLangOpts()); if (!NextTok || NextTok->isNot(tok::semi)) - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "no semicolon after using-directive"); + return error("no semicolon after using-directive"); // FIXME: removing the semicolon may be invalid in some obscure cases, e.g. // if (x) using namespace std; else using namespace bar; return tooling::Replacement( diff --git a/clang-tools-extra/clangd/refactor/tweaks/SwapIfBranches.cpp b/clang-tools-extra/clangd/refactor/tweaks/SwapIfBranches.cpp index d6966e699fdbc..d5299f014cc74 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/SwapIfBranches.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/SwapIfBranches.cpp @@ -69,15 +69,11 @@ Expected SwapIfBranches::apply(const Selection &Inputs) { auto ThenRng = toHalfOpenFileRange(SrcMgr, Ctx.getLangOpts(), If->getThen()->getSourceRange()); if (!ThenRng) - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "Could not obtain range of the 'then' branch. Macros?"); + return error("Could not obtain range of the 'then' branch. Macros?"); auto ElseRng = toHalfOpenFileRange(SrcMgr, Ctx.getLangOpts(), If->getElse()->getSourceRange()); if (!ElseRng) - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "Could not obtain range of the 'else' branch. Macros?"); + return error("Could not obtain range of the 'else' branch. Macros?"); auto ThenCode = toSourceCode(SrcMgr, *ThenRng); auto ElseCode = toSourceCode(SrcMgr, *ElseRng); diff --git a/clang-tools-extra/clangd/support/Logger.cpp b/clang-tools-extra/clangd/support/Logger.cpp index 768d2e52210b2..4a5d7d63bed46 100644 --- a/clang-tools-extra/clangd/support/Logger.cpp +++ b/clang-tools-extra/clangd/support/Logger.cpp @@ -9,6 +9,7 @@ #include "support/Logger.h" #include "support/Trace.h" #include "llvm/Support/Chrono.h" +#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" #include @@ -58,5 +59,27 @@ void StreamLogger::log(Logger::Level Level, Logs.flush(); } +namespace { +// Like llvm::StringError but with fewer options and no gratuitous copies. +class SimpleStringError : public llvm::ErrorInfo { + std::error_code EC; + std::string Message; + +public: + SimpleStringError(std::error_code EC, std::string &&Message) + : EC(EC), Message(std::move(Message)) {} + void log(llvm::raw_ostream &OS) const override { OS << Message; } + std::string message() const override { return Message; } + std::error_code convertToErrorCode() const override { return EC; } + static char ID; +}; +char SimpleStringError::ID; + +} // namespace + +llvm::Error detail::error(std::error_code EC, std::string &&Msg) { + return llvm::make_error(EC, std::move(Msg)); +} + } // namespace clangd } // namespace clang diff --git a/clang-tools-extra/clangd/support/Logger.h b/clang-tools-extra/clangd/support/Logger.h index 72d1408bdc77c..0674671aa8e12 100644 --- a/clang-tools-extra/clangd/support/Logger.h +++ b/clang-tools-extra/clangd/support/Logger.h @@ -45,6 +45,8 @@ template void log(Logger::Level L, const char *Fmt, Ts &&... Vals) { detail::log(L, llvm::formatv(Fmt, detail::wrap(std::forward(Vals))...)); } + +llvm::Error error(std::error_code, std::string &&); } // namespace detail // Clangd logging functions write to a global logger set by LoggingSession. @@ -67,6 +69,30 @@ template void log(const char *Fmt, Ts &&... Vals) { template void vlog(const char *Fmt, Ts &&... Vals) { detail::log(Logger::Verbose, Fmt, std::forward(Vals)...); } +// error() constructs an llvm::Error object, using formatv()-style arguments. +// It is not automatically logged! (This function is a little out of place). +// The error simply embeds the message string. +template +llvm::Error error(std::error_code EC, const char *Fmt, Ts &&... Vals) { + // We must render the formatv_object eagerly, while references are valid. + return detail::error( + EC, llvm::formatv(Fmt, detail::wrap(std::forward(Vals))...).str()); +} +// Overload with no error_code conversion, the error will be inconvertible. +template llvm::Error error(const char *Fmt, Ts &&... Vals) { + return detail::error( + llvm::inconvertibleErrorCode(), + llvm::formatv(Fmt, detail::wrap(std::forward(Vals))...).str()); +} +// Overload to avoid formatv complexity for simple strings. +inline llvm::Error error(std::error_code EC, std::string Msg) { + return detail::error(EC, std::move(Msg)); +} +// Overload for simple strings with no error_code conversion. +inline llvm::Error error(std::string Msg) { + return detail::error(llvm::inconvertibleErrorCode(), std::move(Msg)); +} + // dlog only logs if --debug was passed, or --debug_only=Basename. // This level would be enabled in a targeted way when debugging. #define dlog(...) \ diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp index dcbaa35238226..cf74ded936320 100644 --- a/clang-tools-extra/clangd/tool/ClangdMain.cpp +++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp @@ -484,9 +484,9 @@ class TestScheme : public URIScheme { // Still require "/" in body to mimic file scheme, as we want lengths of an // equivalent URI in both schemes to be the same. if (!Body.startswith("/")) - return llvm::make_error( - "Expect URI body to be an absolute path starting with '/': " + Body, - llvm::inconvertibleErrorCode()); + return error( + "Expect URI body to be an absolute path starting with '/': {0}", + Body); Body = Body.ltrim('/'); llvm::SmallVector Path(Body.begin(), Body.end()); path::native(Path); @@ -497,11 +497,9 @@ class TestScheme : public URIScheme { llvm::Expected uriFromAbsolutePath(llvm::StringRef AbsolutePath) const override { llvm::StringRef Body = AbsolutePath; - if (!Body.consume_front(TestScheme::TestDir)) { - return llvm::make_error( - "Path " + AbsolutePath + " doesn't start with root " + TestDir, - llvm::inconvertibleErrorCode()); - } + if (!Body.consume_front(TestScheme::TestDir)) + return error("Path {0} doesn't start with root {1}", AbsolutePath, + TestDir); return URI("test", /*Authority=*/"", llvm::sys::path::convert_to_slash(Body)); diff --git a/clang-tools-extra/clangd/unittests/CMakeLists.txt b/clang-tools-extra/clangd/unittests/CMakeLists.txt index 966fa9630852b..2167b5e210e22 100644 --- a/clang-tools-extra/clangd/unittests/CMakeLists.txt +++ b/clang-tools-extra/clangd/unittests/CMakeLists.txt @@ -62,6 +62,7 @@ add_unittest(ClangdUnitTests ClangdTests IndexActionTests.cpp IndexTests.cpp JSONTransportTests.cpp + LoggerTests.cpp LSPClient.cpp ModulesTests.cpp ParsedASTTests.cpp diff --git a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp index a9526ce2367c4..27b1c0cfc56dd 100644 --- a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp +++ b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp @@ -47,16 +47,21 @@ CompileFlags: { Add: [foo, bar] } Add: | b az +--- +Index: + Background: Skip )yaml"; auto Results = Fragment::parseYAML(YAML, "config.yaml", Diags.callback()); EXPECT_THAT(Diags.Diagnostics, IsEmpty()); - ASSERT_EQ(Results.size(), 2u); - EXPECT_FALSE(Results.front().If.HasUnrecognizedCondition); - EXPECT_THAT(Results.front().If.PathMatch, ElementsAre(Val("abc"))); - EXPECT_THAT(Results.front().CompileFlags.Add, - ElementsAre(Val("foo"), Val("bar"))); + ASSERT_EQ(Results.size(), 3u); + EXPECT_FALSE(Results[0].If.HasUnrecognizedCondition); + EXPECT_THAT(Results[0].If.PathMatch, ElementsAre(Val("abc"))); + EXPECT_THAT(Results[0].CompileFlags.Add, ElementsAre(Val("foo"), Val("bar"))); + + EXPECT_THAT(Results[1].CompileFlags.Add, ElementsAre(Val("b\naz\n"))); - EXPECT_THAT(Results.back().CompileFlags.Add, ElementsAre(Val("b\naz\n"))); + ASSERT_TRUE(Results[2].Index.Background); + EXPECT_EQ("Skip", *Results[2].Index.Background.getValue()); } TEST(ParseYAML, Locations) { diff --git a/clang-tools-extra/clangd/unittests/LoggerTests.cpp b/clang-tools-extra/clangd/unittests/LoggerTests.cpp new file mode 100644 index 0000000000000..3d2194d79090d --- /dev/null +++ b/clang-tools-extra/clangd/unittests/LoggerTests.cpp @@ -0,0 +1,62 @@ +//===-- LoggerTests.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "support/Logger.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace clang { +namespace clangd { +namespace { + +TEST(ErrorTest, Overloads) { + EXPECT_EQ("foo", llvm::toString(error("foo"))); + // Inconvertible to error code when none is specified. + // Don't actually try to convert, it'll crash. + handleAllErrors(error("foo"), [&](const llvm::ErrorInfoBase &EI) { + EXPECT_EQ(llvm::inconvertibleErrorCode(), EI.convertToErrorCode()); + }); + + EXPECT_EQ("foo 42", llvm::toString(error("foo {0}", 42))); + handleAllErrors(error("foo {0}", 42), [&](const llvm::ErrorInfoBase &EI) { + EXPECT_EQ(llvm::inconvertibleErrorCode(), EI.convertToErrorCode()); + }); + + EXPECT_EQ("foo", llvm::toString(error(llvm::errc::invalid_argument, "foo"))); + EXPECT_EQ(llvm::errc::invalid_argument, + llvm::errorToErrorCode(error(llvm::errc::invalid_argument, "foo"))); + + EXPECT_EQ("foo 42", + llvm::toString(error(llvm::errc::invalid_argument, "foo {0}", 42))); + EXPECT_EQ(llvm::errc::invalid_argument, + llvm::errorToErrorCode( + error(llvm::errc::invalid_argument, "foo {0}", 42))); +} + +TEST(ErrorTest, Lifetimes) { + llvm::Optional Err; + { + // Check the error contains the value when error() was called. + std::string S = "hello, world"; + Err = error("S={0}", llvm::StringRef(S)); + S = "garbage"; + } + EXPECT_EQ("S=hello, world", llvm::toString(std::move(*Err))); +} + +TEST(ErrorTest, ConsumeError) { + llvm::Error Foo = error("foo"); + llvm::Error Bar = error("bar: {0}", std::move(Foo)); + EXPECT_EQ("bar: foo", llvm::toString(std::move(Bar))); + // No assert for unchecked Foo. +} + +} // namespace +} // namespace clangd +} // namespace clang diff --git a/clang-tools-extra/clangd/unittests/TestFS.cpp b/clang-tools-extra/clangd/unittests/TestFS.cpp index 3b2fbc142a28f..ba4010cb45817 100644 --- a/clang-tools-extra/clangd/unittests/TestFS.cpp +++ b/clang-tools-extra/clangd/unittests/TestFS.cpp @@ -100,13 +100,9 @@ class TestScheme : public URIScheme { getAbsolutePath(llvm::StringRef /*Authority*/, llvm::StringRef Body, llvm::StringRef HintPath) const override { if (!HintPath.startswith(testRoot())) - return llvm::make_error( - "Hint path doesn't start with test root: " + HintPath, - llvm::inconvertibleErrorCode()); + return error("Hint path doesn't start with test root: {0}", HintPath); if (!Body.consume_front("/")) - return llvm::make_error( - "Body of an unittest: URI must start with '/'", - llvm::inconvertibleErrorCode()); + return error("Body of an unittest: URI must start with '/'"); llvm::SmallString<16> Path(Body.begin(), Body.end()); llvm::sys::path::native(Path); return testPath(Path); @@ -116,9 +112,7 @@ class TestScheme : public URIScheme { uriFromAbsolutePath(llvm::StringRef AbsolutePath) const override { llvm::StringRef Body = AbsolutePath; if (!Body.consume_front(testRoot())) - return llvm::make_error( - AbsolutePath + "does not start with " + testRoot(), - llvm::inconvertibleErrorCode()); + return error("{0} does not start with {1}", AbsolutePath, testRoot()); return URI(Scheme, /*Authority=*/"", llvm::sys::path::convert_to_slash(Body)); diff --git a/clang-tools-extra/clangd/xpc/XPCTransport.cpp b/clang-tools-extra/clangd/xpc/XPCTransport.cpp index 50eacf2115eea..9eb083953b965 100644 --- a/clang-tools-extra/clangd/xpc/XPCTransport.cpp +++ b/clang-tools-extra/clangd/xpc/XPCTransport.cpp @@ -41,7 +41,7 @@ Error decodeError(const json::Object &O) { std::string(O.getString("message").getValueOr("Unspecified error")); if (auto Code = O.getInteger("code")) return make_error(std::move(Msg), ErrorCode(*Code)); - return make_error(std::move(Msg), inconvertibleErrorCode()); + return error("{0}", Msg); } // C "closure" for XPCTransport::loop() method diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 781fef27c4761..563c0eced92ef 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -67,18 +67,33 @@ The improvements are... Improvements to clang-tidy -------------------------- +New modules +^^^^^^^^^^^ + +- New ``altera`` module. + + Includes checks related to OpenCL for FPGA coding guidelines, based on the + `Altera SDK for OpenCL: Best Practices Guide + `_. + +New checks +^^^^^^^^^^ + +- New :doc:`altera-struct-pack-align + ` check. + + Finds structs that are inefficiently packed or aligned, and recommends + packing and/or aligning of said structs as needed. + +- New :doc:`bugprone-misplaced-pointer-arithmetic-in-alloc + ` check. + - New :doc:`bugprone-redundant-branch-condition ` check. Finds condition variables in nested ``if`` statements that were also checked in the outer ``if`` statement and were not changed. -- New :doc:`cppcoreguidelines-prefer-member-initializer - ` check. - - Finds member initializations in the constructor body which can be placed into - the initialization list instead. - Changes in existing checks ^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst index 6b7af479804de..c7e7e804a0ff4 100644 --- a/clang-tools-extra/docs/clang-tidy/Contributing.rst +++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst @@ -27,7 +27,7 @@ There are a few tools particularly useful when developing clang-tidy checks: * `clang-check`_ with the ``-ast-dump`` (and optionally ``-ast-dump-filter``) provides a convenient way to dump AST of a C++ program. -If CMake is configured with ``CLANG_ENABLE_STATIC_ANALYZER``, +If CMake is configured with ``CLANG_TIDY_ENABLE_STATIC_ANALYZER=NO``, :program:`clang-tidy` will not be built with support for the ``clang-analyzer-*`` checks or the ``mpi-*`` checks. diff --git a/clang-tools-extra/docs/clang-tidy/Integrations.rst b/clang-tools-extra/docs/clang-tidy/Integrations.rst index bdd012aec89ee..c81a00deb68ad 100644 --- a/clang-tools-extra/docs/clang-tidy/Integrations.rst +++ b/clang-tools-extra/docs/clang-tidy/Integrations.rst @@ -2,12 +2,17 @@ Clang-tidy IDE/Editor Integrations ================================== -.. _Clangd: https://clang.llvm.org/extra/clangd.html +.. _clangd: http://clangd.llvm.org/ +.. _is available: https://clangd.llvm.org/installation.html#editor-plugins +.. _more: https://langserver.org/#implementations-client Apart from being a standalone tool, :program:`clang-tidy` is integrated into -various IDEs, code analyzers, and editors. Besides, it is currently being -integrated into Clangd_. The following table shows the most -well-known :program:`clang-tidy` integrations in detail. +various IDEs, code analyzers, and editors. We recommend using clangd_ which +integrates :program:`clang-tidy` and `is available`_ in most major editors +through plugins (Vim, Emacs, Visual Studio Code, Sublime Text and more_). + +The following table shows the most well-known :program:`clang-tidy` +integrations in detail. +--------------------------------------+------------------------+---------------------------------+--------------------------+-----------------------------------------+--------------------------+ | | Feature | @@ -18,7 +23,7 @@ well-known :program:`clang-tidy` integrations in detail. +--------------------------------------+------------------------+---------------------------------+--------------------------+-----------------------------------------+--------------------------+ |Clang Power Tools for Visual Studio | \-\ | \+\ | \-\ | \+\ | \-\ | +--------------------------------------+------------------------+---------------------------------+--------------------------+-----------------------------------------+--------------------------+ -|Clangd | \+\ | \-\ | \-\ | \-\ | \-\ | +|Clangd | \+\ | \-\ | \-\ | \+\ | \-\ | +--------------------------------------+------------------------+---------------------------------+--------------------------+-----------------------------------------+--------------------------+ |CLion IDE | \+\ | \+\ | \+\ | \+\ | \+\ | +--------------------------------------+------------------------+---------------------------------+--------------------------+-----------------------------------------+--------------------------+ diff --git a/clang-tools-extra/docs/clang-tidy/checks/altera-struct-pack-align.rst b/clang-tools-extra/docs/clang-tidy/checks/altera-struct-pack-align.rst new file mode 100644 index 0000000000000..b03a4fcf7fcf3 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/altera-struct-pack-align.rst @@ -0,0 +1,54 @@ +.. title:: clang-tidy - altera-struct-pack-align + +altera-struct-pack-align +======================== + +Finds structs that are inefficiently packed or aligned, and recommends +packing and/or aligning of said structs as needed. + +Structs that are not packed take up more space than they should, and accessing +structs that are not well aligned is inefficient. + +Fix-its are provided to fix both of these issues by inserting and/or amending +relevant struct attributes. + +Based on the `Altera SDK for OpenCL: Best Practices Guide +`_. + +.. code-block:: c++ + + // The following struct is originally aligned to 4 bytes, and thus takes up + // 12 bytes of memory instead of 10. Packing the struct will make it use + // only 10 bytes of memory, and aligning it to 16 bytes will make it + // efficient to access. + struct example { + char a; // 1 byte + double b; // 8 bytes + char c; // 1 byte + }; + + // The following struct is arranged in such a way that packing is not needed. + // However, it is aligned to 4 bytes instead of 8, and thus needs to be + // explicitly aligned. + struct implicitly_packed_example { + char a; // 1 byte + char b; // 1 byte + char c; // 1 byte + char d; // 1 byte + int e; // 4 bytes + }; + + // The following struct is explicitly aligned and packed. + struct good_example { + char a; // 1 byte + double b; // 8 bytes + char c; // 1 byte + } __attribute__((packed)) __attribute__((aligned(16)); + + // Explicitly aligning a struct to the wrong value will result in a warning. + // The following example should be aligned to 16 bytes, not 32. + struct badly_aligned_example { + char a; // 1 byte + double b; // 8 bytes + char c; // 1 byte + } __attribute__((packed)) __attribute__((aligned(32))); diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone-argument-comment.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone-argument-comment.rst index 8484c393a12bd..8c59541b8d42a 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone-argument-comment.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone-argument-comment.rst @@ -29,6 +29,7 @@ Options account. .. option:: IgnoreSingleArgument + When true, the check will ignore the single argument. .. option:: CommentBoolLiterals diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone-exception-escape.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone-exception-escape.rst index 9c7f113a1bf3c..52f3ceff28149 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone-exception-escape.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone-exception-escape.rst @@ -5,6 +5,7 @@ bugprone-exception-escape Finds functions which may throw an exception directly or indirectly, but they should not. The functions which should not throw exceptions are the following: + * Destructors * Move constructors * Move assignment operators diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone-forwarding-reference-overload.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone-forwarding-reference-overload.rst index 61255e7596b40..b2a9e0f3b3dfb 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone-forwarding-reference-overload.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone-forwarding-reference-overload.rst @@ -37,7 +37,7 @@ The check warns for constructors C1 and C2, because those can hide copy and move constructors. We suppress warnings if the copy and the move constructors are both disabled (deleted or private), because there is nothing the perfect forwarding constructor could hide in this case. We also suppress warnings for constructors -like C3 that are guarded with an enable_if, assuming the programmer was aware of +like C3 that are guarded with an ``enable_if``, assuming the programmer was aware of the possible hiding. Background @@ -45,5 +45,5 @@ Background For deciding whether a constructor is guarded with enable_if, we consider the default values of the type parameters and the types of the constructor -parameters. If any part of these types is std::enable_if or std::enable_if_t, we -assume the constructor is guarded. +parameters. If any part of these types is ``std::enable_if`` or ``std::enable_if_t``, +we assume the constructor is guarded. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone-lambda-function-name.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone-lambda-function-name.rst index 683977a3d2c06..6f0ba836fdf5c 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone-lambda-function-name.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone-lambda-function-name.rst @@ -10,7 +10,7 @@ is almost never what was intended. Example: .. code-block:: c++ - + void FancyFunction() { [] { printf("Called from %s\n", __func__); }(); [] { printf("Now called from %s\n", __FUNCTION__); }(); diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone-not-null-terminated-result.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone-not-null-terminated-result.rst index 9e5a702630c88..54e48268181ca 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone-not-null-terminated-result.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone-not-null-terminated-result.rst @@ -5,7 +5,7 @@ bugprone-not-null-terminated-result Finds function calls where it is possible to cause a not null-terminated result. Usually the proper length of a string is ``strlen(src) + 1`` or equal length of -this expression, because the null terminator needs an extra space. Without the +this expression, because the null terminator needs an extra space. Without the null terminator it can result in undefined behaviour when the string is read. The following and their respective ``wchar_t`` based functions are checked: @@ -17,27 +17,27 @@ The following is a real-world example where the programmer forgot to increase the passed third argument, which is ``size_t length``. That is why the length of the allocated memory is not enough to hold the null terminator. - .. code-block:: c +.. code-block:: c - static char *stringCpy(const std::string &str) { - char *result = reinterpret_cast(malloc(str.size())); - memcpy(result, str.data(), str.size()); - return result; - } + static char *stringCpy(const std::string &str) { + char *result = reinterpret_cast(malloc(str.size())); + memcpy(result, str.data(), str.size()); + return result; + } In addition to issuing warnings, fix-it rewrites all the necessary code. It also tries to adjust the capacity of the destination array: - .. code-block:: c +.. code-block:: c - static char *stringCpy(const std::string &str) { - char *result = reinterpret_cast(malloc(str.size() + 1)); - strcpy(result, str.data()); - return result; - } + static char *stringCpy(const std::string &str) { + char *result = reinterpret_cast(malloc(str.size() + 1)); + strcpy(result, str.data()); + return result; + } Note: It cannot guarantee to rewrite every of the path-sensitive memory - allocations. +allocations. .. _MemcpyTransformation: diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone-suspicious-include.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone-suspicious-include.rst index 237823ce8558b..3c05f39db12d5 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone-suspicious-include.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone-suspicious-include.rst @@ -19,7 +19,7 @@ Options ------- .. option:: HeaderFileExtensions - Default value: `";h;hh;hpp;hxx"` + Default value: ``";h;hh;hpp;hxx"`` A semicolon-separated list of filename extensions of header files (the filename extensions should not contain a "." prefix). For extension-less header files, use an empty string or leave an empty string between ";" @@ -27,6 +27,6 @@ Options .. option:: ImplementationFileExtensions - Default value: `"c;cc;cpp;cxx"` + Default value: ``"c;cc;cpp;cxx"`` Likewise, a semicolon-separated list of filename extensions of implementation files. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone-suspicious-missing-comma.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone-suspicious-missing-comma.rst index 9fe9153117c2c..7455a2ef13509 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone-suspicious-missing-comma.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone-suspicious-missing-comma.rst @@ -46,14 +46,14 @@ Options .. option:: SizeThreshold An unsigned integer specifying the minimum size of a string literal to be - considered by the check. Default is `5U`. + considered by the check. Default is ``5U``. .. option:: RatioThreshold A string specifying the maximum threshold ratio [0, 1.0] of suspicious string - literals to be considered. Default is `".2"`. + literals to be considered. Default is ``".2"``. .. option:: MaxConcatenatedTokens An unsigned integer specifying the maximum number of concatenated tokens. - Default is `5U`. + Default is ``5U``. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone-terminating-continue.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone-terminating-continue.rst index 1a6ae812f2aa1..222de90037336 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone-terminating-continue.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone-terminating-continue.rst @@ -3,15 +3,15 @@ bugprone-terminating-continue ============================= -Detects `do while` loops with a condition always evaluating to false that -have a `continue` statement, as this `continue` terminates the loop +Detects ``do while`` loops with a condition always evaluating to false that +have a ``continue`` statement, as this ``continue`` terminates the loop effectively. .. code-block:: c++ void f() { do { - // some code + // some code continue; // terminating continue // some other code } while(false); diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert-con36-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert-con36-c.rst index 7d74e05cf64d3..6fabd146993bc 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert-con36-c.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert-con36-c.rst @@ -1,10 +1,10 @@ .. title:: clang-tidy - cert-con36-c .. meta:: :http-equiv=refresh: 5;URL=bugprone-spuriously-wake-up-functions.html - + cert-con36-c ============ The cert-con36-c check is an alias, please see -`bugprone-spuriously-wake-up-functions `_ +`bugprone-spuriously-wake-up-functions `_ for more information. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert-con54-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert-con54-cpp.rst index f74bc44962199..ff9237ef53a55 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert-con54-cpp.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert-con54-cpp.rst @@ -1,10 +1,10 @@ .. title:: clang-tidy - cert-con54-cpp .. meta:: :http-equiv=refresh: 5;URL=bugprone-spuriously-wake-up-functions.html - + cert-con54-cpp ============== The cert-con54-cpp check is an alias, please see -`bugprone-spuriously-wake-up-functions `_ +`bugprone-spuriously-wake-up-functions `_ for more information. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines-avoid-non-const-global-variables.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines-avoid-non-const-global-variables.rst index 4d1ffde62dbb7..53dafc7f8b435 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines-avoid-non-const-global-variables.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines-avoid-non-const-global-variables.rst @@ -3,8 +3,8 @@ cppcoreguidelines-avoid-non-const-global-variables ================================================== -Finds non-const global variables as described in `I.2 of C++ Core Guidelines ` . -As `R.6 of C++ Core Guidelines ` is a duplicate of rule I.2 it also covers that rule. +Finds non-const global variables as described in `I.2 of C++ Core Guidelines `_ . +As `R.6 of C++ Core Guidelines `_ is a duplicate of rule I.2 it also covers that rule. .. code-block:: c++ diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines-prefer-member-initializer.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines-prefer-member-initializer.rst deleted file mode 100644 index 749be14182153..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines-prefer-member-initializer.rst +++ /dev/null @@ -1,102 +0,0 @@ -.. title:: clang-tidy - cppcoreguidelines-prefer-member-initializer - -cppcoreguidelines-prefer-member-initializer -=========================================== - -Finds member initializations in the constructor body which can be converted -into member initializers of the constructor instead. This not only improves -the readability of the code but also positively affects its performance. -Class-member assignments inside a control statement or following the first -control statement are ignored. - -This check implements `C.49 `_ from the CppCoreGuidelines. - -If the language version is `C++ 11` or above, the constructor is the default -constructor of the class, the field is not a bitfield (only in case of earlier -language version than `C++ 20`), furthermore the assigned value is a literal, -negated literal or ``enum`` constant then the preferred place of the -initialization is at the class member declaration. - -This latter rule is `C.48 `_ from CppCoreGuidelines. - -Please note, that this check does not enforce this latter rule for -initializations already implemented as member initializers. For that purpose -see check `modernize-use-default-member-init `_. - -Example 1 ---------- - -.. code-block:: c++ - - class C { - int n; - int m; - public: - C() { - n = 1; // Literal in default constructor - if (dice()) - return; - m = 1; - } - }; - -Here ``n`` can be initialized using a default member initializer, unlike -``m``, as ``m``'s initialization follows a control statement (``if``): - -.. code-block:: c++ - - class C { - int n{1}; - int m; - public: - C() { - if (dice()) - return; - m = 1; - } - -Example 2 ---------- - -.. code-block:: c++ - - class C { - int n; - int m; - public: - C(int nn, int mm) { - n = nn; // Neither default constructor nor literal - if (dice()) - return; - m = mm; - } - }; - -Here ``n`` can be initialized in the constructor initialization list, unlike -``m``, as ``m``'s initialization follows a control statement (``if``): - -.. code-block:: c++ - - C(int nn, int mm) : n(nn) { - if (dice()) - return; - m = mm; - } - -.. option:: UseAssignment - - If this option is set to non-zero (default is `0`), the check will initialize - members with an assignment. In this case the fix of the first example looks - like this: - -.. code-block:: c++ - - class C { - int n = 1; - int m; - public: - C() { - if (dice()) - return; - m = 1; - } diff --git a/clang-tools-extra/docs/clang-tidy/checks/google-objc-global-variable-declaration.rst b/clang-tools-extra/docs/clang-tidy/checks/google-objc-global-variable-declaration.rst index e4b41fbc723a2..15b59996e3d31 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/google-objc-global-variable-declaration.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/google-objc-global-variable-declaration.rst @@ -9,8 +9,8 @@ pattern of variable names in Google's Objective-C Style Guide. The corresponding style guide rule: https://google.github.io/styleguide/objcguide.html#variable-names -All the global variables should follow the pattern of `g[A-Z].*` (variables) or -`k[A-Z].*` (constants). The check will suggest a variable name that follows the +All the global variables should follow the pattern of ``g[A-Z].*`` (variables) or +``k[A-Z].*`` (constants). The check will suggest a variable name that follows the pattern if it can be inferred from the original name. For code: diff --git a/clang-tools-extra/docs/clang-tidy/checks/google-readability-casting.rst b/clang-tools-extra/docs/clang-tidy/checks/google-readability-casting.rst index 4c9d1bc4f99d6..d927e1ce29fce 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/google-readability-casting.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/google-readability-casting.rst @@ -9,6 +9,6 @@ https://google.github.io/styleguide/cppguide.html#Casting Corresponding cpplint.py check name: `readability/casting`. -This check is similar to `-Wold-style-cast`, but it suggests automated fixes +This check is similar to ``-Wold-style-cast``, but it suggests automated fixes in some cases. The reported locations should not be different from the -ones generated by `-Wold-style-cast`. +ones generated by ``-Wold-style-cast``. diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index 91414ee8c90f3..378e92cb66ddc 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -30,6 +30,7 @@ Clang-Tidy Checks `abseil-time-comparison `_, "Yes" `abseil-time-subtraction `_, "Yes" `abseil-upgrade-duration-conversions `_, "Yes" + `altera-struct-pack-align `_, `android-cloexec-accept `_, "Yes" `android-cloexec-accept4 `_, `android-cloexec-creat `_, "Yes" @@ -142,7 +143,6 @@ Clang-Tidy Checks `cppcoreguidelines-narrowing-conversions `_, `cppcoreguidelines-no-malloc `_, `cppcoreguidelines-owning-memory `_, - `cppcoreguidelines-prefer-member-initializer `_, `cppcoreguidelines-pro-bounds-array-to-pointer-decay `_, `cppcoreguidelines-pro-bounds-constant-array-index `_, "Yes" `cppcoreguidelines-pro-bounds-pointer-arithmetic `_, diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc-misplaced-const.rst b/clang-tools-extra/docs/clang-tidy/checks/misc-misplaced-const.rst index e583ecb54cac1..3b21a87069863 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc-misplaced-const.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc-misplaced-const.rst @@ -8,7 +8,7 @@ This check diagnoses when a ``const`` qualifier is applied to a ``typedef``/ are often misleading to developers because the ``const`` applies to the pointer rather than the pointee. -For instance, in the following code, the resulting type is ``int *`` ``const`` +For instance, in the following code, the resulting type is ``int * const`` rather than ``const int *``: .. code-block:: c++ diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc-no-recursion.rst b/clang-tools-extra/docs/clang-tidy/checks/misc-no-recursion.rst index dad6f74ef7f4d..c8281075ded8f 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc-no-recursion.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc-no-recursion.rst @@ -9,10 +9,12 @@ diagnoses each function in the cycle, and displays one example of a possible call graph loop (recursion). References: + * CERT C++ Coding Standard rule `DCL56-CPP. Avoid cycles during initialization of static objects `_. * JPL Institutional Coding Standard for the C Programming Language (JPL DOCID D-60411) rule `2.4 Do not use direct or indirect recursion`. * OpenCL Specification, Version 1.2 rule `6.9 Restrictions: i. Recursion is not supported. `_. Limitations: + * The check does not handle calls done through function pointers * The check does not handle C++ destructors diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc-unused-parameters.rst b/clang-tools-extra/docs/clang-tidy/checks/misc-unused-parameters.rst index 3dfeb299de06b..d954c1ddb1c54 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc-unused-parameters.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc-unused-parameters.rst @@ -8,7 +8,7 @@ code (e.g. when a different parameter is used instead). The suggested fixes either comment parameter name out or remove the parameter completely, if all callers of the function are in the same translation unit and can be updated. -The check is similar to the `-Wunused-parameter` compiler diagnostic and can be +The check is similar to the ``-Wunused-parameter`` compiler diagnostic and can be used to prepare a codebase to enabling of that diagnostic. By default the check is more permissive (see :option:`StrictMode`). diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize-replace-disallow-copy-and-assign-macro.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize-replace-disallow-copy-and-assign-macro.rst index 6717c928506a7..c1c8ace0c937d 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize-replace-disallow-copy-and-assign-macro.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize-replace-disallow-copy-and-assign-macro.rst @@ -37,7 +37,7 @@ Known Limitations ----------------- * Notice that the migration example above leaves the ``private`` access - specification untouched. You might want to run the check:doc:`modernize-use-equals-delete + specification untouched. You might want to run the check :doc:`modernize-use-equals-delete ` to get warnings for deleted functions in private sections. diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize-use-noexcept.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize-use-noexcept.rst index 084dad74f8d5a..8addc8b4b66dd 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize-use-noexcept.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize-use-noexcept.rst @@ -15,25 +15,25 @@ Example .. code-block:: c++ void foo() throw(); - void bar() throw(int) {} + void bar() throw(int) {} transforms to: .. code-block:: c++ void foo() noexcept; - void bar() noexcept(false) {} + void bar() noexcept(false) {} Options ------- .. option:: ReplacementString -Users can use :option:`ReplacementString` to specify a macro to use -instead of ``noexcept``. This is useful when maintaining source code -that uses custom exception specification marking other than -``noexcept``. Fix-it hints will only be generated for non-throwing -specifications. + Users can use :option:`ReplacementString` to specify a macro to use + instead of ``noexcept``. This is useful when maintaining source code + that uses custom exception specification marking other than + ``noexcept``. Fix-it hints will only be generated for non-throwing + specifications. Example ^^^^^^^ diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize-use-uncaught-exceptions.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize-use-uncaught-exceptions.rst index 615f2e3f4a27f..d10556ff3b60e 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize-use-uncaught-exceptions.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize-use-uncaught-exceptions.rst @@ -12,53 +12,53 @@ they will be replaced with. .. code-block:: c++ - #define MACRO1 std::uncaught_exception - #define MACRO2 std::uncaught_exception - - int uncaught_exception() { - return 0; - } - - int main() { - int res; - - res = uncaught_exception(); - // No warning, since it is not the deprecated function from namespace std - - res = MACRO2(); - // Warning, but will not be replaced - - res = std::uncaught_exception(); - // Warning and replaced - - using std::uncaught_exception; - // Warning and replaced - - res = uncaught_exception(); - // Warning and replaced - } + #define MACRO1 std::uncaught_exception + #define MACRO2 std::uncaught_exception + + int uncaught_exception() { + return 0; + } + + int main() { + int res; + + res = uncaught_exception(); + // No warning, since it is not the deprecated function from namespace std + + res = MACRO2(); + // Warning, but will not be replaced + + res = std::uncaught_exception(); + // Warning and replaced + + using std::uncaught_exception; + // Warning and replaced + + res = uncaught_exception(); + // Warning and replaced + } After applying the fixes the code will look like the following: .. code-block:: c++ - #define MACRO1 std::uncaught_exception - #define MACRO2 std::uncaught_exception - - int uncaught_exception() { - return 0; - } - - int main() { - int res; - - res = uncaught_exception(); - - res = MACRO2(); - - res = std::uncaught_exceptions(); - - using std::uncaught_exceptions; - - res = uncaught_exceptions(); - } + #define MACRO1 std::uncaught_exception + #define MACRO2 std::uncaught_exception + + int uncaught_exception() { + return 0; + } + + int main() { + int res; + + res = uncaught_exception(); + + res = MACRO2(); + + res = std::uncaught_exceptions(); + + using std::uncaught_exceptions; + + res = uncaught_exceptions(); + } diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability-const-return-type.rst b/clang-tools-extra/docs/clang-tidy/checks/readability-const-return-type.rst index e236d8d00e627..6242e43818d48 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability-const-return-type.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability-const-return-type.rst @@ -11,7 +11,7 @@ return types. Examples: .. code-block:: c++ - + const int foo(); const Clazz foo(); Clazz *const foo(); diff --git a/clang-tools-extra/docs/clang-tidy/checks/zircon-temporary-objects.rst b/clang-tools-extra/docs/clang-tidy/checks/zircon-temporary-objects.rst index 7491f77e4b9f4..ab1225faa2139 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/zircon-temporary-objects.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/zircon-temporary-objects.rst @@ -3,12 +3,12 @@ zircon-temporary-objects ======================== -Warns on construction of specific temporary objects in the Zircon kernel. -If the object should be flagged, If the object should be flagged, the fully +Warns on construction of specific temporary objects in the Zircon kernel. +If the object should be flagged, If the object should be flagged, the fully qualified type name must be explicitly passed to the check. -For example, given the list of classes "Foo" and "NS::Bar", all of the -following will trigger the warning: +For example, given the list of classes "Foo" and "NS::Bar", all of the +following will trigger the warning: .. code-block:: c++ @@ -26,14 +26,14 @@ With the same list, the following will not trigger the warning: .. code-block:: c++ - Foo F; // Non-temporary construction okay - Foo F(param); // Non-temporary construction okay - Foo *F = new Foo(); // New construction okay + Foo F; // Non-temporary construction okay + Foo F(param); // Non-temporary construction okay + Foo *F = new Foo(); // New construction okay - Bar(); // Not NS::Bar, so okay - NS::Bar B; // Non-temporary construction okay + Bar(); // Not NS::Bar, so okay + NS::Bar B; // Non-temporary construction okay -Note that objects must be explicitly specified in order to be flagged, +Note that objects must be explicitly specified in order to be flagged, and so objects that inherit a specified object will not be flagged. This check matches temporary objects without regard for inheritance and so a @@ -49,5 +49,5 @@ Options .. option:: Names - A semi-colon-separated list of fully-qualified names of C++ classes that + A semi-colon-separated list of fully-qualified names of C++ classes that should not be constructed as temporaries. Default is empty. diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst index b9a4a7d694b4f..a85c721541784 100644 --- a/clang-tools-extra/docs/clang-tidy/index.rst +++ b/clang-tools-extra/docs/clang-tidy/index.rst @@ -58,6 +58,7 @@ There are currently the following groups of checks: Name prefix Description ====================== ========================================================= ``abseil-`` Checks related to Abseil library. +``altera-`` Checks related to OpenCL programming for FPGAs. ``android-`` Checks related to Android. ``boost-`` Checks related to Boost library. ``bugprone-`` Checks that target bugprone code constructs. diff --git a/clang-tools-extra/test/CMakeLists.txt b/clang-tools-extra/test/CMakeLists.txt index 60217b8c50cd4..15b756f0a3207 100644 --- a/clang-tools-extra/test/CMakeLists.txt +++ b/clang-tools-extra/test/CMakeLists.txt @@ -16,7 +16,7 @@ endif () string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} CLANG_TOOLS_DIR ${LLVM_RUNTIME_OUTPUT_INTDIR}) llvm_canonicalize_cmake_booleans( - CLANG_ENABLE_STATIC_ANALYZER + CLANG_TIDY_ENABLE_STATIC_ANALYZER LIBCLANG_INCLUDE_CLANG_TOOLS_EXTRA ) diff --git a/clang-tools-extra/test/clang-tidy/checkers/altera-struct-pack-align.cpp b/clang-tools-extra/test/clang-tidy/checkers/altera-struct-pack-align.cpp new file mode 100644 index 0000000000000..615b6cafe87a2 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/altera-struct-pack-align.cpp @@ -0,0 +1,101 @@ +// RUN: %check_clang_tidy %s altera-struct-pack-align %t -- -header-filter=.* + +// Struct needs both alignment and packing +struct error { + char a; + double b; + char c; +}; +// CHECK-MESSAGES: :[[@LINE-5]]:8: warning: accessing fields in struct 'error' is inefficient due to padding; only needs 10 bytes but is using 24 bytes [altera-struct-pack-align] +// CHECK-MESSAGES: :[[@LINE-6]]:8: note: use "__attribute__((packed))" to reduce the amount of padding applied to struct 'error' +// CHECK-MESSAGES: :[[@LINE-7]]:8: warning: accessing fields in struct 'error' is inefficient due to poor alignment; currently aligned to 8 bytes, but recommended alignment is 16 bytes [altera-struct-pack-align] +// CHECK-MESSAGES: :[[@LINE-8]]:8: note: use "__attribute__((aligned(16)))" to align struct 'error' to 16 bytes +// CHECK-FIXES: __attribute__((packed)) +// CHECK-FIXES: __attribute__((aligned(16))); + +// Struct is explicitly packed, but needs alignment +struct error_packed { + char a; + double b; + char c; +} __attribute__((packed)); +// CHECK-MESSAGES: :[[@LINE-5]]:8: warning: accessing fields in struct 'error_packed' is inefficient due to poor alignment; currently aligned to 1 bytes, but recommended alignment is 16 bytes [altera-struct-pack-align] +// CHECK-MESSAGES: :[[@LINE-6]]:8: note: use "__attribute__((aligned(16)))" to align struct 'error_packed' to 16 bytes +// CHECK-FIXES: __attribute__((aligned(16))) + +// Struct is properly packed, but needs alignment +struct align_only { + char a; + char b; + char c; + char d; + int e; + double f; +}; +// CHECK-MESSAGES: :[[@LINE-8]]:8: warning: accessing fields in struct 'align_only' is inefficient due to poor alignment; currently aligned to 8 bytes, but recommended alignment is 16 bytes [altera-struct-pack-align] +// CHECK-MESSAGES: :[[@LINE-9]]:8: note: use "__attribute__((aligned(16)))" to align struct 'align_only' to 16 bytes +// CHECK-FIXES: __attribute__((aligned(16))); + +// Struct is perfectly packed but wrongly aligned +struct bad_align { + char a; + double b; + char c; +} __attribute__((packed)) __attribute__((aligned(8))); +// CHECK-MESSAGES: :[[@LINE-5]]:8: warning: accessing fields in struct 'bad_align' is inefficient due to poor alignment; currently aligned to 8 bytes, but recommended alignment is 16 bytes [altera-struct-pack-align] +// CHECK-MESSAGES: :[[@LINE-6]]:8: note: use "__attribute__((aligned(16)))" to align struct 'bad_align' to 16 bytes +// CHECK-FIXES: __attribute__((aligned(16))); + +struct bad_align2 { + char a; + double b; + char c; +} __attribute__((packed)) __attribute__((aligned(32))); +// CHECK-MESSAGES: :[[@LINE-5]]:8: warning: accessing fields in struct 'bad_align2' is inefficient due to poor alignment; currently aligned to 32 bytes, but recommended alignment is 16 bytes [altera-struct-pack-align] +// CHECK-MESSAGES: :[[@LINE-6]]:8: note: use "__attribute__((aligned(16)))" to align struct 'bad_align2' to 16 bytes +// CHECK-FIXES: __attribute__((aligned(16))); + +struct bad_align3 { + char a; + double b; + char c; +} __attribute__((packed)) __attribute__((aligned(4))); +// CHECK-MESSAGES: :[[@LINE-5]]:8: warning: accessing fields in struct 'bad_align3' is inefficient due to poor alignment; currently aligned to 4 bytes, but recommended alignment is 16 bytes [altera-struct-pack-align] +// CHECK-MESSAGES: :[[@LINE-6]]:8: note: use "__attribute__((aligned(16)))" to align struct 'bad_align3' to 16 bytes +// CHECK-FIXES: __attribute__((aligned(16))); + +// Struct is both perfectly packed and aligned +struct success { + char a; + double b; + char c; +} __attribute__((packed)) __attribute__((aligned(16))); +//Should take 10 bytes and be aligned to 16 bytes + +// Struct is properly packed, and explicitly aligned +struct success2 { + int a; + int b; + int c; +} __attribute__((aligned(16))); + +// If struct is properly aligned, packing not needed +struct success3 { + char a; + double b; + char c; +} __attribute__((aligned(16))); + +// If struct is templated, warnings should not be triggered +template +struct success4 { + A a; + B b; + int c; +}; + +// Warnings should not trigger on struct instantiations +void no_trigger_on_instantiation() { + struct bad_align3 instantiated { 'a', 0.001, 'b' }; +} + diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone-misplaced-pointer-arithmetic-in-alloc.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone-misplaced-pointer-arithmetic-in-alloc.cpp index 42250da2610df..00d12891cde88 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone-misplaced-pointer-arithmetic-in-alloc.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone-misplaced-pointer-arithmetic-in-alloc.cpp @@ -51,3 +51,14 @@ void bad_new_array(int n, int m) { // CHECK-FIXES: p = new char[n - m] + 10; // FIXME: should be p = new char[n - m + 10]; } + +namespace std { +typedef decltype(sizeof(void*)) size_t; +} + +void* operator new(std::size_t, void*); + +void placement_new_ptr(void *buf, C *old) { + C **p = new (buf) C*(old) + 1; + // CHECK-MESSAGES-NOT: :[[@LINE-1]]:11: warning: arithmetic operation is applied to the result of operator new() instead of its size-like argument +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init-assignment.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init-assignment.cpp deleted file mode 100644 index dc6cb7606a0de..0000000000000 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init-assignment.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// RUN: %check_clang_tidy %s cppcoreguidelines-prefer-member-initializer,modernize-use-default-member-init %t -- \ -// RUN: -config="{CheckOptions: [{key: modernize-use-default-member-init.UseAssignment, value: 1}]}" - -class Simple1 { - int n; - // CHECK-FIXES: int n = 0; - double x; - // CHECK-FIXES: double x = 0.0; - -public: - Simple1() { - n = 0; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in an in-class default member initializer [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - x = 0.0; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in an in-class default member initializer [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - Simple1(int nn, double xx) { - // CHECK-FIXES: Simple1(int nn, double xx) : n(nn), x(xx) { - n = nn; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - x = xx; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - ~Simple1() = default; -}; diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init.cpp deleted file mode 100644 index fe5bb7c3bb989..0000000000000 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// RUN: %check_clang_tidy %s cppcoreguidelines-prefer-member-initializer,modernize-use-default-member-init %t - -class Simple1 { - int n; - // CHECK-FIXES: int n{0}; - double x; - // CHECK-FIXES: double x{0.0}; - -public: - Simple1() { - n = 0; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in an in-class default member initializer [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - x = 0.0; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in an in-class default member initializer [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - Simple1(int nn, double xx) { - // CHECK-FIXES: Simple1(int nn, double xx) : n(nn), x(xx) { - n = nn; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - x = xx; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - ~Simple1() = default; -}; diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer.cpp deleted file mode 100644 index a55a7d8208a6a..0000000000000 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer.cpp +++ /dev/null @@ -1,454 +0,0 @@ -// RUN: %check_clang_tidy %s cppcoreguidelines-prefer-member-initializer %t -- -- -fcxx-exceptions - -class Simple1 { - int n; - double x; - -public: - Simple1() { - // CHECK-FIXES: Simple1() : n(0), x(0.0) { - n = 0; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - x = 0.0; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - Simple1(int nn, double xx) { - // CHECK-FIXES: Simple1(int nn, double xx) : n(nn), x(xx) { - n = nn; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - x = xx; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - ~Simple1() = default; -}; - -class Simple2 { - int n; - double x; - -public: - Simple2() : n(0) { - // CHECK-FIXES: Simple2() : n(0), x(0.0) { - x = 0.0; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - Simple2(int nn, double xx) : n(nn) { - // CHECK-FIXES: Simple2(int nn, double xx) : n(nn), x(xx) { - x = xx; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - ~Simple2() = default; -}; - -class Simple3 { - int n; - double x; - -public: - Simple3() : x(0.0) { - // CHECK-FIXES: Simple3() : n(0), x(0.0) { - n = 0; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - Simple3(int nn, double xx) : x(xx) { - // CHECK-FIXES: Simple3(int nn, double xx) : n(nn), x(xx) { - n = nn; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - ~Simple3() = default; -}; - -int something_int(); -double something_double(); - -class Simple4 { - int n; - -public: - Simple4() { - // CHECK-FIXES: Simple4() : n(something_int()) { - n = something_int(); - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - ~Simple4() = default; -}; - -static bool dice(); - -class Complex1 { - int n; - int m; - -public: - Complex1() : n(0) { - if (dice()) - m = 1; - // NO-MESSAGES: initialization of 'm' is nested in a conditional expression - } - - ~Complex1() = default; -}; - -class Complex2 { - int n; - int m; - -public: - Complex2() : n(0) { - if (!dice()) - return; - m = 1; - // NO-MESSAGES: initialization of 'm' follows a conditional expression - } - - ~Complex2() = default; -}; - -class Complex3 { - int n; - int m; - -public: - Complex3() : n(0) { - while (dice()) - m = 1; - // NO-MESSAGES: initialization of 'm' is nested in a conditional loop - } - - ~Complex3() = default; -}; - -class Complex4 { - int n; - int m; - -public: - Complex4() : n(0) { - while (!dice()) - return; - m = 1; - // NO-MESSAGES: initialization of 'm' follows a conditional loop - } - - ~Complex4() = default; -}; - -class Complex5 { - int n; - int m; - -public: - Complex5() : n(0) { - do { - m = 1; - // NO-MESSAGES: initialization of 'm' is nested in a conditional loop - } while (dice()); - } - - ~Complex5() = default; -}; - -class Complex6 { - int n; - int m; - -public: - Complex6() : n(0) { - do { - return; - } while (!dice()); - m = 1; - // NO-MESSAGES: initialization of 'm' follows a conditional loop - } - - ~Complex6() = default; -}; - -class Complex7 { - int n; - int m; - -public: - Complex7() : n(0) { - for (int i = 2; i < 1; ++i) { - m = 1; - } - // NO-MESSAGES: initialization of 'm' is nested into a conditional loop - } - - ~Complex7() = default; -}; - -class Complex8 { - int n; - int m; - -public: - Complex8() : n(0) { - for (int i = 0; i < 2; ++i) { - return; - } - m = 1; - // NO-MESSAGES: initialization of 'm' follows a conditional loop - } - - ~Complex8() = default; -}; - -class Complex9 { - int n; - int m; - -public: - Complex9() : n(0) { - switch (dice()) { - case 1: - m = 1; - // NO-MESSAGES: initialization of 'm' is nested in a conditional expression - break; - default: - break; - } - } - - ~Complex9() = default; -}; - -class Complex10 { - int n; - int m; - -public: - Complex10() : n(0) { - switch (dice()) { - case 1: - return; - break; - default: - break; - } - m = 1; - // NO-MESSAGES: initialization of 'm' follows a conditional expression - } - - ~Complex10() = default; -}; - -class E {}; -int risky(); // may throw - -class Complex11 { - int n; - int m; - -public: - Complex11() : n(0) { - try { - risky(); - m = 1; - // NO-MESSAGES: initialization of 'm' follows is nested in a try-block - } catch (const E& e) { - return; - } - } - - ~Complex11() = default; -}; - -class Complex12 { - int n; - int m; - -public: - Complex12() : n(0) { - try { - risky(); - } catch (const E& e) { - return; - } - m = 1; - // NO-MESSAGES: initialization of 'm' follows a try-block - } - - ~Complex12() = default; -}; - -class Complex13 { - int n; - int m; - -public: - Complex13() : n(0) { - return; - m = 1; - // NO-MESSAGES: initialization of 'm' follows a return statement - } - - ~Complex13() = default; -}; - -class Complex14 { - int n; - int m; - -public: - Complex14() : n(0) { - goto X; - m = 1; - // NO-MESSAGES: initialization of 'm' follows a goto statement - X: - ; - } - - ~Complex14() = default; -}; - -void returning(); - -class Complex15 { - int n; - int m; - -public: - Complex15() : n(0) { - // CHECK-FIXES: Complex15() : n(0), m(1) { - returning(); - m = 1; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'm' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - ~Complex15() = default; -}; - -[[noreturn]] void not_returning(); - -class Complex16 { - int n; - int m; - -public: - Complex16() : n(0) { - not_returning(); - m = 1; - // NO-MESSAGES: initialization of 'm' follows a non-returning function call - } - - ~Complex16() = default; -}; - -class Complex17 { - int n; - int m; - -public: - Complex17() : n(0) { - throw 1; - m = 1; - // NO-MESSAGES: initialization of 'm' follows a 'throw' statement; - } - - ~Complex17() = default; -}; - -class Complex18 { - int n; - -public: - Complex18() try { - n = risky(); - // NO-MESSAGES: initialization of 'n' in a 'try' body; - } catch (const E& e) { - n = 0; - } - - ~Complex18() = default; -}; - -class Complex19 { - int n; -public: - Complex19() { - // CHECK-FIXES: Complex19() : n(0) { - n = 0; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - explicit Complex19(int) { - // CHECK-FIXES: Complex19(int) : n(12) { - n = 12; - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } - - ~Complex19() = default; -}; - -class VeryComplex1 { - int n1, n2, n3; - double x1, x2, x3; - int n4, n5, n6; - double x4, x5, x6; - - VeryComplex1() : n3(something_int()), x3(something_double()), - n5(something_int()), x4(something_double()), - x5(something_double()) { - // CHECK-FIXES: VeryComplex1() : n2(something_int()), n1(something_int()), n3(something_int()), x2(something_double()), x1(something_double()), x3(something_double()), - // CHECK-FIXES: n4(something_int()), n5(something_int()), n6(something_int()), x4(something_double()), - // CHECK-FIXES: x5(something_double()), x6(something_double()) { - -// FIXME: Order of elements on the constructor initializer list should match -// the order of the declaration of the fields. Thus the correct fixes -// should look like these: -// - // C ECK-FIXES: VeryComplex1() : n2(something_int()), n1(something_int()), n3(something_int()), x2(something_double()), x1(something_double()), x3(something_double()), - // C ECK-FIXES: n4(something_int()), n5(something_int()), n6(something_int()), x4(something_double()), - // C ECK-FIXES: x5(something_double()), x6(something_double()) { -// -// However, the Diagnostics Engine processes fixes in the order of the -// diagnostics and insertions to the same position are handled in left to -// right order thus in the case two adjacent fields are initialized -// inside the constructor in reverse order the provided fix is a -// constructor initializer list that does not match the order of the -// declaration of the fields. - - x2 = something_double(); - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x2' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - n2 = something_int(); - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n2' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - x6 = something_double(); - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x6' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - x1 = something_double(); - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x1' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - n6 = something_int(); - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n6' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - n1 = something_int(); - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n1' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - n4 = something_int(); - // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n4' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer] - // CHECK-FIXES: {{^\ *$}} - } -}; diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize-use-noexcept-opt.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize-use-noexcept-opt.cpp index 92c1387d64d66..b0f52a18edf51 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize-use-noexcept-opt.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize-use-noexcept-opt.cpp @@ -4,6 +4,7 @@ // This test is not run in C++17 or later because dynamic exception // specifications were removed in C++17. +using size_t = __SIZE_TYPE__; class A {}; class B {}; @@ -19,6 +20,11 @@ void k() throw(int(int)); // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: dynamic exception specification 'throw(int(int))' is deprecated; consider removing it instead [modernize-use-noexcept] // CHECK-FIXES: void k() ; +// Shouldn't crash due to llvm_unreachable in canThrow() on EST_Uninstantiated +template class c { void *operator new(size_t) throw (int);}; +void s() { c<1> doesnt_crash; } +// CHECK-MESSAGES: :[[@LINE-2]]:53: warning: dynamic exception specification 'throw (int)' is deprecated; consider removing it instead [modernize-use-noexcept] + void foobar() throw(A, B) {} // CHECK-MESSAGES: :[[@LINE-2]]:15: warning: dynamic exception specification 'throw(A, B)' is deprecated; consider removing it instead [modernize-use-noexcept] diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance-unnecessary-copy-initialization.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance-unnecessary-copy-initialization.cpp index 50dcfd8f8bf22..7a70bc18a28c8 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance-unnecessary-copy-initialization.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance-unnecessary-copy-initialization.cpp @@ -23,6 +23,9 @@ struct WeirdCopyCtorType { ExpensiveToCopyType global_expensive_to_copy_type; const ExpensiveToCopyType &ExpensiveTypeReference(); +const ExpensiveToCopyType &freeFunctionWithArg(const ExpensiveToCopyType &); +const ExpensiveToCopyType &freeFunctionWithDefaultArg( + const ExpensiveToCopyType *arg = nullptr); const TrivialToCopyType &TrivialTypeReference(); void mutate(ExpensiveToCopyType &); @@ -387,3 +390,18 @@ void implicitVarFalsePositive() { for (const Element &E : Container()) { } } + +// This should not trigger the check as the argument could introduce an alias. +void negativeInitializedFromFreeFunctionWithArg() { + ExpensiveToCopyType Orig; + const ExpensiveToCopyType Copy = freeFunctionWithArg(Orig); +} + +void negativeInitializedFromFreeFunctionWithDefaultArg() { + const ExpensiveToCopyType Copy = freeFunctionWithDefaultArg(); +} + +void negativeInitialzedFromFreeFunctionWithNonDefaultArg() { + ExpensiveToCopyType Orig; + const ExpensiveToCopyType Copy = freeFunctionWithDefaultArg(&Orig); +} diff --git a/clang-tools-extra/test/lit.cfg.py b/clang-tools-extra/test/lit.cfg.py index 2366f4613db23..24cabd823844e 100644 --- a/clang-tools-extra/test/lit.cfg.py +++ b/clang-tools-extra/test/lit.cfg.py @@ -115,7 +115,7 @@ if platform.system() not in ['Windows']: config.available_features.add('ansi-escape-sequences') -if config.clang_staticanalyzer: +if config.clang_tidy_staticanalyzer: config.available_features.add('static-analyzer') # Get shlex.quote if available (added in 3.3), and fall back to pipes.quote if diff --git a/clang-tools-extra/test/lit.site.cfg.py.in b/clang-tools-extra/test/lit.site.cfg.py.in index 31ce2eaa27d00..7eef661b85fd1 100644 --- a/clang-tools-extra/test/lit.site.cfg.py.in +++ b/clang-tools-extra/test/lit.site.cfg.py.in @@ -10,7 +10,7 @@ config.clang_tools_dir = "@CLANG_TOOLS_DIR@" config.clang_libs_dir = "@SHLIBDIR@" config.python_executable = "@Python3_EXECUTABLE@" config.target_triple = "@TARGET_TRIPLE@" -config.clang_staticanalyzer = @CLANG_ENABLE_STATIC_ANALYZER@ +config.clang_tidy_staticanalyzer = @CLANG_TIDY_ENABLE_STATIC_ANALYZER@ config.libclang_include_clang_tools_extra = @LIBCLANG_INCLUDE_CLANG_TOOLS_EXTRA@ # Support substitution of the tools and libs dirs with user parameters. This is diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index 56b207916ff2f..b4ce45e0b0f97 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -136,38 +136,19 @@ if( CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR ) set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} ) if(LLVM_INCLUDE_TESTS) - if(CMAKE_VERSION VERSION_LESS 3.12) - include(FindPythonInterp) - if(NOT PYTHONINTERP_FOUND) - message(FATAL_ERROR - "Unable to find Python interpreter, required for builds and testing. - - Please install Python or specify the PYTHON_EXECUTABLE CMake variable.") - endif() - - if( ${PYTHON_VERSION_STRING} VERSION_LESS 2.7 ) - message(FATAL_ERROR "Python 2.7 or newer is required") + find_package(Python3 COMPONENTS Interpreter) + if(NOT Python3_Interpreter_FOUND) + message(WARNING "Python3 not found, using python2 as a fallback") + find_package(Python2 COMPONENTS Interpreter REQUIRED) + if(Python2_VERSION VERSION_LESS 2.7) + message(SEND_ERROR "Python 2.7 or newer is required") endif() + # Treat python2 as python3 add_executable(Python3::Interpreter IMPORTED) set_target_properties(Python3::Interpreter PROPERTIES - IMPORTED_LOCATION ${PYTHON_EXECUTABLE}) - set(Python3_EXECUTABLE ${PYTHON_EXECUTABLE}) - else() - find_package(Python3 COMPONENTS Interpreter) - if(NOT Python3_Interpreter_FOUND) - message(WARNING "Python3 not found, using python2 as a fallback") - find_package(Python2 COMPONENTS Interpreter REQUIRED) - if(Python2_VERSION VERSION_LESS 2.7) - message(SEND_ERROR "Python 2.7 or newer is required") - endif() - - # Treat python2 as python3 - add_executable(Python3::Interpreter IMPORTED) - set_target_properties(Python3::Interpreter PROPERTIES - IMPORTED_LOCATION ${Python2_EXECUTABLE}) - set(Python3_EXECUTABLE ${Python2_EXECUTABLE}) - endif() + IMPORTED_LOCATION ${Python2_EXECUTABLE}) + set(Python3_EXECUTABLE ${Python2_EXECUTABLE}) endif() # Check prebuilt llvm/utils. @@ -495,7 +476,8 @@ option(CLANG_BUILD_TOOLS "Build the Clang tools. If OFF, just generate build targets." ON) option(CLANG_ENABLE_ARCMT "Build ARCMT." ON) -option(CLANG_ENABLE_STATIC_ANALYZER "Build static analyzer." ON) +option(CLANG_ENABLE_STATIC_ANALYZER + "Include static analyzer in clang binary." ON) option(CLANG_ENABLE_PROTO_FUZZER "Build Clang protobuf fuzzer." OFF) diff --git a/clang/cmake/caches/Android.cmake b/clang/cmake/caches/Android.cmake index 6fbc4a53951e3..9e15fff033761 100644 --- a/clang/cmake/caches/Android.cmake +++ b/clang/cmake/caches/Android.cmake @@ -4,6 +4,7 @@ set(LLVM_TARGETS_TO_BUILD X86 CACHE STRING "") set(CLANG_ENABLE_ARCMT OFF CACHE BOOL "") set(CLANG_ENABLE_STATIC_ANALYZER OFF CACHE BOOL "") +set(CLANG_TIDY_ENABLE_STATIC_ANALYZER OFF CACHE BOOL "") set(CLANG_VENDOR Android CACHE STRING "") set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "") diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index 72a25032151ff..20e829135b33c 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -758,10 +758,14 @@ the configuration (without a prefix: ``Auto``). int bbbbbbbbbbbbbbbbbbbbb) { } + + **AttributeMacros** (``std::vector``) A vector of strings that should be interpreted as attributes/qualifiers instead of identifiers. This can be useful for language extensions or - static analyzer annotations: + static analyzer annotations. + + For example: .. code-block:: c++ @@ -775,8 +779,6 @@ the configuration (without a prefix: ``Auto``). AttributeMacros: ['__capability', '__output', '__ununsed'] - For example: __capability. - **BinPackArguments** (``bool``) If ``false``, a function call's arguments will either be all on the same line or will have one line each. @@ -2246,7 +2248,7 @@ the configuration (without a prefix: ``Auto``). **ObjCBreakBeforeNestedBlockParam** (``bool``) Break parameters list into lines when there is nested block - parameters in a fuction call. + parameters in a function call. .. code-block:: c++ diff --git a/clang/docs/CommandGuide/clang.rst b/clang/docs/CommandGuide/clang.rst index 394bd1be24e87..a24e138e86a7d 100644 --- a/clang/docs/CommandGuide/clang.rst +++ b/clang/docs/CommandGuide/clang.rst @@ -338,12 +338,12 @@ number of cross compilers, or may only support a native target. .. option:: --print-supported-cpus Print out a list of supported processors for the given target (specified - through --target= or -arch ). If no target is - specified, the system default target will be used. + through ``--target=`` or :option:`-arch` ````). If no + target is specified, the system default target will be used. .. option:: -mcpu=?, -mtune=? - Aliases of --print-supported-cpus + Acts as an alias for :option:`--print-supported-cpus`. .. option:: -march= @@ -385,7 +385,7 @@ Code Generation Options :option:`-Og` Like :option:`-O1`. In future versions, this option might disable different optimizations in order to improve debuggability. - :option:`-O` Equivalent to :option:`-O2`. + :option:`-O` Equivalent to :option:`-O1`. :option:`-O4` and higher diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index c89f924c58ba2..256f7e12364f8 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -2408,20 +2408,6 @@ with ``__has_feature(cxx_constexpr_string_builtins)``. Memory builtins --------------- - * ``__builtin_memcpy_inline`` - -.. code-block:: c - - void __builtin_memcpy_inline(void *dst, const void *src, size_t size); - -``__builtin_memcpy_inline(dst, src, size)`` is identical to -``__builtin_memcpy(dst, src, size)`` except that the generated code is -guaranteed not to call any external functions. See [LLVM IR ‘llvm.memcpy.inline’ -Intrinsic](https://llvm.org/docs/LangRef.html#llvm-memcpy-inline-intrinsic) for -more information. - -Note that the `size` argument must be a compile time constant. - Clang provides constant expression evaluation support for builtin forms of the following functions from the C standard library headers ```` and ````: @@ -2439,7 +2425,27 @@ are pointers to arrays with the same trivially copyable element type, and the given size is an exact multiple of the element size that is no greater than the number of elements accessible through the source and destination operands. -Constant evaluation support is not yet provided for ``__builtin_memcpy_inline``. +Guaranteed inlined copy +^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: c + + void __builtin_memcpy_inline(void *dst, const void *src, size_t size); + + +``__builtin_memcpy_inline`` has been designed as a building block for efficient +``memcpy`` implementations. It is identical to ``__builtin_memcpy`` but also +guarantees not to call any external functions. See LLVM IR `llvm.memcpy.inline +`_ intrinsic +for more information. + +This is useful to implement a custom version of ``memcpy``, implemement a +``libc`` memcpy or work around the absence of a ``libc``. + +Note that the `size` argument must be a compile time constant. + +Note that this intrinsic cannot yet be called in a ``constexpr`` context. + Atomic Min/Max builtins with memory ordering -------------------------------------------- diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html index eb85e420e7e4d..c4c6de117c1c0 100644 --- a/clang/docs/LibASTMatchersReference.html +++ b/clang/docs/LibASTMatchersReference.html @@ -649,6 +649,30 @@

Node Matchers

+Matcher<DecompositionDecl>decompositionDeclMatcher<DecompositionDecl>... +
Matches decomposition-declarations.
+
+Examples matches the declaration node with foo and bar, but not
+number.
+(matcher = declStmt(has(decompositionDecl())))
+
+  int number = 42;
+  auto [foo, bar] = std::make_pair{42, 42};
+
+ + +Matcher<DecompositionDecl>decompositionDeclMatcher<DecompositionDecl>... +
Matches decomposition-declarations.
+
+Examples matches the declaration node with foo and bar, but not
+number.
+(matcher = declStmt(has(decompositionDecl())))
+
+  int number = 42;
+  auto [foo, bar] = std::make_pair{42, 42};
+
+ + Matcher<NestedNameSpecifierLoc>nestedNameSpecifierLocMatcher<NestedNameSpecifierLoc>...
Same as nestedNameSpecifier but matches NestedNameSpecifierLoc.
 
@@ -5322,6 +5346,60 @@

AST Traversal Matchers

+Matcher<CXXConstructExpr>forEachArgumentWithParamTypeMatcher<Expr> ArgMatcher, Matcher<QualType> ParamMatcher +
Matches all arguments and their respective types for a CallExpr or
+CXXConstructExpr. It is very similar to forEachArgumentWithParam but
+it works on calls through function pointers as well.
+
+The difference is, that function pointers do not provide access to a
+ParmVarDecl, but only the QualType for each argument.
+
+Given
+  void f(int i);
+  int y;
+  f(y);
+  void (*f_ptr)(int) = f;
+  f_ptr(y);
+callExpr(
+  forEachArgumentWithParamType(
+    declRefExpr(to(varDecl(hasName("y")))),
+    qualType(isInteger()).bind("type)
+))
+  matches f(y) and f_ptr(y)
+with declRefExpr(...)
+  matching int y
+and qualType(...)
+  matching int
+
+ + +Matcher<CXXConstructExpr>forEachArgumentWithParamTypeMatcher<Expr> ArgMatcher, Matcher<QualType> ParamMatcher +
Matches all arguments and their respective types for a CallExpr or
+CXXConstructExpr. It is very similar to forEachArgumentWithParam but
+it works on calls through function pointers as well.
+
+The difference is, that function pointers do not provide access to a
+ParmVarDecl, but only the QualType for each argument.
+
+Given
+  void f(int i);
+  int y;
+  f(y);
+  void (*f_ptr)(int) = f;
+  f_ptr(y);
+callExpr(
+  forEachArgumentWithParamType(
+    declRefExpr(to(varDecl(hasName("y")))),
+    qualType(isInteger()).bind("type)
+))
+  matches f(y) and f_ptr(y)
+with declRefExpr(...)
+  matching int y
+and qualType(...)
+  matching int
+
+ + Matcher<CXXConstructExpr>hasAnyArgumentMatcher<Expr> InnerMatcher
Matches any argument of a call expression or a constructor call
 expression, or an ObjC-message-send expression.
@@ -5850,6 +5928,60 @@ 

AST Traversal Matchers

+Matcher<CallExpr>forEachArgumentWithParamTypeMatcher<Expr> ArgMatcher, Matcher<QualType> ParamMatcher +
Matches all arguments and their respective types for a CallExpr or
+CXXConstructExpr. It is very similar to forEachArgumentWithParam but
+it works on calls through function pointers as well.
+
+The difference is, that function pointers do not provide access to a
+ParmVarDecl, but only the QualType for each argument.
+
+Given
+  void f(int i);
+  int y;
+  f(y);
+  void (*f_ptr)(int) = f;
+  f_ptr(y);
+callExpr(
+  forEachArgumentWithParamType(
+    declRefExpr(to(varDecl(hasName("y")))),
+    qualType(isInteger()).bind("type)
+))
+  matches f(y) and f_ptr(y)
+with declRefExpr(...)
+  matching int y
+and qualType(...)
+  matching int
+
+ + +Matcher<CallExpr>forEachArgumentWithParamTypeMatcher<Expr> ArgMatcher, Matcher<QualType> ParamMatcher +
Matches all arguments and their respective types for a CallExpr or
+CXXConstructExpr. It is very similar to forEachArgumentWithParam but
+it works on calls through function pointers as well.
+
+The difference is, that function pointers do not provide access to a
+ParmVarDecl, but only the QualType for each argument.
+
+Given
+  void f(int i);
+  int y;
+  f(y);
+  void (*f_ptr)(int) = f;
+  f_ptr(y);
+callExpr(
+  forEachArgumentWithParamType(
+    declRefExpr(to(varDecl(hasName("y")))),
+    qualType(isInteger()).bind("type)
+))
+  matches f(y) and f_ptr(y)
+with declRefExpr(...)
+  matching int y
+and qualType(...)
+  matching int
+
+ + Matcher<CallExpr>hasAnyArgumentMatcher<Expr> InnerMatcher
Matches any argument of a call expression or a constructor call
 expression, or an ObjC-message-send expression.
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 1a1aea2ae5382..2d0d71443dfda 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -1700,9 +1700,12 @@ are listed below.
 
 **-fbasic-block-sections=[labels, all, list=, none]**
 
-  Controls whether Clang emits a label for each basic block.  Further, with
-  values "all" and "list=arg", each basic block or a subset of basic blocks
-  can be placed in its own unique section.
+  Controls how Clang emits text sections for basic blocks. With values ``all``
+  and ``list=``, each basic block or a subset of basic blocks can be placed
+  in its own unique section. With the "labels" value, normal text sections are
+  emitted, but a ``.bb_addr_map`` section is emitted which includes address
+  offsets for each basic block in the program, relative to the parent function
+  address.
 
   With the ``list=`` option, a file containing the subset of basic blocks
   that need to placed in unique sections can be specified.  The format of the
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 7a294f916bcf9..9fb6782cf5a5e 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -1491,6 +1491,23 @@ Warn about assigning non-{0,1} values to boolean variables.
 alpha.core
 ^^^^^^^^^^
 
+.. _alpha-core-C11Lock:
+
+alpha.core.C11Lock
+""""""""""""""""""
+Similarly to :ref:`alpha.unix.PthreadLock `, checks for
+the locking/unlocking of ``mtx_t`` mutexes.
+
+.. code-block:: cpp
+
+ mtx_t mtx1;
+
+ void bad1(void)
+ {
+   mtx_lock(&mtx1);
+   mtx_lock(&mtx1); // warn: This lock has already been acquired
+ }
+
 .. _alpha-core-CallAndMessageUnInitRefArg:
 
 alpha.core.CallAndMessageUnInitRefArg (C,C++, ObjC)
@@ -1868,6 +1885,26 @@ Check for dereference of null smart pointers.
    *P; // warn: dereference of a default constructed smart unique_ptr
  }
 
+alpha.fuchsia
+^^^^^^^^^^^^^
+
+.. _alpha-fuchsia-lock:
+
+alpha.fuchsia.Lock
+""""""""""""""""""
+Similarly to :ref:`alpha.unix.PthreadLock `, checks for
+the locking/unlocking of fuchsia mutexes.
+
+.. code-block:: cpp
+
+ spin_lock_t mtx1;
+
+ void bad1(void)
+ {
+   spin_lock(&mtx1);
+   spin_lock(&mtx1);	// warn: This lock has already been acquired
+ }
+
 alpha.llvm
 ^^^^^^^^^^
 
diff --git a/clang/docs/analyzer/developer-docs/DebugChecks.rst b/clang/docs/analyzer/developer-docs/DebugChecks.rst
index 48b584a463072..45985a1dfd793 100644
--- a/clang/docs/analyzer/developer-docs/DebugChecks.rst
+++ b/clang/docs/analyzer/developer-docs/DebugChecks.rst
@@ -30,7 +30,7 @@ using a 'dot' format viewer (such as Graphviz on macOS) instead.
 - debug.DumpLiveVars: Show the results of live variable analysis for each
   top-level function being analyzed.
 
-- debug.DumpLiveStmts: Show the results of live statement analysis for each
+- debug.DumpLiveExprs: Show the results of live expression analysis for each
   top-level function being analyzed.
 
 - debug.ViewExplodedGraph: Show the Exploded Graphs generated for the
diff --git a/clang/docs/analyzer/user-docs/CrossTranslationUnit.rst b/clang/docs/analyzer/user-docs/CrossTranslationUnit.rst
index 36be82f209ef2..0606185f39e64 100644
--- a/clang/docs/analyzer/user-docs/CrossTranslationUnit.rst
+++ b/clang/docs/analyzer/user-docs/CrossTranslationUnit.rst
@@ -201,6 +201,8 @@ Example usage of scan-build-py:
   ^C
   $
 
+.. _ctu-on-demand:
+
 On-demand analysis
 __________________
 The analysis produces the necessary AST structure of external TUs during analysis. This requires the
diff --git a/clang/include/clang/AST/APValue.h b/clang/include/clang/AST/APValue.h
index 5103cfa8604e5..6307f8a92e5a2 100644
--- a/clang/include/clang/AST/APValue.h
+++ b/clang/include/clang/AST/APValue.h
@@ -174,6 +174,7 @@ class APValue {
       return !(LHS == RHS);
     }
     friend llvm::hash_code hash_value(const LValueBase &Base);
+    friend struct llvm::DenseMapInfo;
 
   private:
     PtrTy Ptr;
@@ -201,8 +202,7 @@ class APValue {
 
   public:
     LValuePathEntry() : Value() {}
-    LValuePathEntry(BaseOrMemberType BaseOrMember)
-        : Value{reinterpret_cast(BaseOrMember.getOpaqueValue())} {}
+    LValuePathEntry(BaseOrMemberType BaseOrMember);
     static LValuePathEntry ArrayIndex(uint64_t Index) {
       LValuePathEntry Result;
       Result.Value = Index;
diff --git a/clang/include/clang/AST/ASTStructuralEquivalence.h b/clang/include/clang/AST/ASTStructuralEquivalence.h
index 36a42070fd281..c958a16aba213 100644
--- a/clang/include/clang/AST/ASTStructuralEquivalence.h
+++ b/clang/include/clang/AST/ASTStructuralEquivalence.h
@@ -97,6 +97,13 @@ struct StructuralEquivalenceContext {
   /// \c VisitedDecls members) and can cause faulty equivalent results.
   bool IsEquivalent(QualType T1, QualType T2);
 
+  /// Determine whether the two statements are structurally equivalent.
+  /// Implementation functions (all static functions in
+  /// ASTStructuralEquivalence.cpp) must never call this function because that
+  /// will wreak havoc the internal state (\c DeclsToCheck and
+  /// \c VisitedDecls members) and can cause faulty equivalent results.
+  bool IsEquivalent(Stmt *S1, Stmt *S2);
+
   /// Find the index of the given anonymous struct/union within its
   /// context.
   ///
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 26e52ad367f81..1672fd707c6d2 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -3440,9 +3440,11 @@ class CastExpr : public Expr {
   }
   CXXBaseSpecifier **path_buffer();
 
+  friend class ASTStmtReader;
+
 protected:
   CastExpr(StmtClass SC, QualType ty, ExprValueKind VK, const CastKind kind,
-           Expr *op, unsigned BasePathSize)
+           Expr *op, unsigned BasePathSize, bool HasFPFeatures)
       : Expr(SC, ty, VK, OK_Ordinary), Op(op) {
     CastExprBits.Kind = kind;
     CastExprBits.PartOfExplicitCast = false;
@@ -3451,17 +3453,27 @@ class CastExpr : public Expr {
            "BasePathSize overflow!");
     setDependence(computeDependence(this));
     assert(CastConsistency());
+    CastExprBits.HasFPFeatures = HasFPFeatures;
   }
 
   /// Construct an empty cast.
-  CastExpr(StmtClass SC, EmptyShell Empty, unsigned BasePathSize)
-    : Expr(SC, Empty) {
+  CastExpr(StmtClass SC, EmptyShell Empty, unsigned BasePathSize,
+           bool HasFPFeatures)
+      : Expr(SC, Empty) {
     CastExprBits.PartOfExplicitCast = false;
     CastExprBits.BasePathSize = BasePathSize;
+    CastExprBits.HasFPFeatures = HasFPFeatures;
     assert((CastExprBits.BasePathSize == BasePathSize) &&
            "BasePathSize overflow!");
   }
 
+  /// Return a pointer to the trailing FPOptions.
+  /// \pre hasStoredFPFeatures() == true
+  FPOptionsOverride *getTrailingFPFeatures();
+  const FPOptionsOverride *getTrailingFPFeatures() const {
+    return const_cast(this)->getTrailingFPFeatures();
+  }
+
 public:
   CastKind getCastKind() const { return (CastKind) CastExprBits.Kind; }
   void setCastKind(CastKind K) { CastExprBits.Kind = K; }
@@ -3506,6 +3518,28 @@ class CastExpr : public Expr {
     return getTargetFieldForToUnionCast(getType(), getSubExpr()->getType());
   }
 
+  bool hasStoredFPFeatures() const { return CastExprBits.HasFPFeatures; }
+
+  /// Get FPOptionsOverride from trailing storage.
+  FPOptionsOverride getStoredFPFeatures() const {
+    assert(hasStoredFPFeatures());
+    return *getTrailingFPFeatures();
+  }
+
+  // Get the FP features status of this operation. Only meaningful for
+  // operations on floating point types.
+  FPOptions getFPFeaturesInEffect(const LangOptions &LO) const {
+    if (hasStoredFPFeatures())
+      return getStoredFPFeatures().applyOverrides(LO);
+    return FPOptions::defaultWithoutTrailingStorage(LO);
+  }
+
+  FPOptionsOverride getFPFeatures() const {
+    if (hasStoredFPFeatures())
+      return getStoredFPFeatures();
+    return FPOptionsOverride();
+  }
+
   static const FieldDecl *getTargetFieldForToUnionCast(QualType unionType,
                                                        QualType opType);
   static const FieldDecl *getTargetFieldForToUnionCast(const RecordDecl *RD,
@@ -3543,21 +3577,35 @@ class CastExpr : public Expr {
 /// @endcode
 class ImplicitCastExpr final
     : public CastExpr,
-      private llvm::TrailingObjects {
+      private llvm::TrailingObjects {
 
   ImplicitCastExpr(QualType ty, CastKind kind, Expr *op,
-                   unsigned BasePathLength, ExprValueKind VK)
-    : CastExpr(ImplicitCastExprClass, ty, VK, kind, op, BasePathLength) { }
+                   unsigned BasePathLength, FPOptionsOverride FPO,
+                   ExprValueKind VK)
+      : CastExpr(ImplicitCastExprClass, ty, VK, kind, op, BasePathLength,
+                 FPO.requiresTrailingStorage()) {
+    if (hasStoredFPFeatures())
+      *getTrailingFPFeatures() = FPO;
+  }
 
   /// Construct an empty implicit cast.
-  explicit ImplicitCastExpr(EmptyShell Shell, unsigned PathSize)
-    : CastExpr(ImplicitCastExprClass, Shell, PathSize) { }
+  explicit ImplicitCastExpr(EmptyShell Shell, unsigned PathSize,
+                            bool HasFPFeatures)
+      : CastExpr(ImplicitCastExprClass, Shell, PathSize, HasFPFeatures) {}
+
+  unsigned numTrailingObjects(OverloadToken) const {
+    return path_size();
+  }
 
 public:
   enum OnStack_t { OnStack };
   ImplicitCastExpr(OnStack_t _, QualType ty, CastKind kind, Expr *op,
-                   ExprValueKind VK)
-    : CastExpr(ImplicitCastExprClass, ty, VK, kind, op, 0) {
+                   ExprValueKind VK, FPOptionsOverride FPO)
+      : CastExpr(ImplicitCastExprClass, ty, VK, kind, op, 0,
+                 FPO.requiresTrailingStorage()) {
+    if (hasStoredFPFeatures())
+      *getTrailingFPFeatures() = FPO;
   }
 
   bool isPartOfExplicitCast() const { return CastExprBits.PartOfExplicitCast; }
@@ -3568,10 +3616,10 @@ class ImplicitCastExpr final
   static ImplicitCastExpr *Create(const ASTContext &Context, QualType T,
                                   CastKind Kind, Expr *Operand,
                                   const CXXCastPath *BasePath,
-                                  ExprValueKind Cat);
+                                  ExprValueKind Cat, FPOptionsOverride FPO);
 
   static ImplicitCastExpr *CreateEmpty(const ASTContext &Context,
-                                       unsigned PathSize);
+                                       unsigned PathSize, bool HasFPFeatures);
 
   SourceLocation getBeginLoc() const LLVM_READONLY {
     return getSubExpr()->getBeginLoc();
@@ -3612,12 +3660,14 @@ class ExplicitCastExpr : public CastExpr {
 protected:
   ExplicitCastExpr(StmtClass SC, QualType exprTy, ExprValueKind VK,
                    CastKind kind, Expr *op, unsigned PathSize,
-                   TypeSourceInfo *writtenTy)
-    : CastExpr(SC, exprTy, VK, kind, op, PathSize), TInfo(writtenTy) {}
+                   bool HasFPFeatures, TypeSourceInfo *writtenTy)
+      : CastExpr(SC, exprTy, VK, kind, op, PathSize, HasFPFeatures),
+        TInfo(writtenTy) {}
 
   /// Construct an empty explicit cast.
-  ExplicitCastExpr(StmtClass SC, EmptyShell Shell, unsigned PathSize)
-    : CastExpr(SC, Shell, PathSize) { }
+  ExplicitCastExpr(StmtClass SC, EmptyShell Shell, unsigned PathSize,
+                   bool HasFPFeatures)
+      : CastExpr(SC, Shell, PathSize, HasFPFeatures) {}
 
 public:
   /// getTypeInfoAsWritten - Returns the type source info for the type
@@ -3640,29 +3690,38 @@ class ExplicitCastExpr : public CastExpr {
 /// (Type)expr. For example: @c (int)f.
 class CStyleCastExpr final
     : public ExplicitCastExpr,
-      private llvm::TrailingObjects {
+      private llvm::TrailingObjects {
   SourceLocation LPLoc; // the location of the left paren
   SourceLocation RPLoc; // the location of the right paren
 
   CStyleCastExpr(QualType exprTy, ExprValueKind vk, CastKind kind, Expr *op,
-                 unsigned PathSize, TypeSourceInfo *writtenTy,
-                 SourceLocation l, SourceLocation r)
-    : ExplicitCastExpr(CStyleCastExprClass, exprTy, vk, kind, op, PathSize,
-                       writtenTy), LPLoc(l), RPLoc(r) {}
+                 unsigned PathSize, FPOptionsOverride FPO,
+                 TypeSourceInfo *writtenTy, SourceLocation l, SourceLocation r)
+      : ExplicitCastExpr(CStyleCastExprClass, exprTy, vk, kind, op, PathSize,
+                         FPO.requiresTrailingStorage(), writtenTy),
+        LPLoc(l), RPLoc(r) {
+    if (hasStoredFPFeatures())
+      *getTrailingFPFeatures() = FPO;
+  }
 
   /// Construct an empty C-style explicit cast.
-  explicit CStyleCastExpr(EmptyShell Shell, unsigned PathSize)
-    : ExplicitCastExpr(CStyleCastExprClass, Shell, PathSize) { }
+  explicit CStyleCastExpr(EmptyShell Shell, unsigned PathSize,
+                          bool HasFPFeatures)
+      : ExplicitCastExpr(CStyleCastExprClass, Shell, PathSize, HasFPFeatures) {}
+
+  unsigned numTrailingObjects(OverloadToken) const {
+    return path_size();
+  }
 
 public:
-  static CStyleCastExpr *Create(const ASTContext &Context, QualType T,
-                                ExprValueKind VK, CastKind K,
-                                Expr *Op, const CXXCastPath *BasePath,
-                                TypeSourceInfo *WrittenTy, SourceLocation L,
-                                SourceLocation R);
+  static CStyleCastExpr *
+  Create(const ASTContext &Context, QualType T, ExprValueKind VK, CastKind K,
+         Expr *Op, const CXXCastPath *BasePath, FPOptionsOverride FPO,
+         TypeSourceInfo *WrittenTy, SourceLocation L, SourceLocation R);
 
   static CStyleCastExpr *CreateEmpty(const ASTContext &Context,
-                                     unsigned PathSize);
+                                     unsigned PathSize, bool HasFPFeatures);
 
   SourceLocation getLParenLoc() const { return LPLoc; }
   void setLParenLoc(SourceLocation L) { LPLoc = L; }
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index 6b4b57eca9bea..9658f37723e18 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -374,16 +374,17 @@ class CXXNamedCastExpr : public ExplicitCastExpr {
 protected:
   friend class ASTStmtReader;
 
-  CXXNamedCastExpr(StmtClass SC, QualType ty, ExprValueKind VK,
-                   CastKind kind, Expr *op, unsigned PathSize,
+  CXXNamedCastExpr(StmtClass SC, QualType ty, ExprValueKind VK, CastKind kind,
+                   Expr *op, unsigned PathSize, bool HasFPFeatures,
                    TypeSourceInfo *writtenTy, SourceLocation l,
-                   SourceLocation RParenLoc,
-                   SourceRange AngleBrackets)
-      : ExplicitCastExpr(SC, ty, VK, kind, op, PathSize, writtenTy), Loc(l),
-        RParenLoc(RParenLoc), AngleBrackets(AngleBrackets) {}
+                   SourceLocation RParenLoc, SourceRange AngleBrackets)
+      : ExplicitCastExpr(SC, ty, VK, kind, op, PathSize, HasFPFeatures,
+                         writtenTy),
+        Loc(l), RParenLoc(RParenLoc), AngleBrackets(AngleBrackets) {}
 
-  explicit CXXNamedCastExpr(StmtClass SC, EmptyShell Shell, unsigned PathSize)
-      : ExplicitCastExpr(SC, Shell, PathSize) {}
+  explicit CXXNamedCastExpr(StmtClass SC, EmptyShell Shell, unsigned PathSize,
+                            bool HasFPFeatures)
+      : ExplicitCastExpr(SC, Shell, PathSize, HasFPFeatures) {}
 
 public:
   const char *getCastName() const;
@@ -419,29 +420,39 @@ class CXXNamedCastExpr : public ExplicitCastExpr {
 /// \c static_cast(1.0).
 class CXXStaticCastExpr final
     : public CXXNamedCastExpr,
-      private llvm::TrailingObjects {
+      private llvm::TrailingObjects {
   CXXStaticCastExpr(QualType ty, ExprValueKind vk, CastKind kind, Expr *op,
                     unsigned pathSize, TypeSourceInfo *writtenTy,
-                    SourceLocation l, SourceLocation RParenLoc,
-                    SourceRange AngleBrackets)
+                    FPOptionsOverride FPO, SourceLocation l,
+                    SourceLocation RParenLoc, SourceRange AngleBrackets)
       : CXXNamedCastExpr(CXXStaticCastExprClass, ty, vk, kind, op, pathSize,
-                         writtenTy, l, RParenLoc, AngleBrackets) {}
+                         FPO.requiresTrailingStorage(), writtenTy, l, RParenLoc,
+                         AngleBrackets) {
+    if (hasStoredFPFeatures())
+      *getTrailingFPFeatures() = FPO;
+  }
 
-  explicit CXXStaticCastExpr(EmptyShell Empty, unsigned PathSize)
-      : CXXNamedCastExpr(CXXStaticCastExprClass, Empty, PathSize) {}
+  explicit CXXStaticCastExpr(EmptyShell Empty, unsigned PathSize,
+                             bool HasFPFeatures)
+      : CXXNamedCastExpr(CXXStaticCastExprClass, Empty, PathSize,
+                         HasFPFeatures) {}
+
+  unsigned numTrailingObjects(OverloadToken) const {
+    return path_size();
+  }
 
 public:
   friend class CastExpr;
   friend TrailingObjects;
 
-  static CXXStaticCastExpr *Create(const ASTContext &Context, QualType T,
-                                   ExprValueKind VK, CastKind K, Expr *Op,
-                                   const CXXCastPath *Path,
-                                   TypeSourceInfo *Written, SourceLocation L,
-                                   SourceLocation RParenLoc,
-                                   SourceRange AngleBrackets);
+  static CXXStaticCastExpr *
+  Create(const ASTContext &Context, QualType T, ExprValueKind VK, CastKind K,
+         Expr *Op, const CXXCastPath *Path, TypeSourceInfo *Written,
+         FPOptionsOverride FPO, SourceLocation L, SourceLocation RParenLoc,
+         SourceRange AngleBrackets);
   static CXXStaticCastExpr *CreateEmpty(const ASTContext &Context,
-                                        unsigned PathSize);
+                                        unsigned PathSize, bool hasFPFeatures);
 
   static bool classof(const Stmt *T) {
     return T->getStmtClass() == CXXStaticCastExprClass;
@@ -456,15 +467,17 @@ class CXXStaticCastExpr final
 class CXXDynamicCastExpr final
     : public CXXNamedCastExpr,
       private llvm::TrailingObjects {
-  CXXDynamicCastExpr(QualType ty, ExprValueKind VK, CastKind kind,
-                     Expr *op, unsigned pathSize, TypeSourceInfo *writtenTy,
+  CXXDynamicCastExpr(QualType ty, ExprValueKind VK, CastKind kind, Expr *op,
+                     unsigned pathSize, TypeSourceInfo *writtenTy,
                      SourceLocation l, SourceLocation RParenLoc,
                      SourceRange AngleBrackets)
       : CXXNamedCastExpr(CXXDynamicCastExprClass, ty, VK, kind, op, pathSize,
-                         writtenTy, l, RParenLoc, AngleBrackets) {}
+                         /*HasFPFeatures*/ false, writtenTy, l, RParenLoc,
+                         AngleBrackets) {}
 
   explicit CXXDynamicCastExpr(EmptyShell Empty, unsigned pathSize)
-      : CXXNamedCastExpr(CXXDynamicCastExprClass, Empty, pathSize) {}
+      : CXXNamedCastExpr(CXXDynamicCastExprClass, Empty, pathSize,
+                         /*HasFPFeatures*/ false) {}
 
 public:
   friend class CastExpr;
@@ -499,16 +512,17 @@ class CXXReinterpretCastExpr final
     : public CXXNamedCastExpr,
       private llvm::TrailingObjects {
-  CXXReinterpretCastExpr(QualType ty, ExprValueKind vk, CastKind kind,
-                         Expr *op, unsigned pathSize,
-                         TypeSourceInfo *writtenTy, SourceLocation l,
-                         SourceLocation RParenLoc,
+  CXXReinterpretCastExpr(QualType ty, ExprValueKind vk, CastKind kind, Expr *op,
+                         unsigned pathSize, TypeSourceInfo *writtenTy,
+                         SourceLocation l, SourceLocation RParenLoc,
                          SourceRange AngleBrackets)
       : CXXNamedCastExpr(CXXReinterpretCastExprClass, ty, vk, kind, op,
-                         pathSize, writtenTy, l, RParenLoc, AngleBrackets) {}
+                         pathSize, /*HasFPFeatures*/ false, writtenTy, l,
+                         RParenLoc, AngleBrackets) {}
 
   CXXReinterpretCastExpr(EmptyShell Empty, unsigned pathSize)
-      : CXXNamedCastExpr(CXXReinterpretCastExprClass, Empty, pathSize) {}
+      : CXXNamedCastExpr(CXXReinterpretCastExprClass, Empty, pathSize,
+                         /*HasFPFeatures*/ false) {}
 
 public:
   friend class CastExpr;
@@ -541,11 +555,13 @@ class CXXConstCastExpr final
   CXXConstCastExpr(QualType ty, ExprValueKind VK, Expr *op,
                    TypeSourceInfo *writtenTy, SourceLocation l,
                    SourceLocation RParenLoc, SourceRange AngleBrackets)
-      : CXXNamedCastExpr(CXXConstCastExprClass, ty, VK, CK_NoOp, op,
-                         0, writtenTy, l, RParenLoc, AngleBrackets) {}
+      : CXXNamedCastExpr(CXXConstCastExprClass, ty, VK, CK_NoOp, op, 0,
+                         /*HasFPFeatures*/ false, writtenTy, l, RParenLoc,
+                         AngleBrackets) {}
 
   explicit CXXConstCastExpr(EmptyShell Empty)
-      : CXXNamedCastExpr(CXXConstCastExprClass, Empty, 0) {}
+      : CXXNamedCastExpr(CXXConstCastExprClass, Empty, 0,
+                         /*HasFPFeatures*/ false) {}
 
 public:
   friend class CastExpr;
@@ -578,10 +594,12 @@ class CXXAddrspaceCastExpr final
                        TypeSourceInfo *writtenTy, SourceLocation l,
                        SourceLocation RParenLoc, SourceRange AngleBrackets)
       : CXXNamedCastExpr(CXXAddrspaceCastExprClass, ty, VK, Kind, op, 0,
-                         writtenTy, l, RParenLoc, AngleBrackets) {}
+                         /*HasFPFeatures*/ false, writtenTy, l, RParenLoc,
+                         AngleBrackets) {}
 
   explicit CXXAddrspaceCastExpr(EmptyShell Empty)
-      : CXXNamedCastExpr(CXXAddrspaceCastExprClass, Empty, 0) {}
+      : CXXNamedCastExpr(CXXAddrspaceCastExprClass, Empty, 0,
+                         /*HasFPFeatures*/ false) {}
 
 public:
   friend class CastExpr;
@@ -840,6 +858,10 @@ class CXXTypeidExpr : public Expr {
   /// evaluated, per C++11 [expr.typeid]p3.
   bool isPotentiallyEvaluated() const;
 
+  /// Best-effort check if the expression operand refers to a most derived
+  /// object. This is not a strong guarantee.
+  bool isMostDerived(ASTContext &Context) const;
+
   bool isTypeOperand() const { return Operand.is(); }
 
   /// Retrieves the type operand of this typeid() expression after
@@ -1693,34 +1715,43 @@ class CXXInheritedCtorInitExpr : public Expr {
 /// \endcode
 class CXXFunctionalCastExpr final
     : public ExplicitCastExpr,
-      private llvm::TrailingObjects {
+      private llvm::TrailingObjects {
   SourceLocation LParenLoc;
   SourceLocation RParenLoc;
 
   CXXFunctionalCastExpr(QualType ty, ExprValueKind VK,
-                        TypeSourceInfo *writtenTy,
-                        CastKind kind, Expr *castExpr, unsigned pathSize,
-                        SourceLocation lParenLoc, SourceLocation rParenLoc)
-      : ExplicitCastExpr(CXXFunctionalCastExprClass, ty, VK, kind,
-                         castExpr, pathSize, writtenTy),
-        LParenLoc(lParenLoc), RParenLoc(rParenLoc) {}
+                        TypeSourceInfo *writtenTy, CastKind kind,
+                        Expr *castExpr, unsigned pathSize,
+                        FPOptionsOverride FPO, SourceLocation lParenLoc,
+                        SourceLocation rParenLoc)
+      : ExplicitCastExpr(CXXFunctionalCastExprClass, ty, VK, kind, castExpr,
+                         pathSize, FPO.requiresTrailingStorage(), writtenTy),
+        LParenLoc(lParenLoc), RParenLoc(rParenLoc) {
+    if (hasStoredFPFeatures())
+      *getTrailingFPFeatures() = FPO;
+  }
 
-  explicit CXXFunctionalCastExpr(EmptyShell Shell, unsigned PathSize)
-      : ExplicitCastExpr(CXXFunctionalCastExprClass, Shell, PathSize) {}
+  explicit CXXFunctionalCastExpr(EmptyShell Shell, unsigned PathSize,
+                                 bool HasFPFeatures)
+      : ExplicitCastExpr(CXXFunctionalCastExprClass, Shell, PathSize,
+                         HasFPFeatures) {}
+
+  unsigned numTrailingObjects(OverloadToken) const {
+    return path_size();
+  }
 
 public:
   friend class CastExpr;
   friend TrailingObjects;
 
-  static CXXFunctionalCastExpr *Create(const ASTContext &Context, QualType T,
-                                       ExprValueKind VK,
-                                       TypeSourceInfo *Written,
-                                       CastKind Kind, Expr *Op,
-                                       const CXXCastPath *Path,
-                                       SourceLocation LPLoc,
-                                       SourceLocation RPLoc);
-  static CXXFunctionalCastExpr *CreateEmpty(const ASTContext &Context,
-                                            unsigned PathSize);
+  static CXXFunctionalCastExpr *
+  Create(const ASTContext &Context, QualType T, ExprValueKind VK,
+         TypeSourceInfo *Written, CastKind Kind, Expr *Op,
+         const CXXCastPath *Path, FPOptionsOverride FPO, SourceLocation LPLoc,
+         SourceLocation RPLoc);
+  static CXXFunctionalCastExpr *
+  CreateEmpty(const ASTContext &Context, unsigned PathSize, bool HasFPFeatures);
 
   SourceLocation getLParenLoc() const { return LParenLoc; }
   void setLParenLoc(SourceLocation L) { LParenLoc = L; }
@@ -4828,11 +4859,11 @@ class BuiltinBitCastExpr final
   BuiltinBitCastExpr(QualType T, ExprValueKind VK, CastKind CK, Expr *SrcExpr,
                      TypeSourceInfo *DstType, SourceLocation KWLoc,
                      SourceLocation RParenLoc)
-      : ExplicitCastExpr(BuiltinBitCastExprClass, T, VK, CK, SrcExpr, 0,
+      : ExplicitCastExpr(BuiltinBitCastExprClass, T, VK, CK, SrcExpr, 0, false,
                          DstType),
         KWLoc(KWLoc), RParenLoc(RParenLoc) {}
   BuiltinBitCastExpr(EmptyShell Empty)
-      : ExplicitCastExpr(BuiltinBitCastExprClass, Empty, 0) {}
+      : ExplicitCastExpr(BuiltinBitCastExprClass, Empty, 0, false) {}
 
   SourceLocation getBeginLoc() const LLVM_READONLY { return KWLoc; }
   SourceLocation getEndLoc() const LLVM_READONLY { return RParenLoc; }
diff --git a/clang/include/clang/AST/ExprObjC.h b/clang/include/clang/AST/ExprObjC.h
index 4b39d9ab96a6a..17eec51726978 100644
--- a/clang/include/clang/AST/ExprObjC.h
+++ b/clang/include/clang/AST/ExprObjC.h
@@ -1639,12 +1639,12 @@ class ObjCBridgedCastExpr final
                       CastKind CK, SourceLocation BridgeKeywordLoc,
                       TypeSourceInfo *TSInfo, Expr *Operand)
       : ExplicitCastExpr(ObjCBridgedCastExprClass, TSInfo->getType(), VK_RValue,
-                         CK, Operand, 0, TSInfo),
+                         CK, Operand, 0, false, TSInfo),
         LParenLoc(LParenLoc), BridgeKeywordLoc(BridgeKeywordLoc), Kind(Kind) {}
 
   /// Construct an empty Objective-C bridged cast.
   explicit ObjCBridgedCastExpr(EmptyShell Shell)
-      : ExplicitCastExpr(ObjCBridgedCastExprClass, Shell, 0) {}
+      : ExplicitCastExpr(ObjCBridgedCastExprClass, Shell, 0, false) {}
 
   SourceLocation getLParenLoc() const { return LParenLoc; }
 
diff --git a/clang/include/clang/AST/IgnoreExpr.h b/clang/include/clang/AST/IgnoreExpr.h
index 15d31f3af9954..1c2b538e5b635 100644
--- a/clang/include/clang/AST/IgnoreExpr.h
+++ b/clang/include/clang/AST/IgnoreExpr.h
@@ -14,12 +14,13 @@
 #define LLVM_CLANG_AST_IGNOREEXPR_H
 
 #include "clang/AST/Expr.h"
+#include "clang/AST/ExprCXX.h"
 
 namespace clang {
 namespace detail {
 /// Given an expression E and functions Fn_1,...,Fn_n : Expr * -> Expr *,
 /// Return Fn_n(...(Fn_1(E)))
-inline Expr *IgnoreExprNodesImpl(Expr *E) { return E; };
+inline Expr *IgnoreExprNodesImpl(Expr *E) { return E; }
 template 
 Expr *IgnoreExprNodesImpl(Expr *E, FnTy &&Fn, FnTys &&... Fns) {
   return IgnoreExprNodesImpl(Fn(E), std::forward(Fns)...);
@@ -38,23 +39,122 @@ template  Expr *IgnoreExprNodes(Expr *E, FnTys &&... Fns) {
   return E;
 }
 
-Expr *IgnoreImplicitCastsSingleStep(Expr *E);
+template 
+const Expr *IgnoreExprNodes(const Expr *E, FnTys &&...Fns) {
+  return const_cast(IgnoreExprNodes(E, std::forward(Fns)...));
+}
+
+inline Expr *IgnoreImplicitCastsSingleStep(Expr *E) {
+  if (auto *ICE = dyn_cast(E))
+    return ICE->getSubExpr();
+
+  if (auto *FE = dyn_cast(E))
+    return FE->getSubExpr();
+
+  return E;
+}
+
+inline Expr *IgnoreImplicitCastsExtraSingleStep(Expr *E) {
+  // FIXME: Skip MaterializeTemporaryExpr and SubstNonTypeTemplateParmExpr in
+  // addition to what IgnoreImpCasts() skips to account for the current
+  // behaviour of IgnoreParenImpCasts().
+  Expr *SubE = IgnoreImplicitCastsSingleStep(E);
+  if (SubE != E)
+    return SubE;
+
+  if (auto *MTE = dyn_cast(E))
+    return MTE->getSubExpr();
+
+  if (auto *NTTP = dyn_cast(E))
+    return NTTP->getReplacement();
+
+  return E;
+}
+
+inline Expr *IgnoreCastsSingleStep(Expr *E) {
+  if (auto *CE = dyn_cast(E))
+    return CE->getSubExpr();
+
+  if (auto *FE = dyn_cast(E))
+    return FE->getSubExpr();
+
+  if (auto *MTE = dyn_cast(E))
+    return MTE->getSubExpr();
+
+  if (auto *NTTP = dyn_cast(E))
+    return NTTP->getReplacement();
+
+  return E;
+}
+
+inline Expr *IgnoreLValueCastsSingleStep(Expr *E) {
+  // Skip what IgnoreCastsSingleStep skips, except that only
+  // lvalue-to-rvalue casts are skipped.
+  if (auto *CE = dyn_cast(E))
+    if (CE->getCastKind() != CK_LValueToRValue)
+      return E;
 
-Expr *IgnoreImplicitCastsExtraSingleStep(Expr *E);
+  return IgnoreCastsSingleStep(E);
+}
+
+inline Expr *IgnoreBaseCastsSingleStep(Expr *E) {
+  if (auto *CE = dyn_cast(E))
+    if (CE->getCastKind() == CK_DerivedToBase ||
+        CE->getCastKind() == CK_UncheckedDerivedToBase ||
+        CE->getCastKind() == CK_NoOp)
+      return CE->getSubExpr();
+
+  return E;
+}
+
+inline Expr *IgnoreImplicitSingleStep(Expr *E) {
+  Expr *SubE = IgnoreImplicitCastsSingleStep(E);
+  if (SubE != E)
+    return SubE;
+
+  if (auto *MTE = dyn_cast(E))
+    return MTE->getSubExpr();
+
+  if (auto *BTE = dyn_cast(E))
+    return BTE->getSubExpr();
+
+  return E;
+}
+
+inline Expr *IgnoreImplicitAsWrittenSingleStep(Expr *E) {
+  if (auto *ICE = dyn_cast(E))
+    return ICE->getSubExprAsWritten();
 
-Expr *IgnoreCastsSingleStep(Expr *E);
+  return IgnoreImplicitSingleStep(E);
+}
 
-Expr *IgnoreLValueCastsSingleStep(Expr *E);
+inline Expr *IgnoreParensOnlySingleStep(Expr *E) {
+  if (auto *PE = dyn_cast(E))
+    return PE->getSubExpr();
+  return E;
+}
 
-Expr *IgnoreBaseCastsSingleStep(Expr *E);
+inline Expr *IgnoreParensSingleStep(Expr *E) {
+  if (auto *PE = dyn_cast(E))
+    return PE->getSubExpr();
 
-Expr *IgnoreImplicitSingleStep(Expr *E);
+  if (auto *UO = dyn_cast(E)) {
+    if (UO->getOpcode() == UO_Extension)
+      return UO->getSubExpr();
+  }
 
-Expr *IgnoreImplicitAsWrittenSingleStep(Expr *E);
+  else if (auto *GSE = dyn_cast(E)) {
+    if (!GSE->isResultDependent())
+      return GSE->getResultExpr();
+  }
 
-Expr *IgnoreParensOnlySingleStep(Expr *E);
+  else if (auto *CE = dyn_cast(E)) {
+    if (!CE->isConditionDependent())
+      return CE->getChosenSubExpr();
+  }
 
-Expr *IgnoreParensSingleStep(Expr *E);
+  return E;
+}
 
 } // namespace clang
 
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 35ab8ff39efa8..d101fcf214b5e 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -7856,6 +7856,23 @@ class OMPTraitInfo {
   /// Return a string representation identifying this context selector.
   std::string getMangledName() const;
 
+  /// Check the extension trait \p TP is active.
+  bool isExtensionActive(llvm::omp::TraitProperty TP) {
+    for (const OMPTraitSet &Set : Sets) {
+      if (Set.Kind != llvm::omp::TraitSet::implementation)
+        continue;
+      for (const OMPTraitSelector &Selector : Set.Selectors) {
+        if (Selector.Kind != llvm::omp::TraitSelector::implementation_extension)
+          continue;
+        for (const OMPTraitProperty &Property : Selector.Properties) {
+          if (Property.Kind == TP)
+            return true;
+        }
+      }
+    }
+    return false;
+  }
+
   /// Print a human readable representation into \p OS.
   void print(llvm::raw_ostream &OS, const PrintingPolicy &Policy) const;
 };
diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h
index 726c61cb0126b..4a6e8182e5a06 100644
--- a/clang/include/clang/AST/Stmt.h
+++ b/clang/include/clang/AST/Stmt.h
@@ -521,6 +521,9 @@ class alignas(void *) Stmt {
     unsigned Kind : 6;
     unsigned PartOfExplicitCast : 1; // Only set for ImplicitCastExpr.
 
+    /// True if the call expression has some floating-point features.
+    unsigned HasFPFeatures : 1;
+
     /// The number of CXXBaseSpecifiers in the cast. 14 bits would be enough
     /// here. ([implimits] Direct and indirect base classes [16384]).
     unsigned BasePathSize;
@@ -1098,6 +1101,14 @@ class alignas(void *) Stmt {
   /// de-serialization).
   struct EmptyShell {};
 
+  /// The likelihood of a branch being taken.
+  enum Likelihood {
+    LH_Unlikely = -1, ///< Branch has the [[unlikely]] attribute.
+    LH_None,          ///< No attribute set or branches of the IfStmt have
+                      ///< the same attribute.
+    LH_Likely         ///< Branch has the [[likely]] attribute.
+  };
+
 protected:
   /// Iterator for iterating over Stmt * arrays that contain only T *.
   ///
@@ -1166,6 +1177,20 @@ class alignas(void *) Stmt {
   static void EnableStatistics();
   static void PrintStats();
 
+  /// \returns the likelihood of a statement.
+  static Likelihood getLikelihood(const Stmt *S);
+
+  /// \returns the likelihood of the 'then' branch of an 'if' statement. The
+  /// 'else' branch is required to determine whether both branches specify the
+  /// same likelihood, which affects the result.
+  static Likelihood getLikelihood(const Stmt *Then, const Stmt *Else);
+
+  /// \returns whether the likelihood of the branches of an if statement are
+  /// conflicting. When the first element is \c true there's a conflict and
+  /// the Attr's are the conflicting attributes of the Then and Else Stmt.
+  static std::tuple
+  determineLikelihoodConflict(const Stmt *Then, const Stmt *Else);
+
   /// Dumps the specified AST fragment and all subtrees to
   /// \c llvm::errs().
   void dump() const;
diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h
index f68a5dbfc2a0d..15ca348f47667 100644
--- a/clang/include/clang/AST/TextNodeDumper.h
+++ b/clang/include/clang/AST/TextNodeDumper.h
@@ -270,6 +270,7 @@ class TextNodeDumper
   void VisitCXXBoolLiteralExpr(const CXXBoolLiteralExpr *Node);
   void VisitCXXThisExpr(const CXXThisExpr *Node);
   void VisitCXXFunctionalCastExpr(const CXXFunctionalCastExpr *Node);
+  void VisitCXXStaticCastExpr(const CXXStaticCastExpr *Node);
   void VisitCXXUnresolvedConstructExpr(const CXXUnresolvedConstructExpr *Node);
   void VisitCXXConstructExpr(const CXXConstructExpr *Node);
   void VisitCXXBindTemporaryExpr(const CXXBindTemporaryExpr *Node);
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index f5c4fe63182ff..bd89906eadb0f 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -334,6 +334,19 @@ AST_MATCHER_P(Stmt, isExpandedFromMacro, llvm::StringRef, MacroName) {
 /// \endcode
 extern const internal::VariadicAllOfMatcher decl;
 
+/// Matches decomposition-declarations.
+///
+/// Examples matches the declaration node with \c foo and \c bar, but not
+/// \c number.
+/// (matcher = declStmt(has(decompositionDecl())))
+///
+/// \code
+///   int number = 42;
+///   auto [foo, bar] = std::make_pair{42, 42};
+/// \endcode
+extern const internal::VariadicAllOfMatcher
+    decompositionDecl;
+
 /// Matches a declaration of a linkage specification.
 ///
 /// Given
@@ -4349,6 +4362,103 @@ AST_POLYMORPHIC_MATCHER_P2(forEachArgumentWithParam,
   return Matched;
 }
 
+/// Matches all arguments and their respective types for a \c CallExpr or
+/// \c CXXConstructExpr. It is very similar to \c forEachArgumentWithParam but
+/// it works on calls through function pointers as well.
+///
+/// The difference is, that function pointers do not provide access to a
+/// \c ParmVarDecl, but only the \c QualType for each argument.
+///
+/// Given
+/// \code
+///   void f(int i);
+///   int y;
+///   f(y);
+///   void (*f_ptr)(int) = f;
+///   f_ptr(y);
+/// \endcode
+/// callExpr(
+///   forEachArgumentWithParamType(
+///     declRefExpr(to(varDecl(hasName("y")))),
+///     qualType(isInteger()).bind("type)
+/// ))
+///   matches f(y) and f_ptr(y)
+/// with declRefExpr(...)
+///   matching int y
+/// and qualType(...)
+///   matching int
+AST_POLYMORPHIC_MATCHER_P2(forEachArgumentWithParamType,
+                           AST_POLYMORPHIC_SUPPORTED_TYPES(CallExpr,
+                                                           CXXConstructExpr),
+                           internal::Matcher, ArgMatcher,
+                           internal::Matcher, ParamMatcher) {
+  BoundNodesTreeBuilder Result;
+  // The first argument of an overloaded member operator is the implicit object
+  // argument of the method which should not be matched against a parameter, so
+  // we skip over it here.
+  BoundNodesTreeBuilder Matches;
+  unsigned ArgIndex = cxxOperatorCallExpr(callee(cxxMethodDecl()))
+                              .matches(Node, Finder, &Matches)
+                          ? 1
+                          : 0;
+
+  const FunctionProtoType *FProto = nullptr;
+
+  if (const auto *Call = dyn_cast(&Node)) {
+    if (const auto *Value =
+            dyn_cast_or_null(Call->getCalleeDecl())) {
+      QualType QT = Value->getType().getCanonicalType();
+
+      // This does not necessarily lead to a `FunctionProtoType`,
+      // e.g. K&R functions do not have a function prototype.
+      if (QT->isFunctionPointerType())
+        FProto = QT->getPointeeType()->getAs();
+
+      if (QT->isMemberFunctionPointerType()) {
+        const auto *MP = QT->getAs();
+        assert(MP && "Must be member-pointer if its a memberfunctionpointer");
+        FProto = MP->getPointeeType()->getAs();
+        assert(FProto &&
+               "The call must have happened through a member function "
+               "pointer");
+      }
+    }
+  }
+
+  int ParamIndex = 0;
+  bool Matched = false;
+
+  for (; ArgIndex < Node.getNumArgs(); ++ArgIndex, ++ParamIndex) {
+    BoundNodesTreeBuilder ArgMatches(*Builder);
+    if (ArgMatcher.matches(*(Node.getArg(ArgIndex)->IgnoreParenCasts()), Finder,
+                           &ArgMatches)) {
+      BoundNodesTreeBuilder ParamMatches(ArgMatches);
+
+      // This test is cheaper compared to the big matcher in the next if.
+      // Therefore, please keep this order.
+      if (FProto) {
+        QualType ParamType = FProto->getParamType(ParamIndex);
+        if (ParamMatcher.matches(ParamType, Finder, &ParamMatches)) {
+          Result.addMatch(ParamMatches);
+          Matched = true;
+          continue;
+        }
+      }
+      if (expr(anyOf(cxxConstructExpr(hasDeclaration(cxxConstructorDecl(
+                         hasParameter(ParamIndex, hasType(ParamMatcher))))),
+                     callExpr(callee(functionDecl(
+                         hasParameter(ParamIndex, hasType(ParamMatcher)))))))
+              .matches(Node, Finder, &ParamMatches)) {
+        Result.addMatch(ParamMatches);
+        Matched = true;
+        continue;
+      }
+    }
+  }
+  *Builder = std::move(Result);
+  return Matched;
+}
+
 /// Matches the ParmVarDecl nodes that are at the N'th position in the parameter
 /// list. The parameter list could be that of either a block, function, or
 /// objc-method.
@@ -4769,7 +4879,9 @@ AST_MATCHER_P(ArraySubscriptExpr, hasBase,
 }
 
 /// Matches a 'for', 'while', 'do while' statement or a function
-/// definition that has a given body.
+/// definition that has a given body. Note that in case of functions
+/// this matcher only matches the definition itself and not the other
+/// declarations of the same function.
 ///
 /// Given
 /// \code
@@ -4779,6 +4891,18 @@ AST_MATCHER_P(ArraySubscriptExpr, hasBase,
 ///   matches 'for (;;) {}'
 /// with compoundStmt()
 ///   matching '{}'
+///
+/// Given
+/// \code
+///   void f();
+///   void f() {}
+/// \endcode
+/// hasBody(functionDecl())
+///   matches 'void f() {}'
+/// with compoundStmt()
+///   matching '{}'
+///   but does not match 'void f();'
+
 AST_POLYMORPHIC_MATCHER_P(hasBody,
                           AST_POLYMORPHIC_SUPPORTED_TYPES(DoStmt, ForStmt,
                                                           WhileStmt,
@@ -4790,6 +4914,30 @@ AST_POLYMORPHIC_MATCHER_P(hasBody,
           InnerMatcher.matches(*Statement, Finder, Builder));
 }
 
+/// Matches a function declaration that has a given body present in the AST.
+/// Note that this matcher matches all the declarations of a function whose
+/// body is present in the AST.
+///
+/// Given
+/// \code
+///   void f();
+///   void f() {}
+///   void g();
+/// \endcode
+/// hasAnyBody(functionDecl())
+///   matches both 'void f();'
+///   and 'void f() {}'
+/// with compoundStmt()
+///   matching '{}'
+///   but does not match 'void g();'
+AST_MATCHER_P(FunctionDecl, hasAnyBody,
+              internal::Matcher, InnerMatcher) {
+  const Stmt *const Statement = Node.getBody();
+  return (Statement != nullptr &&
+          InnerMatcher.matches(*Statement, Finder, Builder));
+}
+
+
 /// Matches compound statements where at least one substatement matches
 /// a given matcher. Also matches StmtExprs that have CompoundStmt as children.
 ///
diff --git a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
index 09774b3c912c7..2a3f503f99516 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
@@ -1835,18 +1835,18 @@ struct NotEqualsBoundNodePredicate {
   DynTypedNode Node;
 };
 
+template  struct GetBodyMatcher {
+  static const Stmt *get(const Ty &Node) { return Node.getBody(); }
+};
+
 template 
-struct GetBodyMatcher {
+struct GetBodyMatcher::value>::type> {
   static const Stmt *get(const Ty &Node) {
-    return Node.getBody();
+    return Node.doesThisDeclarationHaveABody() ? Node.getBody() : nullptr;
   }
 };
 
-template <>
-inline const Stmt *GetBodyMatcher::get(const FunctionDecl &Node) {
-  return Node.doesThisDeclarationHaveABody() ? Node.getBody() : nullptr;
-}
-
 template 
 struct HasSizeMatcher {
   static bool hasSize(const Ty &Node, unsigned int N) {
diff --git a/clang/include/clang/Analysis/Analyses/LiveVariables.h b/clang/include/clang/Analysis/Analyses/LiveVariables.h
index 2e7dd5d81678a..8a3dd0c35e64c 100644
--- a/clang/include/clang/Analysis/Analyses/LiveVariables.h
+++ b/clang/include/clang/Analysis/Analyses/LiveVariables.h
@@ -30,22 +30,22 @@ class LiveVariables : public ManagedAnalysis {
   class LivenessValues {
   public:
 
-    llvm::ImmutableSet liveStmts;
+    llvm::ImmutableSet liveExprs;
     llvm::ImmutableSet liveDecls;
     llvm::ImmutableSet liveBindings;
 
     bool equals(const LivenessValues &V) const;
 
     LivenessValues()
-      : liveStmts(nullptr), liveDecls(nullptr), liveBindings(nullptr) {}
+      : liveExprs(nullptr), liveDecls(nullptr), liveBindings(nullptr) {}
 
-    LivenessValues(llvm::ImmutableSet LiveStmts,
+    LivenessValues(llvm::ImmutableSet liveExprs,
                    llvm::ImmutableSet LiveDecls,
                    llvm::ImmutableSet LiveBindings)
-        : liveStmts(LiveStmts), liveDecls(LiveDecls),
+        : liveExprs(liveExprs), liveDecls(LiveDecls),
           liveBindings(LiveBindings) {}
 
-    bool isLive(const Stmt *S) const;
+    bool isLive(const Expr *E) const;
     bool isLive(const VarDecl *D) const;
 
     friend class LiveVariables;
@@ -83,17 +83,17 @@ class LiveVariables : public ManagedAnalysis {
   ///  only returns liveness information for block-level expressions.
   bool isLive(const Stmt *S, const VarDecl *D);
 
-  /// Returns true the block-level expression "value" is live
+  /// Returns true the block-level expression value is live
   ///  before the given block-level expression (see runOnAllBlocks).
-  bool isLive(const Stmt *Loc, const Stmt *StmtVal);
+  bool isLive(const Stmt *Loc, const Expr *Val);
 
   /// Print to stderr the variable liveness information associated with
   /// each basic block.
   void dumpBlockLiveness(const SourceManager &M);
 
-  /// Print to stderr the statement liveness information associated with
+  /// Print to stderr the expression liveness information associated with
   /// each basic block.
-  void dumpStmtLiveness(const SourceManager &M);
+  void dumpExprLiveness(const SourceManager &M);
 
   void runOnAllBlocks(Observer &obs);
 
diff --git a/clang/include/clang/Basic/AlignedAllocation.h b/clang/include/clang/Basic/AlignedAllocation.h
index 88410c5cb51ff..ab9f19da5d598 100644
--- a/clang/include/clang/Basic/AlignedAllocation.h
+++ b/clang/include/clang/Basic/AlignedAllocation.h
@@ -33,6 +33,8 @@ inline llvm::VersionTuple alignedAllocMinVersion(llvm::Triple::OSType OS) {
     return llvm::VersionTuple(11U);
   case llvm::Triple::WatchOS: // Earliest supporting version is 4.0.0.
     return llvm::VersionTuple(4U);
+  case llvm::Triple::ZOS:
+    return llvm::VersionTuple(); // All z/OS versions have no support.
   }
 
   llvm_unreachable("Unexpected OS");
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 6e1d15bed74e6..946b43cd79a15 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -836,6 +836,7 @@ static llvm::StringRef getPlatformNameSourceSpelling(llvm::StringRef Platform) {
              .Case("macos_app_extension", "macOSApplicationExtension")
              .Case("tvos_app_extension", "tvOSApplicationExtension")
              .Case("watchos_app_extension", "watchOSApplicationExtension")
+             .Case("zos", "z/OS")
              .Default(Platform);
 }
 static llvm::StringRef canonicalizePlatformName(llvm::StringRef Platform) {
@@ -1441,6 +1442,18 @@ def FallThrough : StmtAttr {
   let Documentation = [FallthroughDocs];
 }
 
+def Likely : StmtAttr {
+  // FIXME: Change the date to 201803 once the implementation is finished.
+  let Spellings = [CXX11<"", "likely", 2>, C2x<"clang", "likely">];
+  let Documentation = [LikelihoodDocs];
+}
+
+def Unlikely : StmtAttr {
+  // FIXME: Change the date to 201803 once the implementation is finished.
+  let Spellings = [CXX11<"", "unlikely", 2>, C2x<"clang", "unlikely">];
+  let Documentation = [LikelihoodDocs];
+}
+
 def NoMerge : StmtAttr {
   let Spellings = [Clang<"nomerge">];
   let Documentation = [NoMergeDocs];
@@ -2607,6 +2620,37 @@ def Regparm : TypeAttr {
   let ASTNode = 0;
 }
 
+def SwiftBridge : InheritableAttr {
+  let Spellings = [GNU<"swift_bridge">];
+  let Args = [StringArgument<"SwiftType">];
+  let Subjects = SubjectList<[Tag, TypedefName, ObjCInterface, ObjCProtocol],
+                             ErrorDiag>;
+  let Documentation = [SwiftBridgeDocs];
+}
+
+def SwiftBridgedTypedef : InheritableAttr {
+  let Spellings = [GNU<"swift_bridged_typedef">];
+  let Subjects = SubjectList<[TypedefName], ErrorDiag>;
+  let Documentation = [SwiftBridgedTypedefDocs];
+}
+
+def SwiftObjCMembers : Attr {
+  let Spellings = [GNU<"swift_objc_members">];
+  let Subjects = SubjectList<[ObjCInterface], ErrorDiag>;
+  let Documentation = [SwiftObjCMembersDocs];
+}
+
+def SwiftError : InheritableAttr {
+  let Spellings = [GNU<"swift_error">];
+  let Args = [
+      EnumArgument<"Convention", "ConventionKind",
+                   ["none", "nonnull_error", "null_result", "zero_result", "nonzero_result"],
+                   ["None", "NonNullError", "NullResult", "ZeroResult", "NonZeroResult"]>
+  ];
+  let Subjects = SubjectList<[Function, ObjCMethod], ErrorDiag>;
+  let Documentation = [SwiftErrorDocs];
+}
+
 def NoDeref : TypeAttr {
   let Spellings = [Clang<"noderef">];
   let Documentation = [NoDerefDocs];
@@ -3974,3 +4018,11 @@ def ReleaseHandle : InheritableParamAttr {
   let Subjects = SubjectList<[ParmVar]>;
   let Documentation = [ReleaseHandleDocs];
 }
+
+def Builtin : InheritableAttr {
+  let Spellings = [];
+  let Args = [UnsignedArgument<"ID">];
+  let Subjects = SubjectList<[Function]>;
+  let SemaHandler = 0;
+  let Documentation = [Undocumented];
+}
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index f9bb41bf0635a..970b49ccd42fe 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -1708,6 +1708,101 @@ Here is an example:
   }];
 }
 
+def LikelihoodDocs : Documentation {
+  let Category = DocCatStmt;
+  let Heading = "likely and unlikely";
+  let Content = [{
+The ``likely`` and ``unlikely`` attributes are used as compiler hints.
+The attributes are used to aid the compiler to determine which branch is
+likely or unlikely to be taken. This is done by marking the branch substatement
+with one of the two attributes.
+
+It isn't allowed to annotate a single statement with both ``likely`` and
+``unlikely``. Annotating the ``true`` and ``false`` branch of an ``if``
+statement with the same likelihood attribute will result in a diagnostic and
+the attributes are ignored on both branches.
+
+These attributes have no effect on the generated code when using
+PGO (Profile-Guided Optimization) or at optimization level 0.
+
+In Clang, the attributes will be ignored if they're not placed on the
+substatement of an ``if`` or ``else`` statement. The C++ Standard recommends
+to honor them on every statement in the path of execution, but that can be
+confusing:
+
+.. code-block:: c++
+
+  if (b) {
+    [[unlikely]] --b; // In the path of execution,
+                      // this branch is considered unlikely.
+  }
+
+  if (b) {
+    --b;
+    if(b)
+      return;
+    [[unlikely]] --b; // Not in the path of execution,
+  }                   // the branch has no likelihood information.
+
+  if (b) {
+    --b;
+    foo(b);
+    // Whether or not the next statement is in the path of execution depends
+    // on the declaration of foo():
+    // In the path of execution: void foo(int);
+    // Not in the path of execution: [[noreturn]] void foo(int);
+    // This means the likelihood of the branch depends on the declaration
+    // of foo().
+    [[unlikely]] --b;
+  }
+
+
+At the moment the attribute only has effect when used in an ``if`` or ``else``
+statement.
+
+.. code-block:: c++
+
+  if (b) [[likely]] { // Placement on the first statement in the branch.
+    // The compiler will optimize to execute the code here.
+  } else {
+  }
+
+  if (b)
+    [[unlikely]] b++; // Placement on the first statement in the branch.
+  else {
+    // The compiler will optimize to execute the code here.
+  }
+
+  if (b) {
+    [[unlikely]] b++; // Placement on the second statement in the branch.
+  }                   // The attribute will be ignored.
+
+  if (b) [[likely]] {
+    [[unlikely]] b++; // No contradiction since the second attribute
+  }                   // is ignored.
+
+  if (b)
+    ;
+  else [[likely]] {
+    // The compiler will optimize to execute the code here.
+  }
+
+  if (b)
+    ;
+  else
+    // The compiler will optimize to execute the next statement.
+    [[likely]] b = f();
+
+  if (b) [[likely]]; // Both branches are likely. A diagnostic is issued
+  else [[likely]];   // and the attributes are ignored.
+
+  if (b)
+    [[likely]] int i = 5; // Issues a diagnostic since the attribute
+                          // isn't allowed on a declaration.
+
+  }];
+}
+
 def ARMInterruptDocs : Documentation {
   let Category = DocCatFunction;
   let Heading = "interrupt (ARM)";
@@ -3837,6 +3932,109 @@ For example:
   }];
 }
 
+def SwiftDocs : DocumentationCategory<"Customizing Swift Import"> {
+  let Content = [{
+Clang supports additional attributes for customizing how APIs are imported into
+Swift.
+  }];
+}
+
+def SwiftBridgeDocs : Documentation {
+  let Category = SwiftDocs;
+  let Heading = "swift_bridge";
+  let Content = [{
+The ``swift_bridge`` attribute indicates that the declaration to which the
+attribute appertains is bridged to the named Swift type.
+
+  .. code-block:: c
+
+    __attribute__((__objc_root__))
+    @interface Base
+    - (instancetype)init;
+    @end
+
+    __attribute__((__swift_bridge__("BridgedI")))
+    @interface I : Base
+    @end
+
+In this example, the Objective-C interface ``I`` will be made available to Swift
+with the name ``BridgedI``.  It would be possible for the compiler to refer to
+``I`` still in order to bridge the type back to Objective-C.
+  }];
+}
+
+def SwiftBridgedTypedefDocs : Documentation {
+  let Category = SwiftDocs;
+  let Heading = "swift_bridged";
+  let Content = [{
+The ``swift_bridged_typedef`` attribute indicates that when the typedef to which
+the attribute appertains is imported into Swift, it should refer to the bridged
+Swift type (e.g. Swift's ``String``) rather than the Objective-C type as written
+(e.g. ``NSString``).
+
+  .. code-block:: c
+
+    @interface NSString;
+    typedef NSString *AliasedString __attribute__((__swift_bridged_typedef__));
+
+    extern void acceptsAliasedString(AliasedString _Nonnull parameter);
+
+In this case, the function ``acceptsAliasedString`` will be imported into Swift
+as a function which accepts a ``String`` type parameter.
+  }];
+}
+
+def SwiftObjCMembersDocs : Documentation {
+  let Category = SwiftDocs;
+  let Heading = "swift_objc_members";
+  let Content = [{
+This attribute indicates that Swift subclasses and members of Swift extensions
+of this class will be implicitly marked with the ``@objcMembers`` Swift
+attribute, exposing them back to Objective-C.
+  }];
+}
+
+def SwiftErrorDocs : Documentation {
+  let Category = SwiftDocs;
+  let Heading = "swift_error";
+  let Content = [{
+The ``swift_error`` attribute controls whether a particular function (or
+Objective-C method) is imported into Swift as a throwing function, and if so,
+which dynamic convention it uses.
+
+All of these conventions except ``none`` require the function to have an error
+parameter. Currently, the error parameter is always the last parameter of type
+``NSError**`` or ``CFErrorRef*``.  Swift will remove the error parameter from
+the imported API. When calling the API, Swift will always pass a valid address
+initialized to a null pointer.
+
+* ``swift_error(none)`` means that the function should not be imported as
+throwing. The error parameter and result type will be imported normally.
+
+* ``swift_error(null_result)`` means that calls to the function should be
+considered to have thrown if they return a null value. The return type must be
+a pointer type, and it will be imported into Swift with a non-optional type.
+This is the default error convention for Objective-C methods that return
+pointers.
+
+* ``swift_error(zero_result)`` means that calls to the function should be
+considered to have thrown if they return a zero result. The return type must be
+an integral type. If the return type would have been imported as ``Bool``, it
+is instead imported as ``Void``. This is the default error convention for
+Objective-C methods that return a type that would be imported as ``Bool``.
+
+* ``swift_error(nonzero_result)`` means that calls to the function should be
+considered to have thrown if they return a non-zero result. The return type must
+be an integral type. If the return type would have been imported as ``Bool``,
+it is instead imported as ``Void``.
+
+* ``swift_error(nonnull_error)`` means that calls to the function should be
+considered to have thrown if they leave a non-null error in the error parameter.
+The return type is left unmodified.
+
+  }];
+}
+
 def OMPDeclareSimdDocs : Documentation {
   let Category = DocCatFunction;
   let Heading = "#pragma omp declare simd";
@@ -3943,12 +4141,24 @@ Clang provides the following context selector extensions, used via
     match_all
     match_any
     match_none
+    disable_implicit_base
+    allow_templates
 
 The match extensions change when the *entire* context selector is considered a
 match for an OpenMP context. The default is ``all``, with ``none`` no trait in the
 selector is allowed to be in the OpenMP context, with ``any`` a single trait in
 both the selector and OpenMP context is sufficient. Only a single match
 extension trait is allowed per context selector.
+The disable extensions remove default effects of the ``begin declare variant``
+applied to a definition. If ``disable_implicit_base`` is given, we will not
+introduce an implicit base function for a variant if no base function was
+found. The variant is still generated but will never be called, due to the
+absence of a base function and consequently calls to a base function.
+The allow extensions change when the ``begin declare variant`` effect is
+applied to a definition. If ``allow_templates`` is given, template function
+definitions are considered as specializations of existing or assumed template
+declarations with the same name. The template parameters for the base functions
+are used to instantiate the specialization.
 
   }];
 }
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 2a291ce1fe2c9..0d5290dd29ca0 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -1020,6 +1020,7 @@ LIBBUILTIN(strncasecmp, "icC*cC*z", "f",   "strings.h", ALL_GNU_LANGUAGES)
 LIBBUILTIN(_exit, "vi",           "fr",    "unistd.h", ALL_GNU_LANGUAGES)
 LIBBUILTIN(vfork, "p",            "fj",    "unistd.h", ALL_LANGUAGES)
 // POSIX pthread.h
+// FIXME: Should specify argument types.
 LIBBUILTIN(pthread_create, "",  "fC<2,3>", "pthread.h", ALL_GNU_LANGUAGES)
 
 // POSIX setjmp.h
diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index 89dd03075b28f..4b97cbc092094 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -329,6 +329,12 @@ BUILTIN(__builtin_altivec_vexpandwm, "V4UiV4Ui", "")
 BUILTIN(__builtin_altivec_vexpanddm, "V2ULLiV2ULLi", "")
 BUILTIN(__builtin_altivec_vexpandqm, "V1ULLLiV1ULLLi", "")
 
+// P10 Vector Count with Mask built-ins.
+BUILTIN(__builtin_altivec_vcntmbb, "ULLiV16UcUi", "")
+BUILTIN(__builtin_altivec_vcntmbh, "ULLiV8UsUi", "")
+BUILTIN(__builtin_altivec_vcntmbw, "ULLiV4UiUi", "")
+BUILTIN(__builtin_altivec_vcntmbd, "ULLiV2ULLiUi", "")
+
 // P10 Vector Parallel Bits built-ins.
 BUILTIN(__builtin_altivec_vpdepd, "V2ULLiV2ULLiV2ULLi", "")
 BUILTIN(__builtin_altivec_vpextd, "V2ULLiV2ULLiV2ULLi", "")
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index ec77f68062e7a..b5da2a9cde1ac 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -145,7 +145,7 @@ CODEGENOPT(IncrementalLinkerCompatible, 1, 0) ///< Emit an object file which can
                                               ///< linker.
 CODEGENOPT(MergeAllConstants , 1, 1) ///< Merge identical constants.
 CODEGENOPT(MergeFunctions    , 1, 0) ///< Set when -fmerge-functions is enabled.
-CODEGENOPT(HeapProf          , 1, 0) ///< Set when -fmemory-profile is enabled.
+CODEGENOPT(MemProf           , 1, 0) ///< Set when -fmemory-profile is enabled.
 CODEGENOPT(MSVolatile        , 1, 0) ///< Set when /volatile:ms is enabled.
 CODEGENOPT(NoCommon          , 1, 0) ///< Set when -fno-common or C++ is enabled.
 CODEGENOPT(NoDwarfDirectoryAsm , 1, 0) ///< Set when -fno-dwarf-directory-asm is
@@ -162,6 +162,7 @@ CODEGENOPT(NoImplicitFloat   , 1, 0) ///< Set when -mno-implicit-float is enable
 CODEGENOPT(NullPointerIsValid , 1, 0) ///< Assume Null pointer deference is defined.
 CODEGENOPT(CorrectlyRoundedDivSqrt, 1, 0) ///< -cl-fp32-correctly-rounded-divide-sqrt
 CODEGENOPT(UniqueInternalLinkageNames, 1, 0) ///< Internal Linkage symbols get unique names.
+CODEGENOPT(SplitMachineFunctions, 1, 0) ///< Split machine functions using profile information.
 
 /// When false, this attempts to generate code as if the result of an
 /// overflowing conversion matches the overflowing behavior of a target's native
@@ -395,6 +396,10 @@ CODEGENOPT(KeepStaticConsts, 1, 0)
 /// Whether to not follow the AAPCS that enforce at least one read before storing to a volatile bitfield
 CODEGENOPT(ForceAAPCSBitfieldLoad, 1, 0)
 
+/// Assume that by-value parameters do not alias any other values.
+CODEGENOPT(PassByValueIsNoAlias, 1, 0)
+
+
 #undef CODEGENOPT
 #undef ENUM_CODEGENOPT
 #undef VALUE_CODEGENOPT
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 13b2f97dda86c..8caa55ee14047 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1245,3 +1245,5 @@ in addition with the pragmas or -fmax-tokens flag to get any warnings.
 }
 
 def WebAssemblyExceptionSpec : DiagGroup<"wasm-exception-spec">;
+
+def RTTI : DiagGroup<"rtti">;
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 57e6f398f3507..befb82a5c944c 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1293,6 +1293,11 @@ def err_omp_mapper_expected_declarator : Error<
   "expected declarator on 'omp declare mapper' directive">;
 def err_omp_declare_variant_wrong_clause : Error<
   "expected '%0' clause on 'omp declare variant' directive">;
+def err_omp_declare_variant_duplicate_nested_trait : Error<
+  "nested OpenMP context selector contains duplicated trait '%0'"
+  " in selector '%1' and set '%2' with different score">;
+def err_omp_declare_variant_nested_user_condition : Error<
+  "nested user conditions in OpenMP context selector not supported (yet)">;
 def warn_omp_declare_variant_string_literal_or_identifier
     : Warning<"expected identifier or string literal describing a context "
               "%select{set|selector|property}0; "
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 82654f9e19ef4..2ccdc126b5d31 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3182,6 +3182,9 @@ def warn_nocf_check_attribute_ignored :
 def warn_attribute_after_definition_ignored : Warning<
   "attribute %0 after definition is ignored">,
    InGroup;
+def warn_attributes_likelihood_ifstmt_conflict
+    : Warning<"conflicting attributes %0 are ignored">,
+      InGroup;
 def warn_cxx11_gnu_attribute_on_type : Warning<
   "attribute %0 ignored, because it cannot be applied to a type">,
   InGroup;
@@ -4012,6 +4015,13 @@ def err_objc_bridged_related_known_method : Error<
 def err_objc_attr_protocol_requires_definition : Error<
   "attribute %0 can only be applied to @protocol definitions, not forward declarations">;
 
+def err_attr_swift_error_no_error_parameter : Error<
+  "%0 attribute can only be applied to a %select{function|method}1 with an "
+  "error parameter">;
+def err_attr_swift_error_return_type : Error<
+  "%0 attribute with '%1' convention can only be applied to a "
+  "%select{function|method}2 returning %select{an integral type|a pointer}3">;
+
 def warn_ignored_objc_externally_retained : Warning<
   "'objc_externally_retained' can only be applied to local variables "
   "%select{of retainable type|with strong ownership}0">,
@@ -5133,6 +5143,9 @@ def err_fold_expression_empty : Error<
   "with no fallback value">;
 def err_fold_expression_bad_operand : Error<
   "expression not permitted as operand of fold expression">;
+def err_fold_expression_limit_exceeded: Error<
+  "instantiating fold expression with %0 arguments exceeded expression nesting "
+  "limit of %1">, DefaultFatal, NoSFINAE;
 
 def err_unexpected_typedef : Error<
   "unexpected type name %0: expected expression">;
@@ -7247,8 +7260,8 @@ def warn_overaligned_type : Warning<
   "guarantees %2 bytes">,
   InGroup, DefaultIgnore;
 def err_aligned_allocation_unavailable : Error<
-  "aligned %select{allocation|deallocation}0 function of type '%1' is only "
-  "available on %2 %3 or newer">;
+  "aligned %select{allocation|deallocation}0 function of type '%1' is "
+  "%select{only|not}4 available on %2%select{ %3 or newer|}4">;
 def note_silence_aligned_allocation_unavailable : Note<
   "if you supply your own aligned allocation functions, use "
   "-faligned-allocation to silence this diagnostic">;
@@ -7481,6 +7494,12 @@ def err_no_typeid_with_fno_rtti : Error<
   "use of typeid requires -frtti">;
 def err_no_dynamic_cast_with_fno_rtti : Error<
   "use of dynamic_cast requires -frtti">;
+def warn_no_dynamic_cast_with_rtti_disabled: Warning<
+  "dynamic_cast will not work since RTTI data is disabled by " 
+  "%select{-fno-rtti-data|/GR-}0">, InGroup;
+def warn_no_typeid_with_rtti_disabled: Warning<
+  "typeid will not work since RTTI data is disabled by "
+  "%select{-fno-rtti-data|/GR-}0">, InGroup;
 
 def err_cannot_form_pointer_to_member_of_reference_type : Error<
   "cannot form a pointer-to-member to member %0 of reference type %1">;
@@ -10393,10 +10412,6 @@ def err_omp_non_lvalue_in_map_or_motion_clauses: Error<
   "expected addressable lvalue in '%0' clause">;
 def err_omp_var_expected : Error<
   "expected variable of the '%0' type%select{|, not %2}1">;
-def warn_nested_declare_variant
-    : Warning<"nesting `omp begin/end declare variant` is not supported yet; "
-              "nested context ignored">,
-      InGroup;
 def warn_unknown_declare_variant_isa_trait
     : Warning<"isa trait '%0' is not known to the current target; verify the "
               "spelling or consider restricting the context selector with the "
diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index fc554a35e721b..204a0f0cc0a5d 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -225,18 +225,6 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
   }
   void setObjCKeywordID(tok::ObjCKeywordKind ID) { ObjCOrBuiltinID = ID; }
 
-  /// True if setNotBuiltin() was called.
-  bool hasRevertedBuiltin() const {
-    return ObjCOrBuiltinID == tok::NUM_OBJC_KEYWORDS;
-  }
-
-  /// Revert the identifier to a non-builtin identifier. We do this if
-  /// the name of a known builtin library function is used to declare that
-  /// function, but an unexpected type is specified.
-  void revertBuiltin() {
-    setBuiltinID(0);
-  }
-
   /// Return a value indicating whether this is a builtin function.
   ///
   /// 0 is not-built-in. 1+ are specific builtin functions.
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 9d51eb7ba597d..d60ec3d1f32b8 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -119,6 +119,7 @@ class LangOptions : public LangOptionsBase {
     MSVC2017 = 1910,
     MSVC2017_5 = 1912,
     MSVC2017_7 = 1914,
+    MSVC2019 = 1920,
   };
 
   enum class SYCLVersionList {
@@ -504,6 +505,8 @@ class FPOptionsOverride {
   FPOptionsOverride() {}
   FPOptionsOverride(const LangOptions &LO)
       : Options(LO), OverrideMask(OverrideMaskBits) {}
+  FPOptionsOverride(FPOptions FPO)
+      : Options(FPO), OverrideMask(OverrideMaskBits) {}
 
   bool requiresTrailingStorage() const { return OverrideMask != 0; }
 
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 73767da632430..c1fd06ea0ebe8 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -301,7 +301,7 @@ class Driver {
                                       StringRef CustomResourceDir = "");
 
   Driver(StringRef ClangExecutable, StringRef TargetTriple,
-         DiagnosticsEngine &Diags,
+         DiagnosticsEngine &Diags, std::string Title = "clang LLVM compiler",
          IntrusiveRefCntPtr VFS = nullptr);
 
   /// @name Accessors
diff --git a/clang/include/clang/Driver/Options.h b/clang/include/clang/Driver/Options.h
index 9831efda4e580..06dd3652be940 100644
--- a/clang/include/clang/Driver/Options.h
+++ b/clang/include/clang/Driver/Options.h
@@ -34,7 +34,9 @@ enum ClangFlags {
   CC1AsOption = (1 << 11),
   NoDriverOption = (1 << 12),
   LinkOption = (1 << 13),
-  Ignored = (1 << 14),
+  FlangOption = (1 << 14),
+  FC1Option = (1 << 15),
+  Ignored = (1 << 16),
 };
 
 enum ID {
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 6021c063ed232..20543d6c5d91c 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -56,6 +56,13 @@ def NoDriverOption : OptionFlag;
 // be used), add this flag.
 def LinkOption : OptionFlag;
 
+// FlangOption - This is considered a "core" Flang option, available in
+// flang mode.
+def FlangOption : OptionFlag;
+
+// FC1Option - This option should be accepted by flang -fc1.
+def FC1Option : OptionFlag;
+
 // A short name to show in documentation. The name will be interpreted as rST.
 class DocName { string DocName = name; }
 
@@ -2052,6 +2059,9 @@ defm unique_internal_linkage_names : OptInFFlag<"unique-internal-linkage-names",
 defm unique_section_names : OptOutFFlag<"unique-section-names",
   "", "Don't use unique names for text and data sections">;
 
+defm split_machine_functions: OptInFFlag<"split-machine-functions",
+  "Enable", "Disable", " late function splitting using profile information (x86 ELF)">;
+
 defm strict_return : OptOutFFlag<"strict-return", "",
   "Don't treat control flow paths that fall off the end of a non-void function as unreachable">;
 
@@ -2163,7 +2173,7 @@ def gno_embed_source : Flag<["-"], "gno-embed-source">, Group,
     Flags<[DriverOption]>,
     HelpText<"Restore the default behavior of not embedding source text in DWARF debug sections">;
 def headerpad__max__install__names : Joined<["-"], "headerpad_max_install_names">;
-def help : Flag<["-", "--"], "help">, Flags<[CC1Option,CC1AsOption]>,
+def help : Flag<["-", "--"], "help">, Flags<[CC1Option,CC1AsOption, FC1Option, FlangOption]>,
   HelpText<"Display available options">;
 def ibuiltininc : Flag<["-"], "ibuiltininc">,
   HelpText<"Enable builtin #include directories even when -nostdinc is used "
@@ -2438,6 +2448,9 @@ def mfix_cortex_a53_835769 : Flag<["-"], "mfix-cortex-a53-835769">,
 def mno_fix_cortex_a53_835769 : Flag<["-"], "mno-fix-cortex-a53-835769">,
   Group,
   HelpText<"Don't workaround Cortex-A53 erratum 835769 (AArch64 only)">;
+def mmark_bti_property : Flag<["-"], "mmark-bti-property">,
+  Group,
+  HelpText<"Add .note.gnu.property with BTI to assembly files (AArch64 only)">;
 foreach i = {1-31} in
   def ffixed_x#i : Flag<["-"], "ffixed-x"#i>, Group,
     HelpText<"Reserve the x"#i#" register (AArch64/RISC-V only)">;
@@ -3114,7 +3127,8 @@ def _rtlib : Separate<["--"], "rtlib">, Alias;
 def _serialize_diags : Separate<["-", "--"], "serialize-diagnostics">, Flags<[DriverOption]>,
   HelpText<"Serialize compiler diagnostics to a file">;
 // We give --version different semantics from -version.
-def _version : Flag<["--"], "version">, Flags<[CoreOption, CC1Option]>,
+def _version : Flag<["--"], "version">,
+  Flags<[CoreOption, CC1Option, FC1Option, FlangOption]>,
   HelpText<"Print version information">;
 def _signed_char : Flag<["--"], "signed-char">, Alias;
 def _std : Separate<["--"], "std">, Alias;
@@ -4407,6 +4421,9 @@ def fno_signed_wchar : Flag<["-"], "fno-signed-wchar">,
 def fcompatibility_qualified_id_block_param_type_checking : Flag<["-"], "fcompatibility-qualified-id-block-type-checking">,
   HelpText<"Allow using blocks with parameters of more specific type than "
            "the type system guarantees when a parameter is qualified id">;
+def fpass_by_value_is_noalias: Flag<["-"], "fpass-by-value-is-noalias">,
+  HelpText<"Allows assuming by-value parameters do not alias any other value. "
+           "Has no effect on non-trivially-copyable classes in C++.">, Group;
 
 // FIXME: Remove these entirely once functionality/tests have been excised.
 def fobjc_gc_only : Flag<["-"], "fobjc-gc-only">, Group,
diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
index 95d6bcf35c786..ac2b817be1dc5 100644
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -55,7 +55,7 @@ class SanitizerArgs {
   bool MinimalRuntime = false;
   // True if cross-dso CFI support if provided by the system (i.e. Android).
   bool ImplicitCfiRuntime = false;
-  bool NeedsHeapProfRt = false;
+  bool NeedsMemProfRt = false;
 
 public:
   /// Parses the sanitizer arguments from an argument list.
@@ -63,7 +63,7 @@ class SanitizerArgs {
 
   bool needsSharedRt() const { return SharedRuntime; }
 
-  bool needsHeapProfRt() const { return NeedsHeapProfRt; }
+  bool needsMemProfRt() const { return NeedsMemProfRt; }
   bool needsAsanRt() const { return Sanitizers.has(SanitizerKind::Address); }
   bool needsHwasanRt() const {
     return Sanitizers.has(SanitizerKind::HWAddress);
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 6bb828d60071f..c6c182b7bdcef 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -1860,7 +1860,7 @@ struct FormatStyle {
   bool ObjCSpaceAfterProperty;
 
   /// Break parameters list into lines when there is nested block
-  /// parameters in a fuction call.
+  /// parameters in a function call.
   /// \code
   ///   false:
   ///    - (void)_aMethod
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 66f22732e29cd..c48a1c3f4a3c3 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3106,7 +3106,8 @@ class Parser : public CodeCompletionHandler {
 
   /// Parse a `match` clause for an '#pragma omp declare variant'. Return true
   /// if there was an error.
-  bool parseOMPDeclareVariantMatchClause(SourceLocation Loc, OMPTraitInfo &TI);
+  bool parseOMPDeclareVariantMatchClause(SourceLocation Loc, OMPTraitInfo &TI,
+                                         OMPTraitInfo *ParentTI);
 
   /// Parse clauses for '#pragma omp declare variant'.
   void ParseOMPDeclareVariantClauses(DeclGroupPtrTy Ptr, CachedTokens &Toks,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 2632a92f91764..cba10a911a3a9 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -4126,6 +4126,8 @@ class Sema final {
   ObjCInterfaceDecl *getObjCInterfaceDecl(IdentifierInfo *&Id,
                                           SourceLocation IdLoc,
                                           bool TypoCorrection = false);
+  FunctionDecl *CreateBuiltin(IdentifierInfo *II, QualType Type, unsigned ID,
+                              SourceLocation Loc);
   NamedDecl *LazilyCreateBuiltin(IdentifierInfo *II, unsigned ID,
                                  Scope *S, bool ForRedeclaration,
                                  SourceLocation Loc);
@@ -10195,21 +10197,27 @@ class Sema final {
     OMPDeclareVariantScope(OMPTraitInfo &TI);
   };
 
+  /// Return the OMPTraitInfo for the surrounding scope, if any.
+  OMPTraitInfo *getOMPTraitInfoForSurroundingScope() {
+    return OMPDeclareVariantScopes.empty() ? nullptr
+                                           : OMPDeclareVariantScopes.back().TI;
+  }
+
   /// The current `omp begin/end declare variant` scopes.
   SmallVector OMPDeclareVariantScopes;
 
   /// The declarator \p D defines a function in the scope \p S which is nested
   /// in an `omp begin/end declare variant` scope. In this method we create a
   /// declaration for \p D and rename \p D according to the OpenMP context
-  /// selector of the surrounding scope.
-  FunctionDecl *
-  ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(Scope *S,
-                                                            Declarator &D);
+  /// selector of the surrounding scope. Return all base functions in \p Bases.
+  void ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
+      Scope *S, Declarator &D, MultiTemplateParamsArg TemplateParameterLists,
+      SmallVectorImpl &Bases);
 
-  /// Register \p FD as specialization of \p BaseFD in the current `omp
-  /// begin/end declare variant` scope.
+  /// Register \p D as specialization of all base functions in \p Bases in the
+  /// current `omp begin/end declare variant` scope.
   void ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
-      FunctionDecl *FD, FunctionDecl *BaseFD);
+      Decl *D, SmallVectorImpl &Bases);
 
 public:
 
@@ -12597,6 +12605,7 @@ class Sema final {
 
   /// The struct behind the CFErrorRef pointer.
   RecordDecl *CFError = nullptr;
+  bool isCFError(RecordDecl *D);
 
   /// Retrieve the identifier "NSError".
   IdentifierInfo *getNSErrorIdent();
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index a61af45231348..3540fe5fe55c5 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -349,6 +349,9 @@ let ParentPackage = APIModeling in {
 
 def StdCLibraryFunctionsChecker : Checker<"StdCLibraryFunctions">,
   HelpText<"Improve modeling of the C standard library functions">,
+  // Uninitialized value check is a mandatory dependency. This Checker asserts
+  // that arguments are always initialized.
+  Dependencies<[CallAndMessageModeling]>,
   CheckerOptions<[
     CmdLineOption,
   HelpText<"Print results of live variable analysis">,
   Documentation;
 
-def LiveStatementsDumper : Checker<"DumpLiveStmts">,
-  HelpText<"Print results of live statement analysis">,
+def LiveExpressionsDumper : Checker<"DumpLiveExprs">,
+  HelpText<"Print results of live expression analysis">,
   Documentation;
 
 def CFGViewer : Checker<"ViewCFG">,
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index cdfe986355c56..582a56cbee1ee 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -869,6 +869,23 @@ class ExprEngine {
   void handleConstructor(const Expr *E, ExplodedNode *Pred,
                          ExplodedNodeSet &Dst);
 
+public:
+  /// Note whether this loop has any more iteratios to model. These methods are
+  /// essentially an interface for a GDM trait. Further reading in
+  /// ExprEngine::VisitObjCForCollectionStmt().
+  LLVM_NODISCARD static ProgramStateRef
+  setWhetherHasMoreIteration(ProgramStateRef State,
+                             const ObjCForCollectionStmt *O,
+                             const LocationContext *LC, bool HasMoreIteraton);
+
+  LLVM_NODISCARD static ProgramStateRef
+  removeIterationState(ProgramStateRef State, const ObjCForCollectionStmt *O,
+                       const LocationContext *LC);
+
+  LLVM_NODISCARD static bool hasMoreIteration(ProgramStateRef State,
+                                              const ObjCForCollectionStmt *O,
+                                              const LocationContext *LC);
+private:
   /// Store the location of a C++ object corresponding to a statement
   /// until the statement is actually encountered. For example, if a DeclStmt
   /// has CXXConstructExpr as its initializer, the object would be considered
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
index 6a0f5f10874e3..07fc73a670f35 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
@@ -122,8 +122,7 @@ class SMTConstraintManager : public clang::ento::SimpleConstraintManager {
       // this method tries to get the interpretation (the actual value) from
       // the solver, which is currently not cached.
 
-      llvm::SMTExprRef Exp =
-          SMTConv::fromData(Solver, SD->getSymbolID(), Ty, Ctx.getTypeSize(Ty));
+      llvm::SMTExprRef Exp = SMTConv::fromData(Solver, Ctx, SD);
 
       Solver->reset();
       addStateConstraints(State);
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h
index bdebe238829e8..2d0f169260a45 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h
@@ -319,11 +319,16 @@ class SMTConv {
   }
 
   /// Construct an SMTSolverRef from a SymbolData.
-  static inline llvm::SMTExprRef fromData(llvm::SMTSolverRef &Solver,
-                                          const SymbolID ID, const QualType &Ty,
-                                          uint64_t BitWidth) {
-    llvm::Twine Name = "$" + llvm::Twine(ID);
-    return Solver->mkSymbol(Name.str().c_str(), mkSort(Solver, Ty, BitWidth));
+  static inline llvm::SMTExprRef
+  fromData(llvm::SMTSolverRef &Solver, ASTContext &Ctx, const SymbolData *Sym) {
+    const SymbolID ID = Sym->getSymbolID();
+    const QualType Ty = Sym->getType();
+    const uint64_t BitWidth = Ctx.getTypeSize(Ty);
+
+    llvm::SmallString<16> Str;
+    llvm::raw_svector_ostream OS(Str);
+    OS << Sym->getKindStr() << ID;
+    return Solver->mkSymbol(Str.c_str(), mkSort(Solver, Ty, BitWidth));
   }
 
   // Wrapper to generate SMTSolverRef from SymbolCast data.
@@ -422,8 +427,7 @@ class SMTConv {
       if (RetTy)
         *RetTy = Sym->getType();
 
-      return fromData(Solver, SD->getSymbolID(), Sym->getType(),
-                      Ctx.getTypeSize(Sym->getType()));
+      return fromData(Solver, Ctx, SD);
     }
 
     if (const SymbolCast *SC = dyn_cast(Sym)) {
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h
index abfcd1d80faa4..2f4ac6ba5f975 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h
@@ -126,6 +126,9 @@ class SymbolData : public SymExpr {
 public:
   ~SymbolData() override = default;
 
+  /// Get a string representation of the kind of the region.
+  virtual StringRef getKindStr() const = 0;
+
   SymbolID getSymbolID() const { return Sym; }
 
   unsigned computeComplexity() const override {
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
index 390ced8c29f8f..c71cb88f5574c 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
@@ -59,6 +59,8 @@ class SymbolRegionValue : public SymbolData {
     Profile(profile, R);
   }
 
+  StringRef getKindStr() const override;
+
   void dumpToStream(raw_ostream &os) const override;
   const MemRegion *getOriginRegion() const override { return getRegion(); }
 
@@ -99,6 +101,8 @@ class SymbolConjured : public SymbolData {
 
   QualType getType() const override;
 
+  StringRef getKindStr() const override;
+
   void dumpToStream(raw_ostream &os) const override;
 
   static void Profile(llvm::FoldingSetNodeID& profile, const Stmt *S,
@@ -141,6 +145,8 @@ class SymbolDerived : public SymbolData {
 
   QualType getType() const override;
 
+  StringRef getKindStr() const override;
+
   void dumpToStream(raw_ostream &os) const override;
   const MemRegion *getOriginRegion() const override { return getRegion(); }
 
@@ -177,6 +183,8 @@ class SymbolExtent : public SymbolData {
 
   QualType getType() const override;
 
+  StringRef getKindStr() const override;
+
   void dumpToStream(raw_ostream &os) const override;
 
   static void Profile(llvm::FoldingSetNodeID& profile, const SubRegion *R) {
@@ -226,6 +234,8 @@ class SymbolMetadata : public SymbolData {
 
   QualType getType() const override;
 
+  StringRef getKindStr() const override;
+
   void dumpToStream(raw_ostream &os) const override;
 
   static void Profile(llvm::FoldingSetNodeID& profile, const MemRegion *R,
@@ -529,7 +539,7 @@ class SymbolReaper {
 
   bool isLive(SymbolRef sym);
   bool isLiveRegion(const MemRegion *region);
-  bool isLive(const Stmt *ExprVal, const LocationContext *LCtx) const;
+  bool isLive(const Expr *ExprVal, const LocationContext *LCtx) const;
   bool isLive(const VarRegion *VR, bool includeStoreBindings = false) const;
 
   /// Unconditionally marks a symbol as live.
diff --git a/clang/include/clang/Tooling/Syntax/BuildTree.h b/clang/include/clang/Tooling/Syntax/BuildTree.h
index b7ad50c941d18..452edf580ae17 100644
--- a/clang/include/clang/Tooling/Syntax/BuildTree.h
+++ b/clang/include/clang/Tooling/Syntax/BuildTree.h
@@ -24,9 +24,26 @@ syntax::TranslationUnit *buildSyntaxTree(Arena &A,
 
 // Create syntax trees from subtrees not backed by the source code.
 
-clang::syntax::Leaf *createPunctuation(clang::syntax::Arena &A,
-                                       clang::tok::TokenKind K);
-clang::syntax::EmptyStatement *createEmptyStatement(clang::syntax::Arena &A);
+// Synthesis of Leafs
+/// Create `Leaf` from token with `Spelling` and assert it has the desired
+/// `TokenKind`.
+syntax::Leaf *createLeaf(syntax::Arena &A, tok::TokenKind K,
+                         StringRef Spelling);
+
+/// Infer the token spelling from its `TokenKind`, then create `Leaf` from
+/// this token
+syntax::Leaf *createLeaf(syntax::Arena &A, tok::TokenKind K);
+
+// Synthesis of Trees
+/// Creates the concrete syntax node according to the specified `NodeKind` `K`.
+/// Returns it as a pointer to the base class `Tree`.
+syntax::Tree *
+createTree(syntax::Arena &A,
+           std::vector> Children,
+           syntax::NodeKind K);
+
+// Synthesis of Syntax Nodes
+syntax::EmptyStatement *createEmptyStatement(syntax::Arena &A);
 
 } // namespace syntax
 } // namespace clang
diff --git a/clang/include/clang/Tooling/Syntax/Nodes.h b/clang/include/clang/Tooling/Syntax/Nodes.h
index a6505c8167eed..8b393c5423b4d 100644
--- a/clang/include/clang/Tooling/Syntax/Nodes.h
+++ b/clang/include/clang/Tooling/Syntax/Nodes.h
@@ -190,7 +190,7 @@ class TranslationUnit final : public Tree {
 public:
   TranslationUnit() : Tree(NodeKind::TranslationUnit) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::TranslationUnit;
+    return N->getKind() == NodeKind::TranslationUnit;
   }
 };
 
@@ -200,8 +200,8 @@ class Expression : public Tree {
 public:
   Expression(NodeKind K) : Tree(K) {}
   static bool classof(const Node *N) {
-    return NodeKind::UnknownExpression <= N->kind() &&
-           N->kind() <= NodeKind::UnknownExpression;
+    return NodeKind::UnknownExpression <= N->getKind() &&
+           N->getKind() <= NodeKind::UnknownExpression;
   }
 };
 
@@ -211,10 +211,10 @@ class NameSpecifier : public Tree {
 public:
   NameSpecifier(NodeKind K) : Tree(K) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::GlobalNameSpecifier ||
-           N->kind() == NodeKind::DecltypeNameSpecifier ||
-           N->kind() == NodeKind::IdentifierNameSpecifier ||
-           N->kind() == NodeKind::SimpleTemplateNameSpecifier;
+    return N->getKind() == NodeKind::GlobalNameSpecifier ||
+           N->getKind() == NodeKind::DecltypeNameSpecifier ||
+           N->getKind() == NodeKind::IdentifierNameSpecifier ||
+           N->getKind() == NodeKind::SimpleTemplateNameSpecifier;
   }
 };
 
@@ -226,7 +226,7 @@ class GlobalNameSpecifier final : public NameSpecifier {
 public:
   GlobalNameSpecifier() : NameSpecifier(NodeKind::GlobalNameSpecifier) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::GlobalNameSpecifier;
+    return N->getKind() == NodeKind::GlobalNameSpecifier;
   }
 };
 
@@ -236,7 +236,7 @@ class DecltypeNameSpecifier final : public NameSpecifier {
 public:
   DecltypeNameSpecifier() : NameSpecifier(NodeKind::DecltypeNameSpecifier) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::DecltypeNameSpecifier;
+    return N->getKind() == NodeKind::DecltypeNameSpecifier;
   }
 };
 
@@ -247,7 +247,7 @@ class IdentifierNameSpecifier final : public NameSpecifier {
   IdentifierNameSpecifier()
       : NameSpecifier(NodeKind::IdentifierNameSpecifier) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::IdentifierNameSpecifier;
+    return N->getKind() == NodeKind::IdentifierNameSpecifier;
   }
 };
 
@@ -259,7 +259,7 @@ class SimpleTemplateNameSpecifier final : public NameSpecifier {
   SimpleTemplateNameSpecifier()
       : NameSpecifier(NodeKind::SimpleTemplateNameSpecifier) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::SimpleTemplateNameSpecifier;
+    return N->getKind() == NodeKind::SimpleTemplateNameSpecifier;
   }
 };
 
@@ -269,7 +269,7 @@ class NestedNameSpecifier final : public List {
 public:
   NestedNameSpecifier() : List(NodeKind::NestedNameSpecifier) {}
   static bool classof(const Node *N) {
-    return N->kind() <= NodeKind::NestedNameSpecifier;
+    return N->getKind() <= NodeKind::NestedNameSpecifier;
   }
   std::vector getSpecifiers();
   std::vector>
@@ -282,7 +282,7 @@ class UnqualifiedId final : public Tree {
 public:
   UnqualifiedId() : Tree(NodeKind::UnqualifiedId) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::UnqualifiedId;
+    return N->getKind() == NodeKind::UnqualifiedId;
   }
 };
 
@@ -297,7 +297,7 @@ class IdExpression final : public Expression {
 public:
   IdExpression() : Expression(NodeKind::IdExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::IdExpression;
+    return N->getKind() == NodeKind::IdExpression;
   }
   NestedNameSpecifier *getQualifier();
   Leaf *getTemplateKeyword();
@@ -310,7 +310,7 @@ class UnknownExpression final : public Expression {
 public:
   UnknownExpression() : Expression(NodeKind::UnknownExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::UnknownExpression;
+    return N->getKind() == NodeKind::UnknownExpression;
   }
 };
 
@@ -319,7 +319,7 @@ class ThisExpression final : public Expression {
 public:
   ThisExpression() : Expression(NodeKind::ThisExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::ThisExpression;
+    return N->getKind() == NodeKind::ThisExpression;
   }
   Leaf *getThisKeyword();
 };
@@ -333,7 +333,7 @@ class CallArguments final : public List {
 public:
   CallArguments() : List(NodeKind::CallArguments) {}
   static bool classof(const Node *N) {
-    return N->kind() <= NodeKind::CallArguments;
+    return N->getKind() <= NodeKind::CallArguments;
   }
   std::vector getArguments();
   std::vector> getArgumentsAndCommas();
@@ -347,7 +347,7 @@ class CallExpression final : public Expression {
 public:
   CallExpression() : Expression(NodeKind::CallExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::CallExpression;
+    return N->getKind() == NodeKind::CallExpression;
   }
   Expression *getCallee();
   Leaf *getOpenParen();
@@ -361,7 +361,7 @@ class ParenExpression final : public Expression {
 public:
   ParenExpression() : Expression(NodeKind::ParenExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::ParenExpression;
+    return N->getKind() == NodeKind::ParenExpression;
   }
   Leaf *getOpenParen();
   Expression *getSubExpression();
@@ -380,7 +380,7 @@ class MemberExpression final : public Expression {
 public:
   MemberExpression() : Expression(NodeKind::MemberExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::MemberExpression;
+    return N->getKind() == NodeKind::MemberExpression;
   }
   Expression *getObject();
   Leaf *getAccessToken();
@@ -393,16 +393,16 @@ class LiteralExpression : public Expression {
 public:
   LiteralExpression(NodeKind K) : Expression(K) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::IntegerLiteralExpression ||
-           N->kind() == NodeKind::CharacterLiteralExpression ||
-           N->kind() == NodeKind::FloatingLiteralExpression ||
-           N->kind() == NodeKind::StringLiteralExpression ||
-           N->kind() == NodeKind::BoolLiteralExpression ||
-           N->kind() == NodeKind::CxxNullPtrExpression ||
-           N->kind() == NodeKind::IntegerUserDefinedLiteralExpression ||
-           N->kind() == NodeKind::FloatUserDefinedLiteralExpression ||
-           N->kind() == NodeKind::CharUserDefinedLiteralExpression ||
-           N->kind() == NodeKind::StringUserDefinedLiteralExpression;
+    return N->getKind() == NodeKind::IntegerLiteralExpression ||
+           N->getKind() == NodeKind::CharacterLiteralExpression ||
+           N->getKind() == NodeKind::FloatingLiteralExpression ||
+           N->getKind() == NodeKind::StringLiteralExpression ||
+           N->getKind() == NodeKind::BoolLiteralExpression ||
+           N->getKind() == NodeKind::CxxNullPtrExpression ||
+           N->getKind() == NodeKind::IntegerUserDefinedLiteralExpression ||
+           N->getKind() == NodeKind::FloatUserDefinedLiteralExpression ||
+           N->getKind() == NodeKind::CharUserDefinedLiteralExpression ||
+           N->getKind() == NodeKind::StringUserDefinedLiteralExpression;
   }
   Leaf *getLiteralToken();
 };
@@ -413,7 +413,7 @@ class IntegerLiteralExpression final : public LiteralExpression {
   IntegerLiteralExpression()
       : LiteralExpression(NodeKind::IntegerLiteralExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::IntegerLiteralExpression;
+    return N->getKind() == NodeKind::IntegerLiteralExpression;
   }
 };
 
@@ -423,7 +423,7 @@ class CharacterLiteralExpression final : public LiteralExpression {
   CharacterLiteralExpression()
       : LiteralExpression(NodeKind::CharacterLiteralExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::CharacterLiteralExpression;
+    return N->getKind() == NodeKind::CharacterLiteralExpression;
   }
 };
 
@@ -433,7 +433,7 @@ class FloatingLiteralExpression final : public LiteralExpression {
   FloatingLiteralExpression()
       : LiteralExpression(NodeKind::FloatingLiteralExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::FloatingLiteralExpression;
+    return N->getKind() == NodeKind::FloatingLiteralExpression;
   }
 };
 
@@ -443,7 +443,7 @@ class StringLiteralExpression final : public LiteralExpression {
   StringLiteralExpression()
       : LiteralExpression(NodeKind::StringLiteralExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::StringLiteralExpression;
+    return N->getKind() == NodeKind::StringLiteralExpression;
   }
 };
 
@@ -453,7 +453,7 @@ class BoolLiteralExpression final : public LiteralExpression {
   BoolLiteralExpression()
       : LiteralExpression(NodeKind::BoolLiteralExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::BoolLiteralExpression;
+    return N->getKind() == NodeKind::BoolLiteralExpression;
   }
 };
 
@@ -462,7 +462,7 @@ class CxxNullPtrExpression final : public LiteralExpression {
 public:
   CxxNullPtrExpression() : LiteralExpression(NodeKind::CxxNullPtrExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::CxxNullPtrExpression;
+    return N->getKind() == NodeKind::CxxNullPtrExpression;
   }
 };
 
@@ -476,10 +476,10 @@ class UserDefinedLiteralExpression : public LiteralExpression {
 public:
   UserDefinedLiteralExpression(NodeKind K) : LiteralExpression(K) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::IntegerUserDefinedLiteralExpression ||
-           N->kind() == NodeKind::FloatUserDefinedLiteralExpression ||
-           N->kind() == NodeKind::CharUserDefinedLiteralExpression ||
-           N->kind() == NodeKind::StringUserDefinedLiteralExpression;
+    return N->getKind() == NodeKind::IntegerUserDefinedLiteralExpression ||
+           N->getKind() == NodeKind::FloatUserDefinedLiteralExpression ||
+           N->getKind() == NodeKind::CharUserDefinedLiteralExpression ||
+           N->getKind() == NodeKind::StringUserDefinedLiteralExpression;
   }
 };
 
@@ -491,7 +491,7 @@ class IntegerUserDefinedLiteralExpression final
       : UserDefinedLiteralExpression(
             NodeKind::IntegerUserDefinedLiteralExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::IntegerUserDefinedLiteralExpression;
+    return N->getKind() == NodeKind::IntegerUserDefinedLiteralExpression;
   }
 };
 
@@ -503,7 +503,7 @@ class FloatUserDefinedLiteralExpression final
       : UserDefinedLiteralExpression(
             NodeKind::FloatUserDefinedLiteralExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::FloatUserDefinedLiteralExpression;
+    return N->getKind() == NodeKind::FloatUserDefinedLiteralExpression;
   }
 };
 
@@ -515,7 +515,7 @@ class CharUserDefinedLiteralExpression final
       : UserDefinedLiteralExpression(
             NodeKind::CharUserDefinedLiteralExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::CharUserDefinedLiteralExpression;
+    return N->getKind() == NodeKind::CharUserDefinedLiteralExpression;
   }
 };
 
@@ -527,7 +527,7 @@ class StringUserDefinedLiteralExpression final
       : UserDefinedLiteralExpression(
             NodeKind::StringUserDefinedLiteralExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::StringUserDefinedLiteralExpression;
+    return N->getKind() == NodeKind::StringUserDefinedLiteralExpression;
   }
 };
 
@@ -536,8 +536,8 @@ class UnaryOperatorExpression : public Expression {
 public:
   UnaryOperatorExpression(NodeKind K) : Expression(K) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::PrefixUnaryOperatorExpression ||
-           N->kind() == NodeKind::PostfixUnaryOperatorExpression;
+    return N->getKind() == NodeKind::PrefixUnaryOperatorExpression ||
+           N->getKind() == NodeKind::PostfixUnaryOperatorExpression;
   }
   Leaf *getOperatorToken();
   Expression *getOperand();
@@ -557,7 +557,7 @@ class PrefixUnaryOperatorExpression final : public UnaryOperatorExpression {
   PrefixUnaryOperatorExpression()
       : UnaryOperatorExpression(NodeKind::PrefixUnaryOperatorExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::PrefixUnaryOperatorExpression;
+    return N->getKind() == NodeKind::PrefixUnaryOperatorExpression;
   }
 };
 
@@ -571,7 +571,7 @@ class PostfixUnaryOperatorExpression final : public UnaryOperatorExpression {
   PostfixUnaryOperatorExpression()
       : UnaryOperatorExpression(NodeKind::PostfixUnaryOperatorExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::PostfixUnaryOperatorExpression;
+    return N->getKind() == NodeKind::PostfixUnaryOperatorExpression;
   }
 };
 
@@ -586,7 +586,7 @@ class BinaryOperatorExpression final : public Expression {
 public:
   BinaryOperatorExpression() : Expression(NodeKind::BinaryOperatorExpression) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::BinaryOperatorExpression;
+    return N->getKind() == NodeKind::BinaryOperatorExpression;
   }
   Expression *getLhs();
   Leaf *getOperatorToken();
@@ -599,8 +599,8 @@ class Statement : public Tree {
 public:
   Statement(NodeKind K) : Tree(K) {}
   static bool classof(const Node *N) {
-    return NodeKind::UnknownStatement <= N->kind() &&
-           N->kind() <= NodeKind::CompoundStatement;
+    return NodeKind::UnknownStatement <= N->getKind() &&
+           N->getKind() <= NodeKind::CompoundStatement;
   }
 };
 
@@ -610,7 +610,7 @@ class UnknownStatement final : public Statement {
 public:
   UnknownStatement() : Statement(NodeKind::UnknownStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::UnknownStatement;
+    return N->getKind() == NodeKind::UnknownStatement;
   }
 };
 
@@ -619,7 +619,7 @@ class DeclarationStatement final : public Statement {
 public:
   DeclarationStatement() : Statement(NodeKind::DeclarationStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::DeclarationStatement;
+    return N->getKind() == NodeKind::DeclarationStatement;
   }
 };
 
@@ -628,7 +628,7 @@ class EmptyStatement final : public Statement {
 public:
   EmptyStatement() : Statement(NodeKind::EmptyStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::EmptyStatement;
+    return N->getKind() == NodeKind::EmptyStatement;
   }
 };
 
@@ -637,7 +637,7 @@ class SwitchStatement final : public Statement {
 public:
   SwitchStatement() : Statement(NodeKind::SwitchStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::SwitchStatement;
+    return N->getKind() == NodeKind::SwitchStatement;
   }
   Leaf *getSwitchKeyword();
   Statement *getBody();
@@ -648,7 +648,7 @@ class CaseStatement final : public Statement {
 public:
   CaseStatement() : Statement(NodeKind::CaseStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::CaseStatement;
+    return N->getKind() == NodeKind::CaseStatement;
   }
   Leaf *getCaseKeyword();
   Expression *getCaseValue();
@@ -660,7 +660,7 @@ class DefaultStatement final : public Statement {
 public:
   DefaultStatement() : Statement(NodeKind::DefaultStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::DefaultStatement;
+    return N->getKind() == NodeKind::DefaultStatement;
   }
   Leaf *getDefaultKeyword();
   Statement *getBody();
@@ -672,7 +672,7 @@ class IfStatement final : public Statement {
 public:
   IfStatement() : Statement(NodeKind::IfStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::IfStatement;
+    return N->getKind() == NodeKind::IfStatement;
   }
   Leaf *getIfKeyword();
   Statement *getThenStatement();
@@ -685,7 +685,7 @@ class ForStatement final : public Statement {
 public:
   ForStatement() : Statement(NodeKind::ForStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::ForStatement;
+    return N->getKind() == NodeKind::ForStatement;
   }
   Leaf *getForKeyword();
   Statement *getBody();
@@ -696,7 +696,7 @@ class WhileStatement final : public Statement {
 public:
   WhileStatement() : Statement(NodeKind::WhileStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::WhileStatement;
+    return N->getKind() == NodeKind::WhileStatement;
   }
   Leaf *getWhileKeyword();
   Statement *getBody();
@@ -707,7 +707,7 @@ class ContinueStatement final : public Statement {
 public:
   ContinueStatement() : Statement(NodeKind::ContinueStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::ContinueStatement;
+    return N->getKind() == NodeKind::ContinueStatement;
   }
   Leaf *getContinueKeyword();
 };
@@ -717,7 +717,7 @@ class BreakStatement final : public Statement {
 public:
   BreakStatement() : Statement(NodeKind::BreakStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::BreakStatement;
+    return N->getKind() == NodeKind::BreakStatement;
   }
   Leaf *getBreakKeyword();
 };
@@ -728,7 +728,7 @@ class ReturnStatement final : public Statement {
 public:
   ReturnStatement() : Statement(NodeKind::ReturnStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::ReturnStatement;
+    return N->getKind() == NodeKind::ReturnStatement;
   }
   Leaf *getReturnKeyword();
   Expression *getReturnValue();
@@ -739,7 +739,7 @@ class RangeBasedForStatement final : public Statement {
 public:
   RangeBasedForStatement() : Statement(NodeKind::RangeBasedForStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::RangeBasedForStatement;
+    return N->getKind() == NodeKind::RangeBasedForStatement;
   }
   Leaf *getForKeyword();
   Statement *getBody();
@@ -751,7 +751,7 @@ class ExpressionStatement final : public Statement {
 public:
   ExpressionStatement() : Statement(NodeKind::ExpressionStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::ExpressionStatement;
+    return N->getKind() == NodeKind::ExpressionStatement;
   }
   Expression *getExpression();
 };
@@ -761,7 +761,7 @@ class CompoundStatement final : public Statement {
 public:
   CompoundStatement() : Statement(NodeKind::CompoundStatement) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::CompoundStatement;
+    return N->getKind() == NodeKind::CompoundStatement;
   }
   Leaf *getLbrace();
   /// FIXME: use custom iterator instead of 'vector'.
@@ -777,8 +777,8 @@ class Declaration : public Tree {
 public:
   Declaration(NodeKind K) : Tree(K) {}
   static bool classof(const Node *N) {
-    return NodeKind::UnknownDeclaration <= N->kind() &&
-           N->kind() <= NodeKind::TypeAliasDeclaration;
+    return NodeKind::UnknownDeclaration <= N->getKind() &&
+           N->getKind() <= NodeKind::TypeAliasDeclaration;
   }
 };
 
@@ -787,7 +787,7 @@ class UnknownDeclaration final : public Declaration {
 public:
   UnknownDeclaration() : Declaration(NodeKind::UnknownDeclaration) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::UnknownDeclaration;
+    return N->getKind() == NodeKind::UnknownDeclaration;
   }
 };
 
@@ -796,7 +796,7 @@ class EmptyDeclaration final : public Declaration {
 public:
   EmptyDeclaration() : Declaration(NodeKind::EmptyDeclaration) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::EmptyDeclaration;
+    return N->getKind() == NodeKind::EmptyDeclaration;
   }
 };
 
@@ -806,7 +806,7 @@ class StaticAssertDeclaration final : public Declaration {
 public:
   StaticAssertDeclaration() : Declaration(NodeKind::StaticAssertDeclaration) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::StaticAssertDeclaration;
+    return N->getKind() == NodeKind::StaticAssertDeclaration;
   }
   Expression *getCondition();
   Expression *getMessage();
@@ -819,7 +819,7 @@ class LinkageSpecificationDeclaration final : public Declaration {
   LinkageSpecificationDeclaration()
       : Declaration(NodeKind::LinkageSpecificationDeclaration) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::LinkageSpecificationDeclaration;
+    return N->getKind() == NodeKind::LinkageSpecificationDeclaration;
   }
 };
 
@@ -830,7 +830,7 @@ class SimpleDeclaration final : public Declaration {
 public:
   SimpleDeclaration() : Declaration(NodeKind::SimpleDeclaration) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::SimpleDeclaration;
+    return N->getKind() == NodeKind::SimpleDeclaration;
   }
   /// FIXME: use custom iterator instead of 'vector'.
   std::vector getDeclarators();
@@ -841,7 +841,7 @@ class TemplateDeclaration final : public Declaration {
 public:
   TemplateDeclaration() : Declaration(NodeKind::TemplateDeclaration) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::TemplateDeclaration;
+    return N->getKind() == NodeKind::TemplateDeclaration;
   }
   Leaf *getTemplateKeyword();
   Declaration *getDeclaration();
@@ -857,7 +857,7 @@ class ExplicitTemplateInstantiation final : public Declaration {
   ExplicitTemplateInstantiation()
       : Declaration(NodeKind::ExplicitTemplateInstantiation) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::ExplicitTemplateInstantiation;
+    return N->getKind() == NodeKind::ExplicitTemplateInstantiation;
   }
   Leaf *getTemplateKeyword();
   Leaf *getExternKeyword();
@@ -869,7 +869,7 @@ class NamespaceDefinition final : public Declaration {
 public:
   NamespaceDefinition() : Declaration(NodeKind::NamespaceDefinition) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::NamespaceDefinition;
+    return N->getKind() == NodeKind::NamespaceDefinition;
   }
 };
 
@@ -879,7 +879,7 @@ class NamespaceAliasDefinition final : public Declaration {
   NamespaceAliasDefinition()
       : Declaration(NodeKind::NamespaceAliasDefinition) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::NamespaceAliasDefinition;
+    return N->getKind() == NodeKind::NamespaceAliasDefinition;
   }
 };
 
@@ -888,7 +888,7 @@ class UsingNamespaceDirective final : public Declaration {
 public:
   UsingNamespaceDirective() : Declaration(NodeKind::UsingNamespaceDirective) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::UsingNamespaceDirective;
+    return N->getKind() == NodeKind::UsingNamespaceDirective;
   }
 };
 
@@ -898,7 +898,7 @@ class UsingDeclaration final : public Declaration {
 public:
   UsingDeclaration() : Declaration(NodeKind::UsingDeclaration) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::UsingDeclaration;
+    return N->getKind() == NodeKind::UsingDeclaration;
   }
 };
 
@@ -907,7 +907,7 @@ class TypeAliasDeclaration final : public Declaration {
 public:
   TypeAliasDeclaration() : Declaration(NodeKind::TypeAliasDeclaration) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::TypeAliasDeclaration;
+    return N->getKind() == NodeKind::TypeAliasDeclaration;
   }
 };
 
@@ -927,8 +927,8 @@ class Declarator : public Tree {
 public:
   Declarator(NodeKind K) : Tree(K) {}
   static bool classof(const Node *N) {
-    return NodeKind::SimpleDeclarator <= N->kind() &&
-           N->kind() <= NodeKind::ParenDeclarator;
+    return NodeKind::SimpleDeclarator <= N->getKind() &&
+           N->getKind() <= NodeKind::ParenDeclarator;
   }
 };
 
@@ -938,7 +938,7 @@ class SimpleDeclarator final : public Declarator {
 public:
   SimpleDeclarator() : Declarator(NodeKind::SimpleDeclarator) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::SimpleDeclarator;
+    return N->getKind() == NodeKind::SimpleDeclarator;
   }
 };
 
@@ -949,7 +949,7 @@ class ParenDeclarator final : public Declarator {
 public:
   ParenDeclarator() : Declarator(NodeKind::ParenDeclarator) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::ParenDeclarator;
+    return N->getKind() == NodeKind::ParenDeclarator;
   }
   Leaf *getLparen();
   Leaf *getRparen();
@@ -963,7 +963,7 @@ class ArraySubscript final : public Tree {
 public:
   ArraySubscript() : Tree(NodeKind::ArraySubscript) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::ArraySubscript;
+    return N->getKind() == NodeKind::ArraySubscript;
   }
   // TODO: add an accessor for the "static" keyword.
   Leaf *getLbracket();
@@ -977,7 +977,7 @@ class TrailingReturnType final : public Tree {
 public:
   TrailingReturnType() : Tree(NodeKind::TrailingReturnType) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::TrailingReturnType;
+    return N->getKind() == NodeKind::TrailingReturnType;
   }
   // TODO: add accessors for specifiers.
   Leaf *getArrowToken();
@@ -992,7 +992,7 @@ class ParameterDeclarationList final : public List {
 public:
   ParameterDeclarationList() : List(NodeKind::ParameterDeclarationList) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::ParameterDeclarationList;
+    return N->getKind() == NodeKind::ParameterDeclarationList;
   }
   std::vector getParameterDeclarations();
   std::vector>
@@ -1014,7 +1014,7 @@ class ParametersAndQualifiers final : public Tree {
 public:
   ParametersAndQualifiers() : Tree(NodeKind::ParametersAndQualifiers) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::ParametersAndQualifiers;
+    return N->getKind() == NodeKind::ParametersAndQualifiers;
   }
   Leaf *getLparen();
   ParameterDeclarationList *getParameters();
@@ -1028,7 +1028,7 @@ class MemberPointer final : public Tree {
 public:
   MemberPointer() : Tree(NodeKind::MemberPointer) {}
   static bool classof(const Node *N) {
-    return N->kind() == NodeKind::MemberPointer;
+    return N->getKind() == NodeKind::MemberPointer;
   }
 };
 
diff --git a/clang/include/clang/Tooling/Syntax/Tree.h b/clang/include/clang/Tooling/Syntax/Tree.h
index f7f9e6bdc5a09..a544fc1827b7d 100644
--- a/clang/include/clang/Tooling/Syntax/Tree.h
+++ b/clang/include/clang/Tooling/Syntax/Tree.h
@@ -41,17 +41,19 @@ class Arena {
   Arena(SourceManager &SourceMgr, const LangOptions &LangOpts,
         const TokenBuffer &Tokens);
 
-  const SourceManager &sourceManager() const { return SourceMgr; }
-  const LangOptions &langOptions() const { return LangOpts; }
+  const SourceManager &getSourceManager() const { return SourceMgr; }
+  const LangOptions &getLangOptions() const { return LangOpts; }
 
-  const TokenBuffer &tokenBuffer() const;
-  llvm::BumpPtrAllocator &allocator() { return Allocator; }
+  const TokenBuffer &getTokenBuffer() const;
+  llvm::BumpPtrAllocator &getAllocator() { return Allocator; }
 
+private:
   /// Add \p Buffer to the underlying source manager, tokenize it and store the
-  /// resulting tokens. Useful when there is a need to materialize tokens that
-  /// were not written in user code.
+  /// resulting tokens. Used exclusively in `FactoryImpl` to materialize tokens
+  /// that were not written in user code.
   std::pair>
   lexBuffer(std::unique_ptr Buffer);
+  friend class FactoryImpl;
 
 private:
   SourceManager &SourceMgr;
@@ -79,8 +81,8 @@ class Node {
   /// set when the node is added as a child to another one.
   Node(NodeKind Kind);
 
-  NodeKind kind() const { return static_cast(Kind); }
-  NodeRole role() const { return static_cast(Role); }
+  NodeKind getKind() const { return static_cast(Kind); }
+  NodeRole getRole() const { return static_cast(Role); }
 
   /// Whether the node is detached from a tree, i.e. does not have a parent.
   bool isDetached() const;
@@ -99,11 +101,11 @@ class Node {
   /// modifiable.
   bool canModify() const { return CanModify; }
 
-  const Tree *parent() const { return Parent; }
-  Tree *parent() { return Parent; }
+  const Tree *getParent() const { return Parent; }
+  Tree *getParent() { return Parent; }
 
-  const Node *nextSibling() const { return NextSibling; }
-  Node *nextSibling() { return NextSibling; }
+  const Node *getNextSibling() const { return NextSibling; }
+  Node *getNextSibling() { return NextSibling; }
 
   /// Dumps the structure of a subtree. For debugging and testing purposes.
   std::string dump(const SourceManager &SM) const;
@@ -142,7 +144,7 @@ class Leaf final : public Node {
   Leaf(const Token *T);
   static bool classof(const Node *N);
 
-  const Token *token() const { return Tok; }
+  const Token *getToken() const { return Tok; }
 
 private:
   const Token *Tok;
@@ -154,16 +156,18 @@ class Tree : public Node {
   using Node::Node;
   static bool classof(const Node *N);
 
-  Node *firstChild() { return FirstChild; }
-  const Node *firstChild() const { return FirstChild; }
+  Node *getFirstChild() { return FirstChild; }
+  const Node *getFirstChild() const { return FirstChild; }
 
-  Leaf *firstLeaf();
-  const Leaf *firstLeaf() const {
-    return const_cast(this)->firstLeaf();
+  Leaf *findFirstLeaf();
+  const Leaf *findFirstLeaf() const {
+    return const_cast(this)->findFirstLeaf();
   }
 
-  Leaf *lastLeaf();
-  const Leaf *lastLeaf() const { return const_cast(this)->lastLeaf(); }
+  Leaf *findLastLeaf();
+  const Leaf *findLastLeaf() const {
+    return const_cast(this)->findLastLeaf();
+  }
 
 protected:
   /// Find the first node with a corresponding role.
@@ -209,6 +213,7 @@ class List : public Tree {
   };
 
   using Tree::Tree;
+  static bool classof(const Node *N);
   /// Returns the elements and corresponding delimiters. Missing elements
   /// and delimiters are represented as null pointers.
   ///
@@ -232,16 +237,16 @@ class List : public Tree {
   ///
   /// Useful for discovering the correct delimiter to use when adding
   /// elements to empty or one-element lists.
-  clang::tok::TokenKind getDelimiterTokenKind();
+  clang::tok::TokenKind getDelimiterTokenKind() const;
 
-  TerminationKind getTerminationKind();
+  TerminationKind getTerminationKind() const;
 
   /// Whether this list can be empty in syntactically and semantically correct
   /// code.
   ///
   /// This list may be empty when the source code has errors even if
   /// canBeEmpty() returns false.
-  bool canBeEmpty();
+  bool canBeEmpty() const;
 };
 
 } // namespace syntax
diff --git a/clang/lib/AST/APValue.cpp b/clang/lib/AST/APValue.cpp
index 08ae0ff3c67d3..32d3ff7ce1d08 100644
--- a/clang/lib/AST/APValue.cpp
+++ b/clang/lib/AST/APValue.cpp
@@ -38,7 +38,7 @@ static_assert(
     "Type is insufficiently aligned");
 
 APValue::LValueBase::LValueBase(const ValueDecl *P, unsigned I, unsigned V)
-    : Ptr(P), Local{I, V} {}
+    : Ptr(P ? cast(P->getCanonicalDecl()) : nullptr), Local{I, V} {}
 APValue::LValueBase::LValueBase(const Expr *P, unsigned I, unsigned V)
     : Ptr(P), Local{I, V} {}
 
@@ -82,13 +82,19 @@ bool operator==(const APValue::LValueBase &LHS,
                 const APValue::LValueBase &RHS) {
   if (LHS.Ptr != RHS.Ptr)
     return false;
-  if (LHS.is())
+  if (LHS.is() || LHS.is())
     return true;
   return LHS.Local.CallIndex == RHS.Local.CallIndex &&
          LHS.Local.Version == RHS.Local.Version;
 }
 }
 
+APValue::LValuePathEntry::LValuePathEntry(BaseOrMemberType BaseOrMember) {
+  if (const Decl *D = BaseOrMember.getPointer())
+    BaseOrMember.setPointer(D->getCanonicalDecl());
+  Value = reinterpret_cast(BaseOrMember.getOpaqueValue());
+}
+
 namespace {
   struct LVBase {
     APValue::LValueBase Base;
@@ -113,14 +119,16 @@ APValue::LValueBase::operator bool () const {
 
 clang::APValue::LValueBase
 llvm::DenseMapInfo::getEmptyKey() {
-  return clang::APValue::LValueBase(
-      DenseMapInfo::getEmptyKey());
+  clang::APValue::LValueBase B;
+  B.Ptr = DenseMapInfo::getEmptyKey();
+  return B;
 }
 
 clang::APValue::LValueBase
 llvm::DenseMapInfo::getTombstoneKey() {
-  return clang::APValue::LValueBase(
-      DenseMapInfo::getTombstoneKey());
+  clang::APValue::LValueBase B;
+  B.Ptr = DenseMapInfo::getTombstoneKey();
+  return B;
 }
 
 namespace clang {
@@ -773,8 +781,10 @@ void APValue::MakeMemberPointer(const ValueDecl *Member, bool IsDerivedMember,
   assert(isAbsent() && "Bad state change");
   MemberPointerData *MPD = new ((void*)(char*)Data.buffer) MemberPointerData;
   Kind = MemberPointer;
-  MPD->MemberAndIsDerivedMember.setPointer(Member);
+  MPD->MemberAndIsDerivedMember.setPointer(
+      Member ? cast(Member->getCanonicalDecl()) : nullptr);
   MPD->MemberAndIsDerivedMember.setInt(IsDerivedMember);
   MPD->resizePath(Path.size());
-  memcpy(MPD->getPath(), Path.data(), Path.size()*sizeof(const CXXRecordDecl*));
+  for (unsigned I = 0; I != Path.size(); ++I)
+    MPD->getPath()[I] = Path[I]->getCanonicalDecl();
 }
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 59ca0b8c963f7..ee0eec1f2c6fd 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -8539,6 +8539,10 @@ bool ASTContext::areCompatibleSveTypes(QualType FirstType,
         else if (VT->getVectorKind() == VectorType::SveFixedLengthDataVector)
           return VT->getElementType().getCanonicalType() ==
                  FirstType->getSveEltType(*this);
+        else if (VT->getVectorKind() == VectorType::GenericVector)
+          return getTypeSize(SecondType) == getLangOpts().ArmSveVectorBits &&
+                 hasSameType(VT->getElementType(),
+                             getBuiltinVectorTypeInfo(BT).ElementType);
       }
     }
     return false;
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 71a70f2185722..c10b3f6be5522 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -6936,7 +6936,7 @@ ExpectedStmt ASTNodeImporter::VisitImplicitCastExpr(ImplicitCastExpr *E) {
 
   return ImplicitCastExpr::Create(
       Importer.getToContext(), *ToTypeOrErr, E->getCastKind(), *ToSubExprOrErr,
-      &(*ToBasePathOrErr), E->getValueKind());
+      &(*ToBasePathOrErr), E->getValueKind(), E->getFPFeatures());
 }
 
 ExpectedStmt ASTNodeImporter::VisitExplicitCastExpr(ExplicitCastExpr *E) {
@@ -6963,8 +6963,8 @@ ExpectedStmt ASTNodeImporter::VisitExplicitCastExpr(ExplicitCastExpr *E) {
       return ToRParenLocOrErr.takeError();
     return CStyleCastExpr::Create(
         Importer.getToContext(), ToType, E->getValueKind(), E->getCastKind(),
-        ToSubExpr, ToBasePath, ToTypeInfoAsWritten, *ToLParenLocOrErr,
-        *ToRParenLocOrErr);
+        ToSubExpr, ToBasePath, CCE->getFPFeatures(), ToTypeInfoAsWritten,
+        *ToLParenLocOrErr, *ToRParenLocOrErr);
   }
 
   case Stmt::CXXFunctionalCastExprClass: {
@@ -6977,8 +6977,8 @@ ExpectedStmt ASTNodeImporter::VisitExplicitCastExpr(ExplicitCastExpr *E) {
       return ToRParenLocOrErr.takeError();
     return CXXFunctionalCastExpr::Create(
         Importer.getToContext(), ToType, E->getValueKind(), ToTypeInfoAsWritten,
-        E->getCastKind(), ToSubExpr, ToBasePath, *ToLParenLocOrErr,
-        *ToRParenLocOrErr);
+        E->getCastKind(), ToSubExpr, ToBasePath, FCE->getFPFeatures(),
+        *ToLParenLocOrErr, *ToRParenLocOrErr);
   }
 
   case Stmt::ObjCBridgedCastExprClass: {
@@ -7821,10 +7821,11 @@ ExpectedStmt ASTNodeImporter::VisitCXXNamedCastExpr(CXXNamedCastExpr *E) {
   if (!ToBasePathOrErr)
     return ToBasePathOrErr.takeError();
 
-  if (isa(E)) {
+  if (auto CCE = dyn_cast(E)) {
     return CXXStaticCastExpr::Create(
         Importer.getToContext(), ToType, VK, CK, ToSubExpr, &(*ToBasePathOrErr),
-        ToTypeInfoAsWritten, ToOperatorLoc, ToRParenLoc, ToAngleBrackets);
+        ToTypeInfoAsWritten, CCE->getFPFeatures(), ToOperatorLoc, ToRParenLoc,
+        ToAngleBrackets);
   } else if (isa(E)) {
     return CXXDynamicCastExpr::Create(
         Importer.getToContext(), ToType, VK, CK, ToSubExpr, &(*ToBasePathOrErr),
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 8b5b2444f1e25..fafcfce269d75 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -68,7 +68,12 @@
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/ExprConcepts.h"
+#include "clang/AST/ExprObjC.h"
+#include "clang/AST/ExprOpenMP.h"
 #include "clang/AST/NestedNameSpecifier.h"
+#include "clang/AST/StmtObjC.h"
+#include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/TemplateName.h"
 #include "clang/AST/Type.h"
@@ -149,32 +154,230 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
   return true;
 }
 
-/// Determine structural equivalence of two expressions.
-static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
-                                     const Expr *E1, const Expr *E2) {
-  if (!E1 || !E2)
-    return E1 == E2;
+namespace {
+/// Encapsulates Stmt comparison logic.
+class StmtComparer {
+  StructuralEquivalenceContext &Context;
+
+  // IsStmtEquivalent overloads. Each overload compares a specific statement
+  // and only has to compare the data that is specific to the specific statement
+  // class. Should only be called from TraverseStmt.
+
+  bool IsStmtEquivalent(const AddrLabelExpr *E1, const AddrLabelExpr *E2) {
+    return IsStructurallyEquivalent(Context, E1->getLabel(), E2->getLabel());
+  }
+
+  bool IsStmtEquivalent(const AtomicExpr *E1, const AtomicExpr *E2) {
+    return E1->getOp() == E2->getOp();
+  }
+
+  bool IsStmtEquivalent(const BinaryOperator *E1, const BinaryOperator *E2) {
+    return E1->getOpcode() == E2->getOpcode();
+  }
 
-  if (auto *DE1 = dyn_cast(E1)) {
-    auto *DE2 = dyn_cast(E2);
-    if (!DE2)
+  bool IsStmtEquivalent(const CallExpr *E1, const CallExpr *E2) {
+    // FIXME: IsStructurallyEquivalent requires non-const Decls.
+    Decl *Callee1 = const_cast(E1->getCalleeDecl());
+    Decl *Callee2 = const_cast(E2->getCalleeDecl());
+
+    // Compare whether both calls know their callee.
+    if (static_cast(Callee1) != static_cast(Callee2))
       return false;
+
+    // Both calls have no callee, so nothing to do.
+    if (!static_cast(Callee1))
+      return true;
+
+    assert(Callee2);
+    return IsStructurallyEquivalent(Context, Callee1, Callee2);
+  }
+
+  bool IsStmtEquivalent(const CharacterLiteral *E1,
+                        const CharacterLiteral *E2) {
+    return E1->getValue() == E2->getValue() && E1->getKind() == E2->getKind();
+  }
+
+  bool IsStmtEquivalent(const ChooseExpr *E1, const ChooseExpr *E2) {
+    return true; // Semantics only depend on children.
+  }
+
+  bool IsStmtEquivalent(const CompoundStmt *E1, const CompoundStmt *E2) {
+    // Number of children is actually checked by the generic children comparison
+    // code, but a CompoundStmt is one of the few statements where the number of
+    // children frequently differs and the number of statements is also always
+    // precomputed. Directly comparing the number of children here is thus
+    // just an optimization.
+    return E1->size() == E2->size();
+  }
+
+  bool IsStmtEquivalent(const DependentScopeDeclRefExpr *DE1,
+                        const DependentScopeDeclRefExpr *DE2) {
     if (!IsStructurallyEquivalent(Context, DE1->getDeclName(),
                                   DE2->getDeclName()))
       return false;
     return IsStructurallyEquivalent(Context, DE1->getQualifier(),
                                     DE2->getQualifier());
-  } else if (auto CastE1 = dyn_cast(E1)) {
-    auto *CastE2 = dyn_cast(E2);
-    if (!CastE2)
+  }
+
+  bool IsStmtEquivalent(const Expr *E1, const Expr *E2) {
+    return IsStructurallyEquivalent(Context, E1->getType(), E2->getType());
+  }
+
+  bool IsStmtEquivalent(const ExpressionTraitExpr *E1,
+                        const ExpressionTraitExpr *E2) {
+    return E1->getTrait() == E2->getTrait() && E1->getValue() == E2->getValue();
+  }
+
+  bool IsStmtEquivalent(const FloatingLiteral *E1, const FloatingLiteral *E2) {
+    return E1->isExact() == E2->isExact() && E1->getValue() == E2->getValue();
+  }
+
+  bool IsStmtEquivalent(const ImplicitCastExpr *CastE1,
+                        const ImplicitCastExpr *CastE2) {
+    return IsStructurallyEquivalent(Context, CastE1->getType(),
+                                    CastE2->getType());
+  }
+
+  bool IsStmtEquivalent(const IntegerLiteral *E1, const IntegerLiteral *E2) {
+    return E1->getValue() == E2->getValue();
+  }
+
+  bool IsStmtEquivalent(const ObjCStringLiteral *E1,
+                        const ObjCStringLiteral *E2) {
+    // Just wraps a StringLiteral child.
+    return true;
+  }
+
+  bool IsStmtEquivalent(const Stmt *S1, const Stmt *S2) { return true; }
+
+  bool IsStmtEquivalent(const SourceLocExpr *E1, const SourceLocExpr *E2) {
+    return E1->getIdentKind() == E2->getIdentKind();
+  }
+
+  bool IsStmtEquivalent(const StmtExpr *E1, const StmtExpr *E2) {
+    return E1->getTemplateDepth() == E2->getTemplateDepth();
+  }
+
+  bool IsStmtEquivalent(const StringLiteral *E1, const StringLiteral *E2) {
+    return E1->getBytes() == E2->getBytes();
+  }
+
+  bool IsStmtEquivalent(const SubstNonTypeTemplateParmExpr *E1,
+                        const SubstNonTypeTemplateParmExpr *E2) {
+    return IsStructurallyEquivalent(Context, E1->getParameter(),
+                                    E2->getParameter());
+  }
+
+  bool IsStmtEquivalent(const SubstNonTypeTemplateParmPackExpr *E1,
+                        const SubstNonTypeTemplateParmPackExpr *E2) {
+    return IsStructurallyEquivalent(Context, E1->getArgumentPack(),
+                                    E2->getArgumentPack());
+  }
+
+  bool IsStmtEquivalent(const TypeTraitExpr *E1, const TypeTraitExpr *E2) {
+    if (E1->getTrait() != E2->getTrait())
+      return false;
+
+    for (auto Pair : zip_longest(E1->getArgs(), E2->getArgs())) {
+      Optional Child1 = std::get<0>(Pair);
+      Optional Child2 = std::get<1>(Pair);
+      // Different number of args.
+      if (!Child1 || !Child2)
+        return false;
+
+      if (!IsStructurallyEquivalent(Context, (*Child1)->getType(),
+                                    (*Child2)->getType()))
+        return false;
+    }
+    return true;
+  }
+
+  bool IsStmtEquivalent(const UnaryExprOrTypeTraitExpr *E1,
+                        const UnaryExprOrTypeTraitExpr *E2) {
+    if (E1->getKind() != E2->getKind())
+      return false;
+    return IsStructurallyEquivalent(Context, E1->getTypeOfArgument(),
+                                    E2->getTypeOfArgument());
+  }
+
+  bool IsStmtEquivalent(const UnaryOperator *E1, const UnaryOperator *E2) {
+    return E1->getOpcode() == E2->getOpcode();
+  }
+
+  bool IsStmtEquivalent(const VAArgExpr *E1, const VAArgExpr *E2) {
+    // Semantics only depend on children.
+    return true;
+  }
+
+  /// End point of the traversal chain.
+  bool TraverseStmt(const Stmt *S1, const Stmt *S2) { return true; }
+
+  // Create traversal methods that traverse the class hierarchy and return
+  // the accumulated result of the comparison. Each TraverseStmt overload
+  // calls the TraverseStmt overload of the parent class. For example,
+  // the TraverseStmt overload for 'BinaryOperator' calls the TraverseStmt
+  // overload of 'Expr' which then calls the overload for 'Stmt'.
+#define STMT(CLASS, PARENT)                                                    \
+  bool TraverseStmt(const CLASS *S1, const CLASS *S2) {                        \
+    if (!TraverseStmt(static_cast(S1),                         \
+                      static_cast(S2)))                        \
+      return false;                                                            \
+    return IsStmtEquivalent(S1, S2);                                           \
+  }
+#include "clang/AST/StmtNodes.inc"
+
+public:
+  StmtComparer(StructuralEquivalenceContext &C) : Context(C) {}
+
+  /// Determine whether two statements are equivalent. The statements have to
+  /// be of the same kind. The children of the statements and their properties
+  /// are not compared by this function.
+  bool IsEquivalent(const Stmt *S1, const Stmt *S2) {
+    if (S1->getStmtClass() != S2->getStmtClass())
+      return false;
+
+    // Each TraverseStmt walks the class hierarchy from the leaf class to
+    // the root class 'Stmt' (e.g. 'BinaryOperator' -> 'Expr' -> 'Stmt'). Cast
+    // the Stmt we have here to its specific subclass so that we call the
+    // overload that walks the whole class hierarchy from leaf to root (e.g.,
+    // cast to 'BinaryOperator' so that 'Expr' and 'Stmt' is traversed).
+    switch (S1->getStmtClass()) {
+    case Stmt::NoStmtClass:
+      llvm_unreachable("Can't traverse NoStmtClass");
+#define STMT(CLASS, PARENT)                                                    \
+  case Stmt::StmtClass::CLASS##Class:                                          \
+    return TraverseStmt(static_cast(S1),                        \
+                        static_cast(S2));
+#define ABSTRACT_STMT(S)
+#include "clang/AST/StmtNodes.inc"
+    }
+    llvm_unreachable("Invalid statement kind");
+  }
+};
+} // namespace
+
+/// Determine structural equivalence of two statements.
+static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
+                                     const Stmt *S1, const Stmt *S2) {
+  if (!S1 || !S2)
+    return S1 == S2;
+
+  // Compare the statements itself.
+  StmtComparer Comparer(Context);
+  if (!Comparer.IsEquivalent(S1, S2))
+    return false;
+
+  // Iterate over the children of both statements and also compare them.
+  for (auto Pair : zip_longest(S1->children(), S2->children())) {
+    Optional Child1 = std::get<0>(Pair);
+    Optional Child2 = std::get<1>(Pair);
+    // One of the statements has a different amount of children than the other,
+    // so the statements can't be equivalent.
+    if (!Child1 || !Child2)
       return false;
-    if (!IsStructurallyEquivalent(Context, CastE1->getType(),
-                                  CastE2->getType()))
+    if (!IsStructurallyEquivalent(Context, *Child1, *Child2))
       return false;
-    return IsStructurallyEquivalent(Context, CastE1->getSubExpr(),
-                                    CastE2->getSubExpr());
   }
-  // FIXME: Handle other kind of expressions!
   return true;
 }
 
@@ -1790,6 +1993,15 @@ bool StructuralEquivalenceContext::IsEquivalent(QualType T1, QualType T2) {
   return !Finish();
 }
 
+bool StructuralEquivalenceContext::IsEquivalent(Stmt *S1, Stmt *S2) {
+  assert(DeclsToCheck.empty());
+  assert(VisitedDecls.empty());
+  if (!::IsStructurallyEquivalent(*this, S1, S2))
+    return false;
+
+  return !Finish();
+}
+
 bool StructuralEquivalenceContext::CheckCommonEquivalence(Decl *D1, Decl *D2) {
   // Check for equivalent described template.
   TemplateDecl *Template1 = D1->getDescribedTemplate();
diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt
index dfd26fd97bc6d..35099fd0dacf8 100644
--- a/clang/lib/AST/CMakeLists.txt
+++ b/clang/lib/AST/CMakeLists.txt
@@ -55,7 +55,6 @@ add_clang_library(clangAST
   ExternalASTMerger.cpp
   ExternalASTSource.cpp
   FormatString.cpp
-  IgnoreExpr.cpp
   InheritViz.cpp
   Interp/ByteCodeEmitter.cpp
   Interp/ByteCodeExprGen.cpp
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 2a7017635b08c..e204ec8a77742 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3167,44 +3167,24 @@ FunctionDecl *FunctionDecl::getCanonicalDecl() { return getFirstDecl(); }
 /// functions as their wrapped builtins. This shouldn't be done in general, but
 /// it's useful in Sema to diagnose calls to wrappers based on their semantics.
 unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const {
-  unsigned BuiltinID;
+  unsigned BuiltinID = 0;
 
   if (const auto *ABAA = getAttr()) {
     BuiltinID = ABAA->getBuiltinName()->getBuiltinID();
-  } else {
-    if (!getIdentifier())
-      return 0;
-
-    BuiltinID = getIdentifier()->getBuiltinID();
+  } else if (const auto *A = getAttr()) {
+    BuiltinID = A->getID();
   }
 
   if (!BuiltinID)
     return 0;
 
-  ASTContext &Context = getASTContext();
-  if (Context.getLangOpts().CPlusPlus) {
-    const auto *LinkageDecl =
-        dyn_cast(getFirstDecl()->getDeclContext());
-    // In C++, the first declaration of a builtin is always inside an implicit
-    // extern "C".
-    // FIXME: A recognised library function may not be directly in an extern "C"
-    // declaration, for instance "extern "C" { namespace std { decl } }".
-    if (!LinkageDecl) {
-      if (BuiltinID == Builtin::BI__GetExceptionInfo &&
-          Context.getTargetInfo().getCXXABI().isMicrosoft())
-        return Builtin::BI__GetExceptionInfo;
-      return 0;
-    }
-    if (LinkageDecl->getLanguage() != LinkageSpecDecl::lang_c)
-      return 0;
-  }
-
   // If the function is marked "overloadable", it has a different mangled name
   // and is not the C library function.
   if (!ConsiderWrapperFunctions && hasAttr() &&
       !hasAttr())
     return 0;
 
+  ASTContext &Context = getASTContext();
   if (!Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID))
     return BuiltinID;
 
@@ -4710,7 +4690,7 @@ char *Buffer = new (getASTContext(), 1) char[Name.size() + 1];
 void ValueDecl::anchor() {}
 
 bool ValueDecl::isWeak() const {
-  for (const auto *I : attrs())
+  for (const auto *I : getMostRecentDecl()->attrs())
     if (isa(I) || isa(I))
       return true;
 
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index f4314d0bd9614..ab2b55c0762e7 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -720,7 +720,7 @@ bool Decl::isWeakImported() const {
   if (!canBeWeakImported(IsDefinition))
     return false;
 
-  for (const auto *A : attrs()) {
+  for (const auto *A : getMostRecentDecl()->attrs()) {
     if (isa(A))
       return true;
 
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 15f3df0fd2168..b664224aa7323 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -1892,19 +1892,42 @@ const FieldDecl *CastExpr::getTargetFieldForToUnionCast(const RecordDecl *RD,
   return nullptr;
 }
 
+FPOptionsOverride *CastExpr::getTrailingFPFeatures() {
+  assert(hasStoredFPFeatures());
+  switch (getStmtClass()) {
+  case ImplicitCastExprClass:
+    return static_cast(this)
+        ->getTrailingObjects();
+  case CStyleCastExprClass:
+    return static_cast(this)
+        ->getTrailingObjects();
+  case CXXFunctionalCastExprClass:
+    return static_cast(this)
+        ->getTrailingObjects();
+  case CXXStaticCastExprClass:
+    return static_cast(this)
+        ->getTrailingObjects();
+  default:
+    llvm_unreachable("Cast does not have FPFeatures");
+  }
+}
+
 ImplicitCastExpr *ImplicitCastExpr::Create(const ASTContext &C, QualType T,
                                            CastKind Kind, Expr *Operand,
                                            const CXXCastPath *BasePath,
-                                           ExprValueKind VK) {
+                                           ExprValueKind VK,
+                                           FPOptionsOverride FPO) {
   unsigned PathSize = (BasePath ? BasePath->size() : 0);
-  void *Buffer = C.Allocate(totalSizeToAlloc(PathSize));
+  void *Buffer =
+      C.Allocate(totalSizeToAlloc(
+          PathSize, FPO.requiresTrailingStorage()));
   // Per C++ [conv.lval]p3, lvalue-to-rvalue conversions on class and
   // std::nullptr_t have special semantics not captured by CK_LValueToRValue.
   assert((Kind != CK_LValueToRValue ||
           !(T->isNullPtrType() || T->getAsCXXRecordDecl())) &&
          "invalid type for lvalue-to-rvalue conversion");
   ImplicitCastExpr *E =
-    new (Buffer) ImplicitCastExpr(T, Kind, Operand, PathSize, VK);
+      new (Buffer) ImplicitCastExpr(T, Kind, Operand, PathSize, FPO, VK);
   if (PathSize)
     std::uninitialized_copy_n(BasePath->data(), BasePath->size(),
                               E->getTrailingObjects());
@@ -1912,21 +1935,26 @@ ImplicitCastExpr *ImplicitCastExpr::Create(const ASTContext &C, QualType T,
 }
 
 ImplicitCastExpr *ImplicitCastExpr::CreateEmpty(const ASTContext &C,
-                                                unsigned PathSize) {
-  void *Buffer = C.Allocate(totalSizeToAlloc(PathSize));
-  return new (Buffer) ImplicitCastExpr(EmptyShell(), PathSize);
+                                                unsigned PathSize,
+                                                bool HasFPFeatures) {
+  void *Buffer =
+      C.Allocate(totalSizeToAlloc(
+          PathSize, HasFPFeatures));
+  return new (Buffer) ImplicitCastExpr(EmptyShell(), PathSize, HasFPFeatures);
 }
 
-
 CStyleCastExpr *CStyleCastExpr::Create(const ASTContext &C, QualType T,
                                        ExprValueKind VK, CastKind K, Expr *Op,
                                        const CXXCastPath *BasePath,
+                                       FPOptionsOverride FPO,
                                        TypeSourceInfo *WrittenTy,
                                        SourceLocation L, SourceLocation R) {
   unsigned PathSize = (BasePath ? BasePath->size() : 0);
-  void *Buffer = C.Allocate(totalSizeToAlloc(PathSize));
+  void *Buffer =
+      C.Allocate(totalSizeToAlloc(
+          PathSize, FPO.requiresTrailingStorage()));
   CStyleCastExpr *E =
-    new (Buffer) CStyleCastExpr(T, VK, K, Op, PathSize, WrittenTy, L, R);
+      new (Buffer) CStyleCastExpr(T, VK, K, Op, PathSize, FPO, WrittenTy, L, R);
   if (PathSize)
     std::uninitialized_copy_n(BasePath->data(), BasePath->size(),
                               E->getTrailingObjects());
@@ -1934,9 +1962,12 @@ CStyleCastExpr *CStyleCastExpr::Create(const ASTContext &C, QualType T,
 }
 
 CStyleCastExpr *CStyleCastExpr::CreateEmpty(const ASTContext &C,
-                                            unsigned PathSize) {
-  void *Buffer = C.Allocate(totalSizeToAlloc(PathSize));
-  return new (Buffer) CStyleCastExpr(EmptyShell(), PathSize);
+                                            unsigned PathSize,
+                                            bool HasFPFeatures) {
+  void *Buffer =
+      C.Allocate(totalSizeToAlloc(
+          PathSize, HasFPFeatures));
+  return new (Buffer) CStyleCastExpr(EmptyShell(), PathSize, HasFPFeatures);
 }
 
 /// getOpcodeStr - Turn an Opcode enum value into the punctuation char it
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index 3d61496f30e2a..1fd2b8e3b4e26 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -146,6 +146,18 @@ bool CXXTypeidExpr::isPotentiallyEvaluated() const {
   return false;
 }
 
+bool CXXTypeidExpr::isMostDerived(ASTContext &Context) const {
+  assert(!isTypeOperand() && "Cannot call isMostDerived for typeid(type)");
+  const Expr *E = getExprOperand()->IgnoreParenNoopCasts(Context);
+  if (const auto *DRE = dyn_cast(E)) {
+    QualType Ty = DRE->getDecl()->getType();
+    if (!Ty->isPointerType() && !Ty->isReferenceType())
+      return true;
+  }
+
+  return false;
+}
+
 QualType CXXTypeidExpr::getTypeOperand(ASTContext &Context) const {
   assert(isTypeOperand() && "Cannot call getTypeOperand for typeid(expr)");
   Qualifiers Quals;
@@ -690,19 +702,18 @@ const char *CXXNamedCastExpr::getCastName() const {
   }
 }
 
-CXXStaticCastExpr *CXXStaticCastExpr::Create(const ASTContext &C, QualType T,
-                                             ExprValueKind VK,
-                                             CastKind K, Expr *Op,
-                                             const CXXCastPath *BasePath,
-                                             TypeSourceInfo *WrittenTy,
-                                             SourceLocation L,
-                                             SourceLocation RParenLoc,
-                                             SourceRange AngleBrackets) {
+CXXStaticCastExpr *
+CXXStaticCastExpr::Create(const ASTContext &C, QualType T, ExprValueKind VK,
+                          CastKind K, Expr *Op, const CXXCastPath *BasePath,
+                          TypeSourceInfo *WrittenTy, FPOptionsOverride FPO,
+                          SourceLocation L, SourceLocation RParenLoc,
+                          SourceRange AngleBrackets) {
   unsigned PathSize = (BasePath ? BasePath->size() : 0);
-  void *Buffer = C.Allocate(totalSizeToAlloc(PathSize));
-  auto *E =
-      new (Buffer) CXXStaticCastExpr(T, VK, K, Op, PathSize, WrittenTy, L,
-                                     RParenLoc, AngleBrackets);
+  void *Buffer =
+      C.Allocate(totalSizeToAlloc(
+          PathSize, FPO.requiresTrailingStorage()));
+  auto *E = new (Buffer) CXXStaticCastExpr(T, VK, K, Op, PathSize, WrittenTy,
+                                           FPO, L, RParenLoc, AngleBrackets);
   if (PathSize)
     std::uninitialized_copy_n(BasePath->data(), BasePath->size(),
                               E->getTrailingObjects());
@@ -710,9 +721,12 @@ CXXStaticCastExpr *CXXStaticCastExpr::Create(const ASTContext &C, QualType T,
 }
 
 CXXStaticCastExpr *CXXStaticCastExpr::CreateEmpty(const ASTContext &C,
-                                                  unsigned PathSize) {
-  void *Buffer = C.Allocate(totalSizeToAlloc(PathSize));
-  return new (Buffer) CXXStaticCastExpr(EmptyShell(), PathSize);
+                                                  unsigned PathSize,
+                                                  bool HasFPFeatures) {
+  void *Buffer =
+      C.Allocate(totalSizeToAlloc(
+          PathSize, HasFPFeatures));
+  return new (Buffer) CXXStaticCastExpr(EmptyShell(), PathSize, HasFPFeatures);
 }
 
 CXXDynamicCastExpr *CXXDynamicCastExpr::Create(const ASTContext &C, QualType T,
@@ -823,25 +837,30 @@ CXXAddrspaceCastExpr *CXXAddrspaceCastExpr::CreateEmpty(const ASTContext &C) {
   return new (C) CXXAddrspaceCastExpr(EmptyShell());
 }
 
-CXXFunctionalCastExpr *
-CXXFunctionalCastExpr::Create(const ASTContext &C, QualType T, ExprValueKind VK,
-                              TypeSourceInfo *Written, CastKind K, Expr *Op,
-                              const CXXCastPath *BasePath,
-                              SourceLocation L, SourceLocation R) {
+CXXFunctionalCastExpr *CXXFunctionalCastExpr::Create(
+    const ASTContext &C, QualType T, ExprValueKind VK, TypeSourceInfo *Written,
+    CastKind K, Expr *Op, const CXXCastPath *BasePath, FPOptionsOverride FPO,
+    SourceLocation L, SourceLocation R) {
   unsigned PathSize = (BasePath ? BasePath->size() : 0);
-  void *Buffer = C.Allocate(totalSizeToAlloc(PathSize));
-  auto *E =
-      new (Buffer) CXXFunctionalCastExpr(T, VK, Written, K, Op, PathSize, L, R);
+  void *Buffer =
+      C.Allocate(totalSizeToAlloc(
+          PathSize, FPO.requiresTrailingStorage()));
+  auto *E = new (Buffer)
+      CXXFunctionalCastExpr(T, VK, Written, K, Op, PathSize, FPO, L, R);
   if (PathSize)
     std::uninitialized_copy_n(BasePath->data(), BasePath->size(),
                               E->getTrailingObjects());
   return E;
 }
 
-CXXFunctionalCastExpr *
-CXXFunctionalCastExpr::CreateEmpty(const ASTContext &C, unsigned PathSize) {
-  void *Buffer = C.Allocate(totalSizeToAlloc(PathSize));
-  return new (Buffer) CXXFunctionalCastExpr(EmptyShell(), PathSize);
+CXXFunctionalCastExpr *CXXFunctionalCastExpr::CreateEmpty(const ASTContext &C,
+                                                          unsigned PathSize,
+                                                          bool HasFPFeatures) {
+  void *Buffer =
+      C.Allocate(totalSizeToAlloc(
+          PathSize, HasFPFeatures));
+  return new (Buffer)
+      CXXFunctionalCastExpr(EmptyShell(), PathSize, HasFPFeatures);
 }
 
 SourceLocation CXXFunctionalCastExpr::getBeginLoc() const {
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index b6083fdc16fcf..c06a7fb4cf6fa 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -1978,18 +1978,11 @@ static bool HasSameBase(const LValue &A, const LValue &B) {
     return false;
 
   if (A.getLValueBase().getOpaqueValue() !=
-      B.getLValueBase().getOpaqueValue()) {
-    const Decl *ADecl = GetLValueBaseDecl(A);
-    if (!ADecl)
-      return false;
-    const Decl *BDecl = GetLValueBaseDecl(B);
-    if (!BDecl || ADecl->getCanonicalDecl() != BDecl->getCanonicalDecl())
-      return false;
-  }
+      B.getLValueBase().getOpaqueValue())
+    return false;
 
-  return IsGlobalLValue(A.getLValueBase()) ||
-         (A.getLValueCallIndex() == B.getLValueCallIndex() &&
-          A.getLValueVersion() == B.getLValueVersion());
+  return A.getLValueCallIndex() == B.getLValueCallIndex() &&
+         A.getLValueVersion() == B.getLValueVersion();
 }
 
 static void NoteLValueLocation(EvalInfo &Info, APValue::LValueBase Base) {
@@ -3108,7 +3101,8 @@ static bool evaluateVarDeclInit(EvalInfo &Info, const Expr *E,
 
   // If we're currently evaluating the initializer of this declaration, use that
   // in-flight value.
-  if (Info.EvaluatingDecl.dyn_cast() == VD) {
+  if (declaresSameEntity(Info.EvaluatingDecl.dyn_cast(),
+                         VD)) {
     Result = Info.EvaluatingDeclValue;
     return true;
   }
diff --git a/clang/lib/AST/IgnoreExpr.cpp b/clang/lib/AST/IgnoreExpr.cpp
deleted file mode 100644
index 65aaaeb6a1ed0..0000000000000
--- a/clang/lib/AST/IgnoreExpr.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-//===--- IgnoreExpr.cpp - Ignore intermediate Expressions -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements common functions to ignore intermediate expression nodes
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/AST/IgnoreExpr.h"
-#include "clang/AST/Expr.h"
-#include "clang/AST/ExprCXX.h"
-
-using namespace clang;
-
-Expr *clang::IgnoreImplicitCastsSingleStep(Expr *E) {
-  if (auto *ICE = dyn_cast(E))
-    return ICE->getSubExpr();
-
-  if (auto *FE = dyn_cast(E))
-    return FE->getSubExpr();
-
-  return E;
-}
-
-Expr *clang::IgnoreImplicitCastsExtraSingleStep(Expr *E) {
-  // FIXME: Skip MaterializeTemporaryExpr and SubstNonTypeTemplateParmExpr in
-  // addition to what IgnoreImpCasts() skips to account for the current
-  // behaviour of IgnoreParenImpCasts().
-  Expr *SubE = IgnoreImplicitCastsSingleStep(E);
-  if (SubE != E)
-    return SubE;
-
-  if (auto *MTE = dyn_cast(E))
-    return MTE->getSubExpr();
-
-  if (auto *NTTP = dyn_cast(E))
-    return NTTP->getReplacement();
-
-  return E;
-}
-
-Expr *clang::IgnoreCastsSingleStep(Expr *E) {
-  if (auto *CE = dyn_cast(E))
-    return CE->getSubExpr();
-
-  if (auto *FE = dyn_cast(E))
-    return FE->getSubExpr();
-
-  if (auto *MTE = dyn_cast(E))
-    return MTE->getSubExpr();
-
-  if (auto *NTTP = dyn_cast(E))
-    return NTTP->getReplacement();
-
-  return E;
-}
-
-Expr *clang::IgnoreLValueCastsSingleStep(Expr *E) {
-  // Skip what IgnoreCastsSingleStep skips, except that only
-  // lvalue-to-rvalue casts are skipped.
-  if (auto *CE = dyn_cast(E))
-    if (CE->getCastKind() != CK_LValueToRValue)
-      return E;
-
-  return IgnoreCastsSingleStep(E);
-}
-
-Expr *clang::IgnoreBaseCastsSingleStep(Expr *E) {
-  if (auto *CE = dyn_cast(E))
-    if (CE->getCastKind() == CK_DerivedToBase ||
-        CE->getCastKind() == CK_UncheckedDerivedToBase ||
-        CE->getCastKind() == CK_NoOp)
-      return CE->getSubExpr();
-
-  return E;
-}
-
-Expr *clang::IgnoreImplicitSingleStep(Expr *E) {
-  Expr *SubE = IgnoreImplicitCastsSingleStep(E);
-  if (SubE != E)
-    return SubE;
-
-  if (auto *MTE = dyn_cast(E))
-    return MTE->getSubExpr();
-
-  if (auto *BTE = dyn_cast(E))
-    return BTE->getSubExpr();
-
-  return E;
-}
-
-Expr *clang::IgnoreImplicitAsWrittenSingleStep(Expr *E) {
-  if (auto *ICE = dyn_cast(E))
-    return ICE->getSubExprAsWritten();
-
-  return IgnoreImplicitSingleStep(E);
-}
-
-Expr *clang::IgnoreParensOnlySingleStep(Expr *E) {
-  if (auto *PE = dyn_cast(E))
-    return PE->getSubExpr();
-  return E;
-}
-
-Expr *clang::IgnoreParensSingleStep(Expr *E) {
-  if (auto *PE = dyn_cast(E))
-    return PE->getSubExpr();
-
-  if (auto *UO = dyn_cast(E)) {
-    if (UO->getOpcode() == UO_Extension)
-      return UO->getSubExpr();
-  }
-
-  else if (auto *GSE = dyn_cast(E)) {
-    if (!GSE->isResultDependent())
-      return GSE->getResultExpr();
-  }
-
-  else if (auto *CE = dyn_cast(E)) {
-    if (!CE->isConditionDependent())
-      return CE->getChosenSubExpr();
-  }
-
-  return E;
-}
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 095278e7462d9..2627567f4c546 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3283,7 +3283,7 @@ static StringRef mangleAArch64VectorBase(const BuiltinType *EltType) {
   case BuiltinType::Double:
     return "Float64";
   case BuiltinType::BFloat16:
-    return "BFloat16";
+    return "Bfloat16";
   default:
     llvm_unreachable("Unexpected vector element base type");
   }
@@ -3396,7 +3396,7 @@ void CXXNameMangler::mangleAArch64FixedSveVectorType(const VectorType *T) {
   case BuiltinType::ULong:
     TypeName = "__SVUint64_t";
     break;
-  case BuiltinType::Float16:
+  case BuiltinType::Half:
     TypeName = "__SVFloat16_t";
     break;
   case BuiltinType::Float:
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 2515af595eb1b..b7f80ec472e79 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -378,8 +378,10 @@ class MicrosoftCXXNameMangler {
   void mangleFunctionClass(const FunctionDecl *FD);
   void mangleCallingConvention(CallingConv CC);
   void mangleCallingConvention(const FunctionType *T);
-  void mangleIntegerLiteral(const llvm::APSInt &Number, bool IsBoolean);
-  void mangleExpression(const Expr *E);
+  void mangleIntegerLiteral(const llvm::APSInt &Number,
+                            const NonTypeTemplateParmDecl *PD = nullptr,
+                            QualType TemplateArgType = QualType());
+  void mangleExpression(const Expr *E, const NonTypeTemplateParmDecl *PD);
   void mangleThrowSpecification(const FunctionProtoType *T);
 
   void mangleTemplateArgs(const TemplateDecl *TD,
@@ -1357,24 +1359,36 @@ MicrosoftCXXNameMangler::mangleUnscopedTemplateName(const TemplateDecl *TD) {
   mangleUnqualifiedName(TD);
 }
 
-void MicrosoftCXXNameMangler::mangleIntegerLiteral(const llvm::APSInt &Value,
-                                                   bool IsBoolean) {
+void MicrosoftCXXNameMangler::mangleIntegerLiteral(
+    const llvm::APSInt &Value, const NonTypeTemplateParmDecl *PD,
+    QualType TemplateArgType) {
   //  ::= $0 
-  Out << "$0";
-  // Make sure booleans are encoded as 0/1.
-  if (IsBoolean && Value.getBoolValue())
-    mangleNumber(1);
-  else if (Value.isSigned())
+  Out << "$";
+
+  // Since MSVC 2019, add 'M[]' after '$' for auto template parameter when
+  // argument is integer.
+  if (getASTContext().getLangOpts().isCompatibleWithMSVC(
+          LangOptions::MSVC2019) &&
+      PD && PD->getType()->getTypeClass() == Type::Auto &&
+      !TemplateArgType.isNull()) {
+    Out << "M";
+    mangleType(TemplateArgType, SourceRange(), QMM_Drop);
+  }
+
+  Out << "0";
+
+  if (Value.isSigned())
     mangleNumber(Value.getSExtValue());
   else
     mangleNumber(Value.getZExtValue());
 }
 
-void MicrosoftCXXNameMangler::mangleExpression(const Expr *E) {
+void MicrosoftCXXNameMangler::mangleExpression(
+    const Expr *E, const NonTypeTemplateParmDecl *PD) {
   // See if this is a constant expression.
   if (Optional Value =
           E->getIntegerConstantExpr(Context.getASTContext())) {
-    mangleIntegerLiteral(*Value, E->getType()->isBooleanType());
+    mangleIntegerLiteral(*Value, PD, E->getType());
     return;
   }
 
@@ -1448,10 +1462,12 @@ void MicrosoftCXXNameMangler::mangleTemplateArg(const TemplateDecl *TD,
     }
     break;
   }
-  case TemplateArgument::Integral:
+  case TemplateArgument::Integral: {
+    QualType T = TA.getIntegralType();
     mangleIntegerLiteral(TA.getAsIntegral(),
-                         TA.getIntegralType()->isBooleanType());
+                         cast(Parm), T);
     break;
+  }
   case TemplateArgument::NullPtr: {
     QualType T = TA.getNullPtrType();
     if (const MemberPointerType *MPT = T->getAs()) {
@@ -1473,16 +1489,18 @@ void MicrosoftCXXNameMangler::mangleTemplateArg(const TemplateDecl *TD,
         // However, we are free to use 0 *if* we would use multiple fields for
         // non-nullptr member pointers.
         if (!RD->nullFieldOffsetIsZero()) {
-          mangleIntegerLiteral(llvm::APSInt::get(-1), /*IsBoolean=*/false);
+          mangleIntegerLiteral(llvm::APSInt::get(-1),
+                               cast(Parm), T);
           return;
         }
       }
     }
-    mangleIntegerLiteral(llvm::APSInt::getUnsigned(0), /*IsBoolean=*/false);
+    mangleIntegerLiteral(llvm::APSInt::getUnsigned(0),
+                         cast(Parm), T);
     break;
   }
   case TemplateArgument::Expression:
-    mangleExpression(TA.getAsExpr());
+    mangleExpression(TA.getAsExpr(), cast(Parm));
     break;
   case TemplateArgument::Pack: {
     ArrayRef TemplateArgs = TA.getPackAsArray();
@@ -1814,8 +1832,7 @@ void MicrosoftCXXNameMangler::mangleAddressSpaceType(QualType T,
   if (Context.getASTContext().addressSpaceMapManglingFor(AS)) {
     unsigned TargetAS = Context.getASTContext().getTargetAddressSpace(AS);
     Extra.mangleSourceName("_AS");
-    Extra.mangleIntegerLiteral(llvm::APSInt::getUnsigned(TargetAS),
-                               /*IsBoolean*/ false);
+    Extra.mangleIntegerLiteral(llvm::APSInt::getUnsigned(TargetAS));
   } else {
     switch (AS) {
     default:
@@ -2714,8 +2731,7 @@ void MicrosoftCXXNameMangler::mangleType(const VectorType *T, Qualifiers Quals,
     Stream << "?$";
     Extra.mangleSourceName("__vector");
     Extra.mangleType(QualType(ET, 0), Range, QMM_Escape);
-    Extra.mangleIntegerLiteral(llvm::APSInt::getUnsigned(T->getNumElements()),
-                               /*IsBoolean=*/false);
+    Extra.mangleIntegerLiteral(llvm::APSInt::getUnsigned(T->getNumElements()));
 
     mangleArtificialTagType(TTK_Union, TemplateMangling, {"__clang"});
   }
@@ -2954,7 +2970,7 @@ void MicrosoftCXXNameMangler::mangleType(const PipeType *T, Qualifiers,
   Stream << "?$";
   Extra.mangleSourceName("ocl_pipe");
   Extra.mangleType(ElementType, Range, QMM_Escape);
-  Extra.mangleIntegerLiteral(llvm::APSInt::get(T->isReadOnly()), true);
+  Extra.mangleIntegerLiteral(llvm::APSInt::get(T->isReadOnly()));
 
   mangleArtificialTagType(TTK_Struct, TemplateMangling, {"__clang"});
 }
@@ -2994,8 +3010,7 @@ void MicrosoftCXXNameMangler::mangleType(const ExtIntType *T, Qualifiers,
     Extra.mangleSourceName("_UExtInt");
   else
     Extra.mangleSourceName("_ExtInt");
-  Extra.mangleIntegerLiteral(llvm::APSInt::getUnsigned(T->getNumBits()),
-                             /*IsBoolean=*/false);
+  Extra.mangleIntegerLiteral(llvm::APSInt::getUnsigned(T->getNumBits()));
 
   mangleArtificialTagType(TTK_Struct, TemplateMangling, {"__clang"});
 }
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index e846d325560d0..6590738268c60 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -2201,7 +2201,10 @@ void OMPTraitInfo::print(llvm::raw_ostream &OS,
 
       OS << "(";
       if (Selector.Kind == TraitSelector::user_condition) {
-        Selector.ScoreOrCondition->printPretty(OS, nullptr, Policy);
+        if (Selector.ScoreOrCondition)
+          Selector.ScoreOrCondition->printPretty(OS, nullptr, Policy);
+        else
+          OS << "...";
       } else {
 
         if (Selector.ScoreOrCondition) {
@@ -2278,7 +2281,7 @@ OMPTraitInfo::OMPTraitInfo(StringRef MangledName) {
         Property.RawString = PropRestPair.first;
         Property.Kind = getOpenMPContextTraitPropertyKind(
             Set.Kind, Selector.Kind, PropRestPair.first);
-        MangledName = PropRestPair.second;
+        MangledName = MangledName.drop_front(PropRestPair.first.size());
       } while (true);
     } while (true);
   } while (true);
diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp
index 25078e7b00fae..bdfaf410131cc 100644
--- a/clang/lib/AST/Stmt.cpp
+++ b/clang/lib/AST/Stmt.cpp
@@ -13,11 +13,12 @@
 #include "clang/AST/Stmt.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTDiagnostic.h"
+#include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclGroup.h"
 #include "clang/AST/Expr.h"
-#include "clang/AST/ExprConcepts.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/ExprConcepts.h"
 #include "clang/AST/ExprObjC.h"
 #include "clang/AST/ExprOpenMP.h"
 #include "clang/AST/StmtCXX.h"
@@ -41,8 +42,8 @@
 #include 
 #include 
 #include 
-#include 
 #include 
+#include 
 
 using namespace clang;
 
@@ -129,6 +130,51 @@ void Stmt::EnableStatistics() {
   StatisticsEnabled = true;
 }
 
+static std::pair getLikelihood(const Stmt *S) {
+  if (const auto *AS = dyn_cast_or_null(S))
+    for (const auto *A : AS->getAttrs()) {
+      if (isa(A))
+        return std::make_pair(Stmt::LH_Likely, A);
+
+      if (isa(A))
+        return std::make_pair(Stmt::LH_Unlikely, A);
+    }
+
+  return std::make_pair(Stmt::LH_None, nullptr);
+}
+
+Stmt::Likelihood Stmt::getLikelihood(const Stmt *S) {
+  return ::getLikelihood(S).first;
+}
+
+Stmt::Likelihood Stmt::getLikelihood(const Stmt *Then, const Stmt *Else) {
+  Likelihood LHT = ::getLikelihood(Then).first;
+  Likelihood LHE = ::getLikelihood(Else).first;
+  if (LHE == LH_None)
+    return LHT;
+
+  // If the same attribute is used on both branches there's a conflict.
+  if (LHT == LHE)
+    return LH_None;
+
+  if (LHT != LH_None)
+    return LHT;
+
+  // Invert the value of Else to get the value for Then.
+  return LHE == LH_Likely ? LH_Unlikely : LH_Likely;
+}
+
+std::tuple
+Stmt::determineLikelihoodConflict(const Stmt *Then, const Stmt *Else) {
+  std::pair LHT = ::getLikelihood(Then);
+  std::pair LHE = ::getLikelihood(Else);
+  // If the same attribute is used on both branches there's a conflict.
+  if (LHT.first != LH_None && LHT.first == LHE.first)
+    return std::make_tuple(true, LHT.second, LHE.second);
+
+  return std::make_tuple(false, nullptr, nullptr);
+}
+
 /// Skip no-op (attributed, compound) container stmts and skip captured
 /// stmt at the top, if \a IgnoreCaptured is true.
 Stmt *Stmt::IgnoreContainers(bool IgnoreCaptured) {
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 16c4c3736a4a3..acbc0434931dc 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -964,6 +964,8 @@ void TextNodeDumper::VisitCastExpr(const CastExpr *Node) {
   }
   dumpBasePath(OS, Node);
   OS << ">";
+  if (Node->hasStoredFPFeatures())
+    printFPOptions(Node->getFPFeatures());
 }
 
 void TextNodeDumper::VisitImplicitCastExpr(const ImplicitCastExpr *Node) {
@@ -1132,6 +1134,14 @@ void TextNodeDumper::VisitCXXFunctionalCastExpr(
     const CXXFunctionalCastExpr *Node) {
   OS << " functional cast to " << Node->getTypeAsWritten().getAsString() << " <"
      << Node->getCastKindName() << ">";
+  if (Node->hasStoredFPFeatures())
+    printFPOptions(Node->getFPFeatures());
+}
+
+void TextNodeDumper::VisitCXXStaticCastExpr(const CXXStaticCastExpr *Node) {
+  VisitCXXNamedCastExpr(Node);
+  if (Node->hasStoredFPFeatures())
+    printFPOptions(Node->getFPFeatures());
 }
 
 void TextNodeDumper::VisitCXXUnresolvedConstructExpr(
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 3754a515f115a..8582284cbca63 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2317,38 +2317,13 @@ QualType Type::getSveEltType(const ASTContext &Ctx) const {
   assert(isVLSTBuiltinType() && "unsupported type!");
 
   const BuiltinType *BTy = getAs();
-  switch (BTy->getKind()) {
-  default:
-    llvm_unreachable("Unknown builtin SVE type!");
-  case BuiltinType::SveInt8:
-    return Ctx.SignedCharTy;
-  case BuiltinType::SveUint8:
-  case BuiltinType::SveBool:
+  if (BTy->getKind() == BuiltinType::SveBool)
     // Represent predicates as i8 rather than i1 to avoid any layout issues.
     // The type is bitcasted to a scalable predicate type when casting between
     // scalable and fixed-length vectors.
     return Ctx.UnsignedCharTy;
-  case BuiltinType::SveInt16:
-    return Ctx.ShortTy;
-  case BuiltinType::SveUint16:
-    return Ctx.UnsignedShortTy;
-  case BuiltinType::SveInt32:
-    return Ctx.IntTy;
-  case BuiltinType::SveUint32:
-    return Ctx.UnsignedIntTy;
-  case BuiltinType::SveInt64:
-    return Ctx.LongTy;
-  case BuiltinType::SveUint64:
-    return Ctx.UnsignedLongTy;
-  case BuiltinType::SveFloat16:
-    return Ctx.Float16Ty;
-  case BuiltinType::SveBFloat16:
-    return Ctx.BFloat16Ty;
-  case BuiltinType::SveFloat32:
-    return Ctx.FloatTy;
-  case BuiltinType::SveFloat64:
-    return Ctx.DoubleTy;
-  }
+  else
+    return Ctx.getBuiltinVectorTypeInfo(BTy).ElementType;
 }
 
 bool QualType::isPODType(const ASTContext &Context) const {
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index 6b17bd0cda0b3..4e4e43b2a94a6 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -710,6 +710,7 @@ const internal::VariadicDynCastAllOfMatcher typeAliasDecl;
 const internal::VariadicDynCastAllOfMatcher
     typeAliasTemplateDecl;
 const internal::VariadicAllOfMatcher decl;
+const internal::VariadicAllOfMatcher decompositionDecl;
 const internal::VariadicDynCastAllOfMatcher
     linkageSpecDecl;
 const internal::VariadicDynCastAllOfMatcher namedDecl;
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index 058dab3333de1..8e62dce4fab52 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -202,6 +202,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(cxxUnresolvedConstructExpr);
   REGISTER_MATCHER(decayedType);
   REGISTER_MATCHER(decl);
+  REGISTER_MATCHER(decompositionDecl);
   REGISTER_MATCHER(declCountIs);
   REGISTER_MATCHER(declRefExpr);
   REGISTER_MATCHER(declStmt);
@@ -227,6 +228,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(floatLiteral);
   REGISTER_MATCHER(forEach);
   REGISTER_MATCHER(forEachArgumentWithParam);
+  REGISTER_MATCHER(forEachArgumentWithParamType);
   REGISTER_MATCHER(forEachConstructorInitializer);
   REGISTER_MATCHER(forEachDescendant);
   REGISTER_MATCHER(forEachOverridden);
diff --git a/clang/lib/Analysis/BodyFarm.cpp b/clang/lib/Analysis/BodyFarm.cpp
index f68b06487f98e..603da67156254 100644
--- a/clang/lib/Analysis/BodyFarm.cpp
+++ b/clang/lib/Analysis/BodyFarm.cpp
@@ -166,23 +166,21 @@ ASTMaker::makeLvalueToRvalue(const VarDecl *Arg,
 ImplicitCastExpr *ASTMaker::makeImplicitCast(const Expr *Arg, QualType Ty,
                                              CastKind CK) {
   return ImplicitCastExpr::Create(C, Ty,
-                                  /* CastKind=*/ CK,
-                                  /* Expr=*/ const_cast(Arg),
-                                  /* CXXCastPath=*/ nullptr,
-                                  /* ExprValueKind=*/ VK_RValue);
+                                  /* CastKind=*/CK,
+                                  /* Expr=*/const_cast(Arg),
+                                  /* CXXCastPath=*/nullptr,
+                                  /* ExprValueKind=*/VK_RValue,
+                                  /* FPFeatures */ FPOptionsOverride());
 }
 
 Expr *ASTMaker::makeIntegralCast(const Expr *Arg, QualType Ty) {
   if (Arg->getType() == Ty)
     return const_cast(Arg);
-
-  return ImplicitCastExpr::Create(C, Ty, CK_IntegralCast,
-                                  const_cast(Arg), nullptr, VK_RValue);
+  return makeImplicitCast(Arg, Ty, CK_IntegralCast);
 }
 
 ImplicitCastExpr *ASTMaker::makeIntegralCastToBoolean(const Expr *Arg) {
-  return ImplicitCastExpr::Create(C, C.BoolTy, CK_IntegralToBoolean,
-                                  const_cast(Arg), nullptr, VK_RValue);
+  return makeImplicitCast(Arg, C.BoolTy, CK_IntegralToBoolean);
 }
 
 ObjCBoolLiteralExpr *ASTMaker::makeObjCBool(bool Val) {
diff --git a/clang/lib/Analysis/LiveVariables.cpp b/clang/lib/Analysis/LiveVariables.cpp
index d24c40b457b4b..8cdc4cc5bd613 100644
--- a/clang/lib/Analysis/LiveVariables.cpp
+++ b/clang/lib/Analysis/LiveVariables.cpp
@@ -27,7 +27,7 @@ namespace {
 class LiveVariablesImpl {
 public:
   AnalysisDeclContext &analysisContext;
-  llvm::ImmutableSet::Factory SSetFact;
+  llvm::ImmutableSet::Factory ESetFact;
   llvm::ImmutableSet::Factory DSetFact;
   llvm::ImmutableSet::Factory BSetFact;
   llvm::DenseMap blocksEndToLiveness;
@@ -45,16 +45,15 @@ class LiveVariablesImpl {
              LiveVariables::Observer *obs = nullptr);
 
   void dumpBlockLiveness(const SourceManager& M);
-  void dumpStmtLiveness(const SourceManager& M);
+  void dumpExprLiveness(const SourceManager& M);
 
   LiveVariablesImpl(AnalysisDeclContext &ac, bool KillAtAssign)
-    : analysisContext(ac),
-      SSetFact(false), // Do not canonicalize ImmutableSets by default.
-      DSetFact(false), // This is a *major* performance win.
-      BSetFact(false),
-      killAtAssign(KillAtAssign) {}
+      : analysisContext(ac),
+        ESetFact(false), // Do not canonicalize ImmutableSets by default.
+        DSetFact(false), // This is a *major* performance win.
+        BSetFact(false), killAtAssign(KillAtAssign) {}
 };
-}
+} // namespace
 
 static LiveVariablesImpl &getImpl(void *x) {
   return *((LiveVariablesImpl *) x);
@@ -64,8 +63,8 @@ static LiveVariablesImpl &getImpl(void *x) {
 // Operations and queries on LivenessValues.
 //===----------------------------------------------------------------------===//
 
-bool LiveVariables::LivenessValues::isLive(const Stmt *S) const {
-  return liveStmts.contains(S);
+bool LiveVariables::LivenessValues::isLive(const Expr *E) const {
+  return liveExprs.contains(E);
 }
 
 bool LiveVariables::LivenessValues::isLive(const VarDecl *D) const {
@@ -97,10 +96,10 @@ LiveVariables::LivenessValues
 LiveVariablesImpl::merge(LiveVariables::LivenessValues valsA,
                          LiveVariables::LivenessValues valsB) {
 
-  llvm::ImmutableSetRef
-    SSetRefA(valsA.liveStmts.getRootWithoutRetain(), SSetFact.getTreeFactory()),
-    SSetRefB(valsB.liveStmts.getRootWithoutRetain(), SSetFact.getTreeFactory());
-
+  llvm::ImmutableSetRef SSetRefA(
+      valsA.liveExprs.getRootWithoutRetain(), ESetFact.getTreeFactory()),
+      SSetRefB(valsB.liveExprs.getRootWithoutRetain(),
+               ESetFact.getTreeFactory());
 
   llvm::ImmutableSetRef
     DSetRefA(valsA.liveDecls.getRootWithoutRetain(), DSetFact.getTreeFactory()),
@@ -122,7 +121,7 @@ LiveVariablesImpl::merge(LiveVariables::LivenessValues valsA,
 }
 
 bool LiveVariables::LivenessValues::equals(const LivenessValues &V) const {
-  return liveStmts == V.liveStmts && liveDecls == V.liveDecls;
+  return liveExprs == V.liveExprs && liveDecls == V.liveDecls;
 }
 
 //===----------------------------------------------------------------------===//
@@ -141,8 +140,8 @@ bool LiveVariables::isLive(const Stmt *S, const VarDecl *D) {
   return isAlwaysAlive(D) || getImpl(impl).stmtsToLiveness[S].isLive(D);
 }
 
-bool LiveVariables::isLive(const Stmt *Loc, const Stmt *S) {
-  return getImpl(impl).stmtsToLiveness[Loc].isLive(S);
+bool LiveVariables::isLive(const Stmt *Loc, const Expr *Val) {
+  return getImpl(impl).stmtsToLiveness[Loc].isLive(Val);
 }
 
 //===----------------------------------------------------------------------===//
@@ -186,27 +185,27 @@ static const VariableArrayType *FindVA(QualType Ty) {
   return nullptr;
 }
 
-static const Stmt *LookThroughStmt(const Stmt *S) {
-  while (S) {
-    if (const Expr *Ex = dyn_cast(S))
-      S = Ex->IgnoreParens();
-    if (const FullExpr *FE = dyn_cast(S)) {
-      S = FE->getSubExpr();
+static const Expr *LookThroughExpr(const Expr *E) {
+  while (E) {
+    if (const Expr *Ex = dyn_cast(E))
+      E = Ex->IgnoreParens();
+    if (const FullExpr *FE = dyn_cast(E)) {
+      E = FE->getSubExpr();
       continue;
     }
-    if (const OpaqueValueExpr *OVE = dyn_cast(S)) {
-      S = OVE->getSourceExpr();
+    if (const OpaqueValueExpr *OVE = dyn_cast(E)) {
+      E = OVE->getSourceExpr();
       continue;
     }
     break;
   }
-  return S;
+  return E;
 }
 
-static void AddLiveStmt(llvm::ImmutableSet &Set,
-                        llvm::ImmutableSet::Factory &F,
-                        const Stmt *S) {
-  Set = F.add(Set, LookThroughStmt(S));
+static void AddLiveExpr(llvm::ImmutableSet &Set,
+                        llvm::ImmutableSet::Factory &F,
+                        const Expr *E) {
+  Set = F.add(Set, LookThroughExpr(E));
 }
 
 void TransferFunctions::Visit(Stmt *S) {
@@ -215,8 +214,8 @@ void TransferFunctions::Visit(Stmt *S) {
 
   StmtVisitor::Visit(S);
 
-  if (isa(S)) {
-    val.liveStmts = LV.SSetFact.remove(val.liveStmts, S);
+  if (const auto *E = dyn_cast(S)) {
+    val.liveExprs = LV.ESetFact.remove(val.liveExprs, E);
   }
 
   // Mark all children expressions live.
@@ -233,7 +232,7 @@ void TransferFunctions::Visit(Stmt *S) {
       // Include the implicit "this" pointer as being live.
       CXXMemberCallExpr *CE = cast(S);
       if (Expr *ImplicitObj = CE->getImplicitObjectArgument()) {
-        AddLiveStmt(val.liveStmts, LV.SSetFact, ImplicitObj);
+        AddLiveExpr(val.liveExprs, LV.ESetFact, ImplicitObj);
       }
       break;
     }
@@ -250,7 +249,7 @@ void TransferFunctions::Visit(Stmt *S) {
       if (const VarDecl *VD = dyn_cast(DS->getSingleDecl())) {
         for (const VariableArrayType* VA = FindVA(VD->getType());
              VA != nullptr; VA = FindVA(VA->getElementType())) {
-          AddLiveStmt(val.liveStmts, LV.SSetFact, VA->getSizeExpr());
+          AddLiveExpr(val.liveExprs, LV.ESetFact, VA->getSizeExpr());
         }
       }
       break;
@@ -263,7 +262,7 @@ void TransferFunctions::Visit(Stmt *S) {
       if (OpaqueValueExpr *OV = dyn_cast(child))
         child = OV->getSourceExpr();
       child = child->IgnoreParens();
-      val.liveStmts = LV.SSetFact.add(val.liveStmts, child);
+      val.liveExprs = LV.ESetFact.add(val.liveExprs, child);
       return;
     }
 
@@ -284,36 +283,39 @@ void TransferFunctions::Visit(Stmt *S) {
       // If one of the branches is an expression rather than a compound
       // statement, it will be bad if we mark it as live at the terminator
       // of the if-statement (i.e., immediately after the condition expression).
-      AddLiveStmt(val.liveStmts, LV.SSetFact, cast(S)->getCond());
+      AddLiveExpr(val.liveExprs, LV.ESetFact, cast(S)->getCond());
       return;
     }
     case Stmt::WhileStmtClass: {
       // If the loop body is an expression rather than a compound statement,
       // it will be bad if we mark it as live at the terminator of the loop
       // (i.e., immediately after the condition expression).
-      AddLiveStmt(val.liveStmts, LV.SSetFact, cast(S)->getCond());
+      AddLiveExpr(val.liveExprs, LV.ESetFact, cast(S)->getCond());
       return;
     }
     case Stmt::DoStmtClass: {
       // If the loop body is an expression rather than a compound statement,
       // it will be bad if we mark it as live at the terminator of the loop
       // (i.e., immediately after the condition expression).
-      AddLiveStmt(val.liveStmts, LV.SSetFact, cast(S)->getCond());
+      AddLiveExpr(val.liveExprs, LV.ESetFact, cast(S)->getCond());
       return;
     }
     case Stmt::ForStmtClass: {
       // If the loop body is an expression rather than a compound statement,
       // it will be bad if we mark it as live at the terminator of the loop
       // (i.e., immediately after the condition expression).
-      AddLiveStmt(val.liveStmts, LV.SSetFact, cast(S)->getCond());
+      AddLiveExpr(val.liveExprs, LV.ESetFact, cast(S)->getCond());
       return;
     }
 
   }
 
+  // HACK + FIXME: What is this? One could only guess that this is an attempt to
+  // fish for live values, for example, arguments from a call expression.
+  // Maybe we could take inspiration from UninitializedVariable analysis?
   for (Stmt *Child : S->children()) {
-    if (Child)
-      AddLiveStmt(val.liveStmts, LV.SSetFact, Child);
+    if (const auto *E = dyn_cast_or_null(Child))
+      AddLiveExpr(val.liveExprs, LV.ESetFact, E);
   }
 }
 
@@ -416,7 +418,7 @@ VisitUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *UE)
   const Expr *subEx = UE->getArgumentExpr();
   if (subEx->getType()->isVariableArrayType()) {
     assert(subEx->isLValue());
-    val.liveStmts = LV.SSetFact.add(val.liveStmts, subEx->IgnoreParens());
+    val.liveExprs = LV.ESetFact.add(val.liveExprs, subEx->IgnoreParens());
   }
 }
 
@@ -613,19 +615,19 @@ void LiveVariablesImpl::dumpBlockLiveness(const SourceManager &M) {
   llvm::errs() << "\n";
 }
 
-void LiveVariables::dumpStmtLiveness(const SourceManager &M) {
-  getImpl(impl).dumpStmtLiveness(M);
+void LiveVariables::dumpExprLiveness(const SourceManager &M) {
+  getImpl(impl).dumpExprLiveness(M);
 }
 
-void LiveVariablesImpl::dumpStmtLiveness(const SourceManager &M) {
+void LiveVariablesImpl::dumpExprLiveness(const SourceManager &M) {
   // Don't iterate over blockEndsToLiveness directly because it's not sorted.
-  for (auto I : *analysisContext.getCFG()) {
+  for (const CFGBlock *B : *analysisContext.getCFG()) {
 
-    llvm::errs() << "\n[ B" << I->getBlockID()
-                 << " (live statements at block exit) ]\n";
-    for (auto S : blocksEndToLiveness[I].liveStmts) {
+    llvm::errs() << "\n[ B" << B->getBlockID()
+                 << " (live expressions at block exit) ]\n";
+    for (const Expr *E : blocksEndToLiveness[B].liveExprs) {
       llvm::errs() << "\n";
-      S->dump();
+      E->dump();
     }
     llvm::errs() << "\n";
   }
diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp
index 5b97265a6d8ae..64e0da9e64b12 100644
--- a/clang/lib/Analysis/ThreadSafety.cpp
+++ b/clang/lib/Analysis/ThreadSafety.cpp
@@ -1266,21 +1266,13 @@ ClassifyDiagnostic(const AttrTy *A) {
 }
 
 bool ThreadSafetyAnalyzer::inCurrentScope(const CapabilityExpr &CapE) {
-  const threadSafety::til::SExpr *SExp = CapE.sexpr();
-  assert(SExp && "Null expressions should be ignored");
-
-  // Global variables are always in scope.
-  if (isa(SExp))
-    return true;
-
-  // Members are in scope from methods of the same class.
-  if (const auto *P = dyn_cast(SExp)) {
-    if (!CurrentMethod)
+  if (!CurrentMethod)
       return false;
-    const ValueDecl *VD = P->clangDecl();
-    return VD->getDeclContext() == CurrentMethod->getDeclContext();
+  if (const auto *P = dyn_cast_or_null(CapE.sexpr())) {
+    const auto *VD = P->clangDecl();
+    if (VD)
+      return VD->getDeclContext() == CurrentMethod->getDeclContext();
   }
-
   return false;
 }
 
diff --git a/clang/lib/Analysis/ThreadSafetyCommon.cpp b/clang/lib/Analysis/ThreadSafetyCommon.cpp
index aee9185760071..1b8c55e56d470 100644
--- a/clang/lib/Analysis/ThreadSafetyCommon.cpp
+++ b/clang/lib/Analysis/ThreadSafetyCommon.cpp
@@ -274,7 +274,7 @@ til::SExpr *SExprBuilder::translateDeclRefExpr(const DeclRefExpr *DRE,
   const auto *VD = cast(DRE->getDecl()->getCanonicalDecl());
 
   // Function parameters require substitution and/or renaming.
-  if (const auto *PV = dyn_cast(VD)) {
+  if (const auto *PV = dyn_cast_or_null(VD)) {
     unsigned I = PV->getFunctionScopeIndex();
     const DeclContext *D = PV->getDeclContext();
     if (Ctx && Ctx->FunArgs) {
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 709185707bd9c..2abbe3e81e0a2 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -84,7 +84,7 @@ CudaArchToStringMap arch_names[] = {
     GFX(810), // stoney
     GFX(900), // vega, instinct
     GFX(902), GFX(904), GFX(906), GFX(908), GFX(909),
-    GFX(1010), GFX(1011), GFX(1012),
+    GFX(1010), GFX(1011), GFX(1012), GFX(1030), GFX(1031)
     // clang-format on
 };
 #undef SM
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 0a76c78cd44fb..0f194403bf04a 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -1936,6 +1936,11 @@ SourceManager::getMacroArgExpandedLocation(SourceLocation Loc) const {
 
   assert(!MacroArgsCache->empty());
   MacroArgsMap::iterator I = MacroArgsCache->upper_bound(Offset);
+  // In case every element in MacroArgsCache is greater than Offset we can't
+  // decrement the iterator.
+  if (I == MacroArgsCache->begin())
+    return Loc;
+
   --I;
 
   unsigned MacroArgBeginOffs = I->first;
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index 9c206fc7e6a42..0c06ac3cd0350 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -770,6 +770,8 @@ class LLVM_LIBRARY_VISIBILITY ZOSTargetInfo : public OSTargetInfo {
       // type is not declared as a typedef in system headers.
       Builder.defineMacro("__wchar_t");
     }
+
+    this->PlatformName = llvm::Triple::getOSTypeName(Triple.getOS());
   }
 
 public:
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index bca06a7a802dd..ec067d8811fc6 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -82,6 +82,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
     SimdDefaultAlign = 128;
     LongDoubleWidth = LongDoubleAlign = 128;
     LongDoubleFormat = &llvm::APFloat::PPCDoubleDouble();
+    HasStrictFP = true;
   }
 
   // Set the language option for altivec based on our value.
diff --git a/clang/lib/Basic/Targets/Sparc.cpp b/clang/lib/Basic/Targets/Sparc.cpp
index 48f36c5ba1c63..5eeb77406c342 100644
--- a/clang/lib/Basic/Targets/Sparc.cpp
+++ b/clang/lib/Basic/Targets/Sparc.cpp
@@ -147,19 +147,20 @@ void SparcTargetInfo::getTargetDefines(const LangOptions &Opts,
 void SparcV8TargetInfo::getTargetDefines(const LangOptions &Opts,
                                          MacroBuilder &Builder) const {
   SparcTargetInfo::getTargetDefines(Opts, Builder);
-  switch (getCPUGeneration(CPU)) {
-  case CG_V8:
+  if (getTriple().getOS() == llvm::Triple::Solaris)
     Builder.defineMacro("__sparcv8");
-    if (getTriple().getOS() != llvm::Triple::Solaris)
+  else {
+    switch (getCPUGeneration(CPU)) {
+    case CG_V8:
+      Builder.defineMacro("__sparcv8");
       Builder.defineMacro("__sparcv8__");
-    break;
-  case CG_V9:
-    Builder.defineMacro("__sparcv9");
-    if (getTriple().getOS() != llvm::Triple::Solaris) {
+      break;
+    case CG_V9:
+      Builder.defineMacro("__sparcv9");
       Builder.defineMacro("__sparcv9__");
       Builder.defineMacro("__sparc_v9__");
+      break;
     }
-    break;
   }
   if (getTriple().getVendor() == llvm::Triple::Myriad) {
     std::string MyriadArchValue, Myriad2Value;
@@ -227,6 +228,12 @@ void SparcV8TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__myriad2__", Myriad2Value);
     Builder.defineMacro("__myriad2", Myriad2Value);
   }
+  if (getCPUGeneration(CPU) == CG_V9) {
+    Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
+    Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
+    Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
+    Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");
+  }
 }
 
 void SparcV9TargetInfo::getTargetDefines(const LangOptions &Opts,
diff --git a/clang/lib/Basic/Targets/Sparc.h b/clang/lib/Basic/Targets/Sparc.h
index d24cf15d7cd65..07844abafe11b 100644
--- a/clang/lib/Basic/Targets/Sparc.h
+++ b/clang/lib/Basic/Targets/Sparc.h
@@ -166,10 +166,15 @@ class LLVM_LIBRARY_VISIBILITY SparcV8TargetInfo : public SparcTargetInfo {
       PtrDiffType = SignedLong;
       break;
     }
-    // Up to 32 bits are lock-free atomic, but we're willing to do atomic ops
-    // on up to 64 bits.
+    // Up to 32 bits (V8) or 64 bits (V9) are lock-free atomic, but we're
+    // willing to do atomic ops on up to 64 bits.
     MaxAtomicPromoteWidth = 64;
-    MaxAtomicInlineWidth = 32;
+    if (getCPUGeneration(CPU) == CG_V9)
+      MaxAtomicInlineWidth = 64;
+    else
+      // FIXME: This isn't correct for plain V8 which lacks CAS,
+      // only for LEON 3+ and Myriad.
+      MaxAtomicInlineWidth = 32;
   }
 
   void getTargetDefines(const LangOptions &Opts,
diff --git a/clang/lib/CMakeLists.txt b/clang/lib/CMakeLists.txt
index 23082789ff9a2..1068288100fd6 100644
--- a/clang/lib/CMakeLists.txt
+++ b/clang/lib/CMakeLists.txt
@@ -21,8 +21,6 @@ add_subdirectory(Tooling)
 add_subdirectory(DirectoryWatcher)
 add_subdirectory(Index)
 add_subdirectory(IndexSerialization)
-if(CLANG_ENABLE_STATIC_ANALYZER)
-  add_subdirectory(StaticAnalyzer)
-endif()
+add_subdirectory(StaticAnalyzer)
 add_subdirectory(Format)
 add_subdirectory(Testing)
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 1e3f264823f34..2eaee8274500d 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -69,8 +69,8 @@
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
-#include "llvm/Transforms/Instrumentation/HeapProfiler.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
+#include "llvm/Transforms/Instrumentation/MemProfiler.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
@@ -270,10 +270,10 @@ static bool asanUseGlobalsGC(const Triple &T, const CodeGenOptions &CGOpts) {
   return false;
 }
 
-static void addHeapProfilerPasses(const PassManagerBuilder &Builder,
-                                  legacy::PassManagerBase &PM) {
-  PM.add(createHeapProfilerFunctionPass());
-  PM.add(createModuleHeapProfilerLegacyPassPass());
+static void addMemProfilerPasses(const PassManagerBuilder &Builder,
+                                 legacy::PassManagerBase &PM) {
+  PM.add(createMemProfilerFunctionPass());
+  PM.add(createModuleMemProfilerLegacyPassPass());
 }
 
 static void addAddressSanitizerPasses(const PassManagerBuilder &Builder,
@@ -516,6 +516,7 @@ static void initTargetOptions(DiagnosticsEngine &Diags,
       Options.BBSectionsFuncListBuf = std::move(*MBOrErr);
   }
 
+  Options.EnableMachineFunctionSplitter = CodeGenOpts.SplitMachineFunctions;
   Options.FunctionSections = CodeGenOpts.FunctionSections;
   Options.DataSections = CodeGenOpts.DataSections;
   Options.UniqueSectionNames = CodeGenOpts.UniqueSectionNames;
@@ -674,11 +675,11 @@ void EmitAssemblyHelper::CreatePasses(legacy::PassManager &MPM,
   if (LangOpts.Coroutines)
     addCoroutinePassesToExtensionPoints(PMBuilder);
 
-  if (CodeGenOpts.HeapProf) {
+  if (CodeGenOpts.MemProf) {
     PMBuilder.addExtension(PassManagerBuilder::EP_OptimizerLast,
-                           addHeapProfilerPasses);
+                           addMemProfilerPasses);
     PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
-                           addHeapProfilerPasses);
+                           addMemProfilerPasses);
   }
 
   if (LangOpts.Sanitize.has(SanitizerKind::LocalBounds)) {
@@ -1422,9 +1423,9 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
       }
     }
 
-    if (CodeGenOpts.HeapProf) {
-      MPM.addPass(createModuleToFunctionPassAdaptor(HeapProfilerPass()));
-      MPM.addPass(ModuleHeapProfilerPass());
+    if (CodeGenOpts.MemProf) {
+      MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass()));
+      MPM.addPass(ModuleMemProfilerPass());
     }
 
     if (LangOpts.Sanitize.has(SanitizerKind::HWAddress)) {
@@ -1684,9 +1685,10 @@ static void runThinLTOBackend(
     Conf.CGFileType = getCodeGenFileType(Action);
     break;
   }
-  if (Error E = thinBackend(
-          Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
-          ModuleToDefinedGVSummaries[M->getModuleIdentifier()], ModuleMap)) {
+  if (Error E =
+          thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
+                      ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
+                      ModuleMap, &CGOpts.CmdArgs)) {
     handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
       errs() << "Error running ThinLTO backend: " << EIB.message() << '\n';
     });
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index 615b782350414..ee0c14641803b 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -580,7 +580,7 @@ static void computeBlockInfo(CodeGenModule &CGM, CodeGenFunction *CGF,
 
       // Since a __block variable cannot be captured by lambdas, its type and
       // the capture field type should always match.
-      assert(getCaptureFieldType(*CGF, CI) == variable->getType() &&
+      assert(CGF && getCaptureFieldType(*CGF, CI) == variable->getType() &&
              "capture type differs from the variable type");
       layout.push_back(BlockLayoutChunk(align, CGM.getPointerSize(),
                                         Qualifiers::OCL_None, &CI,
@@ -1024,7 +1024,7 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
                           type, VK_LValue, SourceLocation());
 
       ImplicitCastExpr l2r(ImplicitCastExpr::OnStack, type, CK_LValueToRValue,
-                           &declRef, VK_RValue);
+                           &declRef, VK_RValue, FPOptionsOverride());
       // FIXME: Pass a specific location for the expr init so that the store is
       // attributed to a reasonable location - otherwise it may be attributed to
       // locations of subexpressions in the initialization.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ff9dcd5022029..a1b09cc7632bd 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -11320,15 +11320,6 @@ static Value *EmitX86ConvertIntToFp(CodeGenFunction &CGF,
   return EmitX86Select(CGF, Ops[2], Res, Ops[1]);
 }
 
-static Value *EmitX86MinMax(CodeGenFunction &CGF, ICmpInst::Predicate Pred,
-                            ArrayRef Ops) {
-  Value *Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
-  Value *Res = CGF.Builder.CreateSelect(Cmp, Ops[0], Ops[1]);
-
-  assert(Ops.size() == 2);
-  return Res;
-}
-
 // Lowers X86 FMA intrinsics to IR.
 static Value *EmitX86FMAExpr(CodeGenFunction &CGF, ArrayRef Ops,
                              unsigned BuiltinID, bool IsAddSub) {
@@ -13312,7 +13303,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_pmaxsw512:
   case X86::BI__builtin_ia32_pmaxsd512:
   case X86::BI__builtin_ia32_pmaxsq512:
-    return EmitX86MinMax(*this, ICmpInst::ICMP_SGT, Ops);
+    return EmitX86BinaryIntrinsic(*this, Ops, Intrinsic::smax);
   case X86::BI__builtin_ia32_pmaxub128:
   case X86::BI__builtin_ia32_pmaxuw128:
   case X86::BI__builtin_ia32_pmaxud128:
@@ -13325,7 +13316,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_pmaxuw512:
   case X86::BI__builtin_ia32_pmaxud512:
   case X86::BI__builtin_ia32_pmaxuq512:
-    return EmitX86MinMax(*this, ICmpInst::ICMP_UGT, Ops);
+    return EmitX86BinaryIntrinsic(*this, Ops, Intrinsic::umax);
   case X86::BI__builtin_ia32_pminsb128:
   case X86::BI__builtin_ia32_pminsw128:
   case X86::BI__builtin_ia32_pminsd128:
@@ -13338,7 +13329,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_pminsw512:
   case X86::BI__builtin_ia32_pminsd512:
   case X86::BI__builtin_ia32_pminsq512:
-    return EmitX86MinMax(*this, ICmpInst::ICMP_SLT, Ops);
+    return EmitX86BinaryIntrinsic(*this, Ops, Intrinsic::smin);
   case X86::BI__builtin_ia32_pminub128:
   case X86::BI__builtin_ia32_pminuw128:
   case X86::BI__builtin_ia32_pminud128:
@@ -13351,7 +13342,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_pminuw512:
   case X86::BI__builtin_ia32_pminud512:
   case X86::BI__builtin_ia32_pminuq512:
-    return EmitX86MinMax(*this, ICmpInst::ICMP_ULT, Ops);
+    return EmitX86BinaryIntrinsic(*this, Ops, Intrinsic::umin);
 
   case X86::BI__builtin_ia32_pmuludq128:
   case X86::BI__builtin_ia32_pmuludq256:
@@ -14279,8 +14270,8 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpic ||
              BuiltinID == PPC::BI__builtin_vsx_xvrspic)
       ID = Builder.getIsFPConstrained()
-               ? Intrinsic::experimental_constrained_nearbyint
-               : Intrinsic::nearbyint;
+               ? Intrinsic::experimental_constrained_rint
+               : Intrinsic::rint;
     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpip ||
              BuiltinID == PPC::BI__builtin_vsx_xvrspip)
       ID = Builder.getIsFPConstrained()
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 9f305e750f0c1..194b4f710d22c 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2207,6 +2207,13 @@ void CodeGenModule::ConstructAttributeList(
       if (AI.getIndirectByVal())
         Attrs.addByValAttr(getTypes().ConvertTypeForMem(ParamType));
 
+      auto *Decl = ParamType->getAsRecordDecl();
+      if (CodeGenOpts.PassByValueIsNoAlias && Decl &&
+          Decl->getArgPassingRestrictions() == RecordDecl::APK_CanPassInRegs)
+        // When calling the function, the pointer passed in will be the only
+        // reference to the underlying object. Mark it accordingly.
+        Attrs.addAttribute(llvm::Attribute::NoAlias);
+
       // TODO: We could add the byref attribute if not byval, but it would
       // require updating many testcases.
 
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 50b6079bd80bf..e33730b9ae901 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -2199,7 +2199,8 @@ llvm::Value *CodeGenFunction::EmitCXXTypeidExpr(const CXXTypeidExpr *E) {
   //   polymorphic class type, the result refers to a std::type_info object
   //   representing the type of the most derived object (that is, the dynamic
   //   type) to which the glvalue refers.
-  if (E->isPotentiallyEvaluated())
+  // If the operand is already most derived object, no need to look up vtable.
+  if (E->isPotentiallyEvaluated() && !E->isMostDerived(getContext()))
     return EmitTypeidFromVTable(*this, E->getExprOperand(),
                                 StdTypeInfoPtrTy);
 
diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp
index 26dfb6259a290..99b896ae34886 100644
--- a/clang/lib/CodeGen/CGObjC.cpp
+++ b/clang/lib/CodeGen/CGObjC.cpp
@@ -1449,9 +1449,9 @@ CodeGenFunction::generateObjCSetterBody(const ObjCImplementationDecl *classImpl,
   ValueDecl *selfDecl = setterMethod->getSelfDecl();
   DeclRefExpr self(getContext(), selfDecl, false, selfDecl->getType(),
                    VK_LValue, SourceLocation());
-  ImplicitCastExpr selfLoad(ImplicitCastExpr::OnStack,
-                            selfDecl->getType(), CK_LValueToRValue, &self,
-                            VK_RValue);
+  ImplicitCastExpr selfLoad(ImplicitCastExpr::OnStack, selfDecl->getType(),
+                            CK_LValueToRValue, &self, VK_RValue,
+                            FPOptionsOverride());
   ObjCIvarRefExpr ivarRef(ivar, ivar->getType().getNonReferenceType(),
                           SourceLocation(), SourceLocation(),
                           &selfLoad, true, true);
@@ -1462,7 +1462,7 @@ CodeGenFunction::generateObjCSetterBody(const ObjCImplementationDecl *classImpl,
                   SourceLocation());
   ImplicitCastExpr argLoad(ImplicitCastExpr::OnStack,
                            argType.getUnqualifiedType(), CK_LValueToRValue,
-                           &arg, VK_RValue);
+                           &arg, VK_RValue, FPOptionsOverride());
 
   // The property type can differ from the ivar type in some situations with
   // Objective-C pointer types, we can always bit cast the RHS in these cases.
@@ -1483,9 +1483,8 @@ CodeGenFunction::generateObjCSetterBody(const ObjCImplementationDecl *classImpl,
   } else if (ivarRef.getType()->isPointerType()) {
     argCK = CK_BitCast;
   }
-  ImplicitCastExpr argCast(ImplicitCastExpr::OnStack,
-                           ivarRef.getType(), argCK, &argLoad,
-                           VK_RValue);
+  ImplicitCastExpr argCast(ImplicitCastExpr::OnStack, ivarRef.getType(), argCK,
+                           &argLoad, VK_RValue, FPOptionsOverride());
   Expr *finalArg = &argLoad;
   if (!getContext().hasSameUnqualifiedType(ivarRef.getType(),
                                            argLoad.getType()))
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index c55403920d8fa..d402e13c21347 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1526,6 +1526,7 @@ void CGOpenMPRuntime::functionFinished(CodeGenFunction &CGF) {
     FunctionUDMMap.erase(I);
   }
   LastprivateConditionalToTypes.erase(CGF.CurFn);
+  FunctionToUntiedTaskStackMap.erase(CGF.CurFn);
 }
 
 llvm::Type *CGOpenMPRuntime::getIdentTyPointerTy() {
@@ -3382,6 +3383,17 @@ struct PrivateHelpersTy {
 typedef std::pair PrivateDataTy;
 } // anonymous namespace
 
+static bool isAllocatableDecl(const VarDecl *VD) {
+  const VarDecl *CVD = VD->getCanonicalDecl();
+  if (!CVD->hasAttr())
+    return false;
+  const auto *AA = CVD->getAttr();
+  // Use the default allocation.
+  return !((AA->getAllocatorType() == OMPAllocateDeclAttr::OMPDefaultMemAlloc ||
+            AA->getAllocatorType() == OMPAllocateDeclAttr::OMPNullMemAlloc) &&
+           !AA->getAllocator());
+}
+
 static RecordDecl *
 createPrivatesRecordDecl(CodeGenModule &CGM, ArrayRef Privates) {
   if (!Privates.empty()) {
@@ -3396,9 +3408,12 @@ createPrivatesRecordDecl(CodeGenModule &CGM, ArrayRef Privates) {
       QualType Type = VD->getType().getNonReferenceType();
       // If the private variable is a local variable with lvalue ref type,
       // allocate the pointer instead of the pointee type.
-      if (Pair.second.isLocalPrivate() &&
-          VD->getType()->isLValueReferenceType())
-        Type = C.getPointerType(Type);
+      if (Pair.second.isLocalPrivate()) {
+        if (VD->getType()->isLValueReferenceType())
+          Type = C.getPointerType(Type);
+        if (isAllocatableDecl(VD))
+          Type = C.getPointerType(Type);
+      }
       FieldDecl *FD = addFieldToRecordDecl(C, RD, Type);
       if (VD->hasAttrs()) {
         for (specific_attr_iterator I(VD->getAttrs().begin()),
@@ -3700,6 +3715,8 @@ emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc,
     QualType Ty = VD->getType().getNonReferenceType();
     if (VD->getType()->isLValueReferenceType())
       Ty = C.getPointerType(Ty);
+    if (isAllocatableDecl(VD))
+      Ty = C.getPointerType(Ty);
     Args.push_back(ImplicitParamDecl::Create(
         C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
         C.getPointerType(C.getPointerType(Ty)).withConst().withRestrict(),
@@ -3767,9 +3784,9 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
   bool IsTargetTask =
       isOpenMPTargetDataManagementDirective(D.getDirectiveKind()) ||
       isOpenMPTargetExecutionDirective(D.getDirectiveKind());
-  // For target-based directives skip 3 firstprivate arrays BasePointersArray,
-  // PointersArray and SizesArray. The original variables for these arrays are
-  // not captured and we get their addresses explicitly.
+  // For target-based directives skip 4 firstprivate arrays BasePointersArray,
+  // PointersArray, SizesArray, and MappersArray. The original variables for
+  // these arrays are not captured and we get their addresses explicitly.
   if ((!IsTargetTask && !Data.FirstprivateVars.empty() && ForDup) ||
       (IsTargetTask && KmpTaskSharedsPtr.isValid())) {
     SrcBase = CGF.MakeAddrLValue(
@@ -3780,8 +3797,10 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
   FI = cast(FI->getType()->getAsTagDecl())->field_begin();
   for (const PrivateDataTy &Pair : Privates) {
     // Do not initialize private locals.
-    if (Pair.second.isLocalPrivate())
+    if (Pair.second.isLocalPrivate()) {
+      ++FI;
       continue;
+    }
     const VarDecl *VD = Pair.second.PrivateCopy;
     const Expr *Init = VD->getAnyInitializer();
     if (Init && (!ForDup || (isa(Init) &&
@@ -3790,7 +3809,7 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
       if (const VarDecl *Elem = Pair.second.PrivateElemInit) {
         const VarDecl *OriginalVD = Pair.second.Original;
         // Check if the variable is the target-based BasePointersArray,
-        // PointersArray or SizesArray.
+        // PointersArray, SizesArray, or MappersArray.
         LValue SharedRefLValue;
         QualType Type = PrivateLValue.getType();
         const FieldDecl *SharedField = CapturesInfo.lookup(OriginalVD);
@@ -4146,8 +4165,12 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
                          /*PrivateElemInit=*/nullptr));
     ++I;
   }
-  for (const VarDecl *VD : Data.PrivateLocals)
-    Privates.emplace_back(C.getDeclAlign(VD), PrivateHelpersTy(VD));
+  for (const VarDecl *VD : Data.PrivateLocals) {
+    if (isAllocatableDecl(VD))
+      Privates.emplace_back(CGM.getPointerAlign(), PrivateHelpersTy(VD));
+    else
+      Privates.emplace_back(C.getDeclAlign(VD), PrivateHelpersTy(VD));
+  }
   llvm::stable_sort(Privates,
                     [](const PrivateDataTy &L, const PrivateDataTy &R) {
                       return L.first > R.first;
@@ -7692,6 +7715,7 @@ class MappableExprsHandler {
                 break;
               }
             }
+            assert(Size && "Failed to determine structure size");
             CombinedInfo.BasePointers.push_back(BP.getPointer());
             CombinedInfo.Pointers.push_back(LB.getPointer());
             CombinedInfo.Sizes.push_back(CGF.Builder.CreateIntCast(
@@ -8436,10 +8460,12 @@ class MappableExprsHandler {
     if (DevPointersMap.count(VD)) {
       CombinedInfo.BasePointers.emplace_back(Arg, VD);
       CombinedInfo.Pointers.push_back(Arg);
-      CombinedInfo.Sizes.push_back(
-          CGF.Builder.CreateIntCast(CGF.getTypeSize(CGF.getContext().VoidPtrTy),
-                                    CGF.Int64Ty, /*isSigned=*/true));
-      CombinedInfo.Types.push_back(OMP_MAP_LITERAL | OMP_MAP_TARGET_PARAM);
+      CombinedInfo.Sizes.push_back(CGF.Builder.CreateIntCast(
+          CGF.getTypeSize(CGF.getContext().VoidPtrTy), CGF.Int64Ty,
+          /*isSigned=*/true));
+      CombinedInfo.Types.push_back(
+          (Cap->capturesVariable() ? OMP_MAP_TO : OMP_MAP_LITERAL) |
+          OMP_MAP_TARGET_PARAM);
       CombinedInfo.Mappers.push_back(nullptr);
       return;
     }
@@ -8840,6 +8866,17 @@ emitOffloadingArrays(CodeGenFunction &CGF,
   }
 }
 
+namespace {
+/// Additional arguments for emitOffloadingArraysArgument function.
+struct ArgumentsOptions {
+  bool ForEndCall = false;
+  bool IsTask = false;
+  ArgumentsOptions() = default;
+  ArgumentsOptions(bool ForEndCall, bool IsTask)
+      : ForEndCall(ForEndCall), IsTask(IsTask) {}
+};
+} // namespace
+
 /// Emit the arguments to be passed to the runtime library based on the
 /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
 /// ForEndCall, emit map types to be passed for the end of the region instead of
@@ -8848,8 +8885,9 @@ static void emitOffloadingArraysArgument(
     CodeGenFunction &CGF, llvm::Value *&BasePointersArrayArg,
     llvm::Value *&PointersArrayArg, llvm::Value *&SizesArrayArg,
     llvm::Value *&MapTypesArrayArg, llvm::Value *&MappersArrayArg,
-    CGOpenMPRuntime::TargetDataInfo &Info, bool ForEndCall = false) {
-  assert((!ForEndCall || Info.separateBeginEndCalls()) &&
+    CGOpenMPRuntime::TargetDataInfo &Info,
+    const ArgumentsOptions &Options = ArgumentsOptions()) {
+  assert((!Options.ForEndCall || Info.separateBeginEndCalls()) &&
          "expected region end call to runtime only when end call is separate");
   CodeGenModule &CGM = CGF.CGM;
   if (Info.NumberOfPtrs) {
@@ -8867,14 +8905,17 @@ static void emitOffloadingArraysArgument(
         /*Idx0=*/0, /*Idx1=*/0);
     MapTypesArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
         llvm::ArrayType::get(CGM.Int64Ty, Info.NumberOfPtrs),
-        ForEndCall && Info.MapTypesArrayEnd ? Info.MapTypesArrayEnd
-                                            : Info.MapTypesArray,
+        Options.ForEndCall && Info.MapTypesArrayEnd ? Info.MapTypesArrayEnd
+                                                    : Info.MapTypesArray,
         /*Idx0=*/0,
         /*Idx1=*/0);
-    MappersArrayArg =
-        Info.HasMapper
-            ? CGF.Builder.CreatePointerCast(Info.MappersArray, CGM.VoidPtrPtrTy)
-            : llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
+    // Always emit the mapper array address in case of a target task for
+    // privatization.
+    if (!Options.IsTask && !Info.HasMapper)
+      MappersArrayArg = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
+    else
+      MappersArrayArg =
+          CGF.Builder.CreatePointerCast(Info.MappersArray, CGM.VoidPtrPtrTy);
   } else {
     BasePointersArrayArg = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
     PointersArrayArg = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
@@ -9622,9 +9663,11 @@ void CGOpenMPRuntime::emitTargetCall(
     TargetDataInfo Info;
     // Fill up the arrays and create the arguments.
     emitOffloadingArrays(CGF, CombinedInfo, Info);
+    bool HasDependClauses = D.hasClausesOfKind();
     emitOffloadingArraysArgument(CGF, Info.BasePointersArray,
                                  Info.PointersArray, Info.SizesArray,
-                                 Info.MapTypesArray, Info.MappersArray, Info);
+                                 Info.MapTypesArray, Info.MappersArray, Info,
+                                 {/*ForEndTask=*/false, HasDependClauses});
     InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
     InputInfo.BasePointersArray =
         Address(Info.BasePointersArray, CGM.getPointerAlign());
@@ -10235,7 +10278,7 @@ void CGOpenMPRuntime::emitTargetDataCalls(
     llvm::Value *MappersArrayArg = nullptr;
     emitOffloadingArraysArgument(CGF, BasePointersArrayArg, PointersArrayArg,
                                  SizesArrayArg, MapTypesArrayArg,
-                                 MappersArrayArg, Info, /*ForEndCall=*/false);
+                                 MappersArrayArg, Info);
 
     // Emit device ID if any.
     llvm::Value *DeviceID = nullptr;
@@ -10275,7 +10318,8 @@ void CGOpenMPRuntime::emitTargetDataCalls(
     llvm::Value *MappersArrayArg = nullptr;
     emitOffloadingArraysArgument(CGF, BasePointersArrayArg, PointersArrayArg,
                                  SizesArrayArg, MapTypesArrayArg,
-                                 MappersArrayArg, Info, /*ForEndCall=*/true);
+                                 MappersArrayArg, Info,
+                                 {/*ForEndCall=*/true, /*IsTask=*/false});
 
     // Emit device ID if any.
     llvm::Value *DeviceID = nullptr;
@@ -10473,9 +10517,11 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall(
     TargetDataInfo Info;
     // Fill up the arrays and create the arguments.
     emitOffloadingArrays(CGF, CombinedInfo, Info);
+    bool HasDependClauses = D.hasClausesOfKind();
     emitOffloadingArraysArgument(CGF, Info.BasePointersArray,
                                  Info.PointersArray, Info.SizesArray,
-                                 Info.MapTypesArray, Info.MappersArray, Info);
+                                 Info.MapTypesArray, Info.MappersArray, Info,
+                                 {/*ForEndTask=*/false, HasDependClauses});
     InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
     InputInfo.BasePointersArray =
         Address(Info.BasePointersArray, CGM.getPointerAlign());
@@ -10485,7 +10531,7 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall(
         Address(Info.SizesArray, CGM.getPointerAlign());
     InputInfo.MappersArray = Address(Info.MappersArray, CGM.getPointerAlign());
     MapTypesArray = Info.MapTypesArray;
-    if (D.hasClausesOfKind())
+    if (HasDependClauses)
       CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
     else
       emitInlinedDirective(CGF, D.getDirectiveKind(), ThenGen);
@@ -11224,44 +11270,27 @@ Address CGOpenMPRuntime::getParameterAddress(CodeGenFunction &CGF,
   return CGF.GetAddrOfLocalVar(NativeParam);
 }
 
-namespace {
-/// Cleanup action for allocate support.
-class OMPAllocateCleanupTy final : public EHScopeStack::Cleanup {
-public:
-  static const int CleanupArgs = 3;
-
-private:
-  llvm::FunctionCallee RTLFn;
-  llvm::Value *Args[CleanupArgs];
-
-public:
-  OMPAllocateCleanupTy(llvm::FunctionCallee RTLFn,
-                       ArrayRef CallArgs)
-      : RTLFn(RTLFn) {
-    assert(CallArgs.size() == CleanupArgs &&
-           "Size of arguments does not match.");
-    std::copy(CallArgs.begin(), CallArgs.end(), std::begin(Args));
-  }
-  void Emit(CodeGenFunction &CGF, Flags /*flags*/) override {
-    if (!CGF.HaveInsertPoint())
-      return;
-    CGF.EmitRuntimeCall(RTLFn, Args);
-  }
-};
-} // namespace
-
 Address CGOpenMPRuntime::getAddressOfLocalVariable(CodeGenFunction &CGF,
                                                    const VarDecl *VD) {
   if (!VD)
     return Address::invalid();
+  Address UntiedAddr = Address::invalid();
+  Address UntiedRealAddr = Address::invalid();
+  auto It = FunctionToUntiedTaskStackMap.find(CGF.CurFn);
+  if (It != FunctionToUntiedTaskStackMap.end()) {
+    const UntiedLocalVarsAddressesMap &UntiedData =
+        UntiedLocalVarsStack[It->second];
+    auto I = UntiedData.find(VD);
+    if (I != UntiedData.end()) {
+      UntiedAddr = I->second.first;
+      UntiedRealAddr = I->second.second;
+    }
+  }
   const VarDecl *CVD = VD->getCanonicalDecl();
   if (CVD->hasAttr()) {
-    const auto *AA = CVD->getAttr();
     // Use the default allocation.
-    if ((AA->getAllocatorType() == OMPAllocateDeclAttr::OMPDefaultMemAlloc ||
-         AA->getAllocatorType() == OMPAllocateDeclAttr::OMPNullMemAlloc) &&
-        !AA->getAllocator())
-      return Address::invalid();
+    if (!isAllocatableDecl(VD))
+      return UntiedAddr;
     llvm::Value *Size;
     CharUnits Align = CGM.getContext().getDeclAlign(CVD);
     if (CVD->getType()->isVariablyModifiedType()) {
@@ -11276,43 +11305,80 @@ Address CGOpenMPRuntime::getAddressOfLocalVariable(CodeGenFunction &CGF,
       Size = CGM.getSize(Sz.alignTo(Align));
     }
     llvm::Value *ThreadID = getThreadID(CGF, CVD->getBeginLoc());
+    const auto *AA = CVD->getAttr();
     assert(AA->getAllocator() &&
            "Expected allocator expression for non-default allocator.");
     llvm::Value *Allocator = CGF.EmitScalarExpr(AA->getAllocator());
     // According to the standard, the original allocator type is a enum
     // (integer). Convert to pointer type, if required.
-    if (Allocator->getType()->isIntegerTy())
-      Allocator = CGF.Builder.CreateIntToPtr(Allocator, CGM.VoidPtrTy);
-    else if (Allocator->getType()->isPointerTy())
-      Allocator = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-          Allocator, CGM.VoidPtrTy);
+    Allocator = CGF.EmitScalarConversion(
+        Allocator, AA->getAllocator()->getType(), CGF.getContext().VoidPtrTy,
+        AA->getAllocator()->getExprLoc());
     llvm::Value *Args[] = {ThreadID, Size, Allocator};
 
     llvm::Value *Addr =
         CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
                                 CGM.getModule(), OMPRTL___kmpc_alloc),
                             Args, getName({CVD->getName(), ".void.addr"}));
-    llvm::Value *FiniArgs[OMPAllocateCleanupTy::CleanupArgs] = {ThreadID, Addr,
-                                                                Allocator};
     llvm::FunctionCallee FiniRTLFn = OMPBuilder.getOrCreateRuntimeFunction(
         CGM.getModule(), OMPRTL___kmpc_free);
-
-    CGF.EHStack.pushCleanup(NormalAndEHCleanup, FiniRTLFn,
-                                                  llvm::makeArrayRef(FiniArgs));
+    QualType Ty = CGM.getContext().getPointerType(CVD->getType());
     Addr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-        Addr,
-        CGF.ConvertTypeForMem(CGM.getContext().getPointerType(CVD->getType())),
-        getName({CVD->getName(), ".addr"}));
-    return Address(Addr, Align);
+        Addr, CGF.ConvertTypeForMem(Ty), getName({CVD->getName(), ".addr"}));
+    if (UntiedAddr.isValid())
+      CGF.EmitStoreOfScalar(Addr, UntiedAddr, /*Volatile=*/false, Ty);
+
+    // Cleanup action for allocate support.
+    class OMPAllocateCleanupTy final : public EHScopeStack::Cleanup {
+      llvm::FunctionCallee RTLFn;
+      unsigned LocEncoding;
+      Address Addr;
+      const Expr *Allocator;
+
+    public:
+      OMPAllocateCleanupTy(llvm::FunctionCallee RTLFn, unsigned LocEncoding,
+                           Address Addr, const Expr *Allocator)
+          : RTLFn(RTLFn), LocEncoding(LocEncoding), Addr(Addr),
+            Allocator(Allocator) {}
+      void Emit(CodeGenFunction &CGF, Flags /*flags*/) override {
+        if (!CGF.HaveInsertPoint())
+          return;
+        llvm::Value *Args[3];
+        Args[0] = CGF.CGM.getOpenMPRuntime().getThreadID(
+            CGF, SourceLocation::getFromRawEncoding(LocEncoding));
+        Args[1] = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+            Addr.getPointer(), CGF.VoidPtrTy);
+        llvm::Value *AllocVal = CGF.EmitScalarExpr(Allocator);
+        // According to the standard, the original allocator type is a enum
+        // (integer). Convert to pointer type, if required.
+        AllocVal = CGF.EmitScalarConversion(AllocVal, Allocator->getType(),
+                                            CGF.getContext().VoidPtrTy,
+                                            Allocator->getExprLoc());
+        Args[2] = AllocVal;
+
+        CGF.EmitRuntimeCall(RTLFn, Args);
+      }
+    };
+    Address VDAddr =
+        UntiedRealAddr.isValid() ? UntiedRealAddr : Address(Addr, Align);
+    CGF.EHStack.pushCleanup(
+        NormalAndEHCleanup, FiniRTLFn, CVD->getLocation().getRawEncoding(),
+        VDAddr, AA->getAllocator());
+    if (UntiedRealAddr.isValid())
+      if (auto *Region =
+              dyn_cast_or_null(CGF.CapturedStmtInfo))
+        Region->emitUntiedSwitch(CGF);
+    return VDAddr;
   }
-  if (UntiedLocalVarsStack.empty())
-    return Address::invalid();
-  const UntiedLocalVarsAddressesMap &UntiedData = UntiedLocalVarsStack.back();
-  auto It = UntiedData.find(VD);
-  if (It == UntiedData.end())
-    return Address::invalid();
+  return UntiedAddr;
+}
 
-  return It->second;
+bool CGOpenMPRuntime::isLocalVarInUntiedTask(CodeGenFunction &CGF,
+                                             const VarDecl *VD) const {
+  auto It = FunctionToUntiedTaskStackMap.find(CGF.CurFn);
+  if (It == FunctionToUntiedTaskStackMap.end())
+    return false;
+  return UntiedLocalVarsStack[It->second].count(VD) > 0;
 }
 
 CGOpenMPRuntime::NontemporalDeclsRAII::NontemporalDeclsRAII(
@@ -11348,11 +11414,14 @@ CGOpenMPRuntime::NontemporalDeclsRAII::~NontemporalDeclsRAII() {
 }
 
 CGOpenMPRuntime::UntiedTaskLocalDeclsRAII::UntiedTaskLocalDeclsRAII(
-    CodeGenModule &CGM,
-    const llvm::DenseMap, Address> &LocalVars)
-    : CGM(CGM), NeedToPush(!LocalVars.empty()) {
+    CodeGenFunction &CGF,
+    const llvm::DenseMap,
+                         std::pair> &LocalVars)
+    : CGM(CGF.CGM), NeedToPush(!LocalVars.empty()) {
   if (!NeedToPush)
     return;
+  CGM.getOpenMPRuntime().FunctionToUntiedTaskStackMap.try_emplace(
+      CGF.CurFn, CGM.getOpenMPRuntime().UntiedLocalVarsStack.size());
   CGM.getOpenMPRuntime().UntiedLocalVarsStack.push_back(LocalVars);
 }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 178acaec0aa1f..41fa9f5345aa8 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -253,9 +253,9 @@ class CGOpenMPRuntime {
 
   public:
     UntiedTaskLocalDeclsRAII(
-        CodeGenModule &CGM,
-        const llvm::DenseMap, Address>
-            &LocalVars);
+        CodeGenFunction &CGF,
+        const llvm::DenseMap,
+                             std::pair> &LocalVars);
     ~UntiedTaskLocalDeclsRAII();
   };
 
@@ -432,6 +432,8 @@ class CGOpenMPRuntime {
                                 std::tuple>>
       LastprivateConditionalToTypes;
+  /// Maps function to the position of the untied task locals stack.
+  llvm::DenseMap FunctionToUntiedTaskStackMap;
   /// Type kmp_critical_name, originally defined as typedef kmp_int32
   /// kmp_critical_name[8];
   llvm::ArrayType *KmpCriticalNameTy;
@@ -720,7 +722,8 @@ class CGOpenMPRuntime {
   llvm::SmallVector NontemporalDeclsStack;
 
   using UntiedLocalVarsAddressesMap =
-      llvm::DenseMap, Address>;
+      llvm::DenseMap,
+                     std::pair>;
   llvm::SmallVector UntiedLocalVarsStack;
 
   /// Stack for list of addresses of declarations in current context marked as
@@ -1882,6 +1885,9 @@ class CGOpenMPRuntime {
 
   /// Destroys user defined allocators specified in the uses_allocators clause.
   void emitUsesAllocatorsFini(CodeGenFunction &CGF, const Expr *Allocator);
+
+  /// Returns true if the variable is a local variable in untied task.
+  bool isLocalVarInUntiedTask(CodeGenFunction &CGF, const VarDecl *VD) const;
 };
 
 /// Class supports emissionof SIMD-only code.
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 612d76d8ffd48..5bab731b98667 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/SaveAndRestore.h"
+#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -651,6 +652,20 @@ void CodeGenFunction::EmitIndirectGotoStmt(const IndirectGotoStmt &S) {
 
   EmitBranch(IndGotoBB);
 }
+static Optional>
+getLikelihoodWeights(const IfStmt &If) {
+  switch (Stmt::getLikelihood(If.getThen(), If.getElse())) {
+  case Stmt::LH_Unlikely:
+    return std::pair(llvm::UnlikelyBranchWeight,
+                                         llvm::LikelyBranchWeight);
+  case Stmt::LH_None:
+    return None;
+  case Stmt::LH_Likely:
+    return std::pair(llvm::LikelyBranchWeight,
+                                         llvm::UnlikelyBranchWeight);
+  }
+  llvm_unreachable("Unknown Likelihood");
+}
 
 void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
   // C99 6.8.4.1: The first substatement is executed if the expression compares
@@ -695,8 +710,20 @@ void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
   if (S.getElse())
     ElseBlock = createBasicBlock("if.else");
 
-  EmitBranchOnBoolExpr(S.getCond(), ThenBlock, ElseBlock,
-                       getProfileCount(S.getThen()));
+  // Prefer the PGO based weights over the likelihood attribute.
+  // When the build isn't optimized the metadata isn't used, so don't generate
+  // it.
+  llvm::MDNode *Weights = nullptr;
+  uint64_t Count = getProfileCount(S.getThen());
+  if (!Count && CGM.getCodeGenOpts().OptimizationLevel) {
+    Optional> LHW = getLikelihoodWeights(S);
+    if (LHW) {
+      llvm::MDBuilder MDHelper(CGM.getLLVMContext());
+      Weights = MDHelper.createBranchWeights(LHW->first, LHW->second);
+    }
+  }
+
+  EmitBranchOnBoolExpr(S.getCond(), ThenBlock, ElseBlock, Count, Weights);
 
   // Emit the 'then' code.
   EmitBlock(ThenBlock);
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index c1def6c88f0a6..d656792dea718 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1563,6 +1563,17 @@ static void emitCommonOMPParallelDirective(
                                               CapturedVars, IfCond);
 }
 
+static bool isAllocatableDecl(const VarDecl *VD) {
+  const VarDecl *CVD = VD->getCanonicalDecl();
+  if (!CVD->hasAttr())
+    return false;
+  const auto *AA = CVD->getAttr();
+  // Use the default allocation.
+  return !((AA->getAllocatorType() == OMPAllocateDeclAttr::OMPDefaultMemAlloc ||
+            AA->getAllocatorType() == OMPAllocateDeclAttr::OMPNullMemAlloc) &&
+           !AA->getAllocator());
+}
+
 static void emitEmptyBoundParameters(CodeGenFunction &,
                                      const OMPExecutableDirective &,
                                      llvm::SmallVectorImpl &) {}
@@ -1575,12 +1586,7 @@ Address CodeGenFunction::OMPBuilderCBHelpers::getAddressOfLocalVariable(
   if (!VD)
     return Address::invalid();
   const VarDecl *CVD = VD->getCanonicalDecl();
-  if (!CVD->hasAttr())
-    return Address::invalid();
-  const auto *AA = CVD->getAttr();
-  // Use the default allocation.
-  if (AA->getAllocatorType() == OMPAllocateDeclAttr::OMPDefaultMemAlloc &&
-      !AA->getAllocator())
+  if (!isAllocatableDecl(CVD))
     return Address::invalid();
   llvm::Value *Size;
   CharUnits Align = CGM.getContext().getDeclAlign(CVD);
@@ -1596,6 +1602,7 @@ Address CodeGenFunction::OMPBuilderCBHelpers::getAddressOfLocalVariable(
     Size = CGM.getSize(Sz.alignTo(Align));
   }
 
+  const auto *AA = CVD->getAttr();
   assert(AA->getAllocator() &&
          "Expected allocator expression for non-default allocator.");
   llvm::Value *Allocator = CGF.EmitScalarExpr(AA->getAllocator());
@@ -2982,7 +2989,7 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
           ((ScheduleKind.Schedule == OMPC_SCHEDULE_static ||
             ScheduleKind.Schedule == OMPC_SCHEDULE_unknown) &&
            !(ScheduleKind.M1 == OMPC_SCHEDULE_MODIFIER_nonmonotonic ||
-             ScheduleKind.M1 == OMPC_SCHEDULE_MODIFIER_nonmonotonic)) ||
+             ScheduleKind.M2 == OMPC_SCHEDULE_MODIFIER_nonmonotonic)) ||
           ScheduleKind.M1 == OMPC_SCHEDULE_MODIFIER_monotonic ||
           ScheduleKind.M2 == OMPC_SCHEDULE_MODIFIER_monotonic;
       if ((RT.isStaticNonchunked(ScheduleKind.Schedule,
@@ -3931,7 +3938,8 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
   auto &&CodeGen = [&Data, &S, CS, &BodyGen, &LastprivateDstsOrigs,
                     CapturedRegion](CodeGenFunction &CGF,
                                     PrePostActionTy &Action) {
-    llvm::DenseMap, Address> UntiedLocalVars;
+    llvm::DenseMap, std::pair>
+        UntiedLocalVars;
     // Set proper addresses for generated private copies.
     OMPPrivateScope Scope(CGF);
     llvm::SmallVector, 16> FirstprivatePtrs;
@@ -3976,9 +3984,11 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
         QualType Ty = VD->getType().getNonReferenceType();
         if (VD->getType()->isLValueReferenceType())
           Ty = CGF.getContext().getPointerType(Ty);
+        if (isAllocatableDecl(VD))
+          Ty = CGF.getContext().getPointerType(Ty);
         Address PrivatePtr = CGF.CreateMemTemp(
             CGF.getContext().getPointerType(Ty), ".local.ptr.addr");
-        UntiedLocalVars.try_emplace(VD, PrivatePtr);
+        UntiedLocalVars.try_emplace(VD, PrivatePtr, Address::invalid());
         CallArgs.push_back(PrivatePtr.getPointer());
       }
       CGF.CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
@@ -4002,9 +4012,18 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
       // Adjust mapping for internal locals by mapping actual memory instead of
       // a pointer to this memory.
       for (auto &Pair : UntiedLocalVars) {
-        Address Replacement(CGF.Builder.CreateLoad(Pair.second),
-                            CGF.getContext().getDeclAlign(Pair.first));
-        Pair.getSecond() = Replacement;
+        if (isAllocatableDecl(Pair.first)) {
+          llvm::Value *Ptr = CGF.Builder.CreateLoad(Pair.second.first);
+          Address Replacement(Ptr, CGF.getPointerAlign());
+          Pair.getSecond().first = Replacement;
+          Ptr = CGF.Builder.CreateLoad(Replacement);
+          Replacement = Address(Ptr, CGF.getContext().getDeclAlign(Pair.first));
+          Pair.getSecond().second = Replacement;
+        } else {
+          llvm::Value *Ptr = CGF.Builder.CreateLoad(Pair.second.first);
+          Address Replacement(Ptr, CGF.getContext().getDeclAlign(Pair.first));
+          Pair.getSecond().first = Replacement;
+        }
       }
     }
     if (Data.Reductions) {
@@ -4100,7 +4119,7 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
     }
     (void)InRedScope.Privatize();
 
-    CGOpenMPRuntime::UntiedTaskLocalDeclsRAII LocalVarsScope(CGF.CGM,
+    CGOpenMPRuntime::UntiedTaskLocalDeclsRAII LocalVarsScope(CGF,
                                                              UntiedLocalVars);
     Action.Enter(CGF);
     BodyGen(CGF);
@@ -4137,7 +4156,7 @@ createImplicitFirstprivateForType(ASTContext &C, OMPTaskDataTy &Data,
   PrivateVD->setInitStyle(VarDecl::CInit);
   PrivateVD->setInit(ImplicitCastExpr::Create(C, ElemType, CK_LValueToRValue,
                                               InitRef, /*BasePath=*/nullptr,
-                                              VK_RValue));
+                                              VK_RValue, FPOptionsOverride()));
   Data.FirstprivateVars.emplace_back(OrigRef);
   Data.FirstprivateCopies.emplace_back(PrivateRef);
   Data.FirstprivateInits.emplace_back(InitRef);
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
index 0915f14e24a22..2e9af119d4160 100644
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -108,7 +108,6 @@ add_clang_library(clangCodeGen
   LINK_LIBS
   clangAnalysis
   clangAST
-  clangASTMatchers
   clangBasic
   clangFrontend
   clangLex
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index d1cc224dfcd76..341cddef23379 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -247,8 +247,13 @@ namespace clang {
     bool LinkInModules() {
       for (auto &LM : LinkModules) {
         if (LM.PropagateAttrs)
-          for (Function &F : *LM.Module)
+          for (Function &F : *LM.Module) {
+            // Skip intrinsics. Keep consistent with how intrinsics are created
+            // in LLVM IR.
+            if (F.isIntrinsic())
+              continue;
             Gen->CGM().addDefaultFunctionDefinitionAttributes(F);
+          }
 
         CurLinkModule = LM.Module.get();
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 5c9f60ae5f0df..b5c36fbd97935 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1524,16 +1524,15 @@ bool CodeGenFunction::ConstantFoldsToSimpleInteger(const Expr *Cond,
   return true;
 }
 
-
-
 /// EmitBranchOnBoolExpr - Emit a branch on a boolean condition (e.g. for an if
 /// statement) to the specified blocks.  Based on the condition, this might try
 /// to simplify the codegen of the conditional based on the branch.
-///
+/// \param Weights The weights determined by the likelihood attributes.
 void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
                                            llvm::BasicBlock *TrueBlock,
                                            llvm::BasicBlock *FalseBlock,
-                                           uint64_t TrueCount) {
+                                           uint64_t TrueCount,
+                                           llvm::MDNode *Weights) {
   Cond = Cond->IgnoreParens();
 
   if (const BinaryOperator *CondBOp = dyn_cast(Cond)) {
@@ -1548,7 +1547,7 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
         // br(1 && X) -> br(X).
         incrementProfileCounter(CondBOp);
         return EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock,
-                                    TrueCount);
+                                    TrueCount, Weights);
       }
 
       // If we have "X && 1", simplify the code to use an uncond branch.
@@ -1557,7 +1556,7 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
           ConstantBool) {
         // br(X && 1) -> br(X).
         return EmitBranchOnBoolExpr(CondBOp->getLHS(), TrueBlock, FalseBlock,
-                                    TrueCount);
+                                    TrueCount, Weights);
       }
 
       // Emit the LHS as a conditional.  If the LHS conditional is false, we
@@ -1570,7 +1569,8 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
       ConditionalEvaluation eval(*this);
       {
         ApplyDebugLocation DL(*this, Cond);
-        EmitBranchOnBoolExpr(CondBOp->getLHS(), LHSTrue, FalseBlock, RHSCount);
+        EmitBranchOnBoolExpr(CondBOp->getLHS(), LHSTrue, FalseBlock, RHSCount,
+                             Weights);
         EmitBlock(LHSTrue);
       }
 
@@ -1579,7 +1579,8 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
 
       // Any temporaries created here are conditional.
       eval.begin(*this);
-      EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock, TrueCount);
+      EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock, TrueCount,
+                           Weights);
       eval.end(*this);
 
       return;
@@ -1594,7 +1595,7 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
         // br(0 || X) -> br(X).
         incrementProfileCounter(CondBOp);
         return EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock,
-                                    TrueCount);
+                                    TrueCount, Weights);
       }
 
       // If we have "X || 0", simplify the code to use an uncond branch.
@@ -1603,7 +1604,7 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
           !ConstantBool) {
         // br(X || 0) -> br(X).
         return EmitBranchOnBoolExpr(CondBOp->getLHS(), TrueBlock, FalseBlock,
-                                    TrueCount);
+                                    TrueCount, Weights);
       }
 
       // Emit the LHS as a conditional.  If the LHS conditional is true, we
@@ -1619,7 +1620,8 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
       ConditionalEvaluation eval(*this);
       {
         ApplyDebugLocation DL(*this, Cond);
-        EmitBranchOnBoolExpr(CondBOp->getLHS(), TrueBlock, LHSFalse, LHSCount);
+        EmitBranchOnBoolExpr(CondBOp->getLHS(), TrueBlock, LHSFalse, LHSCount,
+                             Weights);
         EmitBlock(LHSFalse);
       }
 
@@ -1628,7 +1630,8 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
 
       // Any temporaries created here are conditional.
       eval.begin(*this);
-      EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock, RHSCount);
+      EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock, RHSCount,
+                           Weights);
 
       eval.end(*this);
 
@@ -1643,7 +1646,7 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
       uint64_t FalseCount = getCurrentProfileCount() - TrueCount;
       // Negate the condition and swap the destination blocks.
       return EmitBranchOnBoolExpr(CondUOp->getSubExpr(), FalseBlock, TrueBlock,
-                                  FalseCount);
+                                  FalseCount, Weights);
     }
   }
 
@@ -1654,7 +1657,7 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
 
     ConditionalEvaluation cond(*this);
     EmitBranchOnBoolExpr(CondOp->getCond(), LHSBlock, RHSBlock,
-                         getProfileCount(CondOp));
+                         getProfileCount(CondOp), Weights);
 
     // When computing PGO branch weights, we only know the overall count for
     // the true block. This code is essentially doing tail duplication of the
@@ -1674,14 +1677,14 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
     {
       ApplyDebugLocation DL(*this, Cond);
       EmitBranchOnBoolExpr(CondOp->getLHS(), TrueBlock, FalseBlock,
-                           LHSScaledTrueCount);
+                           LHSScaledTrueCount, Weights);
     }
     cond.end(*this);
 
     cond.begin(*this);
     EmitBlock(RHSBlock);
     EmitBranchOnBoolExpr(CondOp->getRHS(), TrueBlock, FalseBlock,
-                         TrueCount - LHSScaledTrueCount);
+                         TrueCount - LHSScaledTrueCount, Weights);
     cond.end(*this);
 
     return;
@@ -1712,9 +1715,10 @@ void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
 
   // Create branch weights based on the number of times we get here and the
   // number of times the condition should be true.
-  uint64_t CurrentCount = std::max(getCurrentProfileCount(), TrueCount);
-  llvm::MDNode *Weights =
-      createProfileWeights(TrueCount, CurrentCount - TrueCount);
+  if (!Weights) {
+    uint64_t CurrentCount = std::max(getCurrentProfileCount(), TrueCount);
+    Weights = createProfileWeights(TrueCount, CurrentCount - TrueCount);
+  }
 
   // Emit the code with the fully general case.
   llvm::Value *CondV;
@@ -2215,13 +2219,39 @@ void CodeGenFunction::emitAlignmentAssumption(llvm::Value *PtrValue,
                                               SourceLocation AssumptionLoc,
                                               llvm::Value *Alignment,
                                               llvm::Value *OffsetValue) {
-  llvm::Value *TheCheck;
-  llvm::Instruction *Assumption = Builder.CreateAlignmentAssumption(
-      CGM.getDataLayout(), PtrValue, Alignment, OffsetValue, &TheCheck);
+  if (Alignment->getType() != IntPtrTy)
+    Alignment =
+        Builder.CreateIntCast(Alignment, IntPtrTy, false, "casted.align");
+  if (OffsetValue && OffsetValue->getType() != IntPtrTy)
+    OffsetValue =
+        Builder.CreateIntCast(OffsetValue, IntPtrTy, true, "casted.offset");
+  llvm::Value *TheCheck = nullptr;
   if (SanOpts.has(SanitizerKind::Alignment)) {
-    emitAlignmentAssumptionCheck(PtrValue, Ty, Loc, AssumptionLoc, Alignment,
-                                 OffsetValue, TheCheck, Assumption);
+    llvm::Value *PtrIntValue =
+        Builder.CreatePtrToInt(PtrValue, IntPtrTy, "ptrint");
+
+    if (OffsetValue) {
+      bool IsOffsetZero = false;
+      if (const auto *CI = dyn_cast(OffsetValue))
+        IsOffsetZero = CI->isZero();
+
+      if (!IsOffsetZero)
+        PtrIntValue = Builder.CreateSub(PtrIntValue, OffsetValue, "offsetptr");
+    }
+
+    llvm::Value *Zero = llvm::ConstantInt::get(IntPtrTy, 0);
+    llvm::Value *Mask =
+        Builder.CreateSub(Alignment, llvm::ConstantInt::get(IntPtrTy, 1));
+    llvm::Value *MaskedPtr = Builder.CreateAnd(PtrIntValue, Mask, "maskedptr");
+    TheCheck = Builder.CreateICmpEQ(MaskedPtr, Zero, "maskcond");
   }
+  llvm::Instruction *Assumption = Builder.CreateAlignmentAssumption(
+      CGM.getDataLayout(), PtrValue, Alignment, OffsetValue);
+
+  if (!SanOpts.has(SanitizerKind::Alignment))
+    return;
+  emitAlignmentAssumptionCheck(PtrValue, Ty, Loc, AssumptionLoc, Alignment,
+                               OffsetValue, TheCheck, Assumption);
 }
 
 void CodeGenFunction::emitAlignmentAssumption(llvm::Value *PtrValue,
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 3ac5cd668f21b..07dcf20dbc5be 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4372,7 +4372,8 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// TrueCount should be the number of times we expect the condition to
   /// evaluate to true based on PGO data.
   void EmitBranchOnBoolExpr(const Expr *Cond, llvm::BasicBlock *TrueBlock,
-                            llvm::BasicBlock *FalseBlock, uint64_t TrueCount);
+                            llvm::BasicBlock *FalseBlock, uint64_t TrueCount,
+                            llvm::MDNode *Weights = nullptr);
 
   /// Given an assignment `*LHS = RHS`, emit a test that checks if \p RHS is
   /// nonnull, if \p LHS is marked _Nonnull.
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 4a5efa77873f3..4ad2aff86d0d7 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -9,11 +9,11 @@
 // This provides C++ code generation targeting the Itanium C++ ABI.  The class
 // in this file generates structures that follow the Itanium C++ ABI, which is
 // documented at:
-//  http://www.codesourcery.com/public/cxx-abi/abi.html
-//  http://www.codesourcery.com/public/cxx-abi/abi-eh.html
+//  https://itanium-cxx-abi.github.io/cxx-abi/abi.html
+//  https://itanium-cxx-abi.github.io/cxx-abi/abi-eh.html
 //
 // It also supports the closely-related ARM ABI, documented at:
-// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0041c/IHI0041C_cppabi.pdf
+// https://developer.arm.com/documentation/ihi0041/g/
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 3ca5ca2ffb4c6..e22f93c96e69a 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -5627,7 +5627,7 @@ ABIArgInfo AArch64ABIInfo::coerceIllegalVector(QualType Ty) const {
       ResType = llvm::ScalableVectorType::get(
           llvm::Type::getInt64Ty(getVMContext()), 2);
       break;
-    case BuiltinType::Float16:
+    case BuiltinType::Half:
       ResType = llvm::ScalableVectorType::get(
           llvm::Type::getHalfTy(getVMContext()), 8);
       break;
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 9d400a945ba25..54d3ff9544a86 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -129,12 +129,12 @@ std::string Driver::GetResourcesPath(StringRef BinaryPath,
 }
 
 Driver::Driver(StringRef ClangExecutable, StringRef TargetTriple,
-               DiagnosticsEngine &Diags,
+               DiagnosticsEngine &Diags, std::string Title,
                IntrusiveRefCntPtr VFS)
     : Diags(Diags), VFS(std::move(VFS)), Mode(GCCMode),
       SaveTemps(SaveTempsNone), BitcodeEmbed(EmbedNone), LTOMode(LTOK_None),
       ClangExecutable(ClangExecutable), SysRoot(DEFAULT_SYSROOT),
-      DriverTitle("clang LLVM compiler"), CCPrintOptionsFilename(nullptr),
+      DriverTitle(Title), CCPrintOptionsFilename(nullptr),
       CCPrintHeadersFilename(nullptr), CCLogDiagnosticsFilename(nullptr),
       CCCPrintBindings(false), CCPrintOptions(false), CCPrintHeaders(false),
       CCLogDiagnostics(false), CCGenDiagnostics(false),
@@ -1769,6 +1769,9 @@ void Driver::PrintHelp(bool ShowHidden) const {
   if (!ShowHidden)
     ExcludedFlagsBitmask |= HelpHidden;
 
+  if (IsFlangMode())
+    IncludedFlagsBitmask |= options::FlangOption;
+
   std::string Usage = llvm::formatv("{0} [options] file...", Name).str();
   getOpts().PrintHelp(llvm::outs(), Usage.c_str(), DriverTitle.c_str(),
                       IncludedFlagsBitmask, ExcludedFlagsBitmask,
@@ -1837,9 +1840,13 @@ void Driver::PrintSYCLToolHelp(const Compilation &C) const {
 }
 
 void Driver::PrintVersion(const Compilation &C, raw_ostream &OS) const {
-  // FIXME: The following handlers should use a callback mechanism, we don't
-  // know what the client would like to do.
-  OS << getClangFullVersion() << '\n';
+  if (IsFlangMode()) {
+    OS << getClangToolFullVersion("flang-new") << '\n';
+  } else {
+    // FIXME: The following handlers should use a callback mechanism, we don't
+    // know what the client would like to do.
+    OS << getClangFullVersion() << '\n';
+  }
   const ToolChain &TC = C.getDefaultToolChain();
   OS << "Target: " << TC.getTripleString() << '\n';
 
@@ -1877,7 +1884,7 @@ void Driver::HandleAutocompletions(StringRef PassedFlags) const {
   std::vector SuggestedCompletions;
   std::vector Flags;
 
-  unsigned short DisableFlags =
+  unsigned int DisableFlags =
       options::NoDriverOption | options::Unsupported | options::Ignored;
 
   // Distinguish "--autocomplete=-someflag" and "--autocomplete=-someflag,"
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 0f51443010ca4..90dbced3240d1 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -495,8 +495,10 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
         << lastArgumentForMask(D, Args, Kinds & NeedsLTO) << "-flto";
   }
 
-  if ((Kinds & SanitizerKind::ShadowCallStack) && TC.getTriple().isAArch64() &&
-      !llvm::AArch64::isX18ReservedByDefault(TC.getTriple()) &&
+  if ((Kinds & SanitizerKind::ShadowCallStack) &&
+      ((TC.getTriple().isAArch64() &&
+        !llvm::AArch64::isX18ReservedByDefault(TC.getTriple())) ||
+       TC.getTriple().isRISCV()) &&
       !Args.hasArg(options::OPT_ffixed_x18)) {
     D.Diag(diag::err_drv_argument_only_allowed_with)
         << lastArgumentForMask(D, Args, Kinds & SanitizerKind::ShadowCallStack)
@@ -866,8 +868,8 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
                                 LinkCXXRuntimes) ||
                     D.CCCIsCXX();
 
-  NeedsHeapProfRt = Args.hasFlag(options::OPT_fmemory_profile,
-                                 options::OPT_fno_memory_profile, false);
+  NeedsMemProfRt = Args.hasFlag(options::OPT_fmemory_profile,
+                                options::OPT_fno_memory_profile, false);
 
   // Finally, initialize the set of available and recoverable sanitizers.
   Sanitizers.Mask |= Kinds;
@@ -929,10 +931,10 @@ static bool hasTargetFeatureMTE(const llvm::opt::ArgStringList &CmdArgs) {
 void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args,
                             llvm::opt::ArgStringList &CmdArgs,
                             types::ID InputType) const {
-  // NVPTX doesn't currently support sanitizers.  Bailing out here means that
-  // e.g. -fsanitize=address applies only to host code, which is what we want
-  // for now.
-  if (TC.getTriple().isNVPTX())
+  // NVPTX/AMDGPU doesn't currently support sanitizers.  Bailing out here means
+  // that e.g. -fsanitize=address applies only to host code, which is what we
+  // want for now.
+  if (TC.getTriple().isNVPTX() || TC.getTriple().isAMDGPU())
     return;
 
   // Translate available CoverageFeatures to corresponding clang-cc1 flags.
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index faf0b84963926..11169518745b7 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1087,7 +1087,8 @@ SanitizerMask ToolChain::getSupportedSanitizers() const {
       getTriple().getArch() == llvm::Triple::arm || getTriple().isWasm() ||
       getTriple().isAArch64())
     Res |= SanitizerKind::CFIICall;
-  if (getTriple().getArch() == llvm::Triple::x86_64 || getTriple().isAArch64())
+  if (getTriple().getArch() == llvm::Triple::x86_64 ||
+      getTriple().isAArch64() || getTriple().isRISCV())
     Res |= SanitizerKind::ShadowCallStack;
   if (getTriple().isAArch64())
     Res |= SanitizerKind::MemTag;
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 71acf3ed32816..3616310c37bf7 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -351,6 +351,7 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   std::string Linker = getToolChain().GetProgramPath(getShortName());
   ArgStringList CmdArgs;
+  addLinkerCompressDebugSectionsOption(getToolChain(), Args, CmdArgs);
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
   CmdArgs.push_back("-shared");
   CmdArgs.push_back("-o");
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index ce7c5348a4d57..6c5e43704cc49 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -94,7 +94,7 @@ static bool DecodeAArch64Mcpu(const Driver &D, StringRef Mcpu, StringRef &CPU,
     if (!llvm::AArch64::getArchFeatures(ArchKind, Features))
       return false;
 
-    unsigned Extension = llvm::AArch64::getDefaultExtensions(CPU, ArchKind);
+    uint64_t Extension = llvm::AArch64::getDefaultExtensions(CPU, ArchKind);
     if (!llvm::AArch64::getExtensionFeatures(Extension, Features))
       return false;
    }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 532909e8a0354..271a62728a579 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4950,7 +4950,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // Add the split debug info name to the command lines here so we
   // can propagate it to the backend.
   bool SplitDWARF = (DwarfFission != DwarfFissionKind::None) &&
-                    TC.getTriple().isOSBinFormatELF() &&
+                    (TC.getTriple().isOSBinFormatELF() ||
+                     TC.getTriple().isOSBinFormatWasm()) &&
                     (isa(JA) || isa(JA) ||
                      isa(JA));
   if (SplitDWARF) {
@@ -5025,13 +5026,18 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fbasic_block_sections_EQ)) {
-    StringRef Val = A->getValue();
-    if (Val != "all" && Val != "labels" && Val != "none" &&
-        !(Val.startswith("list=") && llvm::sys::fs::exists(Val.substr(5))))
-      D.Diag(diag::err_drv_invalid_value)
-          << A->getAsString(Args) << A->getValue();
-    else
-      A->render(Args, CmdArgs);
+    if (Triple.isX86() && Triple.isOSBinFormatELF()) {
+      StringRef Val = A->getValue();
+      if (Val != "all" && Val != "labels" && Val != "none" &&
+          !(Val.startswith("list=") && llvm::sys::fs::exists(Val.substr(5))))
+        D.Diag(diag::err_drv_invalid_value)
+            << A->getAsString(Args) << A->getValue();
+      else
+        A->render(Args, CmdArgs);
+    } else {
+      D.Diag(diag::err_drv_unsupported_opt_for_target)
+          << A->getAsString(Args) << TripleStr;
+    }
   }
 
   if (Args.hasFlag(options::OPT_fdata_sections, options::OPT_fno_data_sections,
@@ -5051,6 +5057,26 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                    options::OPT_fno_unique_basic_block_section_names, false))
     CmdArgs.push_back("-funique-basic-block-section-names");
 
+  if (Arg *A = Args.getLastArg(options::OPT_fsplit_machine_functions,
+                               options::OPT_fno_split_machine_functions)) {
+    // This codegen pass is only available on x86-elf targets.
+    if (Triple.isX86() && Triple.isOSBinFormatELF()) {
+      if (A->getOption().matches(options::OPT_fsplit_machine_functions)) {
+        // If the flag is enabled but no profile information is available then
+        // emit a warning.
+        if (getLastProfileUseArg(Args) || getLastProfileSampleUseArg(Args)) {
+          A->render(Args, CmdArgs);
+        } else {
+          D.Diag(diag::warn_drv_diagnostics_hotness_requires_pgo)
+              << A->getAsString(Args);
+        }
+      }
+    } else {
+      D.Diag(diag::err_drv_unsupported_opt_for_target)
+          << A->getAsString(Args) << TripleStr;
+    }
+  }
+
   Args.AddLastArg(CmdArgs, options::OPT_finstrument_functions,
                   options::OPT_finstrument_functions_after_inlining,
                   options::OPT_finstrument_function_entry_bare);
@@ -7270,6 +7296,15 @@ void ClangAs::ConstructJob(Compilation &C, const JobAction &JA,
     }
     break;
 
+  case llvm::Triple::aarch64:
+  case llvm::Triple::aarch64_32:
+  case llvm::Triple::aarch64_be:
+    if (Args.hasArg(options::OPT_mmark_bti_property)) {
+      CmdArgs.push_back("-mllvm");
+      CmdArgs.push_back("-aarch64-mark-bti-property");
+    }
+    break;
+
   case llvm::Triple::riscv32:
   case llvm::Triple::riscv64:
     AddRISCVTargetArgs(Args, CmdArgs);
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 53ef0e267fa38..816b9f3a2b3e7 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -222,6 +222,24 @@ void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs,
   }
 }
 
+void tools::addLinkerCompressDebugSectionsOption(
+    const ToolChain &TC, const llvm::opt::ArgList &Args,
+    llvm::opt::ArgStringList &CmdArgs) {
+  // GNU ld supports --compress-debug-sections=none|zlib|zlib-gnu|zlib-gabi
+  // whereas zlib is an alias to zlib-gabi. Therefore -gz=none|zlib|zlib-gnu
+  // are translated to --compress-debug-sections=none|zlib|zlib-gnu.
+  // -gz is not translated since ld --compress-debug-sections option requires an
+  // argument.
+  if (const Arg *A = Args.getLastArg(options::OPT_gz_EQ)) {
+    StringRef V = A->getValue();
+    if (V == "none" || V == "zlib" || V == "zlib-gnu")
+      CmdArgs.push_back(Args.MakeArgString("--compress-debug-sections=" + V));
+    else
+      TC.getDriver().Diag(diag::err_drv_unsupported_option_argument)
+          << A->getOption().getName() << V;
+  }
+}
+
 void tools::AddTargetFeature(const ArgList &Args,
                              std::vector &Features,
                              OptSpecifier OnOpt, OptSpecifier OffOpt,
@@ -355,6 +373,8 @@ std::string tools::getCPUName(const ArgList &Args, const llvm::Triple &T,
   case llvm::Triple::sparcv9:
     if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
       return A->getValue();
+    if (T.getArch() == llvm::Triple::sparc && T.isOSSolaris())
+      return "v9";
     return "";
 
   case llvm::Triple::x86:
@@ -694,10 +714,10 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
       if (!Args.hasArg(options::OPT_shared) && !TC.getTriple().isAndroid())
         HelperStaticRuntimes.push_back("asan-preinit");
     }
-    if (SanArgs.needsHeapProfRt() && SanArgs.linkRuntimes()) {
-      SharedRuntimes.push_back("heapprof");
+    if (SanArgs.needsMemProfRt() && SanArgs.linkRuntimes()) {
+      SharedRuntimes.push_back("memprof");
       if (!Args.hasArg(options::OPT_shared) && !TC.getTriple().isAndroid())
-        HelperStaticRuntimes.push_back("heapprof-preinit");
+        HelperStaticRuntimes.push_back("memprof-preinit");
     }
     if (SanArgs.needsUbsanRt() && SanArgs.linkRuntimes()) {
       if (SanArgs.requiresMinimalRuntime())
@@ -736,11 +756,11 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
       StaticRuntimes.push_back("asan_cxx");
   }
 
-  if (!SanArgs.needsSharedRt() && SanArgs.needsHeapProfRt() &&
+  if (!SanArgs.needsSharedRt() && SanArgs.needsMemProfRt() &&
       SanArgs.linkRuntimes()) {
-    StaticRuntimes.push_back("heapprof");
+    StaticRuntimes.push_back("memprof");
     if (SanArgs.linkCXXRuntimes())
-      StaticRuntimes.push_back("heapprof_cxx");
+      StaticRuntimes.push_back("memprof_cxx");
   }
 
   if (!SanArgs.needsSharedRt() && SanArgs.needsHwasanRt() && SanArgs.linkRuntimes()) {
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index 29dedec9b09cd..0028ea0ca3373 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -27,6 +27,10 @@ void AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs,
                      const llvm::opt::ArgList &Args,
                      llvm::opt::ArgStringList &CmdArgs, const JobAction &JA);
 
+void addLinkerCompressDebugSectionsOption(const ToolChain &TC,
+                                          const llvm::opt::ArgList &Args,
+                                          llvm::opt::ArgStringList &CmdArgs);
+
 void claimNoWarnArgs(const llvm::opt::ArgList &Args);
 
 bool addSanitizerRuntimes(const ToolChain &TC, const llvm::opt::ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 9d22cda217116..8f2be2a343cc5 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1197,7 +1197,6 @@ void Darwin::addProfileRTLibs(const ArgList &Args,
     if (ForGCOV) {
       addExportedSymbol(CmdArgs, "___gcov_dump");
       addExportedSymbol(CmdArgs, "___gcov_reset");
-      addExportedSymbol(CmdArgs, "_flush_fn_list");
       addExportedSymbol(CmdArgs, "_writeout_fn_list");
       addExportedSymbol(CmdArgs, "_reset_fn_list");
     } else {
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 80f6db7ea6427..93401c6626630 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -69,11 +69,13 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back(Input.getFilename());
 
   const auto& D = C.getDriver();
-  const char* Exec = Args.MakeArgString(D.GetProgramPath("flang", TC));
+  // TODO: Replace flang-new with flang once the new driver replaces the
+  // throwaway driver
+  const char *Exec = Args.MakeArgString(D.GetProgramPath("flang-new", TC));
   C.addCommand(std::make_unique(
       JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
 }
 
-Flang::Flang(const ToolChain &TC) : Tool("flang", "flang frontend", TC) {}
+Flang::Flang(const ToolChain &TC) : Tool("flang-new", "flang frontend", TC) {}
 
 Flang::~Flang() {}
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 631206ae02dfb..9b03af22954a0 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -594,6 +594,7 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   bool NeedsSanitizerDeps = addSanitizerRuntimes(ToolChain, Args, CmdArgs);
   bool NeedsXRayDeps = addXRayRuntime(ToolChain, Args, CmdArgs);
+  addLinkerCompressDebugSectionsOption(ToolChain, Args, CmdArgs);
   // When offloading, the input file(s) could be from unbundled partially
   // linked archives.  The unbundled information is a list of files and not
   // an actual object/archive.  Take that list and pass those to the linker
diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index f3e3976d715b7..43e557c980507 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -89,6 +89,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
   if (C.getDriver().isSaveTempsEnabled())
     LldArgs.push_back("-save-temps");
 
+  addLinkerCompressDebugSectionsOption(TC, Args, LldArgs);
+
   LldArgs.append({"-o", Output.getFilename()});
   for (auto Input : Inputs)
     LldArgs.push_back(Input.getFilename());
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index 10168736400f8..d953082470aab 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -243,6 +243,27 @@ void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
     CC1Args.push_back("+sign-ext");
   }
 
+  if (!DriverArgs.hasFlag(options::OPT_mmutable_globals,
+                          options::OPT_mno_mutable_globals, false)) {
+    // -fPIC implies +mutable-globals because the PIC ABI used by the linker
+    // depends on importing and exporting mutable globals.
+    llvm::Reloc::Model RelocationModel;
+    unsigned PICLevel;
+    bool IsPIE;
+    std::tie(RelocationModel, PICLevel, IsPIE) =
+        ParsePICArgs(*this, DriverArgs);
+    if (RelocationModel == llvm::Reloc::PIC_) {
+      if (DriverArgs.hasFlag(options::OPT_mno_mutable_globals,
+                             options::OPT_mmutable_globals, false)) {
+        getDriver().Diag(diag::err_drv_argument_not_allowed_with)
+            << "-fPIC"
+            << "-mno-mutable-globals";
+      }
+      CC1Args.push_back("-target-feature");
+      CC1Args.push_back("+mutable-globals");
+    }
+  }
+
   if (DriverArgs.getLastArg(options::OPT_fwasm_exceptions)) {
     // '-fwasm-exceptions' is not compatible with '-mno-exception-handling'
     if (DriverArgs.hasFlag(options::OPT_mno_exception_handing,
diff --git a/clang/lib/Driver/ToolChains/ZOS.cpp b/clang/lib/Driver/ToolChains/ZOS.cpp
index d57686b8930a3..f921227076a5e 100644
--- a/clang/lib/Driver/ToolChains/ZOS.cpp
+++ b/clang/lib/Driver/ToolChains/ZOS.cpp
@@ -21,3 +21,13 @@ ZOS::ZOS(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
     : ToolChain(D, Triple, Args) {}
 
 ZOS::~ZOS() {}
+
+void ZOS::addClangTargetOptions(const ArgList &DriverArgs,
+                                ArgStringList &CC1Args,
+                                Action::OffloadKind DeviceOffloadKind) const {
+  // Pass "-faligned-alloc-unavailable" only when the user hasn't manually
+  // enabled or disabled aligned allocations.
+  if (!DriverArgs.hasArgNoClaim(options::OPT_faligned_allocation,
+                                options::OPT_fno_aligned_allocation))
+    CC1Args.push_back("-faligned-alloc-unavailable");
+}
diff --git a/clang/lib/Driver/ToolChains/ZOS.h b/clang/lib/Driver/ToolChains/ZOS.h
index 3a90f4a12428a..cace85d6da772 100644
--- a/clang/lib/Driver/ToolChains/ZOS.h
+++ b/clang/lib/Driver/ToolChains/ZOS.h
@@ -27,6 +27,10 @@ class LLVM_LIBRARY_VISIBILITY ZOS : public ToolChain {
   bool isPICDefaultForced() const override { return false; }
 
   bool IsIntegratedAssemblerDefault() const override { return true; }
+
+  void addClangTargetOptions(
+      const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
+      Action::OffloadKind DeviceOffloadingKind) const override;
 };
 
 } // end namespace toolchains
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index f6db58acd8dbe..c1466196b4d64 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -401,7 +401,7 @@ bool FormatTokenLexer::tryTransformTryUsageForC() {
   if (!Try->is(tok::kw_try))
     return false;
   auto &Next = *(Tokens.end() - 1);
-  if (Next->isOneOf(tok::l_brace, tok::colon))
+  if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
     return false;
 
   if (Tokens.size() > 2) {
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 5dd6a7a9da40b..2fa3b28f3a390 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -56,6 +56,13 @@ static bool isLambdaParameterList(const FormatToken *Left) {
          Left->Previous->MatchingParen->is(TT_LambdaLSquare);
 }
 
+/// Returns \c true if the token is followed by a boolean condition, \c false
+/// otherwise.
+static bool isKeywordWithCondition(const FormatToken &Tok) {
+  return Tok.isOneOf(tok::kw_if, tok::kw_for, tok::kw_while, tok::kw_switch,
+                     tok::kw_constexpr, tok::kw_catch);
+}
+
 /// A parser that gathers additional information about tokens.
 ///
 /// The \c TokenAnnotator tries to match parenthesis and square brakets and
@@ -108,6 +115,12 @@ class AnnotatingParser {
 
     while (CurrentToken) {
       if (CurrentToken->is(tok::greater)) {
+        // Try to do a better job at looking for ">>" within the condition of
+        // a statement.
+        if (CurrentToken->Next && CurrentToken->Next->is(tok::greater) &&
+            Left->ParentBracket != tok::less &&
+            isKeywordWithCondition(*Line.First))
+          return false;
         Left->MatchingParen = CurrentToken;
         CurrentToken->MatchingParen = Left;
         // In TT_Proto, we must distignuish between:
@@ -185,8 +198,8 @@ class AnnotatingParser {
     if (!CurrentToken)
       return false;
     FormatToken *Left = CurrentToken->Previous;
-    FormatToken *PrevNonComment =
-        Left ? Left->getPreviousNonComment() : nullptr;
+    assert(Left && "Unknown previous token");
+    FormatToken *PrevNonComment = Left->getPreviousNonComment();
     Left->ParentBracket = Contexts.back().ContextKind;
     ScopedContextCreator ContextCreator(*this, tok::l_paren, 1);
 
@@ -2768,13 +2781,6 @@ bool TokenAnnotator::spaceRequiredBeforeParens(const FormatToken &Right) const {
           Right.ParameterCount > 0);
 }
 
-/// Returns \c true if the token is followed by a boolean condition, \c false
-/// otherwise.
-static bool isKeywordWithCondition(const FormatToken &Tok) {
-  return Tok.isOneOf(tok::kw_if, tok::kw_for, tok::kw_while, tok::kw_switch,
-                     tok::kw_constexpr, tok::kw_catch);
-}
-
 bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
                                           const FormatToken &Left,
                                           const FormatToken &Right) {
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 794a5c96b0f4a..b774f6c58a769 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1002,6 +1002,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
   Opts.UniqueInternalLinkageNames =
       Args.hasArg(OPT_funique_internal_linkage_names);
 
+  Opts.SplitMachineFunctions = Args.hasArg(OPT_fsplit_machine_functions);
+
   Opts.MergeFunctions = Args.hasArg(OPT_fmerge_functions);
 
   Opts.NoUseJumpTables = Args.hasArg(OPT_fno_jump_tables);
@@ -1037,7 +1039,7 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
   Opts.ThinLinkBitcodeFile =
       std::string(Args.getLastArgValue(OPT_fthin_link_bitcode_EQ));
 
-  Opts.HeapProf = Args.hasArg(OPT_fmemory_profile);
+  Opts.MemProf = Args.hasArg(OPT_fmemory_profile);
 
   Opts.MSVolatile = Args.hasArg(OPT_fms_volatile);
 
@@ -1457,6 +1459,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
       std::string(Args.getLastArgValue(OPT_fsymbol_partition_EQ));
 
   Opts.ForceAAPCSBitfieldLoad = Args.hasArg(OPT_ForceAAPCSBitfieldLoad);
+
+  Opts.PassByValueIsNoAlias = Args.hasArg(OPT_fpass_by_value_is_noalias);
   return Success;
 }
 
diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
index 1d5a6c06b34fe..ff0aa6faf33f6 100644
--- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
+++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
@@ -40,8 +40,8 @@ std::unique_ptr clang::createInvocationFromCommandLine(
   Args.push_back("-fsyntax-only");
 
   // FIXME: We shouldn't have to pass in the path info.
-  driver::Driver TheDriver(Args[0], llvm::sys::getDefaultTargetTriple(),
-                           *Diags, VFS);
+  driver::Driver TheDriver(Args[0], llvm::sys::getDefaultTargetTriple(), *Diags,
+                           "clang LLVM compiler", VFS);
 
   // Don't check that inputs exist, they may have been remapped.
   TheDriver.setCheckInputsExist(false);
diff --git a/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp b/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
index 8c41e71ef0187..c0c81221b2344 100644
--- a/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
+++ b/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
@@ -586,7 +586,8 @@ namespace {
                                              CastKind Kind, Expr *E) {
       TypeSourceInfo *TInfo = Ctx->getTrivialTypeSourceInfo(Ty, SourceLocation());
       return CStyleCastExpr::Create(*Ctx, Ty, VK_RValue, Kind, E, nullptr,
-                                    TInfo, SourceLocation(), SourceLocation());
+                                    FPOptionsOverride(), TInfo,
+                                    SourceLocation(), SourceLocation());
     }
 
     bool ImplementationIsNonLazy(const ObjCImplDecl *OD) const {
@@ -2105,8 +2106,8 @@ RewriteModernObjC::SynthesizeCallToFunctionDecl(FunctionDecl *FD,
   // Now, we cast the reference to a pointer to the objc_msgSend type.
   QualType pToFunc = Context->getPointerType(msgSendType);
   ImplicitCastExpr *ICE =
-    ImplicitCastExpr::Create(*Context, pToFunc, CK_FunctionToPointerDecay,
-                             DRE, nullptr, VK_RValue);
+      ImplicitCastExpr::Create(*Context, pToFunc, CK_FunctionToPointerDecay,
+                               DRE, nullptr, VK_RValue, FPOptionsOverride());
 
   const auto *FT = msgSendType->castAs();
   CallExpr *Exp =
diff --git a/clang/lib/Frontend/Rewrite/RewriteObjC.cpp b/clang/lib/Frontend/Rewrite/RewriteObjC.cpp
index 4ecd6e95de10e..990509a84b06c 100644
--- a/clang/lib/Frontend/Rewrite/RewriteObjC.cpp
+++ b/clang/lib/Frontend/Rewrite/RewriteObjC.cpp
@@ -492,7 +492,8 @@ namespace {
                                              CastKind Kind, Expr *E) {
       TypeSourceInfo *TInfo = Ctx->getTrivialTypeSourceInfo(Ty, SourceLocation());
       return CStyleCastExpr::Create(*Ctx, Ty, VK_RValue, Kind, E, nullptr,
-                                    TInfo, SourceLocation(), SourceLocation());
+                                    FPOptionsOverride(), TInfo,
+                                    SourceLocation(), SourceLocation());
     }
 
     StringLiteral *getStringLiteral(StringRef Str) {
@@ -2022,8 +2023,8 @@ RewriteObjC::SynthesizeCallToFunctionDecl(FunctionDecl *FD,
   // Now, we cast the reference to a pointer to the objc_msgSend type.
   QualType pToFunc = Context->getPointerType(msgSendType);
   ImplicitCastExpr *ICE =
-    ImplicitCastExpr::Create(*Context, pToFunc, CK_FunctionToPointerDecay,
-                             DRE, nullptr, VK_RValue);
+      ImplicitCastExpr::Create(*Context, pToFunc, CK_FunctionToPointerDecay,
+                               DRE, nullptr, VK_RValue, FPOptionsOverride());
 
   const auto *FT = msgSendType->castAs();
 
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 0692fe75a4417..a9761f0490675 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -154,6 +154,7 @@ set(openmp_wrapper_files
   openmp_wrappers/complex.h
   openmp_wrappers/complex
   openmp_wrappers/__clang_openmp_device_functions.h
+  openmp_wrappers/complex_cmath.h
   openmp_wrappers/new
 )
 
diff --git a/clang/lib/Headers/__clang_cuda_cmath.h b/clang/lib/Headers/__clang_cuda_cmath.h
index 8ba182689a4f9..f49463d72e042 100644
--- a/clang/lib/Headers/__clang_cuda_cmath.h
+++ b/clang/lib/Headers/__clang_cuda_cmath.h
@@ -66,10 +66,38 @@ __DEVICE__ float frexp(float __arg, int *__exp) {
 }
 
 // For inscrutable reasons, the CUDA headers define these functions for us on
-// Windows. For OpenMP we omit these as some old system headers have
-// non-conforming `isinf(float)` and `isnan(float)` implementations that return
-// an `int`. The system versions of these functions should be fine anyway.
-#if !defined(_MSC_VER) && !defined(__OPENMP_NVPTX__)
+// Windows.
+#if !defined(_MSC_VER) || defined(__OPENMP_NVPTX__)
+
+// For OpenMP we work around some old system headers that have non-conforming
+// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
+// this by providing two versions of these functions, differing only in the
+// return type. To avoid conflicting definitions we disable implicit base
+// function generation. That means we will end up with two specializations, one
+// per type, but only one has a base function defined by the system header.
+#if defined(__OPENMP_NVPTX__)
+#pragma omp begin declare variant match(                                       \
+    implementation = {extension(disable_implicit_base)})
+
+// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
+//        add a suffix. This means we would clash with the names of the variants
+//        (note that we do not create implicit base functions here). To avoid
+//        this clash we add a new trait to some of them that is always true
+//        (this is LLVM after all ;)). It will only influence the mangled name
+//        of the variants inside the inner region and avoid the clash.
+#pragma omp begin declare variant match(implementation = {vendor(llvm)})
+
+__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
+__DEVICE__ int isfinite(double __x) { return ::__isfinited(__x); }
+__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
+
+#pragma omp end declare variant
+
+#endif
+
 __DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
 __DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
@@ -79,6 +107,11 @@ __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
 __DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
 __DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
 __DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+
+#if defined(__OPENMP_NVPTX__)
+#pragma omp end declare variant
+#endif
+
 #endif
 
 __DEVICE__ bool isgreater(float __x, float __y) {
diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h
index 22744adefbefd..32b161d82d8e8 100644
--- a/clang/lib/Headers/altivec.h
+++ b/clang/lib/Headers/altivec.h
@@ -3368,6 +3368,18 @@ vec_dive(vector unsigned long long __a, vector unsigned long long __b) {
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_div(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a / __b;
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_div(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a / __b;
+}
+#endif __POWER10_VECTOR__
+
 /* vec_dss */
 
 #define vec_dss __builtin_altivec_dss
@@ -17068,6 +17080,18 @@ vec_expandm(vector unsigned __int128 __a) {
   return __builtin_altivec_vexpandqm(__a);
 }
 
+/* vec_cntm */
+
+#define vec_cntm(__a, __mp)                                                    \
+  _Generic((__a), vector unsigned char                                         \
+           : __builtin_altivec_vcntmbb((__a), (unsigned int)(__mp)),           \
+             vector unsigned short                                             \
+           : __builtin_altivec_vcntmbh((__a), (unsigned int)(__mp)),           \
+             vector unsigned int                                               \
+           : __builtin_altivec_vcntmbw((__a), (unsigned int)(__mp)),           \
+             vector unsigned long long                                         \
+           : __builtin_altivec_vcntmbd((__a), (unsigned int)(__mp)))
+
 /* vec_pdep */
 
 static __inline__ vector unsigned long long __ATTRS_o_ai
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index 871b47ca82674..e7b76a3bb2ed7 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -57,16 +57,11 @@ void __addfsbyte(unsigned long, unsigned char);
 void __addfsdword(unsigned long, unsigned long);
 void __addfsword(unsigned long, unsigned short);
 void __code_seg(const char *);
-static __inline__
 void __cpuid(int[4], int);
-static __inline__
 void __cpuidex(int[4], int, int);
-static __inline__
 __int64 __emul(int, int);
-static __inline__
 unsigned __int64 __emulu(unsigned int, unsigned int);
 unsigned int __getcallerseflags(void);
-static __inline__
 void __halt(void);
 unsigned char __inbyte(unsigned short);
 void __inbytestring(unsigned short, unsigned char *, unsigned long);
@@ -82,13 +77,9 @@ void __inwordstring(unsigned short, unsigned short *, unsigned long);
 void __lidt(void *);
 unsigned __int64 __ll_lshift(unsigned __int64, int);
 __int64 __ll_rshift(__int64, int);
-static __inline__
 void __movsb(unsigned char *, unsigned char const *, size_t);
-static __inline__
 void __movsd(unsigned long *, unsigned long const *, size_t);
-static __inline__
 void __movsw(unsigned short *, unsigned short const *, size_t);
-static __inline__
 void __nop(void);
 void __nvreg_restore_fence(void);
 void __nvreg_save_fence(void);
@@ -105,23 +96,16 @@ unsigned long __readcr4(void);
 unsigned long __readcr8(void);
 unsigned int __readdr(unsigned int);
 #ifdef __i386__
-static __inline__
 unsigned char __readfsbyte(unsigned long);
-static __inline__
 unsigned __int64 __readfsqword(unsigned long);
-static __inline__
 unsigned short __readfsword(unsigned long);
 #endif
-static __inline__
 unsigned __int64 __readmsr(unsigned long);
 unsigned __int64 __readpmc(unsigned long);
 unsigned long __segmentlimit(unsigned long);
 void __sidt(void *);
-static __inline__
 void __stosb(unsigned char *, unsigned char, size_t);
-static __inline__
 void __stosd(unsigned long *, unsigned long, size_t);
-static __inline__
 void __stosw(unsigned short *, unsigned short, size_t);
 void __svm_clgi(void);
 void __svm_invlpga(void *, int);
@@ -136,7 +120,6 @@ void __vmx_off(void);
 void __vmx_vmptrst(unsigned __int64 *);
 void __wbinvd(void);
 void __writecr0(unsigned int);
-static __inline__
 void __writecr3(unsigned __INTPTR_TYPE__);
 void __writecr4(unsigned int);
 void __writecr8(unsigned int);
@@ -146,11 +129,8 @@ void __writefsdword(unsigned long, unsigned long);
 void __writefsqword(unsigned long, unsigned __int64);
 void __writefsword(unsigned long, unsigned short);
 void __writemsr(unsigned long, unsigned __int64);
-static __inline__
 void *_AddressOfReturnAddress(void);
-static __inline__
 unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
-static __inline__
 unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
 unsigned char _bittest(long const *, long);
 unsigned char _bittestandcomplement(long *, long);
@@ -169,12 +149,10 @@ long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
 long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
 __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
 __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
-static __inline__ void
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadBarrier(void);
-static __inline__ void
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadWriteBarrier(void);
+void __attribute__((__deprecated__(
+    "use other intrinsics or C++11 atomics instead"))) _ReadBarrier(void);
+void __attribute__((__deprecated__(
+    "use other intrinsics or C++11 atomics instead"))) _ReadWriteBarrier(void);
 unsigned int _rorx_u32(unsigned int, const unsigned int);
 int _sarx_i32(int, unsigned int);
 #if __STDC_HOSTED__
@@ -185,9 +163,8 @@ unsigned int _shrx_u32(unsigned int, unsigned int);
 void _Store_HLERelease(long volatile *, long);
 void _Store64_HLERelease(__int64 volatile *, __int64);
 void _StorePointer_HLERelease(void *volatile *, void *);
-static __inline__ void
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_WriteBarrier(void);
+void __attribute__((__deprecated__(
+    "use other intrinsics or C++11 atomics instead"))) _WriteBarrier(void);
 unsigned __int32 xbegin(void);
 void _xend(void);
 
@@ -197,19 +174,14 @@ void __addgsbyte(unsigned long, unsigned char);
 void __addgsdword(unsigned long, unsigned long);
 void __addgsqword(unsigned long, unsigned __int64);
 void __addgsword(unsigned long, unsigned short);
-static __inline__
 void __faststorefence(void);
 void __incgsbyte(unsigned long);
 void __incgsdword(unsigned long);
 void __incgsqword(unsigned long);
 void __incgsword(unsigned long);
-static __inline__
 void __movsq(unsigned long long *, unsigned long long const *, size_t);
-static __inline__
 unsigned char __readgsbyte(unsigned long);
-static __inline__
 unsigned long __readgsdword(unsigned long);
-static __inline__
 unsigned __int64 __readgsqword(unsigned long);
 unsigned short __readgsword(unsigned long);
 unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
@@ -218,7 +190,6 @@ unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
 unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
                                  unsigned __int64 _HighPart,
                                  unsigned char _Shift);
-static __inline__
 void __stosq(unsigned __int64 *, unsigned __int64, size_t);
 unsigned char __vmx_on(unsigned __int64 *);
 unsigned char __vmx_vmclear(unsigned __int64 *);
@@ -269,13 +240,9 @@ unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
 __int64 _sarx_i64(__int64, unsigned int);
 unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
 unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
-static __inline__
 __int64 __mulh(__int64, __int64);
-static __inline__
 unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
-static __inline__
 __int64 _mul128(__int64, __int64, __int64*);
-static __inline__
 unsigned __int64 _umul128(unsigned __int64,
                           unsigned __int64,
                           unsigned __int64*);
@@ -284,29 +251,19 @@ unsigned __int64 _umul128(unsigned __int64,
 
 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
 
-static __inline__
 unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
-static __inline__
 unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
 
 #endif
 
 #if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
-static __inline__
 __int64 _InterlockedDecrement64(__int64 volatile *_Addend);
-static __inline__
 __int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
-static __inline__
 __int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
-static __inline__
 __int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
-static __inline__
 __int64 _InterlockedIncrement64(__int64 volatile *_Addend);
-static __inline__
 __int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
-static __inline__
 __int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
-static __inline__
 __int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
 
 #endif
@@ -475,40 +432,56 @@ __int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
 |* movs, stos
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
+                                                  unsigned char const *__src,
+                                                  size_t __n) {
   __asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n)
                        : : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
-  __asm__ __volatile__("rep movsl" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
+static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst,
+                                                  unsigned long const *__src,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep movsl"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
+                       : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
-  __asm__ __volatile__("rep movsw" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
+static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
+                                                  unsigned short const *__src,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep movsw"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
+                       : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
-  __asm__ __volatile__("rep stosl" : "+D"(__dst), "+c"(__n) : "a"(__x)
+static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
+                                                  unsigned long __x,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep stosl"
+                       : "+D"(__dst), "+c"(__n)
+                       : "a"(__x)
                        : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
-  __asm__ __volatile__("rep stosw" : "+D"(__dst), "+c"(__n) : "a"(__x)
+static __inline__ void __DEFAULT_FN_ATTRS __stosw(unsigned short *__dst,
+                                                  unsigned short __x,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep stosw"
+                       : "+D"(__dst), "+c"(__n)
+                       : "a"(__x)
                        : "memory");
 }
 #endif
 #ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
-  __asm__ __volatile__("rep movsq" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
+static __inline__ void __DEFAULT_FN_ATTRS __movsq(
+    unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__ __volatile__("rep movsq"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
+                       : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
+                                                  unsigned __int64 __x,
+                                                  size_t __n) {
   __asm__ __volatile__("rep stosq" : "+D"(__dst), "+c"(__n) : "a"(__x)
                        : "memory");
 }
@@ -518,26 +491,25 @@ __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
 |* Misc
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__cpuid(int __info[4], int __level) {
-  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
-                   : "a"(__level), "c"(0));
+static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) {
+  __asm__("cpuid"
+          : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
+          : "a"(__level), "c"(0));
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__cpuidex(int __info[4], int __level, int __ecx) {
-  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
-                   : "a"(__level), "c"(__ecx));
+static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level,
+                                                    int __ecx) {
+  __asm__("cpuid"
+          : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
+          : "a"(__level), "c"(__ecx));
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__halt(void) {
-  __asm__ volatile ("hlt");
+static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
+  __asm__ volatile("hlt");
 }
 #endif
 
 #if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__nop(void) {
-  __asm__ volatile ("nop");
+static __inline__ void __DEFAULT_FN_ATTRS __nop(void) {
+  __asm__ volatile("nop");
 }
 #endif
 
@@ -574,8 +546,7 @@ __readmsr(unsigned long __register) {
 }
 #endif
 
-static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS
-__readcr3(void) {
+static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS __readcr3(void) {
   unsigned __LPTRINT_TYPE__ __cr3_val;
   __asm__ __volatile__ ("mov %%cr3, %0" : "=r"(__cr3_val) : : "memory");
   return __cr3_val;
diff --git a/clang/lib/Headers/openmp_wrappers/cmath b/clang/lib/Headers/openmp_wrappers/cmath
index bd6011eb6f6d5..1aff66af7d52d 100644
--- a/clang/lib/Headers/openmp_wrappers/cmath
+++ b/clang/lib/Headers/openmp_wrappers/cmath
@@ -24,8 +24,11 @@
 // which might live in cstdlib.
 #include 
 
+// We need limits because __clang_cuda_cmath.h below uses `std::numeric_limit`.
+#include 
+
 #pragma omp begin declare variant match(                                       \
-    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any, allow_templates)})
 
 #define __CUDA__
 #define __OPENMP_NVPTX__
diff --git a/clang/lib/Headers/openmp_wrappers/complex b/clang/lib/Headers/openmp_wrappers/complex
index 1ed0b14879efb..306ffe2080534 100644
--- a/clang/lib/Headers/openmp_wrappers/complex
+++ b/clang/lib/Headers/openmp_wrappers/complex
@@ -23,3 +23,28 @@
 
 // Grab the host header too.
 #include_next 
+
+
+#ifdef __cplusplus
+
+// If we are compiling against libc++, the macro _LIBCPP_STD_VER should be set
+// after including  above. Since the complex header we use is a
+// simplified version of the libc++, we don't need it in this case. If we
+// compile against libstdc++, or any other standard library, we will overload
+// the (hopefully template) functions in the  header with the ones we
+// got from libc++ which decomposes math functions, like `std::sin`, into
+// arithmetic and calls to non-complex functions, all of which we can then
+// handle.
+#ifndef _LIBCPP_STD_VER
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)},                                           \
+    implementation = {extension(match_any, allow_templates)})
+
+#include 
+
+#pragma omp end declare variant
+
+#endif
+
+#endif
diff --git a/clang/lib/Headers/openmp_wrappers/complex_cmath.h b/clang/lib/Headers/openmp_wrappers/complex_cmath.h
new file mode 100644
index 0000000000000..e3d9aebbbc243
--- /dev/null
+++ b/clang/lib/Headers/openmp_wrappers/complex_cmath.h
@@ -0,0 +1,388 @@
+//===------------------------- __complex_cmath.h --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// std::complex header copied from the libcxx source and simplified for use in
+// OpenMP target offload regions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
+
+#ifndef __cplusplus
+#error "This file is for C++ compilation only."
+#endif
+
+#ifndef _LIBCPP_COMPLEX
+#define _LIBCPP_COMPLEX
+
+#include 
+#include 
+
+#define __DEVICE__ static constexpr __attribute__((nothrow))
+
+namespace std {
+
+// abs
+
+template  __DEVICE__ _Tp abs(const std::complex<_Tp> &__c) {
+  return hypot(__c.real(), __c.imag());
+}
+
+// arg
+
+template  __DEVICE__ _Tp arg(const std::complex<_Tp> &__c) {
+  return atan2(__c.imag(), __c.real());
+}
+
+template 
+typename enable_if::value || is_same<_Tp, double>::value,
+                   double>::type
+arg(_Tp __re) {
+  return atan2(0., __re);
+}
+
+template 
+typename enable_if::value, float>::type arg(_Tp __re) {
+  return atan2f(0.F, __re);
+}
+
+// norm
+
+template  __DEVICE__ _Tp norm(const std::complex<_Tp> &__c) {
+  if (std::isinf(__c.real()))
+    return abs(__c.real());
+  if (std::isinf(__c.imag()))
+    return abs(__c.imag());
+  return __c.real() * __c.real() + __c.imag() * __c.imag();
+}
+
+// conj
+
+template  std::complex<_Tp> conj(const std::complex<_Tp> &__c) {
+  return std::complex<_Tp>(__c.real(), -__c.imag());
+}
+
+// proj
+
+template  std::complex<_Tp> proj(const std::complex<_Tp> &__c) {
+  std::complex<_Tp> __r = __c;
+  if (std::isinf(__c.real()) || std::isinf(__c.imag()))
+    __r = std::complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
+  return __r;
+}
+
+// polar
+
+template 
+complex<_Tp> polar(const _Tp &__rho, const _Tp &__theta = _Tp()) {
+  if (std::isnan(__rho) || signbit(__rho))
+    return std::complex<_Tp>(_Tp(NAN), _Tp(NAN));
+  if (std::isnan(__theta)) {
+    if (std::isinf(__rho))
+      return std::complex<_Tp>(__rho, __theta);
+    return std::complex<_Tp>(__theta, __theta);
+  }
+  if (std::isinf(__theta)) {
+    if (std::isinf(__rho))
+      return std::complex<_Tp>(__rho, _Tp(NAN));
+    return std::complex<_Tp>(_Tp(NAN), _Tp(NAN));
+  }
+  _Tp __x = __rho * cos(__theta);
+  if (std::isnan(__x))
+    __x = 0;
+  _Tp __y = __rho * sin(__theta);
+  if (std::isnan(__y))
+    __y = 0;
+  return std::complex<_Tp>(__x, __y);
+}
+
+// log
+
+template  std::complex<_Tp> log(const std::complex<_Tp> &__x) {
+  return std::complex<_Tp>(log(abs(__x)), arg(__x));
+}
+
+// log10
+
+template  std::complex<_Tp> log10(const std::complex<_Tp> &__x) {
+  return log(__x) / log(_Tp(10));
+}
+
+// sqrt
+
+template 
+__DEVICE__ std::complex<_Tp> sqrt(const std::complex<_Tp> &__x) {
+  if (std::isinf(__x.imag()))
+    return std::complex<_Tp>(_Tp(INFINITY), __x.imag());
+  if (std::isinf(__x.real())) {
+    if (__x.real() > _Tp(0))
+      return std::complex<_Tp>(__x.real(), std::isnan(__x.imag())
+                                               ? __x.imag()
+                                               : copysign(_Tp(0), __x.imag()));
+    return std::complex<_Tp>(std::isnan(__x.imag()) ? __x.imag() : _Tp(0),
+                             copysign(__x.real(), __x.imag()));
+  }
+  return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
+}
+
+// exp
+
+template 
+__DEVICE__ std::complex<_Tp> exp(const std::complex<_Tp> &__x) {
+  _Tp __i = __x.imag();
+  if (std::isinf(__x.real())) {
+    if (__x.real() < _Tp(0)) {
+      if (!std::isfinite(__i))
+        __i = _Tp(1);
+    } else if (__i == 0 || !std::isfinite(__i)) {
+      if (std::isinf(__i))
+        __i = _Tp(NAN);
+      return std::complex<_Tp>(__x.real(), __i);
+    }
+  } else if (std::isnan(__x.real()) && __x.imag() == 0)
+    return __x;
+  _Tp __e = exp(__x.real());
+  return std::complex<_Tp>(__e * cos(__i), __e * sin(__i));
+}
+
+// pow
+
+template 
+std::complex<_Tp> pow(const std::complex<_Tp> &__x,
+                      const std::complex<_Tp> &__y) {
+  return exp(__y * log(__x));
+}
+
+// __sqr, computes pow(x, 2)
+
+template  std::complex<_Tp> __sqr(const std::complex<_Tp> &__x) {
+  return std::complex<_Tp>((__x.real() - __x.imag()) *
+                               (__x.real() + __x.imag()),
+                           _Tp(2) * __x.real() * __x.imag());
+}
+
+// asinh
+
+template 
+__DEVICE__ std::complex<_Tp> asinh(const std::complex<_Tp> &__x) {
+  const _Tp __pi(atan2(+0., -0.));
+  if (std::isinf(__x.real())) {
+    if (std::isnan(__x.imag()))
+      return __x;
+    if (std::isinf(__x.imag()))
+      return std::complex<_Tp>(__x.real(),
+                               copysign(__pi * _Tp(0.25), __x.imag()));
+    return std::complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
+  }
+  if (std::isnan(__x.real())) {
+    if (std::isinf(__x.imag()))
+      return std::complex<_Tp>(__x.imag(), __x.real());
+    if (__x.imag() == 0)
+      return __x;
+    return std::complex<_Tp>(__x.real(), __x.real());
+  }
+  if (std::isinf(__x.imag()))
+    return std::complex<_Tp>(copysign(__x.imag(), __x.real()),
+                             copysign(__pi / _Tp(2), __x.imag()));
+  std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) + _Tp(1)));
+  return std::complex<_Tp>(copysign(__z.real(), __x.real()),
+                           copysign(__z.imag(), __x.imag()));
+}
+
+// acosh
+
+template 
+__DEVICE__ std::complex<_Tp> acosh(const std::complex<_Tp> &__x) {
+  const _Tp __pi(atan2(+0., -0.));
+  if (std::isinf(__x.real())) {
+    if (std::isnan(__x.imag()))
+      return std::complex<_Tp>(abs(__x.real()), __x.imag());
+    if (std::isinf(__x.imag())) {
+      if (__x.real() > 0)
+        return std::complex<_Tp>(__x.real(),
+                                 copysign(__pi * _Tp(0.25), __x.imag()));
+      else
+        return std::complex<_Tp>(-__x.real(),
+                                 copysign(__pi * _Tp(0.75), __x.imag()));
+    }
+    if (__x.real() < 0)
+      return std::complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
+    return std::complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
+  }
+  if (std::isnan(__x.real())) {
+    if (std::isinf(__x.imag()))
+      return std::complex<_Tp>(abs(__x.imag()), __x.real());
+    return std::complex<_Tp>(__x.real(), __x.real());
+  }
+  if (std::isinf(__x.imag()))
+    return std::complex<_Tp>(abs(__x.imag()),
+                             copysign(__pi / _Tp(2), __x.imag()));
+  std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
+  return std::complex<_Tp>(copysign(__z.real(), _Tp(0)),
+                           copysign(__z.imag(), __x.imag()));
+}
+
+// atanh
+
+template 
+__DEVICE__ std::complex<_Tp> atanh(const std::complex<_Tp> &__x) {
+  const _Tp __pi(atan2(+0., -0.));
+  if (std::isinf(__x.imag())) {
+    return std::complex<_Tp>(copysign(_Tp(0), __x.real()),
+                             copysign(__pi / _Tp(2), __x.imag()));
+  }
+  if (std::isnan(__x.imag())) {
+    if (std::isinf(__x.real()) || __x.real() == 0)
+      return std::complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
+    return std::complex<_Tp>(__x.imag(), __x.imag());
+  }
+  if (std::isnan(__x.real())) {
+    return std::complex<_Tp>(__x.real(), __x.real());
+  }
+  if (std::isinf(__x.real())) {
+    return std::complex<_Tp>(copysign(_Tp(0), __x.real()),
+                             copysign(__pi / _Tp(2), __x.imag()));
+  }
+  if (abs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) {
+    return std::complex<_Tp>(copysign(_Tp(INFINITY), __x.real()),
+                             copysign(_Tp(0), __x.imag()));
+  }
+  std::complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
+  return std::complex<_Tp>(copysign(__z.real(), __x.real()),
+                           copysign(__z.imag(), __x.imag()));
+}
+
+// sinh
+
+template 
+__DEVICE__ std::complex<_Tp> sinh(const std::complex<_Tp> &__x) {
+  if (std::isinf(__x.real()) && !std::isfinite(__x.imag()))
+    return std::complex<_Tp>(__x.real(), _Tp(NAN));
+  if (__x.real() == 0 && !std::isfinite(__x.imag()))
+    return std::complex<_Tp>(__x.real(), _Tp(NAN));
+  if (__x.imag() == 0 && !std::isfinite(__x.real()))
+    return __x;
+  return std::complex<_Tp>(sinh(__x.real()) * cos(__x.imag()),
+                           cosh(__x.real()) * sin(__x.imag()));
+}
+
+// cosh
+
+template 
+__DEVICE__ std::complex<_Tp> cosh(const std::complex<_Tp> &__x) {
+  if (std::isinf(__x.real()) && !std::isfinite(__x.imag()))
+    return std::complex<_Tp>(abs(__x.real()), _Tp(NAN));
+  if (__x.real() == 0 && !std::isfinite(__x.imag()))
+    return std::complex<_Tp>(_Tp(NAN), __x.real());
+  if (__x.real() == 0 && __x.imag() == 0)
+    return std::complex<_Tp>(_Tp(1), __x.imag());
+  if (__x.imag() == 0 && !std::isfinite(__x.real()))
+    return std::complex<_Tp>(abs(__x.real()), __x.imag());
+  return std::complex<_Tp>(cosh(__x.real()) * cos(__x.imag()),
+                           sinh(__x.real()) * sin(__x.imag()));
+}
+
+// tanh
+
+template 
+__DEVICE__ std::complex<_Tp> tanh(const std::complex<_Tp> &__x) {
+  if (std::isinf(__x.real())) {
+    if (!std::isfinite(__x.imag()))
+      return std::complex<_Tp>(_Tp(1), _Tp(0));
+    return std::complex<_Tp>(_Tp(1),
+                             copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
+  }
+  if (std::isnan(__x.real()) && __x.imag() == 0)
+    return __x;
+  _Tp __2r(_Tp(2) * __x.real());
+  _Tp __2i(_Tp(2) * __x.imag());
+  _Tp __d(cosh(__2r) + cos(__2i));
+  _Tp __2rsh(sinh(__2r));
+  if (std::isinf(__2rsh) && std::isinf(__d))
+    return std::complex<_Tp>(__2rsh > _Tp(0) ? _Tp(1) : _Tp(-1),
+                             __2i > _Tp(0) ? _Tp(0) : _Tp(-0.));
+  return std::complex<_Tp>(__2rsh / __d, sin(__2i) / __d);
+}
+
+// asin
+
+template 
+__DEVICE__ std::complex<_Tp> asin(const std::complex<_Tp> &__x) {
+  std::complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
+  return std::complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// acos
+
+template 
+__DEVICE__ std::complex<_Tp> acos(const std::complex<_Tp> &__x) {
+  const _Tp __pi(atan2(+0., -0.));
+  if (std::isinf(__x.real())) {
+    if (std::isnan(__x.imag()))
+      return std::complex<_Tp>(__x.imag(), __x.real());
+    if (std::isinf(__x.imag())) {
+      if (__x.real() < _Tp(0))
+        return std::complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
+      return std::complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
+    }
+    if (__x.real() < _Tp(0))
+      return std::complex<_Tp>(__pi,
+                               signbit(__x.imag()) ? -__x.real() : __x.real());
+    return std::complex<_Tp>(_Tp(0),
+                             signbit(__x.imag()) ? __x.real() : -__x.real());
+  }
+  if (std::isnan(__x.real())) {
+    if (std::isinf(__x.imag()))
+      return std::complex<_Tp>(__x.real(), -__x.imag());
+    return std::complex<_Tp>(__x.real(), __x.real());
+  }
+  if (std::isinf(__x.imag()))
+    return std::complex<_Tp>(__pi / _Tp(2), -__x.imag());
+  if (__x.real() == 0 && (__x.imag() == 0 || isnan(__x.imag())))
+    return std::complex<_Tp>(__pi / _Tp(2), -__x.imag());
+  std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
+  if (signbit(__x.imag()))
+    return std::complex<_Tp>(abs(__z.imag()), abs(__z.real()));
+  return std::complex<_Tp>(abs(__z.imag()), -abs(__z.real()));
+}
+
+// atan
+
+template 
+__DEVICE__ std::complex<_Tp> atan(const std::complex<_Tp> &__x) {
+  std::complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
+  return std::complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// sin
+
+template 
+__DEVICE__ std::complex<_Tp> sin(const std::complex<_Tp> &__x) {
+  std::complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
+  return std::complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// cos
+
+template  std::complex<_Tp> cos(const std::complex<_Tp> &__x) {
+  return cosh(complex<_Tp>(-__x.imag(), __x.real()));
+}
+
+// tan
+
+template 
+__DEVICE__ std::complex<_Tp> tan(const std::complex<_Tp> &__x) {
+  std::complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
+  return std::complex<_Tp>(__z.imag(), -__z.real());
+}
+
+} // namespace std
+
+#endif
diff --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp
index b512a547de7df..a05df060813e7 100644
--- a/clang/lib/Lex/Pragma.cpp
+++ b/clang/lib/Lex/Pragma.cpp
@@ -1356,7 +1356,7 @@ struct PragmaWarningHandler : public PragmaHandler {
         while (Tok.is(tok::numeric_constant)) {
           uint64_t Value;
           if (!PP.parseSimpleIntegerLiteral(Tok, Value) || Value == 0 ||
-              Value > std::numeric_limits::max()) {
+              Value > INT_MAX) {
             PP.Diag(Tok, diag::warn_pragma_warning_expected_number);
             return;
           }
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 75bb78152e57b..290b3c5df9592 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -4018,6 +4018,8 @@ static bool IsBuiltInOrStandardCXX11Attribute(IdentifierInfo *AttrName,
   case ParsedAttr::AT_FallThrough:
   case ParsedAttr::AT_CXX11NoReturn:
   case ParsedAttr::AT_NoUniqueAddress:
+  case ParsedAttr::AT_Likely:
+  case ParsedAttr::AT_Unlikely:
     return true;
   case ParsedAttr::AT_WarnUnusedResult:
     return !ScopeName && AttrName->getName().equals("nodiscard");
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index ceb91dce186c7..34bddd2e10d76 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -935,6 +935,14 @@ static bool checkExtensionProperty(Parser &P, SourceLocation Loc,
   if (TIProperty.Kind == TraitProperty::invalid)
     return false;
 
+  if (TIProperty.Kind ==
+      TraitProperty::implementation_extension_disable_implicit_base)
+    return true;
+
+  if (TIProperty.Kind ==
+      TraitProperty::implementation_extension_allow_templates)
+    return true;
+
   auto IsMatchExtension = [](OMPTraitProperty &TP) {
     return (TP.Kind ==
                 llvm::omp::TraitProperty::implementation_extension_match_all ||
@@ -1385,8 +1393,10 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
     return;
   }
 
-  OMPTraitInfo &TI = Actions.getASTContext().getNewOMPTraitInfo();
-  if (parseOMPDeclareVariantMatchClause(Loc, TI))
+  OMPTraitInfo *ParentTI = Actions.getOMPTraitInfoForSurroundingScope();
+  ASTContext &ASTCtx = Actions.getASTContext();
+  OMPTraitInfo &TI = ASTCtx.getNewOMPTraitInfo();
+  if (parseOMPDeclareVariantMatchClause(Loc, TI, ParentTI))
     return;
 
   Optional> DeclVarData =
@@ -1407,7 +1417,8 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
 }
 
 bool Parser::parseOMPDeclareVariantMatchClause(SourceLocation Loc,
-                                               OMPTraitInfo &TI) {
+                                               OMPTraitInfo &TI,
+                                               OMPTraitInfo *ParentTI) {
   // Parse 'match'.
   OpenMPClauseKind CKind = Tok.isAnnotation()
                                ? OMPC_unknown
@@ -1438,6 +1449,66 @@ bool Parser::parseOMPDeclareVariantMatchClause(SourceLocation Loc,
 
   // Parse ')'
   (void)T.consumeClose();
+
+  if (!ParentTI)
+    return false;
+
+  // Merge the parent/outer trait info into the one we just parsed and diagnose
+  // problems.
+  // TODO: Keep some source location in the TI to provide better diagnostics.
+  // TODO: Perform some kind of equivalence check on the condition and score
+  //       expressions.
+  for (const OMPTraitSet &ParentSet : ParentTI->Sets) {
+    bool MergedSet = false;
+    for (OMPTraitSet &Set : TI.Sets) {
+      if (Set.Kind != ParentSet.Kind)
+        continue;
+      MergedSet = true;
+      for (const OMPTraitSelector &ParentSelector : ParentSet.Selectors) {
+        bool MergedSelector = false;
+        for (OMPTraitSelector &Selector : Set.Selectors) {
+          if (Selector.Kind != ParentSelector.Kind)
+            continue;
+          MergedSelector = true;
+          for (const OMPTraitProperty &ParentProperty :
+               ParentSelector.Properties) {
+            bool MergedProperty = false;
+            for (OMPTraitProperty &Property : Selector.Properties) {
+              // Ignore "equivalent" properties.
+              if (Property.Kind != ParentProperty.Kind)
+                continue;
+
+              // If the kind is the same but the raw string not, we don't want
+              // to skip out on the property.
+              MergedProperty |= Property.RawString == ParentProperty.RawString;
+
+              if (Property.RawString == ParentProperty.RawString &&
+                  Selector.ScoreOrCondition == ParentSelector.ScoreOrCondition)
+                continue;
+
+              if (Selector.Kind == llvm::omp::TraitSelector::user_condition) {
+                Diag(Loc, diag::err_omp_declare_variant_nested_user_condition);
+              } else if (Selector.ScoreOrCondition !=
+                         ParentSelector.ScoreOrCondition) {
+                Diag(Loc, diag::err_omp_declare_variant_duplicate_nested_trait)
+                    << getOpenMPContextTraitPropertyName(
+                           ParentProperty.Kind, ParentProperty.RawString)
+                    << getOpenMPContextTraitSelectorName(ParentSelector.Kind)
+                    << getOpenMPContextTraitSetName(ParentSet.Kind);
+              }
+            }
+            if (!MergedProperty)
+              Selector.Properties.push_back(ParentProperty);
+          }
+        }
+        if (!MergedSelector)
+          Set.Selectors.push_back(ParentSelector);
+      }
+    }
+    if (!MergedSet)
+      TI.Sets.push_back(ParentSet);
+  }
+
   return false;
 }
 
@@ -1811,8 +1882,10 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
     // { #pragma omp end declare variant }
     //
     ConsumeToken();
-    OMPTraitInfo &TI = Actions.getASTContext().getNewOMPTraitInfo();
-    if (parseOMPDeclareVariantMatchClause(Loc, TI))
+    OMPTraitInfo *ParentTI = Actions.getOMPTraitInfoForSurroundingScope();
+    ASTContext &ASTCtx = Actions.getASTContext();
+    OMPTraitInfo &TI = ASTCtx.getNewOMPTraitInfo();
+    if (parseOMPDeclareVariantMatchClause(Loc, TI, ParentTI))
       break;
 
     // Skip last tokens.
@@ -1821,7 +1894,6 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
     ParsingOpenMPDirectiveRAII NormalScope(*this, /*Value=*/false);
 
     VariantMatchInfo VMI;
-    ASTContext &ASTCtx = Actions.getASTContext();
     TI.getAsVariantMatchInfo(ASTCtx, VMI);
 
     std::function DiagUnknownTrait = [this, Loc](
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index f0e0bf492a552..1e90c0103c7d6 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -614,7 +614,8 @@ ExprResult Sema::ImpCastExprToType(Expr *E, QualType Ty,
     }
   }
 
-  return ImplicitCastExpr::Create(Context, Ty, Kind, E, BasePath, VK);
+  return ImplicitCastExpr::Create(Context, Ty, Kind, E, BasePath, VK,
+                                  CurFPFeatureOverrides());
 }
 
 /// ScalarTypeToBooleanCastKind - Returns the cast kind corresponding
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 726900c59f20e..d59f1880a7fff 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -105,10 +105,10 @@ namespace {
       // If this is an unbridged cast, wrap the result in an implicit
       // cast that yields the unbridged-cast placeholder type.
       if (IsARCUnbridgedCast) {
-        castExpr = ImplicitCastExpr::Create(Self.Context,
-                                            Self.Context.ARCUnbridgedCastTy,
-                                            CK_Dependent, castExpr, nullptr,
-                                            castExpr->getValueKind());
+        castExpr = ImplicitCastExpr::Create(
+            Self.Context, Self.Context.ARCUnbridgedCastTy, CK_Dependent,
+            castExpr, nullptr, castExpr->getValueKind(),
+            Self.CurFPFeatureOverrides());
       }
       updatePartOfExplicitCastFlags(castExpr);
       return castExpr;
@@ -361,11 +361,10 @@ Sema::BuildCXXNamedCast(SourceLocation OpLoc, tok::TokenKind Kind,
       DiscardMisalignedMemberAddress(DestType.getTypePtr(), E);
     }
 
-    return Op.complete(CXXStaticCastExpr::Create(Context, Op.ResultType,
-                                   Op.ValueKind, Op.Kind, Op.SrcExpr.get(),
-                                                 &Op.BasePath, DestTInfo,
-                                                 OpLoc, Parens.getEnd(),
-                                                 AngleBrackets));
+    return Op.complete(CXXStaticCastExpr::Create(
+        Context, Op.ResultType, Op.ValueKind, Op.Kind, Op.SrcExpr.get(),
+        &Op.BasePath, DestTInfo, CurFPFeatureOverrides(), OpLoc,
+        Parens.getEnd(), AngleBrackets));
   }
   }
 }
@@ -890,6 +889,18 @@ void CastOperation::CheckDynamicCast() {
     return;
   }
 
+  // Warns when dynamic_cast is used with RTTI data disabled.
+  if (!Self.getLangOpts().RTTIData) {
+    bool MicrosoftABI =
+        Self.getASTContext().getTargetInfo().getCXXABI().isMicrosoft();
+    bool isClangCL = Self.getDiagnostics().getDiagnosticOptions().getFormat() ==
+                     DiagnosticOptions::MSVC;
+    if (MicrosoftABI || !DestPointee->isVoidType())
+      Self.Diag(OpRange.getBegin(),
+                diag::warn_no_dynamic_cast_with_rtti_disabled)
+          << isClangCL;
+  }
+
   // Done. Everything else is run-time checks.
   Kind = CK_Dynamic;
 }
@@ -3033,9 +3044,9 @@ ExprResult Sema::BuildCStyleCastExpr(SourceLocation LPLoc,
   // -Wcast-qual
   DiagnoseCastQual(Op.Self, Op.SrcExpr, Op.DestType);
 
-  return Op.complete(CStyleCastExpr::Create(Context, Op.ResultType,
-                              Op.ValueKind, Op.Kind, Op.SrcExpr.get(),
-                              &Op.BasePath, CastTypeInfo, LPLoc, RPLoc));
+  return Op.complete(CStyleCastExpr::Create(
+      Context, Op.ResultType, Op.ValueKind, Op.Kind, Op.SrcExpr.get(),
+      &Op.BasePath, CurFPFeatureOverrides(), CastTypeInfo, LPLoc, RPLoc));
 }
 
 ExprResult Sema::BuildCXXFunctionalCastExpr(TypeSourceInfo *CastTypeInfo,
@@ -3058,7 +3069,7 @@ ExprResult Sema::BuildCXXFunctionalCastExpr(TypeSourceInfo *CastTypeInfo,
   if (auto *ConstructExpr = dyn_cast(SubExpr))
     ConstructExpr->setParenOrBraceRange(SourceRange(LPLoc, RPLoc));
 
-  return Op.complete(CXXFunctionalCastExpr::Create(Context, Op.ResultType,
-                         Op.ValueKind, CastTypeInfo, Op.Kind,
-                         Op.SrcExpr.get(), &Op.BasePath, LPLoc, RPLoc));
+  return Op.complete(CXXFunctionalCastExpr::Create(
+      Context, Op.ResultType, Op.ValueKind, CastTypeInfo, Op.Kind,
+      Op.SrcExpr.get(), &Op.BasePath, CurFPFeatureOverrides(), LPLoc, RPLoc));
 }
diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index 990ab26335209..565f907e05b28 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -398,6 +398,10 @@ static Expr *maybeTailCall(Sema &S, QualType RetType, Expr *E,
            diag::warn_coroutine_handle_address_invalid_return_type)
         << JustAddress->getType();
 
+  // The coroutine handle used to obtain the address is no longer needed
+  // at this point, clean it up to avoid unnecessarily long lifetime which
+  // could lead to unnecessary spilling.
+  JustAddress = S.MaybeCreateExprWithCleanups(JustAddress);
   return buildBuiltinCall(S, Loc, Builtin::BI__builtin_coro_resume,
                           JustAddress);
 }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index a036c520211a3..1e0356ec6e06a 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2070,6 +2070,42 @@ static StringRef getHeaderName(Builtin::Context &BuiltinInfo, unsigned ID,
   llvm_unreachable("unhandled error kind");
 }
 
+FunctionDecl *Sema::CreateBuiltin(IdentifierInfo *II, QualType Type,
+                                  unsigned ID, SourceLocation Loc) {
+  DeclContext *Parent = Context.getTranslationUnitDecl();
+
+  if (getLangOpts().CPlusPlus) {
+    LinkageSpecDecl *CLinkageDecl = LinkageSpecDecl::Create(
+        Context, Parent, Loc, Loc, LinkageSpecDecl::lang_c, false);
+    CLinkageDecl->setImplicit();
+    Parent->addDecl(CLinkageDecl);
+    Parent = CLinkageDecl;
+  }
+
+  FunctionDecl *New = FunctionDecl::Create(Context, Parent, Loc, Loc, II, Type,
+                                           /*TInfo=*/nullptr, SC_Extern, false,
+                                           Type->isFunctionProtoType());
+  New->setImplicit();
+  New->addAttr(BuiltinAttr::CreateImplicit(Context, ID));
+
+  // Create Decl objects for each parameter, adding them to the
+  // FunctionDecl.
+  if (const FunctionProtoType *FT = dyn_cast(Type)) {
+    SmallVector Params;
+    for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
+      ParmVarDecl *parm = ParmVarDecl::Create(
+          Context, New, SourceLocation(), SourceLocation(), nullptr,
+          FT->getParamType(i), /*TInfo=*/nullptr, SC_None, nullptr);
+      parm->setScopeInfo(0, i);
+      Params.push_back(parm);
+    }
+    New->setParams(Params);
+  }
+
+  AddKnownFunctionAttributes(New);
+  return New;
+}
+
 /// LazilyCreateBuiltin - The specified Builtin-ID was first used at
 /// file scope.  lazily create a decl for it. ForRedeclaration is true
 /// if we're creating this built-in in anticipation of redeclaring the
@@ -2121,40 +2157,7 @@ NamedDecl *Sema::LazilyCreateBuiltin(IdentifierInfo *II, unsigned ID,
   if (R.isNull())
     return nullptr;
 
-  DeclContext *Parent = Context.getTranslationUnitDecl();
-  if (getLangOpts().CPlusPlus) {
-    LinkageSpecDecl *CLinkageDecl =
-        LinkageSpecDecl::Create(Context, Parent, Loc, Loc,
-                                LinkageSpecDecl::lang_c, false);
-    CLinkageDecl->setImplicit();
-    Parent->addDecl(CLinkageDecl);
-    Parent = CLinkageDecl;
-  }
-
-  FunctionDecl *New = FunctionDecl::Create(Context,
-                                           Parent,
-                                           Loc, Loc, II, R, /*TInfo=*/nullptr,
-                                           SC_Extern,
-                                           false,
-                                           R->isFunctionProtoType());
-  New->setImplicit();
-
-  // Create Decl objects for each parameter, adding them to the
-  // FunctionDecl.
-  if (const FunctionProtoType *FT = dyn_cast(R)) {
-    SmallVector Params;
-    for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
-      ParmVarDecl *parm =
-          ParmVarDecl::Create(Context, New, SourceLocation(), SourceLocation(),
-                              nullptr, FT->getParamType(i), /*TInfo=*/nullptr,
-                              SC_None, nullptr);
-      parm->setScopeInfo(0, i);
-      Params.push_back(parm);
-    }
-    New->setParams(Params);
-  }
-
-  AddKnownFunctionAttributes(New);
+  FunctionDecl *New = CreateBuiltin(II, R, ID, Loc);
   RegisterLocallyScopedExternCDecl(New, S);
 
   // TUScope is the translation-unit scope to insert this function into.
@@ -2162,7 +2165,7 @@ NamedDecl *Sema::LazilyCreateBuiltin(IdentifierInfo *II, unsigned ID,
   // relate Scopes to DeclContexts, and probably eliminate CurContext
   // entirely, but we're not there yet.
   DeclContext *SavedContext = CurContext;
-  CurContext = Parent;
+  CurContext = New->getDeclContext();
   PushOnScopeChains(New, TUScope);
   CurContext = SavedContext;
   return New;
@@ -3389,7 +3392,10 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD,
       // there but not here.
       NewTypeInfo = NewTypeInfo.withCallingConv(OldTypeInfo.getCC());
       RequiresAdjustment = true;
-    } else if (New->getBuiltinID()) {
+    } else if (Old->getBuiltinID()) {
+      // Builtin attribute isn't propagated to the new one yet at this point,
+      // so we check if the old one is a builtin.
+
       // Calling Conventions on a Builtin aren't really useful and setting a
       // default calling convention and cdecl'ing some builtin redeclarations is
       // common, so warn and ignore the calling convention on the redeclaration.
@@ -3822,18 +3828,6 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD,
       Diag(New->getLocation(), diag::warn_redecl_library_builtin) << New;
       Diag(OldLocation, diag::note_previous_builtin_declaration)
         << Old << Old->getType();
-
-      // If this is a global redeclaration, just forget hereafter
-      // about the "builtin-ness" of the function.
-      //
-      // Doing this for local extern declarations is problematic.  If
-      // the builtin declaration remains visible, a second invalid
-      // local declaration will produce a hard error; if it doesn't
-      // remain visible, a single bogus local redeclaration (which is
-      // actually only a warning) could break all the downstream code.
-      if (!New->getLexicalDeclContext()->isFunctionOrMethod())
-        New->getIdentifier()->revertBuiltin();
-
       return false;
     }
 
@@ -9695,6 +9689,35 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
     }
   }
 
+  // In C builtins get merged with implicitly lazily created declarations.
+  // In C++ we need to check if it's a builtin and add the BuiltinAttr here.
+  if (getLangOpts().CPlusPlus) {
+    if (IdentifierInfo *II = Previous.getLookupName().getAsIdentifierInfo()) {
+      if (unsigned BuiltinID = II->getBuiltinID()) {
+        if (NewFD->getLanguageLinkage() == CLanguageLinkage) {
+          // Declarations for builtins with custom typechecking by definition
+          // don't make sense. Don't attempt typechecking and simply add the
+          // attribute.
+          if (Context.BuiltinInfo.hasCustomTypechecking(BuiltinID)) {
+            NewFD->addAttr(BuiltinAttr::CreateImplicit(Context, BuiltinID));
+          } else {
+            ASTContext::GetBuiltinTypeError Error;
+            QualType BuiltinType = Context.GetBuiltinType(BuiltinID, Error);
+
+            if (!Error && !BuiltinType.isNull() &&
+                Context.hasSameFunctionTypeIgnoringExceptionSpec(
+                    NewFD->getType(), BuiltinType))
+              NewFD->addAttr(BuiltinAttr::CreateImplicit(Context, BuiltinID));
+          }
+        } else if (BuiltinID == Builtin::BI__GetExceptionInfo &&
+                   Context.getTargetInfo().getCXXABI().isMicrosoft()) {
+          // FIXME: We should consider this a builtin only in the std namespace.
+          NewFD->addAttr(BuiltinAttr::CreateImplicit(Context, BuiltinID));
+        }
+      }
+    }
+  }
+
   ProcessPragmaWeak(S, NewFD);
   checkAttributesAfterMerging(*this, *NewFD);
 
@@ -13794,19 +13817,17 @@ Sema::ActOnStartOfFunctionDef(Scope *FnBodyScope, Declarator &D,
   // variant` annotation which specifies the mangled definition as a
   // specialization function under the OpenMP context defined as part of the
   // `omp begin declare variant`.
-  FunctionDecl *BaseFD = nullptr;
-  if (LangOpts.OpenMP && isInOpenMPDeclareVariantScope() &&
-      TemplateParameterLists.empty())
-    BaseFD = ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
-        ParentScope, D);
+  SmallVector Bases;
+  if (LangOpts.OpenMP && isInOpenMPDeclareVariantScope())
+    ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
+        ParentScope, D, TemplateParameterLists, Bases);
 
   D.setFunctionDefinitionKind(FDK_Definition);
   Decl *DP = HandleDeclarator(ParentScope, D, TemplateParameterLists);
   Decl *Dcl = ActOnStartOfFunctionDef(FnBodyScope, DP, SkipBody);
 
-  if (BaseFD)
-    ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
-        cast(Dcl), BaseFD);
+  if (!Bases.empty())
+    ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(Dcl, Bases);
 
   return Dcl;
 }
@@ -18214,11 +18235,9 @@ void Sema::ActOnEnumBody(SourceLocation EnumLoc, SourceRange BraceRange,
     // Adjust the Expr initializer and type.
     if (ECD->getInitExpr() &&
         !Context.hasSameType(NewTy, ECD->getInitExpr()->getType()))
-      ECD->setInitExpr(ImplicitCastExpr::Create(Context, NewTy,
-                                                CK_IntegralCast,
-                                                ECD->getInitExpr(),
-                                                /*base paths*/ nullptr,
-                                                VK_RValue));
+      ECD->setInitExpr(ImplicitCastExpr::Create(
+          Context, NewTy, CK_IntegralCast, ECD->getInitExpr(),
+          /*base paths*/ nullptr, VK_RValue, FPOptionsOverride()));
     if (getLangOpts().CPlusPlus)
       // C++ [dcl.enum]p4: Following the closing brace of an
       // enum-specifier, each enumerator has the type of its
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 9aabf4aae0b4f..63eed048d2daf 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -6129,6 +6129,118 @@ static void handleObjCPreciseLifetimeAttr(Sema &S, Decl *D,
   D->addAttr(::new (S.Context) ObjCPreciseLifetimeAttr(S.Context, AL));
 }
 
+static void handleSwiftBridge(Sema &S, Decl *D, const ParsedAttr &AL) {
+  // Make sure that there is a string literal as the annotation's single
+  // argument.
+  StringRef BT;
+  if (!S.checkStringLiteralArgumentAttr(AL, 0, BT))
+    return;
+
+  // Don't duplicate annotations that are already set.
+  if (D->hasAttr()) {
+    S.Diag(AL.getLoc(), diag::warn_duplicate_attribute) << AL;
+    return;
+  }
+
+  D->addAttr(::new (S.Context) SwiftBridgeAttr(S.Context, AL, BT));
+}
+
+static bool isErrorParameter(Sema &S, QualType QT) {
+  const auto *PT = QT->getAs();
+  if (!PT)
+    return false;
+
+  QualType Pointee = PT->getPointeeType();
+
+  // Check for NSError**.
+  if (const auto *OPT = Pointee->getAs())
+    if (const auto *ID = OPT->getInterfaceDecl())
+      if (ID->getIdentifier() == S.getNSErrorIdent())
+        return true;
+
+  // Check for CFError**.
+  if (const auto *PT = Pointee->getAs())
+    if (const auto *RT = PT->getPointeeType()->getAs())
+      if (S.isCFError(RT->getDecl()))
+        return true;
+
+  return false;
+}
+
+static void handleSwiftError(Sema &S, Decl *D, const ParsedAttr &AL) {
+  auto hasErrorParameter = [](Sema &S, Decl *D, const ParsedAttr &AL) -> bool {
+    for (unsigned I = 0, E = getFunctionOrMethodNumParams(D); I != E; ++I) {
+      if (isErrorParameter(S, getFunctionOrMethodParamType(D, I)))
+        return true;
+    }
+
+    S.Diag(AL.getLoc(), diag::err_attr_swift_error_no_error_parameter)
+        << AL << isa(D);
+    return false;
+  };
+
+  auto hasPointerResult = [](Sema &S, Decl *D, const ParsedAttr &AL) -> bool {
+    // - C, ObjC, and block pointers are definitely okay.
+    // - References are definitely not okay.
+    // - nullptr_t is weird, but acceptable.
+    QualType RT = getFunctionOrMethodResultType(D);
+    if (RT->hasPointerRepresentation() && !RT->isReferenceType())
+      return true;
+
+    S.Diag(AL.getLoc(), diag::err_attr_swift_error_return_type)
+        << AL << AL.getArgAsIdent(0)->Ident->getName() << isa(D)
+        << /*pointer*/ 1;
+    return false;
+  };
+
+  auto hasIntegerResult = [](Sema &S, Decl *D, const ParsedAttr &AL) -> bool {
+    QualType RT = getFunctionOrMethodResultType(D);
+    if (RT->isIntegralType(S.Context))
+      return true;
+
+    S.Diag(AL.getLoc(), diag::err_attr_swift_error_return_type)
+        << AL << AL.getArgAsIdent(0)->Ident->getName() << isa(D)
+        << /*integral*/ 0;
+    return false;
+  };
+
+  if (D->isInvalidDecl())
+    return;
+
+  IdentifierLoc *Loc = AL.getArgAsIdent(0);
+  SwiftErrorAttr::ConventionKind Convention;
+  if (!SwiftErrorAttr::ConvertStrToConventionKind(Loc->Ident->getName(),
+                                                  Convention)) {
+    S.Diag(AL.getLoc(), diag::warn_attribute_type_not_supported)
+        << AL << Loc->Ident;
+    return;
+  }
+
+  switch (Convention) {
+  case SwiftErrorAttr::None:
+    // No additional validation required.
+    break;
+
+  case SwiftErrorAttr::NonNullError:
+    if (!hasErrorParameter(S, D, AL))
+      return;
+    break;
+
+  case SwiftErrorAttr::NullResult:
+    if (!hasErrorParameter(S, D, AL) || !hasPointerResult(S, D, AL))
+      return;
+    break;
+
+  case SwiftErrorAttr::NonZeroResult:
+  case SwiftErrorAttr::ZeroResult:
+    if (!hasErrorParameter(S, D, AL) || !hasIntegerResult(S, D, AL))
+      return;
+    break;
+  }
+
+  D->addAttr(::new (S.Context) SwiftErrorAttr(S.Context, AL, Convention));
+}
+
 //===----------------------------------------------------------------------===//
 // Microsoft specific attribute handlers.
 //===----------------------------------------------------------------------===//
@@ -8120,6 +8232,20 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D,
     handleSYCLIntelPipeIOAttr(S, D, AL);
     break;
 
+  // Swift attributes.
+  case ParsedAttr::AT_SwiftBridge:
+    handleSwiftBridge(S, D, AL);
+    break;
+  case ParsedAttr::AT_SwiftBridgedTypedef:
+    handleSimpleAttribute(S, D, AL);
+    break;
+  case ParsedAttr::AT_SwiftError:
+    handleSwiftError(S, D, AL);
+    break;
+  case ParsedAttr::AT_SwiftObjCMembers:
+    handleSimpleAttribute(S, D, AL);
+    break;
+
   // XRay attributes.
   case ParsedAttr::AT_XRayLogArgs:
     handleXRayLogArgsAttr(S, D, AL);
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 0a4f75ad341b1..6558a4f6d8b20 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -1185,7 +1185,8 @@ static bool checkTupleLikeDecomposition(Sema &S,
     //   an xvalue otherwise
     if (!Src->getType()->isLValueReferenceType())
       E = ImplicitCastExpr::Create(S.Context, E.get()->getType(), CK_NoOp,
-                                   E.get(), nullptr, VK_XValue);
+                                   E.get(), nullptr, VK_XValue,
+                                   FPOptionsOverride());
 
     TemplateArgumentListInfo Args(Loc, Loc);
     Args.addArgument(
@@ -14869,9 +14870,9 @@ void Sema::DefineImplicitLambdaToBlockPointerConversion(
   // (since it's unusable otherwise); in the case where we inline the
   // block literal, it has block literal lifetime semantics.
   if (!BuildBlock.isInvalid() && !getLangOpts().ObjCAutoRefCount)
-    BuildBlock = ImplicitCastExpr::Create(Context, BuildBlock.get()->getType(),
-                                          CK_CopyAndAutoreleaseBlockObject,
-                                          BuildBlock.get(), nullptr, VK_RValue);
+    BuildBlock = ImplicitCastExpr::Create(
+        Context, BuildBlock.get()->getType(), CK_CopyAndAutoreleaseBlockObject,
+        BuildBlock.get(), nullptr, VK_RValue, FPOptionsOverride());
 
   if (BuildBlock.isInvalid()) {
     Diag(CurrentLocation, diag::note_lambda_to_block_conv);
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 75200cdfd64ef..45dfd0fdd93a3 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -715,7 +715,8 @@ ExprResult Sema::DefaultLvalueConversion(Expr *E) {
   // C++ [conv.lval]p3:
   //   If T is cv std::nullptr_t, the result is a null pointer constant.
   CastKind CK = T->isNullPtrType() ? CK_NullToPointer : CK_LValueToRValue;
-  Res = ImplicitCastExpr::Create(Context, T, CK, E, nullptr, VK_RValue);
+  Res = ImplicitCastExpr::Create(Context, T, CK, E, nullptr, VK_RValue,
+                                 FPOptionsOverride());
 
   // C11 6.3.2.1p2:
   //   ... if the lvalue has atomic type, the value has the non-atomic version
@@ -723,7 +724,7 @@ ExprResult Sema::DefaultLvalueConversion(Expr *E) {
   if (const AtomicType *Atomic = T->getAs()) {
     T = Atomic->getValueType().getUnqualifiedType();
     Res = ImplicitCastExpr::Create(Context, T, CK_AtomicToNonAtomic, Res.get(),
-                                   nullptr, VK_RValue);
+                                   nullptr, VK_RValue, FPOptionsOverride());
   }
 
   return Res;
@@ -6181,6 +6182,7 @@ static FunctionDecl *rewriteBuiltinFunctionDecl(Sema *Sema, ASTContext &Context,
     Params.push_back(Parm);
   }
   OverloadDecl->setParams(Params);
+  Sema->mergeDeclAttributes(OverloadDecl, FDecl);
   return OverloadDecl;
 }
 
@@ -6985,9 +6987,9 @@ void Sema::maybeExtendBlockObject(ExprResult &E) {
   // Only do this in an r-value context.
   if (!getLangOpts().ObjCAutoRefCount) return;
 
-  E = ImplicitCastExpr::Create(Context, E.get()->getType(),
-                               CK_ARCExtendBlockObject, E.get(),
-                               /*base path*/ nullptr, VK_RValue);
+  E = ImplicitCastExpr::Create(
+      Context, E.get()->getType(), CK_ARCExtendBlockObject, E.get(),
+      /*base path*/ nullptr, VK_RValue, FPOptionsOverride());
   Cleanup.setExprNeedsCleanups(true);
 }
 
@@ -16606,8 +16608,13 @@ static OdrUseContext isOdrUseContext(Sema &SemaRef) {
 }
 
 static bool isImplicitlyDefinableConstexprFunction(FunctionDecl *Func) {
-  return Func->isConstexpr() &&
-         (Func->isImplicitlyInstantiable() || !Func->isUserProvided());
+  if (!Func->isConstexpr())
+    return false;
+
+  if (Func->isImplicitlyInstantiable() || !Func->isUserProvided())
+    return true;
+  auto *CCD = dyn_cast(Func);
+  return CCD && CCD->getInheritedConstructor();
 }
 
 /// Mark a function referenced, and check whether it is odr-used
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 4dc6fd8f72042..2f971319b82d3 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -663,7 +663,16 @@ Sema::ActOnCXXTypeid(SourceLocation OpLoc, SourceLocation LParenLoc,
   }
 
   // The operand is an expression.
-  return BuildCXXTypeId(TypeInfoType, OpLoc, (Expr*)TyOrExpr, RParenLoc);
+  ExprResult Result =
+      BuildCXXTypeId(TypeInfoType, OpLoc, (Expr *)TyOrExpr, RParenLoc);
+
+  if (!getLangOpts().RTTIData && !Result.isInvalid())
+    if (auto *CTE = dyn_cast(Result.get()))
+      if (CTE->isPotentiallyEvaluated() && !CTE->isMostDerived(Context))
+        Diag(OpLoc, diag::warn_no_typeid_with_rtti_disabled)
+            << (getDiagnostics().getDiagnosticOptions().getFormat() ==
+                DiagnosticOptions::MSVC);
+  return Result;
 }
 
 /// Grabs __declspec(uuid()) off a type, or returns 0 if we cannot resolve to
@@ -1508,7 +1517,8 @@ Sema::BuildCXXTypeConstructExpr(TypeSourceInfo *TInfo,
                            : SourceRange(LParenOrBraceLoc, RParenOrBraceLoc);
     Result = CXXFunctionalCastExpr::Create(
         Context, ResultType, Expr::getValueKindForType(Ty), TInfo, CK_NoOp,
-        Result.get(), /*Path=*/nullptr, Locs.getBegin(), Locs.getEnd());
+        Result.get(), /*Path=*/nullptr, CurFPFeatureOverrides(),
+        Locs.getBegin(), Locs.getEnd());
   }
 
   return Result;
@@ -1838,12 +1848,13 @@ void Sema::diagnoseUnavailableAlignedAllocation(const FunctionDecl &FD,
     const llvm::Triple &T = getASTContext().getTargetInfo().getTriple();
     StringRef OSName = AvailabilityAttr::getPlatformNameSourceSpelling(
         getASTContext().getTargetInfo().getPlatformName());
+    VersionTuple OSVersion = alignedAllocMinVersion(T.getOS());
 
     OverloadedOperatorKind Kind = FD.getDeclName().getCXXOverloadedOperator();
     bool IsDelete = Kind == OO_Delete || Kind == OO_Array_Delete;
     Diag(Loc, diag::err_aligned_allocation_unavailable)
         << IsDelete << FD.getType().getAsString() << OSName
-        << alignedAllocMinVersion(T.getOS()).getAsString();
+        << OSVersion.getAsString() << OSVersion.empty();
     Diag(Loc, diag::note_silence_aligned_allocation_unavailable);
   }
 }
@@ -2209,7 +2220,7 @@ Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
         SizeTy, SourceLocation());
     ImplicitCastExpr DesiredAlignment(ImplicitCastExpr::OnStack, AlignValT,
                                       CK_IntegralCast, &AlignmentLiteral,
-                                      VK_RValue);
+                                      VK_RValue, FPOptionsOverride());
 
     // Adjust placement args by prepending conjured size and alignment exprs.
     llvm::SmallVector CallArgs;
@@ -3924,7 +3935,8 @@ static ExprResult BuildCXXCastArgument(Sema &S,
     // Record usage of conversion in an implicit cast.
     Result = ImplicitCastExpr::Create(S.Context, Result.get()->getType(),
                                       CK_UserDefinedConversion, Result.get(),
-                                      nullptr, Result.get()->getValueKind());
+                                      nullptr, Result.get()->getValueKind(),
+                                      S.CurFPFeatureOverrides());
 
     return S.MaybeBindToTemporary(Result.get());
   }
@@ -4105,7 +4117,8 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
     if (const AtomicType *FromAtomic = FromType->getAs()) {
       FromType = FromAtomic->getValueType().getUnqualifiedType();
       From = ImplicitCastExpr::Create(Context, FromType, CK_AtomicToNonAtomic,
-                                      From, /*BasePath=*/nullptr, VK_RValue);
+                                      From, /*BasePath=*/nullptr, VK_RValue,
+                                      FPOptionsOverride());
     }
     break;
 
@@ -6849,7 +6862,7 @@ ExprResult Sema::MaybeBindToTemporary(Expr *E) {
     CastKind ck = (ReturnsRetained ? CK_ARCConsumeObject
                                    : CK_ARCReclaimReturnedObject);
     return ImplicitCastExpr::Create(Context, E->getType(), ck, E, nullptr,
-                                    VK_RValue);
+                                    VK_RValue, FPOptionsOverride());
   }
 
   if (E->getType().isDestructedType() == QualType::DK_nontrivial_c_struct)
diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp
index 228a1ec3ba1f9..2c088c8b15a3f 100644
--- a/clang/lib/Sema/SemaExprObjC.cpp
+++ b/clang/lib/Sema/SemaExprObjC.cpp
@@ -4462,8 +4462,8 @@ Sema::CheckObjCConversion(SourceRange castRange, QualType castType,
   // If the result is +1, consume it here.
   case ACC_plusOne:
     castExpr = ImplicitCastExpr::Create(Context, castExpr->getType(),
-                                        CK_ARCConsumeObject, castExpr,
-                                        nullptr, VK_RValue);
+                                        CK_ARCConsumeObject, castExpr, nullptr,
+                                        VK_RValue, FPOptionsOverride());
     Cleanup.setExprNeedsCleanups(true);
     return ACR_okay;
   }
@@ -4689,9 +4689,9 @@ ExprResult Sema::BuildObjCBridgedCast(SourceLocation LParenLoc,
 
     case OBC_BridgeRetained:
       // Produce the object before casting it.
-      SubExpr = ImplicitCastExpr::Create(Context, FromType,
-                                         CK_ARCProduceObject,
-                                         SubExpr, nullptr, VK_RValue);
+      SubExpr = ImplicitCastExpr::Create(Context, FromType, CK_ARCProduceObject,
+                                         SubExpr, nullptr, VK_RValue,
+                                         FPOptionsOverride());
       break;
 
     case OBC_BridgeTransfer: {
@@ -4730,7 +4730,7 @@ ExprResult Sema::BuildObjCBridgedCast(SourceLocation LParenLoc,
   if (MustConsume) {
     Cleanup.setExprNeedsCleanups(true);
     Result = ImplicitCastExpr::Create(Context, T, CK_ARCConsumeObject, Result,
-                                      nullptr, VK_RValue);
+                                      nullptr, VK_RValue, FPOptionsOverride());
   }
 
   return Result;
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 19d43402fb301..ecb305260ab7b 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -2890,8 +2890,9 @@ InitListChecker::CheckDesignatedInitializer(const InitializedEntity &Entity,
         Expr *Init = new (Context) IntegerLiteral(
             Context, CodeUnit, PromotedCharTy, SubExpr->getExprLoc());
         if (CharTy != PromotedCharTy)
-          Init = ImplicitCastExpr::Create(Context, CharTy, CK_IntegralCast,
-                                          Init, nullptr, VK_RValue);
+          Init =
+              ImplicitCastExpr::Create(Context, CharTy, CK_IntegralCast, Init,
+                                       nullptr, VK_RValue, FPOptionsOverride());
         StructuredList->updateInit(Context, i, Init);
       }
     } else {
@@ -2912,8 +2913,9 @@ InitListChecker::CheckDesignatedInitializer(const InitializedEntity &Entity,
         Expr *Init = new (Context) IntegerLiteral(
             Context, CodeUnit, PromotedCharTy, SubExpr->getExprLoc());
         if (CharTy != PromotedCharTy)
-          Init = ImplicitCastExpr::Create(Context, CharTy, CK_IntegralCast,
-                                          Init, nullptr, VK_RValue);
+          Init =
+              ImplicitCastExpr::Create(Context, CharTy, CK_IntegralCast, Init,
+                                       nullptr, VK_RValue, FPOptionsOverride());
         StructuredList->updateInit(Context, i, Init);
       }
     }
@@ -8019,9 +8021,9 @@ ExprResult InitializationSequence::Perform(Sema &S,
               (Step->Kind == SK_CastDerivedToBaseXValue ?
                    VK_XValue :
                    VK_RValue);
-      CurInit =
-          ImplicitCastExpr::Create(S.Context, Step->Type, CK_DerivedToBase,
-                                   CurInit.get(), &BasePath, VK);
+      CurInit = ImplicitCastExpr::Create(S.Context, Step->Type,
+                                         CK_DerivedToBase, CurInit.get(),
+                                         &BasePath, VK, FPOptionsOverride());
       break;
     }
 
@@ -8150,9 +8152,9 @@ ExprResult InitializationSequence::Perform(Sema &S,
       if (CreatedObject && checkAbstractType(CurInit.get()->getType()))
         return ExprError();
 
-      CurInit = ImplicitCastExpr::Create(S.Context, CurInit.get()->getType(),
-                                         CastKind, CurInit.get(), nullptr,
-                                         CurInit.get()->getValueKind());
+      CurInit = ImplicitCastExpr::Create(
+          S.Context, CurInit.get()->getType(), CastKind, CurInit.get(), nullptr,
+          CurInit.get()->getValueKind(), S.CurFPFeatureOverrides());
 
       if (shouldBindAsTemporary(Entity))
         // The overall entity is temporary, so this expression should be
@@ -8493,9 +8495,9 @@ ExprResult InitializationSequence::Perform(Sema &S,
       break;
 
     case SK_ProduceObjCObject:
-      CurInit =
-          ImplicitCastExpr::Create(S.Context, Step->Type, CK_ARCProduceObject,
-                                   CurInit.get(), nullptr, VK_RValue);
+      CurInit = ImplicitCastExpr::Create(
+          S.Context, Step->Type, CK_ARCProduceObject, CurInit.get(), nullptr,
+          VK_RValue, FPOptionsOverride());
       break;
 
     case SK_StdInitializerList: {
@@ -8550,9 +8552,9 @@ ExprResult InitializationSequence::Perform(Sema &S,
           // Case 1b and 1c
           // No cast from integer to sampler is needed.
           if (!Var->hasGlobalStorage()) {
-            CurInit = ImplicitCastExpr::Create(S.Context, Step->Type,
-                                               CK_LValueToRValue, Init,
-                                               /*BasePath=*/nullptr, VK_RValue);
+            CurInit = ImplicitCastExpr::Create(
+                S.Context, Step->Type, CK_LValueToRValue, Init,
+                /*BasePath=*/nullptr, VK_RValue, FPOptionsOverride());
             break;
           }
           // Case 1a
@@ -8572,9 +8574,9 @@ ExprResult InitializationSequence::Perform(Sema &S,
                  !Entity.isDefaultMemberInitializer() &&
                  isa(Init)) {
         // Case 2: Member initialization from a variable.
-        CurInit =
-            ImplicitCastExpr::Create(S.Context, Step->Type, CK_LValueToRValue,
-                                     Init, /*BasePath=*/nullptr, VK_RValue);
+        CurInit = ImplicitCastExpr::Create(
+            S.Context, Step->Type, CK_LValueToRValue, Init,
+            /*BasePath=*/nullptr, VK_RValue, FPOptionsOverride());
         break;
       } else {
         // Case 3
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index c9f2854f7accf..0b081f39299e9 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -680,8 +680,9 @@ static void adjustBlockReturnsToEnum(Sema &S, ArrayRef returns,
     ExprWithCleanups *cleanups = dyn_cast(retValue);
 
     Expr *E = (cleanups ? cleanups->getSubExpr() : retValue);
-    E = ImplicitCastExpr::Create(S.Context, returnType, CK_IntegralCast,
-                                 E, /*base path*/ nullptr, VK_RValue);
+    E = ImplicitCastExpr::Create(S.Context, returnType, CK_IntegralCast, E,
+                                 /*base path*/ nullptr, VK_RValue,
+                                 FPOptionsOverride());
     if (cleanups) {
       cleanups->setSubExpr(E);
     } else {
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 23eca88029c57..9dcd9f92bff64 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -932,10 +932,9 @@ bool Sema::LookupBuiltin(LookupResult &R) {
             Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID))
           return false;
 
-        if (NamedDecl *D = LazilyCreateBuiltin((IdentifierInfo *)II,
-                                               BuiltinID, TUScope,
-                                               R.isForRedeclaration(),
-                                               R.getNameLoc())) {
+        if (NamedDecl *D =
+                LazilyCreateBuiltin(II, BuiltinID, TUScope,
+                                    R.isForRedeclaration(), R.getNameLoc())) {
           R.addDecl(D);
           return true;
         }
diff --git a/clang/lib/Sema/SemaObjCProperty.cpp b/clang/lib/Sema/SemaObjCProperty.cpp
index e301c62dd2c0b..fdc30fe6f6576 100644
--- a/clang/lib/Sema/SemaObjCProperty.cpp
+++ b/clang/lib/Sema/SemaObjCProperty.cpp
@@ -1464,10 +1464,9 @@ Decl *Sema::ActOnPropertyImplDecl(Scope *S,
           DeclRefExpr(Context, SelfDecl, false, SelfDecl->getType(), VK_LValue,
                       PropertyDiagLoc);
       MarkDeclRefReferenced(SelfExpr);
-      Expr *LoadSelfExpr =
-        ImplicitCastExpr::Create(Context, SelfDecl->getType(),
-                                 CK_LValueToRValue, SelfExpr, nullptr,
-                                 VK_RValue);
+      Expr *LoadSelfExpr = ImplicitCastExpr::Create(
+          Context, SelfDecl->getType(), CK_LValueToRValue, SelfExpr, nullptr,
+          VK_RValue, FPOptionsOverride());
       Expr *IvarRefExpr =
         new (Context) ObjCIvarRefExpr(Ivar,
                                       Ivar->getUsageType(SelfDecl->getType()),
@@ -1528,10 +1527,9 @@ Decl *Sema::ActOnPropertyImplDecl(Scope *S,
           DeclRefExpr(Context, SelfDecl, false, SelfDecl->getType(), VK_LValue,
                       PropertyDiagLoc);
       MarkDeclRefReferenced(SelfExpr);
-      Expr *LoadSelfExpr =
-        ImplicitCastExpr::Create(Context, SelfDecl->getType(),
-                                 CK_LValueToRValue, SelfExpr, nullptr,
-                                 VK_RValue);
+      Expr *LoadSelfExpr = ImplicitCastExpr::Create(
+          Context, SelfDecl->getType(), CK_LValueToRValue, SelfExpr, nullptr,
+          VK_RValue, FPOptionsOverride());
       Expr *lhs =
         new (Context) ObjCIvarRefExpr(Ivar,
                                       Ivar->getUsageType(SelfDecl->getType()),
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 352f52d2f6260..92f6141b6d389 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -2194,6 +2194,7 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
             break;
           }
       }
+      assert(CSI && "Failed to find CapturedRegionScopeInfo");
       SmallVector Regions;
       getOpenMPCaptureRegions(Regions,
                               DSAStack->getDirective(CSI->OpenMPLevel));
@@ -2440,10 +2441,6 @@ void Sema::DestroyDataSharingAttributesStack() { delete DSAStack; }
 
 void Sema::ActOnOpenMPBeginDeclareVariant(SourceLocation Loc,
                                           OMPTraitInfo &TI) {
-  if (!OMPDeclareVariantScopes.empty()) {
-    Diag(Loc, diag::warn_nested_declare_variant);
-    return;
-  }
   OMPDeclareVariantScopes.push_back(OMPDeclareVariantScope(TI));
 }
 
@@ -5871,9 +5868,21 @@ static void setPrototype(Sema &S, FunctionDecl *FD, FunctionDecl *FDWithProto,
 Sema::OMPDeclareVariantScope::OMPDeclareVariantScope(OMPTraitInfo &TI)
     : TI(&TI), NameSuffix(TI.getMangledName()) {}
 
-FunctionDecl *
-Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(Scope *S,
-                                                                Declarator &D) {
+void Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
+    Scope *S, Declarator &D, MultiTemplateParamsArg TemplateParamLists,
+    SmallVectorImpl &Bases) {
+  if (!D.getIdentifier())
+    return;
+
+  OMPDeclareVariantScope &DVScope = OMPDeclareVariantScopes.back();
+
+  // Template specialization is an extension, check if we do it.
+  bool IsTemplated = !TemplateParamLists.empty();
+  if (IsTemplated &
+      !DVScope.TI->isExtensionActive(
+          llvm::omp::TraitProperty::implementation_extension_allow_templates))
+    return;
+
   IdentifierInfo *BaseII = D.getIdentifier();
   LookupResult Lookup(*this, DeclarationName(BaseII), D.getIdentifierLoc(),
                       LookupOrdinaryName);
@@ -5885,9 +5894,13 @@ Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(Scope *S,
   bool IsConstexpr = D.getDeclSpec().getConstexprSpecifier() == CSK_constexpr;
   bool IsConsteval = D.getDeclSpec().getConstexprSpecifier() == CSK_consteval;
 
-  FunctionDecl *BaseFD = nullptr;
   for (auto *Candidate : Lookup) {
-    auto *UDecl = dyn_cast(Candidate->getUnderlyingDecl());
+    auto *CandidateDecl = Candidate->getUnderlyingDecl();
+    FunctionDecl *UDecl = nullptr;
+    if (IsTemplated && isa(CandidateDecl))
+      UDecl = cast(CandidateDecl)->getTemplatedDecl();
+    else if (!IsTemplated)
+      UDecl = dyn_cast(CandidateDecl);
     if (!UDecl)
       continue;
 
@@ -5898,22 +5911,33 @@ Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(Scope *S,
     if (UDecl->isConsteval() && !IsConsteval)
       continue;
 
-    QualType NewType = Context.mergeFunctionTypes(
-        FType, UDecl->getType(), /* OfBlockPointer */ false,
-        /* Unqualified */ false, /* AllowCXX */ true);
-    if (NewType.isNull())
-      continue;
+    QualType UDeclTy = UDecl->getType();
+    // TODO: Verify types for templates eventually.
+    if (!UDeclTy->isDependentType()) {
+      QualType NewType = Context.mergeFunctionTypes(
+          FType, UDeclTy, /* OfBlockPointer */ false,
+          /* Unqualified */ false, /* AllowCXX */ true);
+      if (NewType.isNull())
+        continue;
+    }
 
     // Found a base!
-    BaseFD = UDecl;
-    break;
-  }
-  if (!BaseFD) {
-    BaseFD = cast(ActOnDeclarator(S, D));
-    BaseFD->setImplicit(true);
+    Bases.push_back(UDecl);
+  }
+
+  bool UseImplicitBase = !DVScope.TI->isExtensionActive(
+      llvm::omp::TraitProperty::implementation_extension_disable_implicit_base);
+  // If no base was found we create a declaration that we use as base.
+  if (Bases.empty() && UseImplicitBase) {
+    D.setFunctionDefinitionKind(FDK_Declaration);
+    Decl *BaseD = HandleDeclarator(S, D, TemplateParamLists);
+    BaseD->setImplicit(true);
+    if (auto *BaseTemplD = dyn_cast(BaseD))
+      Bases.push_back(BaseTemplD->getTemplatedDecl());
+    else
+      Bases.push_back(cast(BaseD));
   }
 
-  OMPDeclareVariantScope &DVScope = OMPDeclareVariantScopes.back();
   std::string MangledName;
   MangledName += D.getIdentifier()->getName();
   MangledName += getOpenMPVariantManglingSeparatorStr();
@@ -5922,17 +5946,21 @@ Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(Scope *S,
 
   VariantII.setMangledOpenMPVariantName(true);
   D.SetIdentifier(&VariantII, D.getBeginLoc());
-  return BaseFD;
 }
 
 void Sema::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
-    FunctionDecl *FD, FunctionDecl *BaseFD) {
+    Decl *D, SmallVectorImpl &Bases) {
   // Do not mark function as is used to prevent its emission if this is the
   // only place where it is used.
   EnterExpressionEvaluationContext Unevaluated(
       *this, Sema::ExpressionEvaluationContext::Unevaluated);
 
-  Expr *VariantFuncRef = DeclRefExpr::Create(
+  FunctionDecl *FD = nullptr;
+  if (auto *UTemplDecl = dyn_cast(D))
+    FD = UTemplDecl->getTemplatedDecl();
+  else
+    FD = cast(D);
+  auto *VariantFuncRef = DeclRefExpr::Create(
       Context, NestedNameSpecifierLoc(), SourceLocation(), FD,
       /* RefersToEnclosingVariableOrCapture */ false,
       /* NameLoc */ FD->getLocation(), FD->getType(), ExprValueKind::VK_RValue);
@@ -5940,7 +5968,8 @@ void Sema::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
   OMPDeclareVariantScope &DVScope = OMPDeclareVariantScopes.back();
   auto *OMPDeclareVariantA = OMPDeclareVariantAttr::CreateImplicit(
       Context, VariantFuncRef, DVScope.TI);
-  BaseFD->addAttr(OMPDeclareVariantA);
+  for (FunctionDecl *BaseFD : Bases)
+    BaseFD->addAttr(OMPDeclareVariantA);
 }
 
 ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope,
@@ -6128,7 +6157,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
 
   // Convert VariantRef expression to the type of the original function to
   // resolve possible conflicts.
-  ExprResult VariantRefCast;
+  ExprResult VariantRefCast = VariantRef;
   if (LangOpts.CPlusPlus) {
     QualType FnPtrType;
     auto *Method = dyn_cast(FD);
@@ -6153,25 +6182,27 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
     } else {
       FnPtrType = Context.getPointerType(FD->getType());
     }
-    ImplicitConversionSequence ICS =
-        TryImplicitConversion(VariantRef, FnPtrType.getUnqualifiedType(),
-                              /*SuppressUserConversions=*/false,
-                              AllowedExplicit::None,
-                              /*InOverloadResolution=*/false,
-                              /*CStyle=*/false,
-                              /*AllowObjCWritebackConversion=*/false);
-    if (ICS.isFailure()) {
-      Diag(VariantRef->getExprLoc(),
-           diag::err_omp_declare_variant_incompat_types)
-          << VariantRef->getType()
-          << ((Method && !Method->isStatic()) ? FnPtrType : FD->getType())
-          << VariantRef->getSourceRange();
-      return None;
+    QualType VarianPtrType = Context.getPointerType(VariantRef->getType());
+    if (VarianPtrType.getUnqualifiedType() != FnPtrType.getUnqualifiedType()) {
+      ImplicitConversionSequence ICS = TryImplicitConversion(
+          VariantRef, FnPtrType.getUnqualifiedType(),
+          /*SuppressUserConversions=*/false, AllowedExplicit::None,
+          /*InOverloadResolution=*/false,
+          /*CStyle=*/false,
+          /*AllowObjCWritebackConversion=*/false);
+      if (ICS.isFailure()) {
+        Diag(VariantRef->getExprLoc(),
+             diag::err_omp_declare_variant_incompat_types)
+            << VariantRef->getType()
+            << ((Method && !Method->isStatic()) ? FnPtrType : FD->getType())
+            << VariantRef->getSourceRange();
+        return None;
+      }
+      VariantRefCast = PerformImplicitConversion(
+          VariantRef, FnPtrType.getUnqualifiedType(), AA_Converting);
+      if (!VariantRefCast.isUsable())
+        return None;
     }
-    VariantRefCast = PerformImplicitConversion(
-        VariantRef, FnPtrType.getUnqualifiedType(), AA_Converting);
-    if (!VariantRefCast.isUsable())
-      return None;
     // Drop previously built artificial addr_of unary op for member functions.
     if (Method && !Method->isStatic()) {
       Expr *PossibleAddrOfVariantRef = VariantRefCast.get();
@@ -6179,8 +6210,6 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
               PossibleAddrOfVariantRef->IgnoreImplicit()))
         VariantRefCast = UO->getSubExpr();
     }
-  } else {
-    VariantRefCast = VariantRef;
   }
 
   ExprResult ER = CheckPlaceholderExpr(VariantRefCast.get());
@@ -15119,6 +15148,17 @@ static bool actOnOMPReductionKindClause(
           continue;
         }
       }
+    } else {
+      // Threadprivates cannot be shared between threads, so dignose if the base
+      // is a threadprivate variable.
+      DSAStackTy::DSAVarData DVar = Stack->getTopDSA(D, /*FromParent=*/false);
+      if (DVar.CKind == OMPC_threadprivate) {
+        S.Diag(ELoc, diag::err_omp_wrong_dsa)
+            << getOpenMPClauseName(DVar.CKind)
+            << getOpenMPClauseName(OMPC_reduction);
+        reportOriginalDsa(S, Stack, D, DVar);
+        continue;
+      }
     }
 
     // Try to find 'declare reduction' corresponding construct before using
@@ -15388,12 +15428,12 @@ static bool actOnOMPReductionKindClause(
       if (!BasePath.empty()) {
         LHS = S.DefaultLvalueConversion(LHS.get());
         RHS = S.DefaultLvalueConversion(RHS.get());
-        LHS = ImplicitCastExpr::Create(Context, PtrRedTy,
-                                       CK_UncheckedDerivedToBase, LHS.get(),
-                                       &BasePath, LHS.get()->getValueKind());
-        RHS = ImplicitCastExpr::Create(Context, PtrRedTy,
-                                       CK_UncheckedDerivedToBase, RHS.get(),
-                                       &BasePath, RHS.get()->getValueKind());
+        LHS = ImplicitCastExpr::Create(
+            Context, PtrRedTy, CK_UncheckedDerivedToBase, LHS.get(), &BasePath,
+            LHS.get()->getValueKind(), FPOptionsOverride());
+        RHS = ImplicitCastExpr::Create(
+            Context, PtrRedTy, CK_UncheckedDerivedToBase, RHS.get(), &BasePath,
+            RHS.get()->getValueKind(), FPOptionsOverride());
       }
       FunctionProtoType::ExtProtoInfo EPI;
       QualType Params[] = {PtrRedTy, PtrRedTy};
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 71341e5688fe0..95d110e754f45 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -5862,7 +5862,8 @@ diagnoseNoViableConversion(Sema &SemaRef, SourceLocation Loc, Expr *&From,
     // Record usage of conversion in an implicit cast.
     From = ImplicitCastExpr::Create(SemaRef.Context, Result.get()->getType(),
                                     CK_UserDefinedConversion, Result.get(),
-                                    nullptr, Result.get()->getValueKind());
+                                    nullptr, Result.get()->getValueKind(),
+                                    SemaRef.CurFPFeatureOverrides());
   }
   return false;
 }
@@ -5891,7 +5892,8 @@ static bool recordConversion(Sema &SemaRef, SourceLocation Loc, Expr *&From,
   // Record usage of conversion in an implicit cast.
   From = ImplicitCastExpr::Create(SemaRef.Context, Result.get()->getType(),
                                   CK_UserDefinedConversion, Result.get(),
-                                  nullptr, Result.get()->getValueKind());
+                                  nullptr, Result.get()->getValueKind(),
+                                  SemaRef.CurFPFeatureOverrides());
   return false;
 }
 
@@ -7296,8 +7298,8 @@ void Sema::AddConversionCandidate(
                             VK_LValue, From->getBeginLoc());
   ImplicitCastExpr ConversionFn(ImplicitCastExpr::OnStack,
                                 Context.getPointerType(Conversion->getType()),
-                                CK_FunctionToPointerDecay,
-                                &ConversionRef, VK_RValue);
+                                CK_FunctionToPointerDecay, &ConversionRef,
+                                VK_RValue, FPOptionsOverride());
 
   QualType ConversionType = Conversion->getConversionType();
   if (!isCompleteType(From->getBeginLoc(), ConversionType)) {
@@ -14422,9 +14424,9 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj,
     if (Call.isInvalid())
       return ExprError();
     // Record usage of conversion in an implicit cast.
-    Call = ImplicitCastExpr::Create(Context, Call.get()->getType(),
-                                    CK_UserDefinedConversion, Call.get(),
-                                    nullptr, VK_RValue);
+    Call = ImplicitCastExpr::Create(
+        Context, Call.get()->getType(), CK_UserDefinedConversion, Call.get(),
+        nullptr, VK_RValue, CurFPFeatureOverrides());
 
     return BuildCallExpr(S, Call.get(), LParenLoc, Args, RParenLoc);
   }
@@ -14829,10 +14831,9 @@ Expr *Sema::FixOverloadedFunctionReference(Expr *E, DeclAccessPair Found,
     if (SubExpr == ICE->getSubExpr())
       return ICE;
 
-    return ImplicitCastExpr::Create(Context, ICE->getType(),
-                                    ICE->getCastKind(),
-                                    SubExpr, nullptr,
-                                    ICE->getValueKind());
+    return ImplicitCastExpr::Create(Context, ICE->getType(), ICE->getCastKind(),
+                                    SubExpr, nullptr, ICE->getValueKind(),
+                                    CurFPFeatureOverrides());
   }
 
   if (auto *GSE = dyn_cast(E)) {
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index 11efd7577a2e9..ad8d259937025 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -1864,15 +1864,15 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
       ParamType = Pointer->getType();
     }
 
-    DRE =
-        ImplicitCastExpr::Create(SemaRef.Context, ParamType, CK_LValueToRValue,
-                                 DRE, /*BasePath=*/nullptr, VK_RValue);
+    DRE = ImplicitCastExpr::Create(SemaRef.Context, ParamType,
+                                   CK_LValueToRValue, DRE, /*BasePath=*/nullptr,
+                                   VK_RValue, FPOptionsOverride());
 
     if (PointerTy->getPointeeType().getAddressSpace() !=
         ParamType->getPointeeType().getAddressSpace())
       DRE = ImplicitCastExpr::Create(SemaRef.Context, PointerTy,
                                      CK_AddressSpaceConversion, DRE, nullptr,
-                                     VK_RValue);
+                                     VK_RValue, FPOptionsOverride());
 
     return DRE;
   }
@@ -2191,7 +2191,7 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
                                          /*IgnoreBaseAccess*/ true);
     auto Cast = ImplicitCastExpr::Create(
         SemaRef.Context, BaseTy, CK_DerivedToBase, MemberExprBases.back(),
-        /* CXXCastPath=*/&BasePath, VK_LValue);
+        /* CXXCastPath=*/&BasePath, VK_LValue, FPOptionsOverride());
     MemberExprBases.push_back(Cast);
 
     addCollectionInitListExpr(BaseTy->getAsCXXRecordDecl());
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index febdebca31879..baa7c4cf212df 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -597,6 +597,18 @@ StmtResult Sema::ActOnIfStmt(SourceLocation IfLoc, bool IsConstexpr,
     DiagnoseEmptyStmtBody(CondExpr->getEndLoc(), thenStmt,
                           diag::warn_empty_if_body);
 
+  std::tuple LHC =
+      Stmt::determineLikelihoodConflict(thenStmt, elseStmt);
+  if (std::get<0>(LHC)) {
+    const Attr *ThenAttr = std::get<1>(LHC);
+    const Attr *ElseAttr = std::get<2>(LHC);
+    Diags.Report(ThenAttr->getLocation(),
+                 diag::warn_attributes_likelihood_ifstmt_conflict)
+        << ThenAttr << ThenAttr->getRange();
+    Diags.Report(ElseAttr->getLocation(), diag::note_conflicting_attribute)
+        << ElseAttr << ElseAttr->getRange();
+  }
+
   return BuildIfStmt(IfLoc, IsConstexpr, LParenLoc, InitStmt, Cond, RParenLoc,
                      thenStmt, ElseLoc, elseStmt);
 }
@@ -3083,7 +3095,7 @@ static void TryMoveInitialization(Sema& S,
                                   bool ConvertingConstructorsOnly,
                                   ExprResult &Res) {
   ImplicitCastExpr AsRvalue(ImplicitCastExpr::OnStack, Value->getType(),
-                            CK_NoOp, Value, VK_XValue);
+                            CK_NoOp, Value, VK_XValue, FPOptionsOverride());
 
   Expr *InitExpr = &AsRvalue;
 
@@ -3138,8 +3150,9 @@ static void TryMoveInitialization(Sema& S,
 
     // Promote "AsRvalue" to the heap, since we now need this
     // expression node to persist.
-    Value = ImplicitCastExpr::Create(S.Context, Value->getType(), CK_NoOp,
-                                     Value, nullptr, VK_XValue);
+    Value =
+        ImplicitCastExpr::Create(S.Context, Value->getType(), CK_NoOp, Value,
+                                 nullptr, VK_XValue, FPOptionsOverride());
 
     // Complete type-checking the initialization of the return type
     // using the constructor we found.
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index a0b6550136f92..387c0f9358888 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -413,6 +413,24 @@ static Attr *handleNoMergeAttr(Sema &S, Stmt *St, const ParsedAttr &A,
   return ::new (S.Context) NoMergeAttr(S.Context, A);
 }
 
+static Attr *handleLikely(Sema &S, Stmt *St, const ParsedAttr &A,
+                          SourceRange Range) {
+
+  if (!S.getLangOpts().CPlusPlus20 && A.isCXX11Attribute() && !A.getScopeName())
+    S.Diag(A.getLoc(), diag::ext_cxx20_attr) << A << Range;
+
+  return ::new (S.Context) LikelyAttr(S.Context, A);
+}
+
+static Attr *handleUnlikely(Sema &S, Stmt *St, const ParsedAttr &A,
+                            SourceRange Range) {
+
+  if (!S.getLangOpts().CPlusPlus20 && A.isCXX11Attribute() && !A.getScopeName())
+    S.Diag(A.getLoc(), diag::ext_cxx20_attr) << A << Range;
+
+  return ::new (S.Context) UnlikelyAttr(S.Context, A);
+}
+
 static void
 CheckForIncompatibleAttributes(Sema &S,
                                const SmallVectorImpl &Attrs) {
@@ -518,6 +536,32 @@ CheckForIncompatibleAttributes(Sema &S,
           << CategoryState.NumericAttr->getDiagnosticName(Policy);
     }
   }
+
+  // C++20 [dcl.attr.likelihood]p1 The attribute-token likely shall not appear
+  // in an attribute-specifier-seq that contains the attribute-token unlikely.
+  const LikelyAttr *Likely = nullptr;
+  const UnlikelyAttr *Unlikely = nullptr;
+  for (const auto *I : Attrs) {
+    if (const auto *Attr = dyn_cast(I)) {
+      if (Unlikely) {
+        S.Diag(Attr->getLocation(), diag::err_attributes_are_not_compatible)
+            << Attr << Unlikely << Attr->getRange();
+        S.Diag(Unlikely->getLocation(), diag::note_conflicting_attribute)
+            << Unlikely->getRange();
+        return;
+      }
+      Likely = Attr;
+    } else if (const auto *Attr = dyn_cast(I)) {
+      if (Likely) {
+        S.Diag(Attr->getLocation(), diag::err_attributes_are_not_compatible)
+            << Attr << Likely << Attr->getRange();
+        S.Diag(Likely->getLocation(), diag::note_conflicting_attribute)
+            << Likely->getRange();
+        return;
+      }
+      Unlikely = Attr;
+    }
+  }
 }
 
 template 
@@ -715,6 +759,10 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A,
     return handleSuppressAttr(S, St, A, Range);
   case ParsedAttr::AT_NoMerge:
     return handleNoMergeAttr(S, St, A, Range);
+  case ParsedAttr::AT_Likely:
+    return handleLikely(S, St, A, Range);
+  case ParsedAttr::AT_Unlikely:
+    return handleUnlikely(S, St, A, Range);
   default:
     // if we're here, then we parsed a known attribute, but didn't recognize
     // it as a statement attribute => it is declaration attribute
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 6721b07253292..e1a563850970a 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -7478,7 +7478,7 @@ Sema::BuildExpressionFromIntegralTemplateArgument(const TemplateArgument &Arg,
     // FIXME: This is a hack. We need a better way to handle substituted
     // non-type template parameters.
     E = CStyleCastExpr::Create(Context, OrigT, VK_RValue, CK_IntegralCast, E,
-                               nullptr,
+                               nullptr, CurFPFeatureOverrides(),
                                Context.getTrivialTypeSourceInfo(OrigT, Loc),
                                Loc, Loc);
   }
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index a9526671fd5a3..bc4d68a4edd36 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -418,7 +418,9 @@ static void instantiateOMPDeclareVariantAttr(
   if (TI.anyScoreOrCondition(SubstScoreOrConditionExpr))
     return;
 
-  // Check function/variant ref.
+  Expr *E = VariantFuncRef.get();
+  // Check function/variant ref for `omp declare variant` but not for `omp
+  // begin declare variant` (which use implicit attributes).
   Optional> DeclVarData =
       S.checkOpenMPDeclareVariantFunction(S.ConvertDeclToDeclGroup(New),
                                           VariantFuncRef.get(), TI,
@@ -427,9 +429,36 @@ static void instantiateOMPDeclareVariantAttr(
   if (!DeclVarData)
     return;
 
-  S.ActOnOpenMPDeclareVariantDirective(DeclVarData.getValue().first,
-                                       DeclVarData.getValue().second, TI,
-                                       Attr.getRange());
+  E = DeclVarData.getValue().second;
+  FD = DeclVarData.getValue().first;
+
+  if (auto *VariantDRE = dyn_cast(E->IgnoreParenImpCasts())) {
+    if (auto *VariantFD = dyn_cast(VariantDRE->getDecl())) {
+      if (auto *VariantFTD = VariantFD->getDescribedFunctionTemplate()) {
+        if (!VariantFTD->isThisDeclarationADefinition())
+          return;
+        Sema::TentativeAnalysisScope Trap(S);
+        const TemplateArgumentList *TAL = TemplateArgumentList::CreateCopy(
+            S.Context, TemplateArgs.getInnermost());
+
+        auto *SubstFD = S.InstantiateFunctionDeclaration(VariantFTD, TAL,
+                                                         New->getLocation());
+        if (!SubstFD)
+          return;
+        S.InstantiateFunctionDefinition(
+            New->getLocation(), SubstFD, /* Recursive */ true,
+            /* DefinitionRequired */ false, /* AtEndOfTU */ false);
+        SubstFD->setInstantiationIsPending(!SubstFD->isDefined());
+        E = DeclRefExpr::Create(S.Context, NestedNameSpecifierLoc(),
+                                SourceLocation(), SubstFD,
+                                /* RefersToEnclosingVariableOrCapture */ false,
+                                /* NameLoc */ SubstFD->getLocation(),
+                                SubstFD->getType(), ExprValueKind::VK_RValue);
+      }
+    }
+  }
+
+  S.ActOnOpenMPDeclareVariantDirective(FD, E, TI, Attr.getRange());
 }
 
 static void instantiateDependentAMDGPUFlatWorkGroupSizeAttr(
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 2063425f46b16..9d3831583660b 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -4054,32 +4054,9 @@ classifyPointerDeclarator(Sema &S, QualType type, Declarator &declarator,
     if (auto recordType = type->getAs()) {
       RecordDecl *recordDecl = recordType->getDecl();
 
-      bool isCFError = false;
-      if (S.CFError) {
-        // If we already know about CFError, test it directly.
-        isCFError = (S.CFError == recordDecl);
-      } else {
-        // Check whether this is CFError, which we identify based on its bridge
-        // to NSError. CFErrorRef used to be declared with "objc_bridge" but is
-        // now declared with "objc_bridge_mutable", so look for either one of
-        // the two attributes.
-        if (recordDecl->getTagKind() == TTK_Struct && numNormalPointers > 0) {
-          IdentifierInfo *bridgedType = nullptr;
-          if (auto bridgeAttr = recordDecl->getAttr())
-            bridgedType = bridgeAttr->getBridgedType();
-          else if (auto bridgeAttr =
-                       recordDecl->getAttr())
-            bridgedType = bridgeAttr->getBridgedType();
-
-          if (bridgedType == S.getNSErrorIdent()) {
-            S.CFError = recordDecl;
-            isCFError = true;
-          }
-        }
-      }
-
       // If this is CFErrorRef*, report it as such.
-      if (isCFError && numNormalPointers == 2 && numTypeSpecifierPointers < 2) {
+      if (numNormalPointers == 2 && numTypeSpecifierPointers < 2 &&
+          S.isCFError(recordDecl)) {
         return PointerDeclaratorKind::CFErrorRefPointer;
       }
       break;
@@ -4103,6 +4080,31 @@ classifyPointerDeclarator(Sema &S, QualType type, Declarator &declarator,
   }
 }
 
+bool Sema::isCFError(RecordDecl *RD) {
+  // If we already know about CFError, test it directly.
+  if (CFError)
+    return CFError == RD;
+
+  // Check whether this is CFError, which we identify based on its bridge to
+  // NSError. CFErrorRef used to be declared with "objc_bridge" but is now
+  // declared with "objc_bridge_mutable", so look for either one of the two
+  // attributes.
+  if (RD->getTagKind() == TTK_Struct) {
+    IdentifierInfo *bridgedType = nullptr;
+    if (auto bridgeAttr = RD->getAttr())
+      bridgedType = bridgeAttr->getBridgedType();
+    else if (auto bridgeAttr = RD->getAttr())
+      bridgedType = bridgeAttr->getBridgedType();
+
+    if (bridgedType == getNSErrorIdent()) {
+      CFError = RD;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 static FileID getNullabilityCompletenessCheckFileID(Sema &S,
                                                     SourceLocation loc) {
   // If we're anywhere in a function, method, or closure context, don't perform
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index bcfeeb0a501e7..8849cd8182b9b 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -28,6 +28,7 @@
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtObjC.h"
 #include "clang/AST/StmtOpenMP.h"
+#include "clang/Basic/DiagnosticParse.h"
 #include "clang/Basic/OpenMPKinds.h"
 #include "clang/Sema/Designator.h"
 #include "clang/Sema/Lookup.h"
@@ -13200,6 +13201,18 @@ TreeTransform::TransformCXXFoldExpr(CXXFoldExpr *E) {
         E->getEllipsisLoc(), RHS.get(), E->getEndLoc(), NumExpansions);
   }
 
+  // Formally a fold expression expands to nested parenthesized expressions.
+  // Enforce this limit to avoid creating trees so deep we can't safely traverse
+  // them.
+  if (NumExpansions && SemaRef.getLangOpts().BracketDepth < NumExpansions) {
+    SemaRef.Diag(E->getEllipsisLoc(),
+                 clang::diag::err_fold_expression_limit_exceeded)
+        << *NumExpansions << SemaRef.getLangOpts().BracketDepth
+        << E->getSourceRange();
+    SemaRef.Diag(E->getEllipsisLoc(), diag::note_bracket_depth);
+    return ExprError();
+  }
+
   // The transform has determined that we should perform an elementwise
   // expansion of the pattern. Do so.
   ExprResult Result = getDerived().TransformExpr(E->getInit());
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 88073dfc39298..d97b2a7b85df6 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -910,9 +910,8 @@ ASTIdentifierLookupTraitBase::ReadKey(const unsigned char* d, unsigned n) {
 /// Whether the given identifier is "interesting".
 static bool isInterestingIdentifier(ASTReader &Reader, IdentifierInfo &II,
                                     bool IsModule) {
-  return II.hadMacroDefinition() ||
-         II.isPoisoned() ||
-         (IsModule ? II.hasRevertedBuiltin() : II.getObjCOrBuiltinID()) ||
+  return II.hadMacroDefinition() || II.isPoisoned() ||
+         (!IsModule && II.getObjCOrBuiltinID()) ||
          II.hasRevertedTokenIDToIdentifier() ||
          (!(IsModule && Reader.getPreprocessor().getLangOpts().CPlusPlus) &&
           II.getFETokenInfo());
@@ -972,7 +971,6 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
   unsigned Bits = endian::readNext(d);
   bool CPlusPlusOperatorKeyword = readBit(Bits);
   bool HasRevertedTokenIDToIdentifier = readBit(Bits);
-  bool HasRevertedBuiltin = readBit(Bits);
   bool Poisoned = readBit(Bits);
   bool ExtensionToken = readBit(Bits);
   bool HadMacroDefinition = readBit(Bits);
@@ -986,12 +984,6 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
     II->revertTokenIDToIdentifier();
   if (!F.isModule())
     II->setObjCOrBuiltinID(ObjCOrBuiltinID);
-  else if (HasRevertedBuiltin && II->getBuiltinID()) {
-    II->revertBuiltin();
-    assert((II->hasRevertedBuiltin() ||
-            II->getObjCOrBuiltinID() == ObjCOrBuiltinID) &&
-           "Incorrect ObjC keyword or builtin ID");
-  }
   assert(II->isExtensionToken() == ExtensionToken &&
          "Incorrect extension token flag");
   (void)ExtensionToken;
@@ -3950,7 +3942,7 @@ ASTReader::ReadModuleMapFileBlock(RecordData &Record, ModuleFile &F,
       return OutOfDate;
     }
 
-    assert(M->Name == F.ModuleName && "found module with different name");
+    assert(M && M->Name == F.ModuleName && "found module with different name");
 
     // Check the primary module map file.
     auto StoredModMap = FileMgr.getFile(F.ModuleMapPath);
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index e261044f7cb14..c154c146727e9 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -1082,6 +1082,8 @@ void ASTStmtReader::VisitCastExpr(CastExpr *E) {
   VisitExpr(E);
   unsigned NumBaseSpecs = Record.readInt();
   assert(NumBaseSpecs == E->path_size());
+  unsigned HasFPFeatures = Record.readInt();
+  assert(E->hasStoredFPFeatures() == HasFPFeatures);
   E->setSubExpr(Record.readSubExpr());
   E->setCastKind((CastKind)Record.readInt());
   CastExpr::path_iterator BaseI = E->path_begin();
@@ -1090,6 +1092,9 @@ void ASTStmtReader::VisitCastExpr(CastExpr *E) {
     *BaseSpec = Record.readCXXBaseSpecifier();
     *BaseI++ = BaseSpec;
   }
+  if (HasFPFeatures)
+    *E->getTrailingFPFeatures() =
+        FPOptionsOverride::getFromOpaqueInt(Record.readInt());
 }
 
 void ASTStmtReader::VisitBinaryOperator(BinaryOperator *E) {
@@ -2893,13 +2898,17 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_IMPLICIT_CAST:
-      S = ImplicitCastExpr::CreateEmpty(Context,
-                       /*PathSize*/ Record[ASTStmtReader::NumExprFields]);
+      S = ImplicitCastExpr::CreateEmpty(
+          Context,
+          /*PathSize*/ Record[ASTStmtReader::NumExprFields],
+          /*HasFPFeatures*/ Record[ASTStmtReader::NumExprFields + 1]);
       break;
 
     case EXPR_CSTYLE_CAST:
-      S = CStyleCastExpr::CreateEmpty(Context,
-                       /*PathSize*/ Record[ASTStmtReader::NumExprFields]);
+      S = CStyleCastExpr::CreateEmpty(
+          Context,
+          /*PathSize*/ Record[ASTStmtReader::NumExprFields],
+          /*HasFPFeatures*/ Record[ASTStmtReader::NumExprFields + 1]);
       break;
 
     case EXPR_COMPOUND_LITERAL:
@@ -3501,8 +3510,10 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_CXX_STATIC_CAST:
-      S = CXXStaticCastExpr::CreateEmpty(Context,
-                       /*PathSize*/ Record[ASTStmtReader::NumExprFields]);
+      S = CXXStaticCastExpr::CreateEmpty(
+          Context,
+          /*PathSize*/ Record[ASTStmtReader::NumExprFields],
+          /*HasFPFeatures*/ Record[ASTStmtReader::NumExprFields + 1]);
       break;
 
     case EXPR_CXX_DYNAMIC_CAST:
@@ -3524,8 +3535,10 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_CXX_FUNCTIONAL_CAST:
-      S = CXXFunctionalCastExpr::CreateEmpty(Context,
-                       /*PathSize*/ Record[ASTStmtReader::NumExprFields]);
+      S = CXXFunctionalCastExpr::CreateEmpty(
+          Context,
+          /*PathSize*/ Record[ASTStmtReader::NumExprFields],
+          /*HasFPFeatures*/ Record[ASTStmtReader::NumExprFields + 1]);
       break;
 
     case EXPR_BUILTIN_BIT_CAST:
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 9a72108cb02c2..ea0e18211fd7e 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -3275,9 +3275,8 @@ class ASTIdentifierTableTrait {
   /// doesn't check whether the name has macros defined; use PublicMacroIterator
   /// to check that.
   bool isInterestingIdentifier(const IdentifierInfo *II, uint64_t MacroOffset) {
-    if (MacroOffset ||
-        II->isPoisoned() ||
-        (IsModule ? II->hasRevertedBuiltin() : II->getObjCOrBuiltinID()) ||
+    if (MacroOffset || II->isPoisoned() ||
+        (!IsModule && II->getObjCOrBuiltinID()) ||
         II->hasRevertedTokenIDToIdentifier() ||
         (NeedDecls && II->getFETokenInfo()))
       return true;
@@ -3384,7 +3383,6 @@ class ASTIdentifierTableTrait {
     Bits = (Bits << 1) | unsigned(HadMacroDefinition);
     Bits = (Bits << 1) | unsigned(II->isExtensionToken());
     Bits = (Bits << 1) | unsigned(II->isPoisoned());
-    Bits = (Bits << 1) | unsigned(II->hasRevertedBuiltin());
     Bits = (Bits << 1) | unsigned(II->hasRevertedTokenIDToIdentifier());
     Bits = (Bits << 1) | unsigned(II->isCPlusPlusOperatorKeyword());
     LE.write(Bits);
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 2d250674057c3..911fcb4095474 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -2346,6 +2346,7 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetObjectKind
   // CastExpr
   Abv->Add(BitCodeAbbrevOp(0)); // PathSize
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // HasFPFeatures
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 6)); // CastKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // PartOfExplicitCast
   // ImplicitCastExpr
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 4e3e1fdc346fc..0121f25832073 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -946,12 +946,16 @@ void ASTStmtWriter::VisitObjCBridgedCastExpr(ObjCBridgedCastExpr *E) {
 void ASTStmtWriter::VisitCastExpr(CastExpr *E) {
   VisitExpr(E);
   Record.push_back(E->path_size());
+  Record.push_back(E->hasStoredFPFeatures());
   Record.AddStmt(E->getSubExpr());
   Record.push_back(E->getCastKind()); // FIXME: stable encoding
 
   for (CastExpr::path_iterator
          PI = E->path_begin(), PE = E->path_end(); PI != PE; ++PI)
     Record.AddCXXBaseSpecifier(**PI);
+
+  if (E->hasStoredFPFeatures())
+    Record.push_back(E->getFPFeatures().getAsOpaqueInt());
 }
 
 void ASTStmtWriter::VisitBinaryOperator(BinaryOperator *E) {
@@ -1003,7 +1007,7 @@ void ASTStmtWriter::VisitImplicitCastExpr(ImplicitCastExpr *E) {
   VisitCastExpr(E);
   Record.push_back(E->isPartOfExplicitCast());
 
-  if (E->path_size() == 0)
+  if (E->path_size() == 0 && !E->hasStoredFPFeatures())
     AbbrevToUse = Writer.getExprImplicitCastAbbrev();
 
   Code = serialization::EXPR_IMPLICIT_CAST;
diff --git a/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp b/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
index 918c6e361381e..a86a410ebcbc1 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
@@ -978,8 +978,7 @@ void ObjCLoopChecker::checkPostStmt(const ObjCForCollectionStmt *FCS,
   ProgramStateRef State = C.getState();
 
   // Check if this is the branch for the end of the loop.
-  SVal CollectionSentinel = C.getSVal(FCS);
-  if (CollectionSentinel.isZeroConstant()) {
+  if (!ExprEngine::hasMoreIteration(State, FCS, C.getLocationContext())) {
     if (!alreadyExecutedAtLeastOneLoopIteration(C.getPredecessor(), FCS))
       State = assumeCollectionNonEmpty(C, State, FCS, /*Assumption*/false);
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/DebugCheckers.cpp b/clang/lib/StaticAnalyzer/Checkers/DebugCheckers.cpp
index 03b7cbd1c833d..7cdd78b8adfb7 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DebugCheckers.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DebugCheckers.cpp
@@ -131,21 +131,21 @@ bool ento::shouldRegisterLiveVariablesDumper(const CheckerManager &mgr) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-class LiveStatementsDumper : public Checker {
+class LiveExpressionsDumper : public Checker {
 public:
   void checkASTCodeBody(const Decl *D, AnalysisManager& Mgr,
                         BugReporter &BR) const {
     if (LiveVariables *L = Mgr.getAnalysis(D))
-      L->dumpStmtLiveness(Mgr.getSourceManager());
+      L->dumpExprLiveness(Mgr.getSourceManager());
   }
 };
 }
 
-void ento::registerLiveStatementsDumper(CheckerManager &mgr) {
-  mgr.registerChecker();
+void ento::registerLiveExpressionsDumper(CheckerManager &mgr) {
+  mgr.registerChecker();
 }
 
-bool ento::shouldRegisterLiveStatementsDumper(const CheckerManager &mgr) {
+bool ento::shouldRegisterLiveExpressionsDumper(const CheckerManager &mgr) {
   return true;
 }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
index 285d2da104f1a..88e80c481a5a7 100644
--- a/clang/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
@@ -83,7 +83,7 @@ class PthreadLockChecker : public Checker PThreadCallbacks = {
       // Init.
       {{"pthread_mutex_init", 2}, &PthreadLockChecker::InitAnyLock},
@@ -167,46 +167,49 @@ class PthreadLockChecker : public Checker BT[],
+                 const Expr *MtxExpr, CheckerKind CheckKind,
+                 StringRef Desc) const;
 
   // Init.
   void InitAnyLock(const CallEvent &Call, CheckerContext &C,
-                   CheckerKind checkkind) const;
-  void InitLockAux(const CallEvent &Call, CheckerContext &C, unsigned ArgNo,
-                   SVal Lock, CheckerKind checkkind) const;
+                   CheckerKind CheckKind) const;
+  void InitLockAux(const CallEvent &Call, CheckerContext &C,
+                   const Expr *MtxExpr, SVal MtxVal,
+                   CheckerKind CheckKind) const;
 
   // Lock, Try-lock.
   void AcquirePthreadLock(const CallEvent &Call, CheckerContext &C,
-                          CheckerKind checkkind) const;
+                          CheckerKind CheckKind) const;
   void AcquireXNULock(const CallEvent &Call, CheckerContext &C,
-                      CheckerKind checkkind) const;
+                      CheckerKind CheckKind) const;
   void TryPthreadLock(const CallEvent &Call, CheckerContext &C,
-                      CheckerKind checkkind) const;
+                      CheckerKind CheckKind) const;
   void TryXNULock(const CallEvent &Call, CheckerContext &C,
-                  CheckerKind checkkind) const;
+                  CheckerKind CheckKind) const;
   void TryFuchsiaLock(const CallEvent &Call, CheckerContext &C,
-                      CheckerKind checkkind) const;
+                      CheckerKind CheckKind) const;
   void TryC11Lock(const CallEvent &Call, CheckerContext &C,
-                  CheckerKind checkkind) const;
-  void AcquireLockAux(const CallEvent &Call, CheckerContext &C, unsigned ArgNo,
-                      SVal lock, bool isTryLock, LockingSemantics semantics,
-                      CheckerKind checkkind) const;
+                  CheckerKind CheckKind) const;
+  void AcquireLockAux(const CallEvent &Call, CheckerContext &C,
+                      const Expr *MtxExpr, SVal MtxVal, bool IsTryLock,
+                      LockingSemantics Semantics, CheckerKind CheckKind) const;
 
   // Release.
   void ReleaseAnyLock(const CallEvent &Call, CheckerContext &C,
-                      CheckerKind checkkind) const;
-  void ReleaseLockAux(const CallEvent &Call, CheckerContext &C, unsigned ArgNo,
-                      SVal lock, CheckerKind checkkind) const;
+                      CheckerKind CheckKind) const;
+  void ReleaseLockAux(const CallEvent &Call, CheckerContext &C,
+                      const Expr *MtxExpr, SVal MtxVal,
+                      CheckerKind CheckKind) const;
 
   // Destroy.
   void DestroyPthreadLock(const CallEvent &Call, CheckerContext &C,
-                          CheckerKind checkkind) const;
+                          CheckerKind CheckKind) const;
   void DestroyXNULock(const CallEvent &Call, CheckerContext &C,
-                      CheckerKind checkkind) const;
-  void DestroyLockAux(const CallEvent &Call, CheckerContext &C, unsigned ArgNo,
-                      SVal Lock, LockingSemantics semantics,
-                      CheckerKind checkkind) const;
+                      CheckerKind CheckKind) const;
+  void DestroyLockAux(const CallEvent &Call, CheckerContext &C,
+                      const Expr *MtxExpr, SVal MtxVal,
+                      LockingSemantics Semantics, CheckerKind CheckKind) const;
 
 public:
   void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
@@ -226,18 +229,18 @@ class PthreadLockChecker : public Checker BT_initlock[CK_NumCheckKinds];
   mutable std::unique_ptr BT_lor[CK_NumCheckKinds];
 
-  void initBugType(CheckerKind checkKind) const {
-    if (BT_doublelock[checkKind])
+  void initBugType(CheckerKind CheckKind) const {
+    if (BT_doublelock[CheckKind])
       return;
-    BT_doublelock[checkKind].reset(
-        new BugType{CheckNames[checkKind], "Double locking", "Lock checker"});
-    BT_doubleunlock[checkKind].reset(
-        new BugType{CheckNames[checkKind], "Double unlocking", "Lock checker"});
-    BT_destroylock[checkKind].reset(new BugType{
-        CheckNames[checkKind], "Use destroyed lock", "Lock checker"});
-    BT_initlock[checkKind].reset(new BugType{
-        CheckNames[checkKind], "Init invalid lock", "Lock checker"});
-    BT_lor[checkKind].reset(new BugType{CheckNames[checkKind],
+    BT_doublelock[CheckKind].reset(
+        new BugType{CheckNames[CheckKind], "Double locking", "Lock checker"});
+    BT_doubleunlock[CheckKind].reset(
+        new BugType{CheckNames[CheckKind], "Double unlocking", "Lock checker"});
+    BT_destroylock[CheckKind].reset(new BugType{
+        CheckNames[CheckKind], "Use destroyed lock", "Lock checker"});
+    BT_initlock[CheckKind].reset(new BugType{
+        CheckNames[CheckKind], "Init invalid lock", "Lock checker"});
+    BT_lor[CheckKind].reset(new BugType{CheckNames[CheckKind],
                                         "Lock order reversal", "Lock checker"});
   }
 };
@@ -341,53 +344,53 @@ void PthreadLockChecker::printState(raw_ostream &Out, ProgramStateRef State,
 
 void PthreadLockChecker::AcquirePthreadLock(const CallEvent &Call,
                                             CheckerContext &C,
-                                            CheckerKind checkKind) const {
-  AcquireLockAux(Call, C, 0, Call.getArgSVal(0), false, PthreadSemantics,
-                 checkKind);
+                                            CheckerKind CheckKind) const {
+  AcquireLockAux(Call, C, Call.getArgExpr(0), Call.getArgSVal(0), false,
+                 PthreadSemantics, CheckKind);
 }
 
 void PthreadLockChecker::AcquireXNULock(const CallEvent &Call,
                                         CheckerContext &C,
-                                        CheckerKind checkKind) const {
-  AcquireLockAux(Call, C, 0, Call.getArgSVal(0), false, XNUSemantics,
-                 checkKind);
+                                        CheckerKind CheckKind) const {
+  AcquireLockAux(Call, C, Call.getArgExpr(0), Call.getArgSVal(0), false,
+                 XNUSemantics, CheckKind);
 }
 
 void PthreadLockChecker::TryPthreadLock(const CallEvent &Call,
                                         CheckerContext &C,
-                                        CheckerKind checkKind) const {
-  AcquireLockAux(Call, C, 0, Call.getArgSVal(0), true, PthreadSemantics,
-                 checkKind);
+                                        CheckerKind CheckKind) const {
+  AcquireLockAux(Call, C, Call.getArgExpr(0), Call.getArgSVal(0), true,
+                 PthreadSemantics, CheckKind);
 }
 
 void PthreadLockChecker::TryXNULock(const CallEvent &Call, CheckerContext &C,
-                                    CheckerKind checkKind) const {
-  AcquireLockAux(Call, C, 0, Call.getArgSVal(0), true, PthreadSemantics,
-                 checkKind);
+                                    CheckerKind CheckKind) const {
+  AcquireLockAux(Call, C, Call.getArgExpr(0), Call.getArgSVal(0), true,
+                 PthreadSemantics, CheckKind);
 }
 
 void PthreadLockChecker::TryFuchsiaLock(const CallEvent &Call,
                                         CheckerContext &C,
-                                        CheckerKind checkKind) const {
-  AcquireLockAux(Call, C, 0, Call.getArgSVal(0), true, PthreadSemantics,
-                 checkKind);
+                                        CheckerKind CheckKind) const {
+  AcquireLockAux(Call, C, Call.getArgExpr(0), Call.getArgSVal(0), true,
+                 PthreadSemantics, CheckKind);
 }
 
 void PthreadLockChecker::TryC11Lock(const CallEvent &Call, CheckerContext &C,
-                                    CheckerKind checkKind) const {
-  AcquireLockAux(Call, C, 0, Call.getArgSVal(0), true, PthreadSemantics,
-                 checkKind);
+                                    CheckerKind CheckKind) const {
+  AcquireLockAux(Call, C, Call.getArgExpr(0), Call.getArgSVal(0), true,
+                 PthreadSemantics, CheckKind);
 }
 
 void PthreadLockChecker::AcquireLockAux(const CallEvent &Call,
-                                        CheckerContext &C, unsigned ArgNo,
-                                        SVal lock, bool isTryLock,
-                                        enum LockingSemantics semantics,
-                                        CheckerKind checkKind) const {
-  if (!ChecksEnabled[checkKind])
+                                        CheckerContext &C, const Expr *MtxExpr,
+                                        SVal MtxVal, bool IsTryLock,
+                                        enum LockingSemantics Semantics,
+                                        CheckerKind CheckKind) const {
+  if (!ChecksEnabled[CheckKind])
     return;
 
-  const MemRegion *lockR = lock.getAsRegion();
+  const MemRegion *lockR = MtxVal.getAsRegion();
   if (!lockR)
     return;
 
@@ -398,28 +401,23 @@ void PthreadLockChecker::AcquireLockAux(const CallEvent &Call,
 
   if (const LockState *LState = state->get(lockR)) {
     if (LState->isLocked()) {
-      ExplodedNode *N = C.generateErrorNode();
-      if (!N)
-        return;
-      initBugType(checkKind);
-      auto report = std::make_unique(
-          *BT_doublelock[checkKind], "This lock has already been acquired", N);
-      report->addRange(Call.getArgExpr(ArgNo)->getSourceRange());
-      C.emitReport(std::move(report));
+      reportBug(C, BT_doublelock, MtxExpr, CheckKind,
+                "This lock has already been acquired");
       return;
     } else if (LState->isDestroyed()) {
-      reportUseDestroyedBug(Call, C, ArgNo, checkKind);
+      reportBug(C, BT_destroylock, MtxExpr, CheckKind,
+                "This lock has already been destroyed");
       return;
     }
   }
 
   ProgramStateRef lockSucc = state;
-  if (isTryLock) {
+  if (IsTryLock) {
     // Bifurcate the state, and allow a mode where the lock acquisition fails.
     SVal RetVal = Call.getReturnValue();
     if (auto DefinedRetVal = RetVal.getAs()) {
       ProgramStateRef lockFail;
-      switch (semantics) {
+      switch (Semantics) {
       case PthreadSemantics:
         std::tie(lockFail, lockSucc) = state->assume(*DefinedRetVal);
         break;
@@ -434,7 +432,7 @@ void PthreadLockChecker::AcquireLockAux(const CallEvent &Call,
     }
     // We might want to handle the case when the mutex lock function was inlined
     // and returned an Unknown or Undefined value.
-  } else if (semantics == PthreadSemantics) {
+  } else if (Semantics == PthreadSemantics) {
     // Assume that the return value was 0.
     SVal RetVal = Call.getReturnValue();
     if (auto DefinedRetVal = RetVal.getAs()) {
@@ -447,7 +445,7 @@ void PthreadLockChecker::AcquireLockAux(const CallEvent &Call,
     // and returned an Unknown or Undefined value.
   } else {
     // XNU locking semantics return void on non-try locks
-    assert((semantics == XNUSemantics) && "Unknown locking semantics");
+    assert((Semantics == XNUSemantics) && "Unknown locking semantics");
     lockSucc = state;
   }
 
@@ -459,18 +457,18 @@ void PthreadLockChecker::AcquireLockAux(const CallEvent &Call,
 
 void PthreadLockChecker::ReleaseAnyLock(const CallEvent &Call,
                                         CheckerContext &C,
-                                        CheckerKind checkKind) const {
-  ReleaseLockAux(Call, C, 0, Call.getArgSVal(0), checkKind);
+                                        CheckerKind CheckKind) const {
+  ReleaseLockAux(Call, C, Call.getArgExpr(0), Call.getArgSVal(0), CheckKind);
 }
 
 void PthreadLockChecker::ReleaseLockAux(const CallEvent &Call,
-                                        CheckerContext &C, unsigned ArgNo,
-                                        SVal lock,
-                                        CheckerKind checkKind) const {
-  if (!ChecksEnabled[checkKind])
+                                        CheckerContext &C, const Expr *MtxExpr,
+                                        SVal MtxVal,
+                                        CheckerKind CheckKind) const {
+  if (!ChecksEnabled[CheckKind])
     return;
 
-  const MemRegion *lockR = lock.getAsRegion();
+  const MemRegion *lockR = MtxVal.getAsRegion();
   if (!lockR)
     return;
 
@@ -481,18 +479,12 @@ void PthreadLockChecker::ReleaseLockAux(const CallEvent &Call,
 
   if (const LockState *LState = state->get(lockR)) {
     if (LState->isUnlocked()) {
-      ExplodedNode *N = C.generateErrorNode();
-      if (!N)
-        return;
-      initBugType(checkKind);
-      auto Report = std::make_unique(
-          *BT_doubleunlock[checkKind], "This lock has already been unlocked",
-          N);
-      Report->addRange(Call.getArgExpr(ArgNo)->getSourceRange());
-      C.emitReport(std::move(Report));
+      reportBug(C, BT_doubleunlock, MtxExpr, CheckKind,
+                "This lock has already been unlocked");
       return;
     } else if (LState->isDestroyed()) {
-      reportUseDestroyedBug(Call, C, ArgNo, checkKind);
+      reportBug(C, BT_destroylock, MtxExpr, CheckKind,
+                "This lock has already been destroyed");
       return;
     }
   }
@@ -502,17 +494,9 @@ void PthreadLockChecker::ReleaseLockAux(const CallEvent &Call,
   if (!LS.isEmpty()) {
     const MemRegion *firstLockR = LS.getHead();
     if (firstLockR != lockR) {
-      ExplodedNode *N = C.generateErrorNode();
-      if (!N)
-        return;
-      initBugType(checkKind);
-      auto report = std::make_unique(
-          *BT_lor[checkKind],
-          "This was not the most recently acquired lock. Possible "
-          "lock order reversal",
-          N);
-      report->addRange(Call.getArgExpr(ArgNo)->getSourceRange());
-      C.emitReport(std::move(report));
+      reportBug(C, BT_lor, MtxExpr, CheckKind,
+                "This was not the most recently acquired lock. Possible lock "
+                "order reversal");
       return;
     }
     // Record that the lock was released.
@@ -525,25 +509,27 @@ void PthreadLockChecker::ReleaseLockAux(const CallEvent &Call,
 
 void PthreadLockChecker::DestroyPthreadLock(const CallEvent &Call,
                                             CheckerContext &C,
-                                            CheckerKind checkKind) const {
-  DestroyLockAux(Call, C, 0, Call.getArgSVal(0), PthreadSemantics, checkKind);
+                                            CheckerKind CheckKind) const {
+  DestroyLockAux(Call, C, Call.getArgExpr(0), Call.getArgSVal(0),
+                 PthreadSemantics, CheckKind);
 }
 
 void PthreadLockChecker::DestroyXNULock(const CallEvent &Call,
                                         CheckerContext &C,
-                                        CheckerKind checkKind) const {
-  DestroyLockAux(Call, C, 0, Call.getArgSVal(0), XNUSemantics, checkKind);
+                                        CheckerKind CheckKind) const {
+  DestroyLockAux(Call, C, Call.getArgExpr(0), Call.getArgSVal(0), XNUSemantics,
+                 CheckKind);
 }
 
 void PthreadLockChecker::DestroyLockAux(const CallEvent &Call,
-                                        CheckerContext &C, unsigned ArgNo,
-                                        SVal Lock,
-                                        enum LockingSemantics semantics,
-                                        CheckerKind checkKind) const {
-  if (!ChecksEnabled[checkKind])
+                                        CheckerContext &C, const Expr *MtxExpr,
+                                        SVal MtxVal,
+                                        enum LockingSemantics Semantics,
+                                        CheckerKind CheckKind) const {
+  if (!ChecksEnabled[CheckKind])
     return;
 
-  const MemRegion *LockR = Lock.getAsRegion();
+  const MemRegion *LockR = MtxVal.getAsRegion();
   if (!LockR)
     return;
 
@@ -556,7 +542,7 @@ void PthreadLockChecker::DestroyLockAux(const CallEvent &Call,
   const LockState *LState = State->get(LockR);
   // Checking the return value of the destroy method only in the case of
   // PthreadSemantics
-  if (semantics == PthreadSemantics) {
+  if (Semantics == PthreadSemantics) {
     if (!LState || LState->isUnlocked()) {
       SymbolRef sym = Call.getReturnValue().getAsSymbol();
       if (!sym) {
@@ -581,36 +567,26 @@ void PthreadLockChecker::DestroyLockAux(const CallEvent &Call,
       return;
     }
   }
-  StringRef Message;
 
-  if (LState->isLocked()) {
-    Message = "This lock is still locked";
-  } else {
-    Message = "This lock has already been destroyed";
-  }
+  StringRef Message = LState->isLocked()
+                          ? "This lock is still locked"
+                          : "This lock has already been destroyed";
 
-  ExplodedNode *N = C.generateErrorNode();
-  if (!N)
-    return;
-  initBugType(checkKind);
-  auto Report = std::make_unique(
-      *BT_destroylock[checkKind], Message, N);
-  Report->addRange(Call.getArgExpr(ArgNo)->getSourceRange());
-  C.emitReport(std::move(Report));
+  reportBug(C, BT_destroylock, MtxExpr, CheckKind, Message);
 }
 
 void PthreadLockChecker::InitAnyLock(const CallEvent &Call, CheckerContext &C,
-                                     CheckerKind checkKind) const {
-  InitLockAux(Call, C, 0, Call.getArgSVal(0), checkKind);
+                                     CheckerKind CheckKind) const {
+  InitLockAux(Call, C, Call.getArgExpr(0), Call.getArgSVal(0), CheckKind);
 }
 
 void PthreadLockChecker::InitLockAux(const CallEvent &Call, CheckerContext &C,
-                                     unsigned ArgNo, SVal Lock,
-                                     CheckerKind checkKind) const {
-  if (!ChecksEnabled[checkKind])
+                                     const Expr *MtxExpr, SVal MtxVal,
+                                     CheckerKind CheckKind) const {
+  if (!ChecksEnabled[CheckKind])
     return;
 
-  const MemRegion *LockR = Lock.getAsRegion();
+  const MemRegion *LockR = MtxVal.getAsRegion();
   if (!LockR)
     return;
 
@@ -627,35 +603,24 @@ void PthreadLockChecker::InitLockAux(const CallEvent &Call, CheckerContext &C,
     return;
   }
 
-  StringRef Message;
-
-  if (LState->isLocked()) {
-    Message = "This lock is still being held";
-  } else {
-    Message = "This lock has already been initialized";
-  }
+  StringRef Message = LState->isLocked()
+                          ? "This lock is still being held"
+                          : "This lock has already been initialized";
 
-  ExplodedNode *N = C.generateErrorNode();
-  if (!N)
-    return;
-  initBugType(checkKind);
-  auto Report = std::make_unique(
-      *BT_initlock[checkKind], Message, N);
-  Report->addRange(Call.getArgExpr(ArgNo)->getSourceRange());
-  C.emitReport(std::move(Report));
+  reportBug(C, BT_initlock, MtxExpr, CheckKind, Message);
 }
 
-void PthreadLockChecker::reportUseDestroyedBug(const CallEvent &Call,
-                                               CheckerContext &C,
-                                               unsigned ArgNo,
-                                               CheckerKind checkKind) const {
+void PthreadLockChecker::reportBug(CheckerContext &C,
+                                   std::unique_ptr BT[],
+                                   const Expr *MtxExpr, CheckerKind CheckKind,
+                                   StringRef Desc) const {
   ExplodedNode *N = C.generateErrorNode();
   if (!N)
     return;
-  initBugType(checkKind);
-  auto Report = std::make_unique(
-      *BT_destroylock[checkKind], "This lock has already been destroyed", N);
-  Report->addRange(Call.getArgExpr(ArgNo)->getSourceRange());
+  initBugType(CheckKind);
+  auto Report =
+      std::make_unique(*BT[CheckKind], Desc, N);
+  Report->addRange(MtxExpr->getSourceRange());
   C.emitReport(std::move(Report));
 }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index b71c19a80da90..45711cad56337 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -126,6 +126,8 @@ class StdLibraryFunctionsChecker
     }
     ArgNo getArgNo() const { return ArgN; }
 
+    virtual StringRef getName() const = 0;
+
   protected:
     ArgNo ArgN; // Argument to which we apply the constraint.
 
@@ -152,6 +154,7 @@ class StdLibraryFunctionsChecker
     IntRangeVector Ranges;
 
   public:
+    StringRef getName() const override { return "Range"; }
     RangeConstraint(ArgNo ArgN, RangeKind Kind, const IntRangeVector &Ranges)
         : ValueConstraint(ArgN), Kind(Kind), Ranges(Ranges) {}
 
@@ -205,6 +208,7 @@ class StdLibraryFunctionsChecker
     ArgNo OtherArgN;
 
   public:
+    virtual StringRef getName() const override { return "Comparison"; };
     ComparisonConstraint(ArgNo ArgN, BinaryOperator::Opcode Opcode,
                          ArgNo OtherArgN)
         : ValueConstraint(ArgN), Opcode(Opcode), OtherArgN(OtherArgN) {}
@@ -221,6 +225,7 @@ class StdLibraryFunctionsChecker
     bool CannotBeNull = true;
 
   public:
+    StringRef getName() const override { return "NonNull"; }
     ProgramStateRef apply(ProgramStateRef State, const CallEvent &Call,
                           const Summary &Summary,
                           CheckerContext &C) const override {
@@ -272,6 +277,7 @@ class StdLibraryFunctionsChecker
     BinaryOperator::Opcode Op = BO_LE;
 
   public:
+    StringRef getName() const override { return "BufferSize"; }
     BufferSizeConstraint(ArgNo Buffer, llvm::APSInt BufMinSize)
         : ValueConstraint(Buffer), ConcreteSize(BufMinSize) {}
     BufferSizeConstraint(ArgNo Buffer, ArgNo BufSize)
@@ -466,6 +472,8 @@ class StdLibraryFunctionsChecker
       return *this;
     }
     Summary &ArgConstraint(ValueConstraintPtr VC) {
+      assert(VC->getArgNo() != Ret &&
+             "Arg constraint should not refer to the return value");
       ArgConstraints.push_back(VC);
       return *this;
     }
@@ -549,17 +557,24 @@ class StdLibraryFunctionsChecker
   void initFunctionSummaries(CheckerContext &C) const;
 
   void reportBug(const CallEvent &Call, ExplodedNode *N,
-                 CheckerContext &C) const {
+                 const ValueConstraint *VC, CheckerContext &C) const {
     if (!ChecksEnabled[CK_StdCLibraryFunctionArgsChecker])
       return;
-    // TODO Add detailed diagnostic.
-    StringRef Msg = "Function argument constraint is not satisfied";
+    // TODO Add more detailed diagnostic.
+    std::string Msg =
+        (Twine("Function argument constraint is not satisfied, constraint: ") +
+         VC->getName().data() + ", ArgN: " + Twine(VC->getArgNo()))
+            .str();
     if (!BT_InvalidArg)
       BT_InvalidArg = std::make_unique(
           CheckNames[CK_StdCLibraryFunctionArgsChecker],
           "Unsatisfied argument constraints", categories::LogicError);
     auto R = std::make_unique(*BT_InvalidArg, Msg, N);
-    bugreporter::trackExpressionValue(N, Call.getArgExpr(0), *R);
+    bugreporter::trackExpressionValue(N, Call.getArgExpr(VC->getArgNo()), *R);
+
+    // Highlight the range of the argument that was violated.
+    R->addRange(Call.getArgSourceRange(VC->getArgNo()));
+
     C.emitReport(std::move(R));
   }
 };
@@ -696,7 +711,7 @@ void StdLibraryFunctionsChecker::checkPreCall(const CallEvent &Call,
     // The argument constraint is not satisfied.
     if (FailureSt && !SuccessSt) {
       if (ExplodedNode *N = C.generateErrorNode(NewState))
-        reportBug(Call, N, C);
+        reportBug(Call, N, Constraint.get(), C);
       break;
     } else {
       // We will apply the constraint even if we cannot reason about the
@@ -1075,35 +1090,12 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
   Optional FilePtrRestrictTy = getRestrictTy(FilePtrTy);
 
   // Templates for summaries that are reused by many functions.
-  auto Getc = [&]() {
-    return Summary(ArgTypes{FilePtrTy}, RetType{IntTy}, NoEvalCall)
-        .Case({ReturnValueCondition(WithinRange,
-                                    {{EOFv, EOFv}, {0, UCharRangeMax}})});
-  };
   auto Read = [&](RetType R, RangeInt Max) {
     return Summary(ArgTypes{Irrelevant, Irrelevant, SizeTy}, RetType{R},
                    NoEvalCall)
         .Case({ReturnValueCondition(LessThanOrEq, ArgNo(2)),
                ReturnValueCondition(WithinRange, Range(-1, Max))});
   };
-  auto Fread = [&]() {
-    return Summary(
-               ArgTypes{VoidPtrRestrictTy, SizeTy, SizeTy, FilePtrRestrictTy},
-               RetType{SizeTy}, NoEvalCall)
-        .Case({
-            ReturnValueCondition(LessThanOrEq, ArgNo(2)),
-        })
-        .ArgConstraint(NotNull(ArgNo(0)));
-  };
-  auto Fwrite = [&]() {
-    return Summary(ArgTypes{ConstVoidPtrRestrictTy, SizeTy, SizeTy,
-                            FilePtrRestrictTy},
-                   RetType{SizeTy}, NoEvalCall)
-        .Case({
-            ReturnValueCondition(LessThanOrEq, ArgNo(2)),
-        })
-        .ArgConstraint(NotNull(ArgNo(0)));
-  };
   auto Getline = [&](RetType R, RangeInt Max) {
     return Summary(ArgTypes{Irrelevant, Irrelevant, Irrelevant}, RetType{R},
                    NoEvalCall)
@@ -1268,19 +1260,45 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                          0U, WithinRange, {{EOFv, EOFv}, {0, UCharRangeMax}})));
 
   // The getc() family of functions that returns either a char or an EOF.
-    addToFunctionSummaryMap("getc", Getc());
-    addToFunctionSummaryMap("fgetc", Getc());
+  addToFunctionSummaryMap(
+      {"getc", "fgetc"}, Signature(ArgTypes{FilePtrTy}, RetType{IntTy}),
+      Summary(NoEvalCall)
+          .Case({ReturnValueCondition(WithinRange,
+                                      {{EOFv, EOFv}, {0, UCharRangeMax}})}));
   addToFunctionSummaryMap(
       "getchar", Summary(ArgTypes{}, RetType{IntTy}, NoEvalCall)
                      .Case({ReturnValueCondition(
                          WithinRange, {{EOFv, EOFv}, {0, UCharRangeMax}})}));
 
   // read()-like functions that never return more than buffer size.
-    addToFunctionSummaryMap("fread", Fread());
-    addToFunctionSummaryMap("fwrite", Fwrite());
+  auto FreadSummary =
+      Summary(NoEvalCall)
+          .Case({
+              ReturnValueCondition(LessThanOrEq, ArgNo(2)),
+          })
+          .ArgConstraint(NotNull(ArgNo(0)))
+          .ArgConstraint(NotNull(ArgNo(3)))
+          .ArgConstraint(BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1),
+                                    /*BufSizeMultiplier=*/ArgNo(2)));
+
+  // size_t fread(void *restrict ptr, size_t size, size_t nitems,
+  //              FILE *restrict stream);
+  addToFunctionSummaryMap(
+      "fread",
+      Signature(ArgTypes{VoidPtrRestrictTy, SizeTy, SizeTy, FilePtrRestrictTy},
+                RetType{SizeTy}),
+      FreadSummary);
+  // size_t fwrite(const void *restrict ptr, size_t size, size_t nitems,
+  //               FILE *restrict stream);
+  addToFunctionSummaryMap("fwrite",
+                          Signature(ArgTypes{ConstVoidPtrRestrictTy, SizeTy,
+                                             SizeTy, FilePtrRestrictTy},
+                                    RetType{SizeTy}),
+                          FreadSummary);
 
   // We are not sure how ssize_t is defined on every platform, so we
   // provide three variants that should cover common cases.
+  // FIXME Use lookupTy("ssize_t") instead of the `Read` lambda.
   // FIXME these are actually defined by POSIX and not by the C standard, we
   // should handle them together with the rest of the POSIX functions.
   addToFunctionSummaryMap("read", {Read(IntTy, IntMax), Read(LongTy, LongMax),
@@ -1289,11 +1307,13 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                                     Read(LongLongTy, LongLongMax)});
 
   // getline()-like functions either fail or read at least the delimiter.
+  // FIXME Use lookupTy("ssize_t") instead of the `Getline` lambda.
   // FIXME these are actually defined by POSIX and not by the C standard, we
   // should handle them together with the rest of the POSIX functions.
   addToFunctionSummaryMap("getline",
                           {Getline(IntTy, IntMax), Getline(LongTy, LongMax),
                            Getline(LongLongTy, LongLongMax)});
+  // FIXME getdelim's signature is different than getline's!
   addToFunctionSummaryMap("getdelim",
                           {Getline(IntTy, IntMax), Getline(LongTy, LongMax),
                            Getline(LongLongTy, LongLongMax)});
@@ -1676,22 +1696,6 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                                               RetType{IntTy}, NoEvalCall)
                                           .ArgConstraint(NotNull(ArgNo(0))));
 
-    // int strcasecmp(const char *s1, const char *s2);
-    addToFunctionSummaryMap("strcasecmp",
-                            Summary(ArgTypes{ConstCharPtrTy, ConstCharPtrTy},
-                                    RetType{IntTy}, EvalCallAsPure)
-                                .ArgConstraint(NotNull(ArgNo(0)))
-                                .ArgConstraint(NotNull(ArgNo(1))));
-
-    // int strncasecmp(const char *s1, const char *s2, size_t n);
-    addToFunctionSummaryMap(
-        "strncasecmp", Summary(ArgTypes{ConstCharPtrTy, ConstCharPtrTy, SizeTy},
-                               RetType{IntTy}, EvalCallAsPure)
-                           .ArgConstraint(NotNull(ArgNo(0)))
-                           .ArgConstraint(NotNull(ArgNo(1)))
-                           .ArgConstraint(ArgumentCondition(
-                               2, WithinRange, Range(0, SizeMax))));
-
     // int fileno(FILE *stream);
     addToFunctionSummaryMap(
         "fileno", Summary(ArgTypes{FilePtrTy}, RetType{IntTy}, NoEvalCall)
diff --git a/clang/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp
index 3e0caaf79ca09..ebe5ad53cc303 100644
--- a/clang/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp
@@ -11,6 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/AST/StmtObjC.h"
+#include "clang/AST/Type.h"
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
@@ -54,10 +56,13 @@ class UndefBranchChecker : public Checker {
   void checkBranchCondition(const Stmt *Condition, CheckerContext &Ctx) const;
 };
 
-}
+} // namespace
 
 void UndefBranchChecker::checkBranchCondition(const Stmt *Condition,
                                               CheckerContext &Ctx) const {
+  // ObjCForCollection is a loop, but has no actual condition.
+  if (isa(Condition))
+    return;
   SVal X = Ctx.getSVal(Condition);
   if (X.isUndef()) {
     // Generate a sink node, which implicitly marks both outgoing branches as
diff --git a/clang/lib/StaticAnalyzer/Core/Environment.cpp b/clang/lib/StaticAnalyzer/Core/Environment.cpp
index 1ccf4c6104a65..ee74745925283 100644
--- a/clang/lib/StaticAnalyzer/Core/Environment.cpp
+++ b/clang/lib/StaticAnalyzer/Core/Environment.cpp
@@ -15,6 +15,7 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/PrettyPrinter.h"
 #include "clang/AST/Stmt.h"
+#include "clang/AST/StmtObjC.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/LangOptions.h"
@@ -85,6 +86,12 @@ SVal Environment::lookupExpr(const EnvironmentEntry &E) const {
 SVal Environment::getSVal(const EnvironmentEntry &Entry,
                           SValBuilder& svalBuilder) const {
   const Stmt *S = Entry.getStmt();
+  assert(!isa(S) &&
+         "Use ExprEngine::hasMoreIteration()!");
+  assert((isa(S) || isa(S)) &&
+         "Environment can only argue about Exprs, since only they express "
+         "a value! Any non-expression statement stored in Environment is a "
+         "result of a hack!");
   const LocationContext *LCtx = Entry.getLocationContext();
 
   switch (S->getStmtClass()) {
@@ -109,6 +116,7 @@ SVal Environment::getSVal(const EnvironmentEntry &Entry,
   case Stmt::StringLiteralClass:
   case Stmt::TypeTraitExprClass:
   case Stmt::SizeOfPackExprClass:
+  case Stmt::PredefinedExprClass:
     // Known constants; defer to SValBuilder.
     return svalBuilder.getConstantVal(cast(S)).getValue();
 
@@ -183,12 +191,15 @@ EnvironmentManager::removeDeadBindings(Environment Env,
              F.getTreeFactory());
 
   // Iterate over the block-expr bindings.
-  for (Environment::iterator I = Env.begin(), E = Env.end();
-       I != E; ++I) {
+  for (Environment::iterator I = Env.begin(), End = Env.end(); I != End; ++I) {
     const EnvironmentEntry &BlkExpr = I.getKey();
     const SVal &X = I.getData();
 
-    if (SymReaper.isLive(BlkExpr.getStmt(), BlkExpr.getLocationContext())) {
+    const Expr *E = dyn_cast(BlkExpr.getStmt());
+    if (!E)
+      continue;
+
+    if (SymReaper.isLive(E, BlkExpr.getLocationContext())) {
       // Copy the binding to the new map.
       EBMapRef = EBMapRef.add(BlkExpr, X);
 
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index a4b11b5e8a961..409741cdb6e41 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -2129,6 +2129,83 @@ static const Stmt *ResolveCondition(const Stmt *Condition,
   llvm_unreachable("could not resolve condition");
 }
 
+using ObjCForLctxPair =
+    std::pair;
+
+REGISTER_MAP_WITH_PROGRAMSTATE(ObjCForHasMoreIterations, ObjCForLctxPair, bool)
+
+ProgramStateRef ExprEngine::setWhetherHasMoreIteration(
+    ProgramStateRef State, const ObjCForCollectionStmt *O,
+    const LocationContext *LC, bool HasMoreIteraton) {
+  assert(!State->contains({O, LC}));
+  return State->set({O, LC}, HasMoreIteraton);
+}
+
+ProgramStateRef
+ExprEngine::removeIterationState(ProgramStateRef State,
+                                 const ObjCForCollectionStmt *O,
+                                 const LocationContext *LC) {
+  assert(State->contains({O, LC}));
+  return State->remove({O, LC});
+}
+
+bool ExprEngine::hasMoreIteration(ProgramStateRef State,
+                                  const ObjCForCollectionStmt *O,
+                                  const LocationContext *LC) {
+  assert(State->contains({O, LC}));
+  return *State->get({O, LC});
+}
+
+/// Split the state on whether there are any more iterations left for this loop.
+/// Returns a (HasMoreIteration, HasNoMoreIteration) pair, or None when the
+/// acquisition of the loop condition value failed.
+static Optional>
+assumeCondition(const Stmt *Condition, ExplodedNode *N) {
+  ProgramStateRef State = N->getState();
+  if (const auto *ObjCFor = dyn_cast(Condition)) {
+    bool HasMoreIteraton =
+        ExprEngine::hasMoreIteration(State, ObjCFor, N->getLocationContext());
+    // Checkers have already ran on branch conditions, so the current
+    // information as to whether the loop has more iteration becomes outdated
+    // after this point.
+    State = ExprEngine::removeIterationState(State, ObjCFor,
+                                             N->getLocationContext());
+    if (HasMoreIteraton)
+      return std::pair{State, nullptr};
+    else
+      return std::pair{nullptr, State};
+  }
+  SVal X = State->getSVal(Condition, N->getLocationContext());
+
+  if (X.isUnknownOrUndef()) {
+    // Give it a chance to recover from unknown.
+    if (const auto *Ex = dyn_cast(Condition)) {
+      if (Ex->getType()->isIntegralOrEnumerationType()) {
+        // Try to recover some path-sensitivity.  Right now casts of symbolic
+        // integers that promote their values are currently not tracked well.
+        // If 'Condition' is such an expression, try and recover the
+        // underlying value and use that instead.
+        SVal recovered =
+            RecoverCastedSymbol(State, Condition, N->getLocationContext(),
+                                N->getState()->getStateManager().getContext());
+
+        if (!recovered.isUnknown()) {
+          X = recovered;
+        }
+      }
+    }
+  }
+
+  // If the condition is still unknown, give up.
+  if (X.isUnknownOrUndef())
+    return None;
+
+  DefinedSVal V = X.castAs();
+
+  ProgramStateRef StTrue, StFalse;
+  return State->assume(V);
+}
+
 void ExprEngine::processBranch(const Stmt *Condition,
                                NodeBuilderContext& BldCtx,
                                ExplodedNode *Pred,
@@ -2165,48 +2242,28 @@ void ExprEngine::processBranch(const Stmt *Condition,
     return;
 
   BranchNodeBuilder builder(CheckersOutSet, Dst, BldCtx, DstT, DstF);
-  for (const auto PredI : CheckersOutSet) {
-    if (PredI->isSink())
+  for (ExplodedNode *PredN : CheckersOutSet) {
+    if (PredN->isSink())
       continue;
 
-    ProgramStateRef PrevState = PredI->getState();
-    SVal X = PrevState->getSVal(Condition, PredI->getLocationContext());
-
-    if (X.isUnknownOrUndef()) {
-      // Give it a chance to recover from unknown.
-      if (const auto *Ex = dyn_cast(Condition)) {
-        if (Ex->getType()->isIntegralOrEnumerationType()) {
-          // Try to recover some path-sensitivity.  Right now casts of symbolic
-          // integers that promote their values are currently not tracked well.
-          // If 'Condition' is such an expression, try and recover the
-          // underlying value and use that instead.
-          SVal recovered = RecoverCastedSymbol(PrevState, Condition,
-                                               PredI->getLocationContext(),
-                                               getContext());
-
-          if (!recovered.isUnknown()) {
-            X = recovered;
-          }
-        }
-      }
-    }
+    ProgramStateRef PrevState = PredN->getState();
 
-    // If the condition is still unknown, give up.
-    if (X.isUnknownOrUndef()) {
-      builder.generateNode(PrevState, true, PredI);
-      builder.generateNode(PrevState, false, PredI);
+    ProgramStateRef StTrue, StFalse;
+    if (const auto KnownCondValueAssumption = assumeCondition(Condition, PredN))
+      std::tie(StTrue, StFalse) = *KnownCondValueAssumption;
+    else {
+      assert(!isa(Condition));
+      builder.generateNode(PrevState, true, PredN);
+      builder.generateNode(PrevState, false, PredN);
       continue;
     }
-
-    DefinedSVal V = X.castAs();
-
-    ProgramStateRef StTrue, StFalse;
-    std::tie(StTrue, StFalse) = PrevState->assume(V);
+    if (StTrue && StFalse)
+      assert(!isa(Condition));;
 
     // Process the true branch.
     if (builder.isFeasible(true)) {
       if (StTrue)
-        builder.generateNode(StTrue, true, PredI);
+        builder.generateNode(StTrue, true, PredN);
       else
         builder.markInfeasible(true);
     }
@@ -2214,7 +2271,7 @@ void ExprEngine::processBranch(const Stmt *Condition,
     // Process the false branch.
     if (builder.isFeasible(false)) {
       if (StFalse)
-        builder.generateNode(StFalse, false, PredI);
+        builder.generateNode(StFalse, false, PredN);
       else
         builder.markInfeasible(false);
     }
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineObjC.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineObjC.cpp
index eb9a0be2e5d6e..5a55e81497b03 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineObjC.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineObjC.cpp
@@ -53,10 +53,8 @@ static void populateObjCForDestinationSet(
     ProgramStateRef state = Pred->getState();
     const LocationContext *LCtx = Pred->getLocationContext();
 
-    SVal hasElementsV = svalBuilder.makeTruthVal(hasElements);
-
-    // FIXME: S is not an expression. We should not be binding values to it.
-    ProgramStateRef nextState = state->BindExpr(S, LCtx, hasElementsV);
+    ProgramStateRef nextState =
+        ExprEngine::setWhetherHasMoreIteration(state, S, LCtx, hasElements);
 
     if (auto MV = elementV.getAs())
       if (const auto *R = dyn_cast(MV->getRegion())) {
@@ -93,10 +91,9 @@ void ExprEngine::VisitObjCForCollectionStmt(const ObjCForCollectionStmt *S,
   //  (1) binds the next container value to 'element'.  This creates a new
   //      node in the ExplodedGraph.
   //
-  //  (2) binds the value 0/1 to the ObjCForCollectionStmt* itself, indicating
-  //      whether or not the container has any more elements.  This value
-  //      will be tested in ProcessBranch.  We need to explicitly bind
-  //      this value because a container can contain nil elements.
+  //  (2) note whether the collection has any more elements (or in other words,
+  //      whether the loop has more iterations). This will be tested in
+  //      processBranch.
   //
   // FIXME: Eventually this logic should actually do dispatches to
   //   'countByEnumeratingWithState:objects:count:' (NSFastEnumeration).
diff --git a/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp
index ed62778623a80..ce4addd2f9451 100644
--- a/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Casting.h"
+#include 
 
 using namespace clang;
 using namespace ento;
@@ -825,22 +826,36 @@ void PlistDiagnostics::FlushDiagnosticsImpl(
 
 namespace {
 
-using ExpArgTokens = llvm::SmallVector;
+using ArgTokensTy = llvm::SmallVector;
 
-/// Maps unexpanded macro arguments to expanded arguments. A macro argument may
+} // end of anonymous namespace
+
+LLVM_DUMP_METHOD static void dumpArgTokensToStream(llvm::raw_ostream &Out,
+                                                   const Preprocessor &PP,
+                                                   const ArgTokensTy &Toks);
+
+namespace {
+/// Maps unexpanded macro parameters to expanded arguments. A macro argument may
 /// need to expanded further when it is nested inside another macro.
-class MacroArgMap : public std::map {
+class MacroParamMap : public std::map {
 public:
-  void expandFromPrevMacro(const MacroArgMap &Super);
+  void expandFromPrevMacro(const MacroParamMap &Super);
+
+  LLVM_DUMP_METHOD void dump(const Preprocessor &PP) const {
+    dumpToStream(llvm::errs(), PP);
+  }
+
+  LLVM_DUMP_METHOD void dumpToStream(llvm::raw_ostream &Out,
+                                     const Preprocessor &PP) const;
 };
 
-struct MacroNameAndArgs {
+struct MacroExpansionInfo {
   std::string Name;
   const MacroInfo *MI = nullptr;
-  MacroArgMap Args;
+  MacroParamMap ParamMap;
 
-  MacroNameAndArgs(std::string N, const MacroInfo *MI, MacroArgMap M)
-    : Name(std::move(N)), MI(MI), Args(std::move(M)) {}
+  MacroExpansionInfo(std::string N, const MacroInfo *MI, MacroParamMap M)
+      : Name(std::move(N)), MI(MI), ParamMap(std::move(M)) {}
 };
 
 class TokenPrinter {
@@ -860,6 +875,46 @@ class TokenPrinter {
   void printToken(const Token &Tok);
 };
 
+/// Wrapper around a Lexer object that can lex tokens one-by-one. Its possible
+/// to "inject" a range of tokens into the stream, in which case the next token
+/// is retrieved from the next element of the range, until the end of the range
+/// is reached.
+class TokenStream {
+public:
+  TokenStream(SourceLocation ExpanLoc, const SourceManager &SM,
+              const LangOptions &LangOpts)
+      : ExpanLoc(ExpanLoc) {
+    FileID File;
+    unsigned Offset;
+    std::tie(File, Offset) = SM.getDecomposedLoc(ExpanLoc);
+    const llvm::MemoryBuffer *MB = SM.getBuffer(File);
+    const char *MacroNameTokenPos = MB->getBufferStart() + Offset;
+
+    RawLexer = std::make_unique(SM.getLocForStartOfFile(File), LangOpts,
+                                       MB->getBufferStart(), MacroNameTokenPos,
+                                       MB->getBufferEnd());
+  }
+
+  void next(Token &Result) {
+    if (CurrTokenIt == TokenRange.end()) {
+      RawLexer->LexFromRawLexer(Result);
+      return;
+    }
+    Result = *CurrTokenIt;
+    CurrTokenIt++;
+  }
+
+  void injectRange(const ArgTokensTy &Range) {
+    TokenRange = Range;
+    CurrTokenIt = TokenRange.begin();
+  }
+
+  std::unique_ptr RawLexer;
+  ArgTokensTy TokenRange;
+  ArgTokensTy::iterator CurrTokenIt = TokenRange.begin();
+  SourceLocation ExpanLoc;
+};
+
 } // end of anonymous namespace
 
 /// The implementation method of getMacroExpansion: It prints the expansion of
@@ -878,7 +933,7 @@ class TokenPrinter {
 ///
 /// As we expand the last line, we'll immediately replace PRINT(str) with
 /// print(x). The information that both 'str' and 'x' refers to the same string
-/// is an information we have to forward, hence the argument \p PrevArgs.
+/// is an information we have to forward, hence the argument \p PrevParamMap.
 ///
 /// To avoid infinite recursion we maintain the already processed tokens in
 /// a set. This is carried as a parameter through the recursive calls. The set
@@ -888,13 +943,11 @@ class TokenPrinter {
 /// #define f(y) x
 /// #define x f(x)
 static std::string getMacroNameAndPrintExpansion(
-    TokenPrinter &Printer,
-    SourceLocation MacroLoc,
-    const Preprocessor &PP,
-    const MacroArgMap &PrevArgs,
+    TokenPrinter &Printer, SourceLocation MacroLoc, const Preprocessor &PP,
+    const MacroParamMap &PrevParamMap,
     llvm::SmallPtrSet &AlreadyProcessedTokens);
 
-/// Retrieves the name of the macro and what it's arguments expand into
+/// Retrieves the name of the macro and what it's parameters expand into
 /// at \p ExpanLoc.
 ///
 /// For example, for the following macro expansion:
@@ -916,8 +969,9 @@ static std::string getMacroNameAndPrintExpansion(
 /// When \p ExpanLoc references "SET_TO_NULL(a)" within the definition of
 /// "NOT_SUSPICOUS", the macro name "SET_TO_NULL" and the MacroArgMap map
 /// { (x, a) } will be returned.
-static MacroNameAndArgs getMacroNameAndArgs(SourceLocation ExpanLoc,
-                                            const Preprocessor &PP);
+static MacroExpansionInfo
+getMacroExpansionInfo(const MacroParamMap &PrevParamMap,
+                      SourceLocation ExpanLoc, const Preprocessor &PP);
 
 /// Retrieves the ')' token that matches '(' \p It points to.
 static MacroInfo::tokens_iterator getMatchingRParen(
@@ -951,21 +1005,20 @@ getExpandedMacro(SourceLocation MacroLoc, const Preprocessor &PP,
   llvm::SmallPtrSet AlreadyProcessedTokens;
 
   std::string MacroName = getMacroNameAndPrintExpansion(
-      Printer, MacroLoc, *PPToUse, MacroArgMap{}, AlreadyProcessedTokens);
+      Printer, MacroLoc, *PPToUse, MacroParamMap{}, AlreadyProcessedTokens);
   return {MacroName, std::string(OS.str())};
 }
 
 static std::string getMacroNameAndPrintExpansion(
-    TokenPrinter &Printer,
-    SourceLocation MacroLoc,
-    const Preprocessor &PP,
-    const MacroArgMap &PrevArgs,
+    TokenPrinter &Printer, SourceLocation MacroLoc, const Preprocessor &PP,
+    const MacroParamMap &PrevParamMap,
     llvm::SmallPtrSet &AlreadyProcessedTokens) {
 
   const SourceManager &SM = PP.getSourceManager();
 
-  MacroNameAndArgs Info = getMacroNameAndArgs(SM.getExpansionLoc(MacroLoc), PP);
-  IdentifierInfo* IDInfo = PP.getIdentifierInfo(Info.Name);
+  MacroExpansionInfo MExpInfo =
+      getMacroExpansionInfo(PrevParamMap, SM.getExpansionLoc(MacroLoc), PP);
+  IdentifierInfo *MacroNameII = PP.getIdentifierInfo(MExpInfo.Name);
 
   // TODO: If the macro definition contains another symbol then this function is
   // called recursively. In case this symbol is the one being defined, it will
@@ -973,18 +1026,18 @@ static std::string getMacroNameAndPrintExpansion(
   // in this case we don't get the full expansion text in the Plist file. See
   // the test file where "value" is expanded to "garbage_" instead of
   // "garbage_value".
-  if (!AlreadyProcessedTokens.insert(IDInfo).second)
-    return Info.Name;
+  if (!AlreadyProcessedTokens.insert(MacroNameII).second)
+    return MExpInfo.Name;
 
-  if (!Info.MI)
-    return Info.Name;
+  if (!MExpInfo.MI)
+    return MExpInfo.Name;
 
   // Manually expand its arguments from the previous macro.
-  Info.Args.expandFromPrevMacro(PrevArgs);
+  MExpInfo.ParamMap.expandFromPrevMacro(PrevParamMap);
 
   // Iterate over the macro's tokens and stringify them.
-  for (auto It = Info.MI->tokens_begin(), E = Info.MI->tokens_end(); It != E;
-       ++It) {
+  for (auto It = MExpInfo.MI->tokens_begin(), E = MExpInfo.MI->tokens_end();
+       It != E; ++It) {
     Token T = *It;
 
     // If this token is not an identifier, we only need to print it.
@@ -1000,8 +1053,8 @@ static std::string getMacroNameAndPrintExpansion(
     // If this token is a macro that should be expanded inside the current
     // macro.
     if (getMacroInfoForLocation(PP, SM, II, T.getLocation())) {
-      getMacroNameAndPrintExpansion(Printer, T.getLocation(), PP, Info.Args,
-                                    AlreadyProcessedTokens);
+      getMacroNameAndPrintExpansion(Printer, T.getLocation(), PP,
+                                    MExpInfo.ParamMap, AlreadyProcessedTokens);
 
       // If this is a function-like macro, skip its arguments, as
       // getExpandedMacro() already printed them. If this is the case, let's
@@ -1013,10 +1066,10 @@ static std::string getMacroNameAndPrintExpansion(
     }
 
     // If this token is the current macro's argument, we should expand it.
-    auto ArgMapIt = Info.Args.find(II);
-    if (ArgMapIt != Info.Args.end()) {
-      for (MacroInfo::tokens_iterator ArgIt = ArgMapIt->second.begin(),
-                                      ArgEnd = ArgMapIt->second.end();
+    auto ParamToArgIt = MExpInfo.ParamMap.find(II);
+    if (ParamToArgIt != MExpInfo.ParamMap.end()) {
+      for (MacroInfo::tokens_iterator ArgIt = ParamToArgIt->second.begin(),
+                                      ArgEnd = ParamToArgIt->second.end();
            ArgIt != ArgEnd; ++ArgIt) {
 
         // These tokens may still be macros, if that is the case, handle it the
@@ -1034,7 +1087,8 @@ static std::string getMacroNameAndPrintExpansion(
         }
 
         getMacroNameAndPrintExpansion(Printer, ArgIt->getLocation(), PP,
-                                      Info.Args, AlreadyProcessedTokens);
+                                      MExpInfo.ParamMap,
+                                      AlreadyProcessedTokens);
         // Peek the next token if it is a tok::l_paren. This way we can decide
         // if this is the application or just a reference to a function maxro
         // symbol:
@@ -1055,29 +1109,25 @@ static std::string getMacroNameAndPrintExpansion(
     Printer.printToken(T);
   }
 
-  AlreadyProcessedTokens.erase(IDInfo);
+  AlreadyProcessedTokens.erase(MacroNameII);
 
-  return Info.Name;
+  return MExpInfo.Name;
 }
 
-static MacroNameAndArgs getMacroNameAndArgs(SourceLocation ExpanLoc,
-                                            const Preprocessor &PP) {
+static MacroExpansionInfo
+getMacroExpansionInfo(const MacroParamMap &PrevParamMap,
+                      SourceLocation ExpanLoc, const Preprocessor &PP) {
 
   const SourceManager &SM = PP.getSourceManager();
   const LangOptions &LangOpts = PP.getLangOpts();
 
   // First, we create a Lexer to lex *at the expansion location* the tokens
   // referring to the macro's name and its arguments.
-  std::pair LocInfo = SM.getDecomposedLoc(ExpanLoc);
-  const llvm::MemoryBuffer *MB = SM.getBuffer(LocInfo.first);
-  const char *MacroNameTokenPos = MB->getBufferStart() + LocInfo.second;
-
-  Lexer RawLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
-                 MB->getBufferStart(), MacroNameTokenPos, MB->getBufferEnd());
+  TokenStream TStream(ExpanLoc, SM, LangOpts);
 
   // Acquire the macro's name.
   Token TheTok;
-  RawLexer.LexFromRawLexer(TheTok);
+  TStream.next(TheTok);
 
   std::string MacroName = PP.getSpelling(TheTok);
 
@@ -1094,18 +1144,18 @@ static MacroNameAndArgs getMacroNameAndArgs(SourceLocation ExpanLoc,
   if (!MI)
     return { MacroName, MI, {} };
 
-  // Acquire the macro's arguments.
+  // Acquire the macro's arguments at the expansion point.
   //
   // The rough idea here is to lex from the first left parentheses to the last
-  // right parentheses, and map the macro's unexpanded arguments to what they
-  // will be expanded to. An expanded macro argument may contain several tokens
-  // (like '3 + 4'), so we'll lex until we find a tok::comma or tok::r_paren, at
-  // which point we start lexing the next argument or finish.
-  ArrayRef MacroArgs = MI->params();
-  if (MacroArgs.empty())
+  // right parentheses, and map the macro's parameter to what they will be
+  // expanded to. A macro argument may contain several token (like '3 + 4'), so
+  // we'll lex until we find a tok::comma or tok::r_paren, at which point we
+  // start lexing the next argument or finish.
+  ArrayRef MacroParams = MI->params();
+  if (MacroParams.empty())
     return { MacroName, MI, {} };
 
-  RawLexer.LexFromRawLexer(TheTok);
+  TStream.next(TheTok);
   // When this is a token which expands to another macro function then its
   // parentheses are not at its expansion locaiton. For example:
   //
@@ -1117,9 +1167,9 @@ static MacroNameAndArgs getMacroNameAndArgs(SourceLocation ExpanLoc,
   if (TheTok.isNot(tok::l_paren))
     return { MacroName, MI, {} };
 
-  MacroArgMap Args;
+  MacroParamMap ParamMap;
 
-  // When the macro's argument is a function call, like
+  // When the argument is a function call, like
   //   CALL_FN(someFunctionName(param1, param2))
   // we will find tok::l_paren, tok::r_paren, and tok::comma that do not divide
   // actual macro arguments, or do not represent the macro argument's closing
@@ -1134,8 +1184,8 @@ static MacroNameAndArgs getMacroNameAndArgs(SourceLocation ExpanLoc,
   // even if we lex a tok::comma and ParanthesesDepth == 1.
   const IdentifierInfo *__VA_ARGS__II = PP.getIdentifierInfo("__VA_ARGS__");
 
-  for (const IdentifierInfo *UnexpArgII : MacroArgs) {
-    MacroArgMap::mapped_type ExpandedArgTokens;
+  for (const IdentifierInfo *CurrParamII : MacroParams) {
+    MacroParamMap::mapped_type ArgTokens;
 
     // One could also simply not supply a single argument to __VA_ARGS__ -- this
     // results in a preprocessor warning, but is not an error:
@@ -1149,10 +1199,11 @@ static MacroNameAndArgs getMacroNameAndArgs(SourceLocation ExpanLoc,
     if (ParenthesesDepth != 0) {
 
       // Lex the first token of the next macro parameter.
-      RawLexer.LexFromRawLexer(TheTok);
+      TStream.next(TheTok);
 
-      while (!(ParenthesesDepth == 1 &&
-              (UnexpArgII == __VA_ARGS__II ? false : TheTok.is(tok::comma)))) {
+      while (
+          !(ParenthesesDepth == 1 &&
+            (CurrParamII == __VA_ARGS__II ? false : TheTok.is(tok::comma)))) {
         assert(TheTok.isNot(tok::eof) &&
                "EOF encountered while looking for expanded macro args!");
 
@@ -1165,24 +1216,51 @@ static MacroNameAndArgs getMacroNameAndArgs(SourceLocation ExpanLoc,
         if (ParenthesesDepth == 0)
           break;
 
-        if (TheTok.is(tok::raw_identifier))
+        if (TheTok.is(tok::raw_identifier)) {
           PP.LookUpIdentifierInfo(TheTok);
+          // This token is a variadic parameter:
+          //
+          //   #define PARAMS_RESOLVE_TO_VA_ARGS(i, fmt) foo(i, fmt); \
+          //     i = 0;
+          //   #define DISPATCH(...) \
+          //     PARAMS_RESOLVE_TO_VA_ARGS(__VA_ARGS__);
+          //                            // ^~~~~~~~~~~ Variadic parameter here
+          //
+          //   void mulitpleParamsResolveToVA_ARGS(void) {
+          //     int x = 1;
+          //     DISPATCH(x, "LF1M healer"); // Multiple arguments are mapped to
+          //                                 // a single __VA_ARGS__ parameter.
+          //     (void)(10 / x);
+          //   }
+          //
+          // We will stumble across this while trying to expand
+          // PARAMS_RESOLVE_TO_VA_ARGS. By this point, we already noted during
+          // the processing of DISPATCH what __VA_ARGS__ maps to, so we'll
+          // retrieve the next series of tokens from that.
+          if (TheTok.getIdentifierInfo() == __VA_ARGS__II) {
+            TStream.injectRange(PrevParamMap.at(__VA_ARGS__II));
+            TStream.next(TheTok);
+            continue;
+          }
+        }
 
-        ExpandedArgTokens.push_back(TheTok);
-        RawLexer.LexFromRawLexer(TheTok);
+        ArgTokens.push_back(TheTok);
+        TStream.next(TheTok);
       }
     } else {
-      assert(UnexpArgII == __VA_ARGS__II);
+      assert(CurrParamII == __VA_ARGS__II &&
+             "No more macro arguments are found, but the current parameter "
+             "isn't __VA_ARGS__!");
     }
 
-    Args.emplace(UnexpArgII, std::move(ExpandedArgTokens));
+    ParamMap.emplace(CurrParamII, std::move(ArgTokens));
   }
 
   assert(TheTok.is(tok::r_paren) &&
          "Expanded macro argument acquisition failed! After the end of the loop"
          " this token should be ')'!");
 
-  return { MacroName, MI, Args };
+  return {MacroName, MI, ParamMap};
 }
 
 static MacroInfo::tokens_iterator getMatchingRParen(
@@ -1222,14 +1300,14 @@ static const MacroInfo *getMacroInfoForLocation(const Preprocessor &PP,
   return MD->findDirectiveAtLoc(Loc, SM).getMacroInfo();
 }
 
-void MacroArgMap::expandFromPrevMacro(const MacroArgMap &Super) {
+void MacroParamMap::expandFromPrevMacro(const MacroParamMap &Super) {
 
   for (value_type &Pair : *this) {
-    ExpArgTokens &CurrExpArgTokens = Pair.second;
+    ArgTokensTy &CurrArgTokens = Pair.second;
 
     // For each token in the expanded macro argument.
-    auto It = CurrExpArgTokens.begin();
-    while (It != CurrExpArgTokens.end()) {
+    auto It = CurrArgTokens.begin();
+    while (It != CurrArgTokens.end()) {
       if (It->isNot(tok::identifier)) {
         ++It;
         continue;
@@ -1244,17 +1322,43 @@ void MacroArgMap::expandFromPrevMacro(const MacroArgMap &Super) {
         continue;
       }
 
-      const ExpArgTokens &SuperExpArgTokens = Super.at(II);
+      const ArgTokensTy &SuperArgTokens = Super.at(II);
 
-      It = CurrExpArgTokens.insert(
-          It, SuperExpArgTokens.begin(), SuperExpArgTokens.end());
-      std::advance(It, SuperExpArgTokens.size());
-      It = CurrExpArgTokens.erase(It);
+      It = CurrArgTokens.insert(It, SuperArgTokens.begin(),
+                                SuperArgTokens.end());
+      std::advance(It, SuperArgTokens.size());
+      It = CurrArgTokens.erase(It);
     }
   }
 }
 
+void MacroParamMap::dumpToStream(llvm::raw_ostream &Out,
+                                 const Preprocessor &PP) const {
+  for (const std::pair Pair : *this) {
+    Out << Pair.first->getName() << " -> ";
+    dumpArgTokensToStream(Out, PP, Pair.second);
+    Out << '\n';
+  }
+}
+
+static void dumpArgTokensToStream(llvm::raw_ostream &Out,
+                                  const Preprocessor &PP,
+                                  const ArgTokensTy &Toks) {
+  TokenPrinter Printer(Out, PP);
+  for (Token Tok : Toks)
+    Printer.printToken(Tok);
+}
+
 void TokenPrinter::printToken(const Token &Tok) {
+  // TODO: Handle GNU extensions where hash and hashhash occurs right before
+  // __VA_ARGS__.
+  // cppreference.com: "some compilers offer an extension that allows ## to
+  // appear after a comma and before __VA_ARGS__, in which case the ## does
+  // nothing when the variable arguments are present, but removes the comma when
+  // the variable arguments are not present: this makes it possible to define
+  // macros such as fprintf (stderr, format, ##__VA_ARGS__)"
+  // FIXME: Handle named variadic macro parameters (also a GNU extension).
+
   // If this is the first token to be printed, don't print space.
   if (PrevTok.isNot(tok::unknown)) {
     // If the tokens were already space separated, or if they must be to avoid
diff --git a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
index 32d2a3e30708e..72b8ada1dfab9 100644
--- a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
@@ -306,6 +306,14 @@ Optional SValBuilder::getConstantVal(const Expr *E) {
     return makeLoc(getRegionManager().getStringRegion(SL));
   }
 
+  case Stmt::PredefinedExprClass: {
+    const auto *PE = cast(E);
+    assert(PE->getFunctionName() &&
+           "Since we analyze only instantiated functions, PredefinedExpr "
+           "should have a function name.");
+    return makeLoc(getRegionManager().getStringRegion(PE->getFunctionName()));
+  }
+
   // Fast-path some expressions to avoid the overhead of going through the AST's
   // constant evaluator
   case Stmt::CharacterLiteralClass: {
diff --git a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
index 6ca7aec9caeca..79a8eef305768 100644
--- a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
@@ -14,6 +14,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/StmtObjC.h"
 #include "clang/Analysis/Analyses/LiveVariables.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Basic/LLVM.h"
@@ -34,6 +35,12 @@ using namespace ento;
 
 void SymExpr::anchor() {}
 
+StringRef SymbolConjured::getKindStr() const { return "conj_$"; }
+StringRef SymbolDerived::getKindStr() const { return "derived_$"; }
+StringRef SymbolExtent::getKindStr() const { return "extent_$"; }
+StringRef SymbolMetadata::getKindStr() const { return "meta_$"; }
+StringRef SymbolRegionValue::getKindStr() const { return "reg_$"; }
+
 LLVM_DUMP_METHOD void SymExpr::dump() const { dumpToStream(llvm::errs()); }
 
 void BinarySymExpr::dumpToStreamImpl(raw_ostream &OS, const SymExpr *Sym) {
@@ -64,7 +71,7 @@ void SymbolCast::dumpToStream(raw_ostream &os) const {
 }
 
 void SymbolConjured::dumpToStream(raw_ostream &os) const {
-  os << "conj_$" << getSymbolID() << '{' << T.getAsString() << ", LC"
+  os << getKindStr() << getSymbolID() << '{' << T.getAsString() << ", LC"
      << LCtx->getID();
   if (S)
     os << ", S" << S->getID(LCtx->getDecl()->getASTContext());
@@ -74,24 +81,24 @@ void SymbolConjured::dumpToStream(raw_ostream &os) const {
 }
 
 void SymbolDerived::dumpToStream(raw_ostream &os) const {
-  os << "derived_$" << getSymbolID() << '{'
-     << getParentSymbol() << ',' << getRegion() << '}';
+  os << getKindStr() << getSymbolID() << '{' << getParentSymbol() << ','
+     << getRegion() << '}';
 }
 
 void SymbolExtent::dumpToStream(raw_ostream &os) const {
-  os << "extent_$" << getSymbolID() << '{' << getRegion() << '}';
+  os << getKindStr() << getSymbolID() << '{' << getRegion() << '}';
 }
 
 void SymbolMetadata::dumpToStream(raw_ostream &os) const {
-  os << "meta_$" << getSymbolID() << '{'
-     << getRegion() << ',' << T.getAsString() << '}';
+  os << getKindStr() << getSymbolID() << '{' << getRegion() << ','
+     << T.getAsString() << '}';
 }
 
 void SymbolData::anchor() {}
 
 void SymbolRegionValue::dumpToStream(raw_ostream &os) const {
-  os << "reg_$" << getSymbolID()
-     << '<' << getType().getAsString() << ' ' << R << '>';
+  os << getKindStr() << getSymbolID() << '<' << getType().getAsString() << ' '
+     << R << '>';
 }
 
 bool SymExpr::symbol_iterator::operator==(const symbol_iterator &X) const {
@@ -482,7 +489,7 @@ bool SymbolReaper::isLive(SymbolRef sym) {
 }
 
 bool
-SymbolReaper::isLive(const Stmt *ExprVal, const LocationContext *ELCtx) const {
+SymbolReaper::isLive(const Expr *ExprVal, const LocationContext *ELCtx) const {
   if (LCtx == nullptr)
     return false;
 
@@ -494,7 +501,8 @@ SymbolReaper::isLive(const Stmt *ExprVal, const LocationContext *ELCtx) const {
     return true;
   }
 
-  // If no statement is provided, everything is this and parent contexts is live.
+  // If no statement is provided, everything in this and parent contexts is
+  // live.
   if (!Loc)
     return true;
 
diff --git a/clang/lib/Tooling/ArgumentsAdjusters.cpp b/clang/lib/Tooling/ArgumentsAdjusters.cpp
index a857b57fbf7bc..bcfb5b39a0770 100644
--- a/clang/lib/Tooling/ArgumentsAdjusters.cpp
+++ b/clang/lib/Tooling/ArgumentsAdjusters.cpp
@@ -21,6 +21,16 @@
 namespace clang {
 namespace tooling {
 
+static StringRef getDriverMode(const CommandLineArguments &Args) {
+  for (const auto &Arg : Args) {
+    StringRef ArgRef = Arg;
+    if (ArgRef.consume_front("--driver-mode=")) {
+      return ArgRef;
+    }
+  }
+  return StringRef();
+}
+
 /// Add -fsyntax-only option and drop options that triggers output generation.
 ArgumentsAdjuster getClangSyntaxOnlyAdjuster() {
   return [](const CommandLineArguments &Args, StringRef /*unused*/) {
@@ -93,20 +103,28 @@ ArgumentsAdjuster getClangStripSerializeDiagnosticAdjuster() {
 
 ArgumentsAdjuster getClangStripDependencyFileAdjuster() {
   return [](const CommandLineArguments &Args, StringRef /*unused*/) {
+    auto UsingClDriver = (getDriverMode(Args) == "cl");
+
     CommandLineArguments AdjustedArgs;
     for (size_t i = 0, e = Args.size(); i < e; ++i) {
       StringRef Arg = Args[i];
-      // All dependency-file options begin with -M. These include -MM,
-      // -MF, -MG, -MP, -MT, -MQ, -MD, and -MMD.
-      if (!Arg.startswith("-M") && !Arg.startswith("/showIncludes") &&
-          !Arg.startswith("-showIncludes")) {
-        AdjustedArgs.push_back(Args[i]);
+
+      // These flags take an argument: -MX foo. Skip the next argument also.
+      if (!UsingClDriver && (Arg == "-MF" || Arg == "-MT" || Arg == "-MQ")) {
+        ++i;
         continue;
       }
+      // When not using the cl driver mode, dependency file generation options
+      // begin with -M. These include -MM, -MF, -MG, -MP, -MT, -MQ, -MD, and
+      // -MMD.
+      if (!UsingClDriver && Arg.startswith("-M"))
+        continue;
+      // Under MSVC's cl driver mode, dependency file generation is controlled
+      // using /showIncludes
+      if (Arg.startswith("/showIncludes") || Arg.startswith("-showIncludes"))
+        continue;
 
-      if (Arg == "-MF" || Arg == "-MT" || Arg == "-MQ")
-        // These flags take an argument: -MX foo. Skip the next argument also.
-        ++i;
+      AdjustedArgs.push_back(Args[i]);
     }
     return AdjustedArgs;
   };
diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp
index a9f326439a2a5..3e0573ac4ffcf 100644
--- a/clang/lib/Tooling/Syntax/BuildTree.cpp
+++ b/clang/lib/Tooling/Syntax/BuildTree.cpp
@@ -13,6 +13,8 @@
 #include "clang/AST/DeclarationName.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/IgnoreExpr.h"
+#include "clang/AST/OperationKinds.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/TypeLoc.h"
@@ -44,8 +46,44 @@
 
 using namespace clang;
 
+// Ignores the implicit `CXXConstructExpr` for copy/move constructor calls
+// generated by the compiler, as well as in implicit conversions like the one
+// wrapping `1` in `X x = 1;`.
+static Expr *IgnoreImplicitConstructorSingleStep(Expr *E) {
+  if (auto *C = dyn_cast(E)) {
+    auto NumArgs = C->getNumArgs();
+    if (NumArgs == 1 || (NumArgs > 1 && isa(C->getArg(1)))) {
+      Expr *A = C->getArg(0);
+      if (C->getParenOrBraceRange().isInvalid())
+        return A;
+    }
+  }
+  return E;
+}
+
+// In:
+// struct X {
+//   X(int)
+// };
+// X x = X(1);
+// Ignores the implicit `CXXFunctionalCastExpr` that wraps
+// `CXXConstructExpr X(1)`.
+static Expr *IgnoreCXXFunctionalCastExprWrappingConstructor(Expr *E) {
+  if (auto *F = dyn_cast(E)) {
+    if (F->getCastKind() == CK_ConstructorConversion)
+      return F->getSubExpr();
+  }
+  return E;
+}
+
+static Expr *IgnoreImplicit(Expr *E) {
+  return IgnoreExprNodes(E, IgnoreImplicitSingleStep,
+                         IgnoreImplicitConstructorSingleStep,
+                         IgnoreCXXFunctionalCastExprWrappingConstructor);
+}
+
 LLVM_ATTRIBUTE_UNUSED
-static bool isImplicitExpr(Expr *E) { return E->IgnoreImplicit() != E; }
+static bool isImplicitExpr(Expr *E) { return IgnoreImplicit(E) != E; }
 
 namespace {
 /// Get start location of the Declarator from the TypeLoc.
@@ -116,6 +154,13 @@ struct GetStartLoc : TypeLocVisitor {
 };
 } // namespace
 
+static CallExpr::arg_range dropDefaultArgs(CallExpr::arg_range Args) {
+  auto firstDefaultArg = std::find_if(Args.begin(), Args.end(), [](auto it) {
+    return isa(it);
+  });
+  return llvm::make_range(Args.begin(), firstDefaultArg);
+}
+
 static syntax::NodeKind getOperatorNodeKind(const CXXOperatorCallExpr &E) {
   switch (E.getOperator()) {
   // Comparison
@@ -321,12 +366,14 @@ class ASTToSyntaxMapping {
 class syntax::TreeBuilder {
 public:
   TreeBuilder(syntax::Arena &Arena) : Arena(Arena), Pending(Arena) {
-    for (const auto &T : Arena.tokenBuffer().expandedTokens())
+    for (const auto &T : Arena.getTokenBuffer().expandedTokens())
       LocationToToken.insert({T.location().getRawEncoding(), &T});
   }
 
-  llvm::BumpPtrAllocator &allocator() { return Arena.allocator(); }
-  const SourceManager &sourceManager() const { return Arena.sourceManager(); }
+  llvm::BumpPtrAllocator &allocator() { return Arena.getAllocator(); }
+  const SourceManager &sourceManager() const {
+    return Arena.getSourceManager();
+  }
 
   /// Populate children for \p New node, assuming it covers tokens from \p
   /// Range.
@@ -376,13 +423,13 @@ class syntax::TreeBuilder {
 
   /// Finish building the tree and consume the root node.
   syntax::TranslationUnit *finalize() && {
-    auto Tokens = Arena.tokenBuffer().expandedTokens();
+    auto Tokens = Arena.getTokenBuffer().expandedTokens();
     assert(!Tokens.empty());
     assert(Tokens.back().kind() == tok::eof);
 
     // Build the root of the tree, consuming all the children.
     Pending.foldChildren(Arena, Tokens.drop_back(),
-                         new (Arena.allocator()) syntax::TranslationUnit);
+                         new (Arena.getAllocator()) syntax::TranslationUnit);
 
     auto *TU = cast(std::move(Pending).finalize());
     TU->assertInvariantsRecursive();
@@ -406,7 +453,7 @@ class syntax::TreeBuilder {
     assert(First.isValid());
     assert(Last.isValid());
     assert(First == Last ||
-           Arena.sourceManager().isBeforeInTranslationUnit(First, Last));
+           Arena.getSourceManager().isBeforeInTranslationUnit(First, Last));
     return llvm::makeArrayRef(findToken(First), std::next(findToken(Last)));
   }
 
@@ -495,7 +542,7 @@ class syntax::TreeBuilder {
   }
 
   void setRole(syntax::Node *N, NodeRole R) {
-    assert(N->role() == NodeRole::Detached);
+    assert(N->getRole() == NodeRole::Detached);
     N->setRole(R);
   }
 
@@ -507,14 +554,14 @@ class syntax::TreeBuilder {
   /// Ensures that added nodes properly nest and cover the whole token stream.
   struct Forest {
     Forest(syntax::Arena &A) {
-      assert(!A.tokenBuffer().expandedTokens().empty());
-      assert(A.tokenBuffer().expandedTokens().back().kind() == tok::eof);
+      assert(!A.getTokenBuffer().expandedTokens().empty());
+      assert(A.getTokenBuffer().expandedTokens().back().kind() == tok::eof);
       // Create all leaf nodes.
       // Note that we do not have 'eof' in the tree.
-      for (auto &T : A.tokenBuffer().expandedTokens().drop_back()) {
-        auto *L = new (A.allocator()) syntax::Leaf(&T);
+      for (const auto &T : A.getTokenBuffer().expandedTokens().drop_back()) {
+        auto *L = new (A.getAllocator()) syntax::Leaf(&T);
         L->Original = true;
-        L->CanModify = A.tokenBuffer().spelledForExpanded(T).hasValue();
+        L->CanModify = A.getTokenBuffer().spelledForExpanded(T).hasValue();
         Trees.insert(Trees.end(), {&T, L});
       }
     }
@@ -527,7 +574,7 @@ class syntax::TreeBuilder {
       assert((std::next(It) == Trees.end() ||
               std::next(It)->first == Range.end()) &&
              "no child with the specified range");
-      assert(It->second->role() == NodeRole::Detached &&
+      assert(It->second->getRole() == NodeRole::Detached &&
              "re-assigning role for a child");
       It->second->setRole(Role);
     }
@@ -536,7 +583,7 @@ class syntax::TreeBuilder {
     void foldChildren(const syntax::Arena &A, ArrayRef Tokens,
                       syntax::Tree *Node) {
       // Attach children to `Node`.
-      assert(Node->firstChild() == nullptr && "node already has children");
+      assert(Node->getFirstChild() == nullptr && "node already has children");
 
       auto *FirstToken = Tokens.begin();
       auto BeginChildren = Trees.lower_bound(FirstToken);
@@ -552,14 +599,15 @@ class syntax::TreeBuilder {
       // We need to go in reverse order, because we can only prepend.
       for (auto It = EndChildren; It != BeginChildren; --It) {
         auto *C = std::prev(It)->second;
-        if (C->role() == NodeRole::Detached)
+        if (C->getRole() == NodeRole::Detached)
           C->setRole(NodeRole::Unknown);
         Node->prependChildLowLevel(C);
       }
 
       // Mark that this node came from the AST and is backed by the source code.
       Node->Original = true;
-      Node->CanModify = A.tokenBuffer().spelledForExpanded(Tokens).hasValue();
+      Node->CanModify =
+          A.getTokenBuffer().spelledForExpanded(Tokens).hasValue();
 
       Trees.erase(BeginChildren, EndChildren);
       Trees.insert({FirstToken, Node});
@@ -579,12 +627,12 @@ class syntax::TreeBuilder {
         unsigned CoveredTokens =
             It != Trees.end()
                 ? (std::next(It)->first - It->first)
-                : A.tokenBuffer().expandedTokens().end() - It->first;
+                : A.getTokenBuffer().expandedTokens().end() - It->first;
 
         R += std::string(
-            formatv("- '{0}' covers '{1}'+{2} tokens\n", It->second->kind(),
-                    It->first->text(A.sourceManager()), CoveredTokens));
-        R += It->second->dump(A.sourceManager());
+            formatv("- '{0}' covers '{1}'+{2} tokens\n", It->second->getKind(),
+                    It->first->text(A.getSourceManager()), CoveredTokens));
+        R += It->second->dump(A.getSourceManager());
       }
       return R;
     }
@@ -740,7 +788,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor {
       for (auto *D : DS->decls())
         Builder.noticeDeclWithoutSemicolon(D);
     } else if (auto *E = dyn_cast_or_null(S)) {
-      return RecursiveASTVisitor::TraverseStmt(E->IgnoreImplicit());
+      return RecursiveASTVisitor::TraverseStmt(IgnoreImplicit(E));
     }
     return RecursiveASTVisitor::TraverseStmt(S);
   }
@@ -1073,8 +1121,12 @@ class BuildTreeVisitor : public RecursiveASTVisitor {
     return true;
   }
 
-  syntax::CallArguments *buildCallArguments(CallExpr::arg_range Args) {
-    for (const auto &Arg : Args) {
+  /// Builds `CallArguments` syntax node from arguments that appear in source
+  /// code, i.e. not default arguments.
+  syntax::CallArguments *
+  buildCallArguments(CallExpr::arg_range ArgsAndDefaultArgs) {
+    auto Args = dropDefaultArgs(ArgsAndDefaultArgs);
+    for (auto *Arg : Args) {
       Builder.markExprChild(Arg, syntax::NodeRole::ListElement);
       const auto *DelimiterToken =
           std::next(Builder.findToken(Arg->getEndLoc()));
@@ -1111,6 +1163,14 @@ class BuildTreeVisitor : public RecursiveASTVisitor {
     return true;
   }
 
+  bool WalkUpFromCXXConstructExpr(CXXConstructExpr *S) {
+    // Ignore the implicit calls to default constructors.
+    if ((S->getNumArgs() == 0 || isa(S->getArg(0))) &&
+        S->getParenOrBraceRange().isInvalid())
+      return true;
+    return RecursiveASTVisitor::WalkUpFromCXXConstructExpr(S);
+  }
+
   bool TraverseCXXOperatorCallExpr(CXXOperatorCallExpr *S) {
     // To construct a syntax tree of the same shape for calls to built-in and
     // user-defined operators, ignore the `DeclRefExpr` that refers to the
@@ -1187,6 +1247,8 @@ class BuildTreeVisitor : public RecursiveASTVisitor {
     }
   }
 
+  bool WalkUpFromCXXDefaultArgExpr(CXXDefaultArgExpr *S) { return true; }
+
   bool WalkUpFromNamespaceDecl(NamespaceDecl *S) {
     auto Tokens = Builder.getDeclarationRange(S);
     if (Tokens.front().kind() == tok::coloncolon) {
@@ -1579,7 +1641,7 @@ void syntax::TreeBuilder::markStmtChild(Stmt *Child, NodeRole Role) {
 void syntax::TreeBuilder::markExprChild(Expr *Child, NodeRole Role) {
   if (!Child)
     return;
-  Child = Child->IgnoreImplicit();
+  Child = IgnoreImplicit(Child);
 
   syntax::Tree *ChildNode = Mapping.find(Child);
   assert(ChildNode != nullptr);
diff --git a/clang/lib/Tooling/Syntax/ComputeReplacements.cpp b/clang/lib/Tooling/Syntax/ComputeReplacements.cpp
index 30b3ee17d0926..31e1a40c74b61 100644
--- a/clang/lib/Tooling/Syntax/ComputeReplacements.cpp
+++ b/clang/lib/Tooling/Syntax/ComputeReplacements.cpp
@@ -32,13 +32,14 @@ void enumerateTokenSpans(const syntax::Tree *Root, ProcessTokensFn Callback) {
   private:
     void process(const syntax::Node *N) {
       if (auto *T = dyn_cast(N)) {
-        for (auto *C = T->firstChild(); C != nullptr; C = C->nextSibling())
+        for (const auto *C = T->getFirstChild(); C != nullptr;
+             C = C->getNextSibling())
           process(C);
         return;
       }
 
       auto *L = cast(N);
-      if (SpanEnd == L->token() && SpanIsOriginal == L->isOriginal()) {
+      if (SpanEnd == L->getToken() && SpanIsOriginal == L->isOriginal()) {
         // Extend the current span.
         ++SpanEnd;
         return;
@@ -47,7 +48,7 @@ void enumerateTokenSpans(const syntax::Tree *Root, ProcessTokensFn Callback) {
       if (SpanBegin)
         Callback(llvm::makeArrayRef(SpanBegin, SpanEnd), SpanIsOriginal);
       // Start recording a new span.
-      SpanBegin = L->token();
+      SpanBegin = L->getToken();
       SpanEnd = SpanBegin + 1;
       SpanIsOriginal = L->isOriginal();
     }
@@ -63,8 +64,8 @@ void enumerateTokenSpans(const syntax::Tree *Root, ProcessTokensFn Callback) {
 
 syntax::FileRange rangeOfExpanded(const syntax::Arena &A,
                                   llvm::ArrayRef Expanded) {
-  auto &Buffer = A.tokenBuffer();
-  auto &SM = A.sourceManager();
+  const auto &Buffer = A.getTokenBuffer();
+  const auto &SM = A.getSourceManager();
 
   // Check that \p Expanded actually points into expanded tokens.
   assert(Buffer.expandedTokens().begin() <= Expanded.begin());
@@ -84,8 +85,8 @@ syntax::FileRange rangeOfExpanded(const syntax::Arena &A,
 tooling::Replacements
 syntax::computeReplacements(const syntax::Arena &A,
                             const syntax::TranslationUnit &TU) {
-  auto &Buffer = A.tokenBuffer();
-  auto &SM = A.sourceManager();
+  const auto &Buffer = A.getTokenBuffer();
+  const auto &SM = A.getSourceManager();
 
   tooling::Replacements Replacements;
   // Text inserted by the replacement we are building now.
diff --git a/clang/lib/Tooling/Syntax/Mutations.cpp b/clang/lib/Tooling/Syntax/Mutations.cpp
index 24048b297a112..bf1bcda26455b 100644
--- a/clang/lib/Tooling/Syntax/Mutations.cpp
+++ b/clang/lib/Tooling/Syntax/Mutations.cpp
@@ -36,7 +36,7 @@ class syntax::MutationsImpl {
     assert(Role != NodeRole::Detached);
 
     New->setRole(Role);
-    auto *P = Anchor->parent();
+    auto *P = Anchor->getParent();
     P->replaceChildRangeLowLevel(Anchor, Anchor, New);
 
     P->assertInvariants();
@@ -52,16 +52,16 @@ class syntax::MutationsImpl {
     assert(New->isDetached());
 
     New->Role = Old->Role;
-    auto *P = Old->parent();
-    P->replaceChildRangeLowLevel(findPrevious(Old), Old->nextSibling(), New);
+    auto *P = Old->getParent();
+    P->replaceChildRangeLowLevel(findPrevious(Old), Old->getNextSibling(), New);
 
     P->assertInvariants();
   }
 
   /// Completely remove the node from its parent.
   static void remove(syntax::Node *N) {
-    auto *P = N->parent();
-    P->replaceChildRangeLowLevel(findPrevious(N), N->nextSibling(),
+    auto *P = N->getParent();
+    P->replaceChildRangeLowLevel(findPrevious(N), N->getNextSibling(),
                                  /*New=*/nullptr);
 
     P->assertInvariants();
@@ -70,11 +70,11 @@ class syntax::MutationsImpl {
 
 private:
   static syntax::Node *findPrevious(syntax::Node *N) {
-    if (N->parent()->firstChild() == N)
+    if (N->getParent()->getFirstChild() == N)
       return nullptr;
-    for (syntax::Node *C = N->parent()->firstChild(); C != nullptr;
-         C = C->nextSibling()) {
-      if (C->nextSibling() == N)
+    for (syntax::Node *C = N->getParent()->getFirstChild(); C != nullptr;
+         C = C->getNextSibling()) {
+      if (C->getNextSibling() == N)
         return C;
     }
     llvm_unreachable("could not find a child node");
@@ -85,7 +85,7 @@ void syntax::removeStatement(syntax::Arena &A, syntax::Statement *S) {
   assert(S);
   assert(S->canModify());
 
-  if (isa(S->parent())) {
+  if (isa(S->getParent())) {
     // A child of CompoundStatement can just be safely removed.
     MutationsImpl::remove(S);
     return;
diff --git a/clang/lib/Tooling/Syntax/Nodes.cpp b/clang/lib/Tooling/Syntax/Nodes.cpp
index 6102c45a08e4d..bb63585cbd7c4 100644
--- a/clang/lib/Tooling/Syntax/Nodes.cpp
+++ b/clang/lib/Tooling/Syntax/Nodes.cpp
@@ -501,8 +501,8 @@ syntax::Leaf *syntax::CompoundStatement::getLbrace() {
 
 std::vector syntax::CompoundStatement::getStatements() {
   std::vector Children;
-  for (auto *C = firstChild(); C; C = C->nextSibling()) {
-    assert(C->role() == syntax::NodeRole::Statement);
+  for (auto *C = getFirstChild(); C; C = C->getNextSibling()) {
+    assert(C->getRole() == syntax::NodeRole::Statement);
     Children.push_back(cast(C));
   }
   return Children;
@@ -524,8 +524,8 @@ syntax::Expression *syntax::StaticAssertDeclaration::getMessage() {
 std::vector
 syntax::SimpleDeclaration::getDeclarators() {
   std::vector Children;
-  for (auto *C = firstChild(); C; C = C->nextSibling()) {
-    if (C->role() == syntax::NodeRole::Declarator)
+  for (auto *C = getFirstChild(); C; C = C->getNextSibling()) {
+    if (C->getRole() == syntax::NodeRole::Declarator)
       Children.push_back(cast(C));
   }
   return Children;
diff --git a/clang/lib/Tooling/Syntax/Synthesis.cpp b/clang/lib/Tooling/Syntax/Synthesis.cpp
index aa01a34c761fd..f171d26512d95 100644
--- a/clang/lib/Tooling/Syntax/Synthesis.cpp
+++ b/clang/lib/Tooling/Syntax/Synthesis.cpp
@@ -5,13 +5,15 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+#include "clang/Basic/TokenKinds.h"
 #include "clang/Tooling/Syntax/BuildTree.h"
+#include "clang/Tooling/Syntax/Tree.h"
 
 using namespace clang;
 
 /// Exposes private syntax tree APIs required to implement node synthesis.
 /// Should not be used for anything else.
-class syntax::FactoryImpl {
+class clang::syntax::FactoryImpl {
 public:
   static void setCanModify(syntax::Node *N) { N->CanModify = true; }
 
@@ -19,27 +21,187 @@ class syntax::FactoryImpl {
                                    syntax::NodeRole R) {
     T->prependChildLowLevel(Child, R);
   }
+
+  static std::pair>
+  lexBuffer(syntax::Arena &A, std::unique_ptr Buffer) {
+    return A.lexBuffer(std::move(Buffer));
+  }
 };
 
-clang::syntax::Leaf *syntax::createPunctuation(clang::syntax::Arena &A,
-                                               clang::tok::TokenKind K) {
-  auto Tokens = A.lexBuffer(llvm::MemoryBuffer::getMemBuffer(
-                                clang::tok::getPunctuatorSpelling(K)))
-                    .second;
+syntax::Leaf *clang::syntax::createLeaf(syntax::Arena &A, tok::TokenKind K,
+                                        StringRef Spelling) {
+  auto Tokens =
+      FactoryImpl::lexBuffer(A, llvm::MemoryBuffer::getMemBuffer(Spelling))
+          .second;
   assert(Tokens.size() == 1);
-  assert(Tokens.front().kind() == K);
-  auto *L = new (A.allocator()) clang::syntax::Leaf(Tokens.begin());
-  FactoryImpl::setCanModify(L);
-  L->assertInvariants();
-  return L;
+  assert(Tokens.front().kind() == K &&
+         "spelling is not lexed into the expected kind of token");
+
+  auto *Leaf = new (A.getAllocator()) syntax::Leaf(Tokens.begin());
+  syntax::FactoryImpl::setCanModify(Leaf);
+  Leaf->assertInvariants();
+  return Leaf;
+}
+
+syntax::Leaf *clang::syntax::createLeaf(syntax::Arena &A, tok::TokenKind K) {
+  const auto *Spelling = tok::getPunctuatorSpelling(K);
+  if (!Spelling)
+    Spelling = tok::getKeywordSpelling(K);
+  assert(Spelling &&
+         "Cannot infer the spelling of the token from its token kind.");
+  return createLeaf(A, K, Spelling);
+}
+
+namespace {
+// Allocates the concrete syntax `Tree` according to its `NodeKind`.
+syntax::Tree *allocateTree(syntax::Arena &A, syntax::NodeKind Kind) {
+  switch (Kind) {
+  case syntax::NodeKind::Leaf:
+    assert(false);
+    break;
+  case syntax::NodeKind::TranslationUnit:
+    return new (A.getAllocator()) syntax::TranslationUnit;
+  case syntax::NodeKind::UnknownExpression:
+    return new (A.getAllocator()) syntax::UnknownExpression;
+  case syntax::NodeKind::ParenExpression:
+    return new (A.getAllocator()) syntax::ParenExpression;
+  case syntax::NodeKind::ThisExpression:
+    return new (A.getAllocator()) syntax::ThisExpression;
+  case syntax::NodeKind::IntegerLiteralExpression:
+    return new (A.getAllocator()) syntax::IntegerLiteralExpression;
+  case syntax::NodeKind::CharacterLiteralExpression:
+    return new (A.getAllocator()) syntax::CharacterLiteralExpression;
+  case syntax::NodeKind::FloatingLiteralExpression:
+    return new (A.getAllocator()) syntax::FloatingLiteralExpression;
+  case syntax::NodeKind::StringLiteralExpression:
+    return new (A.getAllocator()) syntax::StringLiteralExpression;
+  case syntax::NodeKind::BoolLiteralExpression:
+    return new (A.getAllocator()) syntax::BoolLiteralExpression;
+  case syntax::NodeKind::CxxNullPtrExpression:
+    return new (A.getAllocator()) syntax::CxxNullPtrExpression;
+  case syntax::NodeKind::IntegerUserDefinedLiteralExpression:
+    return new (A.getAllocator()) syntax::IntegerUserDefinedLiteralExpression;
+  case syntax::NodeKind::FloatUserDefinedLiteralExpression:
+    return new (A.getAllocator()) syntax::FloatUserDefinedLiteralExpression;
+  case syntax::NodeKind::CharUserDefinedLiteralExpression:
+    return new (A.getAllocator()) syntax::CharUserDefinedLiteralExpression;
+  case syntax::NodeKind::StringUserDefinedLiteralExpression:
+    return new (A.getAllocator()) syntax::StringUserDefinedLiteralExpression;
+  case syntax::NodeKind::PrefixUnaryOperatorExpression:
+    return new (A.getAllocator()) syntax::PrefixUnaryOperatorExpression;
+  case syntax::NodeKind::PostfixUnaryOperatorExpression:
+    return new (A.getAllocator()) syntax::PostfixUnaryOperatorExpression;
+  case syntax::NodeKind::BinaryOperatorExpression:
+    return new (A.getAllocator()) syntax::BinaryOperatorExpression;
+  case syntax::NodeKind::UnqualifiedId:
+    return new (A.getAllocator()) syntax::UnqualifiedId;
+  case syntax::NodeKind::IdExpression:
+    return new (A.getAllocator()) syntax::IdExpression;
+  case syntax::NodeKind::CallExpression:
+    return new (A.getAllocator()) syntax::CallExpression;
+  case syntax::NodeKind::UnknownStatement:
+    return new (A.getAllocator()) syntax::UnknownStatement;
+  case syntax::NodeKind::DeclarationStatement:
+    return new (A.getAllocator()) syntax::DeclarationStatement;
+  case syntax::NodeKind::EmptyStatement:
+    return new (A.getAllocator()) syntax::EmptyStatement;
+  case syntax::NodeKind::SwitchStatement:
+    return new (A.getAllocator()) syntax::SwitchStatement;
+  case syntax::NodeKind::CaseStatement:
+    return new (A.getAllocator()) syntax::CaseStatement;
+  case syntax::NodeKind::DefaultStatement:
+    return new (A.getAllocator()) syntax::DefaultStatement;
+  case syntax::NodeKind::IfStatement:
+    return new (A.getAllocator()) syntax::IfStatement;
+  case syntax::NodeKind::ForStatement:
+    return new (A.getAllocator()) syntax::ForStatement;
+  case syntax::NodeKind::WhileStatement:
+    return new (A.getAllocator()) syntax::WhileStatement;
+  case syntax::NodeKind::ContinueStatement:
+    return new (A.getAllocator()) syntax::ContinueStatement;
+  case syntax::NodeKind::BreakStatement:
+    return new (A.getAllocator()) syntax::BreakStatement;
+  case syntax::NodeKind::ReturnStatement:
+    return new (A.getAllocator()) syntax::ReturnStatement;
+  case syntax::NodeKind::RangeBasedForStatement:
+    return new (A.getAllocator()) syntax::RangeBasedForStatement;
+  case syntax::NodeKind::ExpressionStatement:
+    return new (A.getAllocator()) syntax::ExpressionStatement;
+  case syntax::NodeKind::CompoundStatement:
+    return new (A.getAllocator()) syntax::CompoundStatement;
+  case syntax::NodeKind::UnknownDeclaration:
+    return new (A.getAllocator()) syntax::UnknownDeclaration;
+  case syntax::NodeKind::EmptyDeclaration:
+    return new (A.getAllocator()) syntax::EmptyDeclaration;
+  case syntax::NodeKind::StaticAssertDeclaration:
+    return new (A.getAllocator()) syntax::StaticAssertDeclaration;
+  case syntax::NodeKind::LinkageSpecificationDeclaration:
+    return new (A.getAllocator()) syntax::LinkageSpecificationDeclaration;
+  case syntax::NodeKind::SimpleDeclaration:
+    return new (A.getAllocator()) syntax::SimpleDeclaration;
+  case syntax::NodeKind::TemplateDeclaration:
+    return new (A.getAllocator()) syntax::TemplateDeclaration;
+  case syntax::NodeKind::ExplicitTemplateInstantiation:
+    return new (A.getAllocator()) syntax::ExplicitTemplateInstantiation;
+  case syntax::NodeKind::NamespaceDefinition:
+    return new (A.getAllocator()) syntax::NamespaceDefinition;
+  case syntax::NodeKind::NamespaceAliasDefinition:
+    return new (A.getAllocator()) syntax::NamespaceAliasDefinition;
+  case syntax::NodeKind::UsingNamespaceDirective:
+    return new (A.getAllocator()) syntax::UsingNamespaceDirective;
+  case syntax::NodeKind::UsingDeclaration:
+    return new (A.getAllocator()) syntax::UsingDeclaration;
+  case syntax::NodeKind::TypeAliasDeclaration:
+    return new (A.getAllocator()) syntax::TypeAliasDeclaration;
+  case syntax::NodeKind::SimpleDeclarator:
+    return new (A.getAllocator()) syntax::SimpleDeclarator;
+  case syntax::NodeKind::ParenDeclarator:
+    return new (A.getAllocator()) syntax::ParenDeclarator;
+  case syntax::NodeKind::ArraySubscript:
+    return new (A.getAllocator()) syntax::ArraySubscript;
+  case syntax::NodeKind::TrailingReturnType:
+    return new (A.getAllocator()) syntax::TrailingReturnType;
+  case syntax::NodeKind::ParametersAndQualifiers:
+    return new (A.getAllocator()) syntax::ParametersAndQualifiers;
+  case syntax::NodeKind::MemberPointer:
+    return new (A.getAllocator()) syntax::MemberPointer;
+  case syntax::NodeKind::GlobalNameSpecifier:
+    return new (A.getAllocator()) syntax::GlobalNameSpecifier;
+  case syntax::NodeKind::DecltypeNameSpecifier:
+    return new (A.getAllocator()) syntax::DecltypeNameSpecifier;
+  case syntax::NodeKind::IdentifierNameSpecifier:
+    return new (A.getAllocator()) syntax::IdentifierNameSpecifier;
+  case syntax::NodeKind::SimpleTemplateNameSpecifier:
+    return new (A.getAllocator()) syntax::SimpleTemplateNameSpecifier;
+  case syntax::NodeKind::NestedNameSpecifier:
+    return new (A.getAllocator()) syntax::NestedNameSpecifier;
+  case syntax::NodeKind::MemberExpression:
+    return new (A.getAllocator()) syntax::MemberExpression;
+  case syntax::NodeKind::CallArguments:
+    return new (A.getAllocator()) syntax::CallArguments;
+  case syntax::NodeKind::ParameterDeclarationList:
+    return new (A.getAllocator()) syntax::ParameterDeclarationList;
+  }
+  llvm_unreachable("unknown node kind");
+}
+} // namespace
+
+syntax::Tree *clang::syntax::createTree(
+    syntax::Arena &A,
+    std::vector> Children,
+    syntax::NodeKind K) {
+  auto *T = allocateTree(A, K);
+  FactoryImpl::setCanModify(T);
+  for (auto ChildIt = Children.rbegin(); ChildIt != Children.rend();
+       std::advance(ChildIt, 1))
+    FactoryImpl::prependChildLowLevel(T, ChildIt->first, ChildIt->second);
+
+  T->assertInvariants();
+  return T;
 }
 
-clang::syntax::EmptyStatement *
-syntax::createEmptyStatement(clang::syntax::Arena &A) {
-  auto *S = new (A.allocator()) clang::syntax::EmptyStatement;
-  FactoryImpl::setCanModify(S);
-  FactoryImpl::prependChildLowLevel(S, createPunctuation(A, clang::tok::semi),
-                                    NodeRole::Unknown);
-  S->assertInvariants();
-  return S;
+syntax::EmptyStatement *clang::syntax::createEmptyStatement(syntax::Arena &A) {
+  return cast(
+      createTree(A, {{createLeaf(A, tok::semi), NodeRole::Unknown}},
+                 NodeKind::EmptyStatement));
 }
diff --git a/clang/lib/Tooling/Syntax/Tree.cpp b/clang/lib/Tooling/Syntax/Tree.cpp
index 2cef806937bfc..1edd2583105aa 100644
--- a/clang/lib/Tooling/Syntax/Tree.cpp
+++ b/clang/lib/Tooling/Syntax/Tree.cpp
@@ -19,7 +19,7 @@ namespace {
 static void traverse(const syntax::Node *N,
                      llvm::function_ref Visit) {
   if (auto *T = dyn_cast(N)) {
-    for (auto *C = T->firstChild(); C; C = C->nextSibling())
+    for (const auto *C = T->getFirstChild(); C; C = C->getNextSibling())
       traverse(C, Visit);
   }
   Visit(N);
@@ -36,7 +36,9 @@ syntax::Arena::Arena(SourceManager &SourceMgr, const LangOptions &LangOpts,
                      const TokenBuffer &Tokens)
     : SourceMgr(SourceMgr), LangOpts(LangOpts), Tokens(Tokens) {}
 
-const syntax::TokenBuffer &syntax::Arena::tokenBuffer() const { return Tokens; }
+const syntax::TokenBuffer &syntax::Arena::getTokenBuffer() const {
+  return Tokens;
+}
 
 std::pair>
 syntax::Arena::lexBuffer(std::unique_ptr Input) {
@@ -51,7 +53,7 @@ syntax::Leaf::Leaf(const syntax::Token *Tok) : Node(NodeKind::Leaf), Tok(Tok) {
 }
 
 bool syntax::Leaf::classof(const Node *N) {
-  return N->kind() == NodeKind::Leaf;
+  return N->getKind() == NodeKind::Leaf;
 }
 
 syntax::Node::Node(NodeKind Kind)
@@ -60,16 +62,20 @@ syntax::Node::Node(NodeKind Kind)
   this->setRole(NodeRole::Detached);
 }
 
-bool syntax::Node::isDetached() const { return role() == NodeRole::Detached; }
+bool syntax::Node::isDetached() const {
+  return getRole() == NodeRole::Detached;
+}
 
 void syntax::Node::setRole(NodeRole NR) {
   this->Role = static_cast(NR);
 }
 
-bool syntax::Tree::classof(const Node *N) { return N->kind() > NodeKind::Leaf; }
+bool syntax::Tree::classof(const Node *N) {
+  return N->getKind() > NodeKind::Leaf;
+}
 
 void syntax::Tree::prependChildLowLevel(Node *Child, NodeRole Role) {
-  assert(Child->role() == NodeRole::Detached);
+  assert(Child->getRole() == NodeRole::Detached);
   assert(Role != NodeRole::Detached);
 
   Child->setRole(Role);
@@ -79,7 +85,7 @@ void syntax::Tree::prependChildLowLevel(Node *Child, NodeRole Role) {
 void syntax::Tree::prependChildLowLevel(Node *Child) {
   assert(Child->Parent == nullptr);
   assert(Child->NextSibling == nullptr);
-  assert(Child->role() != NodeRole::Detached);
+  assert(Child->getRole() != NodeRole::Detached);
 
   Child->Parent = this;
   Child->NextSibling = this->FirstChild;
@@ -91,15 +97,15 @@ void syntax::Tree::replaceChildRangeLowLevel(Node *BeforeBegin, Node *End,
   assert(!BeforeBegin || BeforeBegin->Parent == this);
 
 #ifndef NDEBUG
-  for (auto *N = New; N; N = N->nextSibling()) {
+  for (auto *N = New; N; N = N->getNextSibling()) {
     assert(N->Parent == nullptr);
-    assert(N->role() != NodeRole::Detached && "Roles must be set");
+    assert(N->getRole() != NodeRole::Detached && "Roles must be set");
     // FIXME: sanity-check the role.
   }
 #endif
 
   // Detach old nodes.
-  for (auto *N = !BeforeBegin ? FirstChild : BeforeBegin->nextSibling();
+  for (auto *N = !BeforeBegin ? FirstChild : BeforeBegin->getNextSibling();
        N != End;) {
     auto *Next = N->NextSibling;
 
@@ -120,7 +126,7 @@ void syntax::Tree::replaceChildRangeLowLevel(Node *BeforeBegin, Node *End,
 
   if (New) {
     auto *Last = New;
-    for (auto *N = New; N != nullptr; N = N->nextSibling()) {
+    for (auto *N = New; N != nullptr; N = N->getNextSibling()) {
       Last = N;
       N->Parent = this;
     }
@@ -136,7 +142,7 @@ namespace {
 static void dumpLeaf(raw_ostream &OS, const syntax::Leaf *L,
                      const SourceManager &SM) {
   assert(L);
-  const auto *Token = L->token();
+  const auto *Token = L->getToken();
   assert(Token);
   // Handle 'eof' separately, calling text() on it produces an empty string.
   if (Token->kind() == tok::eof)
@@ -148,8 +154,8 @@ static void dumpLeaf(raw_ostream &OS, const syntax::Leaf *L,
 static void dumpNode(raw_ostream &OS, const syntax::Node *N,
                      const SourceManager &SM, std::vector IndentMask) {
   auto dumpExtraInfo = [&OS](const syntax::Node *N) {
-    if (N->role() != syntax::NodeRole::Unknown)
-      OS << " " << N->role();
+    if (N->getRole() != syntax::NodeRole::Unknown)
+      OS << " " << N->getRole();
     if (!N->isOriginal())
       OS << " synthesized";
     if (!N->canModify())
@@ -167,18 +173,18 @@ static void dumpNode(raw_ostream &OS, const syntax::Node *N,
   }
 
   const auto *T = cast(N);
-  OS << T->kind();
+  OS << T->getKind();
   dumpExtraInfo(N);
   OS << "\n";
 
-  for (const auto *It = T->firstChild(); It; It = It->nextSibling()) {
+  for (const auto *It = T->getFirstChild(); It; It = It->getNextSibling()) {
     for (bool Filled : IndentMask) {
       if (Filled)
         OS << "| ";
       else
         OS << "  ";
     }
-    if (!It->nextSibling()) {
+    if (!It->getNextSibling()) {
       OS << "`-";
       IndentMask.push_back(false);
     } else {
@@ -213,19 +219,32 @@ std::string syntax::Node::dumpTokens(const SourceManager &SM) const {
 void syntax::Node::assertInvariants() const {
 #ifndef NDEBUG
   if (isDetached())
-    assert(parent() == nullptr);
+    assert(getParent() == nullptr);
   else
-    assert(parent() != nullptr);
+    assert(getParent() != nullptr);
 
-  auto *T = dyn_cast(this);
+  const auto *T = dyn_cast(this);
   if (!T)
     return;
-  for (auto *C = T->firstChild(); C; C = C->nextSibling()) {
+  for (const auto *C = T->getFirstChild(); C; C = C->getNextSibling()) {
     if (T->isOriginal())
       assert(C->isOriginal());
     assert(!C->isDetached());
-    assert(C->parent() == T);
+    assert(C->getParent() == T);
+  }
+
+  const auto *L = dyn_cast(T);
+  if (!L)
+    return;
+  for (const auto *C = T->getFirstChild(); C; C = C->getNextSibling()) {
+    assert(C->getRole() == NodeRole::ListElement ||
+           C->getRole() == NodeRole::ListDelimiter);
+    if (C->getRole() == NodeRole::ListDelimiter) {
+      assert(isa(C));
+      assert(cast(C)->getToken()->kind() == L->getDelimiterTokenKind());
+    }
   }
+
 #endif
 }
 
@@ -235,9 +254,9 @@ void syntax::Node::assertInvariantsRecursive() const {
 #endif
 }
 
-syntax::Leaf *syntax::Tree::firstLeaf() {
+syntax::Leaf *syntax::Tree::findFirstLeaf() {
   auto *T = this;
-  while (auto *C = T->firstChild()) {
+  while (auto *C = T->getFirstChild()) {
     if (auto *L = dyn_cast(C))
       return L;
     T = cast(C);
@@ -245,11 +264,11 @@ syntax::Leaf *syntax::Tree::firstLeaf() {
   return nullptr;
 }
 
-syntax::Leaf *syntax::Tree::lastLeaf() {
+syntax::Leaf *syntax::Tree::findLastLeaf() {
   auto *T = this;
-  while (auto *C = T->firstChild()) {
+  while (auto *C = T->getFirstChild()) {
     // Find the last child.
-    while (auto *Next = C->nextSibling())
+    while (auto *Next = C->getNextSibling())
       C = Next;
 
     if (auto *L = dyn_cast(C))
@@ -260,22 +279,33 @@ syntax::Leaf *syntax::Tree::lastLeaf() {
 }
 
 syntax::Node *syntax::Tree::findChild(NodeRole R) {
-  for (auto *C = FirstChild; C; C = C->nextSibling()) {
-    if (C->role() == R)
+  for (auto *C = FirstChild; C; C = C->getNextSibling()) {
+    if (C->getRole() == R)
       return C;
   }
   return nullptr;
 }
 
+bool syntax::List::classof(const syntax::Node *N) {
+  switch (N->getKind()) {
+  case syntax::NodeKind::NestedNameSpecifier:
+  case syntax::NodeKind::CallArguments:
+  case syntax::NodeKind::ParameterDeclarationList:
+    return true;
+  default:
+    return false;
+  }
+}
+
 std::vector>
 syntax::List::getElementsAsNodesAndDelimiters() {
-  if (!firstChild())
+  if (!getFirstChild())
     return {};
 
   auto children = std::vector>();
   syntax::Node *elementWithoutDelimiter = nullptr;
-  for (auto *C = firstChild(); C; C = C->nextSibling()) {
-    switch (C->role()) {
+  for (auto *C = getFirstChild(); C; C = C->getNextSibling()) {
+    switch (C->getRole()) {
     case syntax::NodeRole::ListElement: {
       if (elementWithoutDelimiter) {
         children.push_back({elementWithoutDelimiter, nullptr});
@@ -314,13 +344,13 @@ syntax::List::getElementsAsNodesAndDelimiters() {
 // Almost the same implementation of `getElementsAsNodesAndDelimiters` but
 // ignoring delimiters
 std::vector syntax::List::getElementsAsNodes() {
-  if (!firstChild())
+  if (!getFirstChild())
     return {};
 
   auto children = std::vector();
   syntax::Node *elementWithoutDelimiter = nullptr;
-  for (auto *C = firstChild(); C; C = C->nextSibling()) {
-    switch (C->role()) {
+  for (auto *C = getFirstChild(); C; C = C->getNextSibling()) {
+    switch (C->getRole()) {
     case syntax::NodeRole::ListElement: {
       if (elementWithoutDelimiter) {
         children.push_back(elementWithoutDelimiter);
@@ -355,12 +385,12 @@ std::vector syntax::List::getElementsAsNodes() {
   return children;
 }
 
-clang::tok::TokenKind syntax::List::getDelimiterTokenKind() {
-  switch (this->kind()) {
+clang::tok::TokenKind syntax::List::getDelimiterTokenKind() const {
+  switch (this->getKind()) {
   case NodeKind::NestedNameSpecifier:
     return clang::tok::coloncolon;
   case NodeKind::CallArguments:
-  case NodeKind::ParametersAndQualifiers:
+  case NodeKind::ParameterDeclarationList:
     return clang::tok::comma;
   default:
     llvm_unreachable("This is not a subclass of List, thus "
@@ -368,12 +398,12 @@ clang::tok::TokenKind syntax::List::getDelimiterTokenKind() {
   }
 }
 
-syntax::List::TerminationKind syntax::List::getTerminationKind() {
-  switch (this->kind()) {
+syntax::List::TerminationKind syntax::List::getTerminationKind() const {
+  switch (this->getKind()) {
   case NodeKind::NestedNameSpecifier:
     return TerminationKind::Terminated;
   case NodeKind::CallArguments:
-  case NodeKind::ParametersAndQualifiers:
+  case NodeKind::ParameterDeclarationList:
     return TerminationKind::Separated;
   default:
     llvm_unreachable("This is not a subclass of List, thus "
@@ -381,13 +411,13 @@ syntax::List::TerminationKind syntax::List::getTerminationKind() {
   }
 }
 
-bool syntax::List::canBeEmpty() {
-  switch (this->kind()) {
+bool syntax::List::canBeEmpty() const {
+  switch (this->getKind()) {
   case NodeKind::NestedNameSpecifier:
     return false;
   case NodeKind::CallArguments:
     return true;
-  case NodeKind::ParametersAndQualifiers:
+  case NodeKind::ParameterDeclarationList:
     return true;
   default:
     llvm_unreachable("This is not a subclass of List, thus canBeEmpty() "
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index 1ee8ce28c2efa..b0d3f5caf67a3 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -78,7 +78,7 @@ newDriver(DiagnosticsEngine *Diagnostics, const char *BinaryName,
           IntrusiveRefCntPtr VFS) {
   driver::Driver *CompilerDriver =
       new driver::Driver(BinaryName, llvm::sys::getDefaultTargetTriple(),
-                         *Diagnostics, std::move(VFS));
+                         *Diagnostics, "clang LLVM compiler", std::move(VFS));
   CompilerDriver->setTitle("clang_based_tool");
   return CompilerDriver;
 }
diff --git a/clang/lib/Tooling/Transformer/Parsing.cpp b/clang/lib/Tooling/Transformer/Parsing.cpp
index fb5fd4a800bbb..66fa04a15594a 100644
--- a/clang/lib/Tooling/Transformer/Parsing.cpp
+++ b/clang/lib/Tooling/Transformer/Parsing.cpp
@@ -148,7 +148,7 @@ static ParseState advance(ParseState S, size_t N) {
 }
 
 static StringRef consumeWhitespace(StringRef S) {
-  return S.drop_while([](char c) { return c >= 0 && isWhitespace(c); });
+  return S.drop_while([](char c) { return isASCII(c) && isWhitespace(c); });
 }
 
 // Parses a single expected character \c c from \c State, skipping preceding
@@ -165,7 +165,7 @@ static ExpectedProgress parseChar(char c, ParseState State) {
 static ExpectedProgress parseId(ParseState State) {
   State.Input = consumeWhitespace(State.Input);
   auto Id = State.Input.take_while(
-      [](char c) { return c >= 0 && isIdentifierBody(c); });
+      [](char c) { return isASCII(c) && isIdentifierBody(c); });
   if (Id.empty())
     return makeParseError(State, "failed to parse name");
   return makeParseProgress(advance(State, Id.size()), Id.str());
diff --git a/clang/test/AST/ast-dump-attr.cpp b/clang/test/AST/ast-dump-attr.cpp
index 95491a02f8b2d..c2bd768dc2adf 100644
--- a/clang/test/AST/ast-dump-attr.cpp
+++ b/clang/test/AST/ast-dump-attr.cpp
@@ -119,6 +119,7 @@ namespace Test {
 extern "C" int printf(const char *format, ...);
 // CHECK: FunctionDecl{{.*}}printf
 // CHECK-NEXT: ParmVarDecl{{.*}}format{{.*}}'const char *'
+// CHECK-NEXT: BuiltinAttr{{.*}}Implicit
 // CHECK-NEXT: FormatAttr{{.*}}Implicit printf 1 2
 
 alignas(8) extern int x;
diff --git a/clang/test/AST/ast-dump-fpfeatures.cpp b/clang/test/AST/ast-dump-fpfeatures.cpp
index f3925aebbe752..01af3a8fd7e9c 100644
--- a/clang/test/AST/ast-dump-fpfeatures.cpp
+++ b/clang/test/AST/ast-dump-fpfeatures.cpp
@@ -36,8 +36,49 @@ float func_03(float x) {
 // CHECK-NEXT:       ReturnStmt
 // CHECK-NEXT:         CallExpr {{.*}} FPContractMode=0
 
+int func_04(float x) {
+#pragma STDC FP_CONTRACT ON
+  return x;
+}
+
+// CHECK:      FunctionDecl {{.*}} func_04 'int (float)'
+// CHECK-NEXT:   ParmVarDecl {{.*}} x 'float'
+// CHECK-NEXT:   CompoundStmt
+// CHECK-NEXT:     ReturnStmt
+// CHECK-NEXT:       ImplicitCastExpr {{.*}} 'int'  FPContractMode=1
 
+float func_05(double x) {
+#pragma STDC FP_CONTRACT ON
+  return (float)x;
+}
 
+// CHECK:      FunctionDecl {{.*}} func_05 'float (double)'
+// CHECK-NEXT:   ParmVarDecl {{.*}} x 'double'
+// CHECK-NEXT:   CompoundStmt
+// CHECK-NEXT:     ReturnStmt
+// CHECK-NEXT:       CStyleCastExpr {{.*}} FPContractMode=1
+
+float func_06(double x) {
+#pragma STDC FP_CONTRACT ON
+  return float(x);
+}
+
+// CHECK:      FunctionDecl {{.*}} func_06 'float (double)'
+// CHECK-NEXT:   ParmVarDecl {{.*}} x 'double'
+// CHECK-NEXT:   CompoundStmt
+// CHECK-NEXT:     ReturnStmt
+// CHECK-NEXT:       CXXFunctionalCastExpr {{.*}} FPContractMode=1
+
+float func_07(double x) {
+#pragma STDC FP_CONTRACT ON
+  return static_cast(x);
+}
+
+// CHECK:      FunctionDecl {{.*}} func_07 'float (double)'
+// CHECK-NEXT:   ParmVarDecl {{.*}} x 'double'
+// CHECK-NEXT:   CompoundStmt
+// CHECK-NEXT:     ReturnStmt
+// CHECK-NEXT:       CXXStaticCastExpr {{.*}} FPContractMode=1
 
 #pragma STDC FENV_ROUND FE_DOWNWARD
 
@@ -87,7 +128,7 @@ T func_14(T x, T y) {
 }
 
 float func_15(float x, float y) {
-#pragma STDC FPENV_ROUND FE_DOWNWARD
+#pragma STDC FENV_ROUND FE_DOWNWARD
   return func_14(x, y);
 }
 
diff --git a/clang/test/AST/ast-dump-openmp-begin-declare-variant-varying-return.c b/clang/test/AST/ast-dump-openmp-begin-declare-variant-varying-return.c
new file mode 100644
index 0000000000000..dd81e2ee98c17
--- /dev/null
+++ b/clang/test/AST/ast-dump-openmp-begin-declare-variant-varying-return.c
@@ -0,0 +1,401 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -verify -ast-dump %s        -DUSE_FLOAT | FileCheck %s --check-prefix=C_FLOAT
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -verify -ast-dump %s -x c++ -DUSE_FLOAT | FileCheck %s --check-prefix=CXX_FLOAT
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -verify -ast-dump %s                    | FileCheck %s --check-prefix=C_INT
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -verify -ast-dump %s -x c++             | FileCheck %s --check-prefix=CXX_INT
+// expected-no-diagnostics
+
+#ifdef __cplusplus
+#define OVERLOADABLE
+#else
+#define OVERLOADABLE __attribute__((overloadable))
+#endif
+
+#ifdef USE_FLOAT
+#define RETURN_TY float
+#define BEFORE_BASE_RETURN_VALUE 0
+#define BEFORE_VARIANT_RETURN_VALUE 1
+#define AFTER__BASE_RETURN_VALUE 1
+#define AFTER__VARIANT_RETURN_VALUE 0
+#else
+#define RETURN_TY int
+#define BEFORE_BASE_RETURN_VALUE 1
+#define BEFORE_VARIANT_RETURN_VALUE 0
+#define AFTER__BASE_RETURN_VALUE 0
+#define AFTER__VARIANT_RETURN_VALUE 1
+#endif
+
+OVERLOADABLE
+RETURN_TY also_before(void) {
+  return BEFORE_BASE_RETURN_VALUE;
+}
+OVERLOADABLE
+RETURN_TY also_before(int i) {
+  return BEFORE_BASE_RETURN_VALUE;
+}
+
+#pragma omp begin declare variant match(implementation = {extension(disable_implicit_base)})
+OVERLOADABLE
+int also_before(void) {
+  return BEFORE_VARIANT_RETURN_VALUE;
+}
+OVERLOADABLE
+int also_before(int i) {
+  return BEFORE_VARIANT_RETURN_VALUE;
+}
+
+OVERLOADABLE
+int also_after(double d) {
+  return AFTER__VARIANT_RETURN_VALUE;
+}
+OVERLOADABLE
+int also_after(long l) {
+  return AFTER__VARIANT_RETURN_VALUE;
+}
+#pragma omp end declare variant
+
+OVERLOADABLE
+RETURN_TY also_after(double d) {
+  return AFTER__BASE_RETURN_VALUE;
+}
+OVERLOADABLE
+RETURN_TY also_after(long l) {
+  return AFTER__BASE_RETURN_VALUE;
+}
+
+int main() {
+  // Should return 0.
+  return also_before() + also_before(1) + also_before(2.0f) + also_after(3.0) + also_after(4L);
+}
+
+// Make sure we see base calls in the FLOAT versions, that is no
+// PseudoObjectExpr in those. In the INT versions we want PseudoObjectExpr (=
+// variant calls) for the `*_before` functions but not the `*_after` ones
+// (first 3 vs 2 last ones).
+
+// C_FLOAT:      |-FunctionDecl [[ADDR_0:0x[a-z0-9]*]] <{{.*}}, line:30:1> line:28:11 used also_before 'float ({{.*}})'
+// C_FLOAT-NEXT: | |-CompoundStmt [[ADDR_1:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | | `-ReturnStmt [[ADDR_2:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | |   `-ImplicitCastExpr [[ADDR_3:0x[a-z0-9]*]]  'float' 
+// C_FLOAT-NEXT: | |     `-IntegerLiteral [[ADDR_4:0x[a-z0-9]*]]  'int' 0
+// C_FLOAT-NEXT: | `-OverloadableAttr [[ADDR_5:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: |-FunctionDecl [[ADDR_6:0x[a-z0-9]*]]  line:32:11 used also_before 'float (int)'
+// C_FLOAT-NEXT: | |-ParmVarDecl [[ADDR_7:0x[a-z0-9]*]]  col:27 i 'int'
+// C_FLOAT-NEXT: | |-CompoundStmt [[ADDR_8:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | | `-ReturnStmt [[ADDR_9:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | |   `-ImplicitCastExpr [[ADDR_10:0x[a-z0-9]*]]  'float' 
+// C_FLOAT-NEXT: | |     `-IntegerLiteral [[ADDR_11:0x[a-z0-9]*]]  'int' 0
+// C_FLOAT-NEXT: | `-OverloadableAttr [[ADDR_12:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: |-FunctionDecl [[ADDR_13:0x[a-z0-9]*]]  line:10:22 also_before[implementation={extension(disable_implicit_base)}] 'int ({{.*}})'
+// C_FLOAT-NEXT: | |-CompoundStmt [[ADDR_14:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | | `-ReturnStmt [[ADDR_15:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | |   `-IntegerLiteral [[ADDR_16:0x[a-z0-9]*]]  'int' 1
+// C_FLOAT-NEXT: | `-OverloadableAttr [[ADDR_17:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: |-FunctionDecl [[ADDR_18:0x[a-z0-9]*]]  line:10:22 also_before[implementation={extension(disable_implicit_base)}] 'int (int)'
+// C_FLOAT-NEXT: | |-ParmVarDecl [[ADDR_19:0x[a-z0-9]*]]  col:21 i 'int'
+// C_FLOAT-NEXT: | |-CompoundStmt [[ADDR_20:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | | `-ReturnStmt [[ADDR_21:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | |   `-IntegerLiteral [[ADDR_22:0x[a-z0-9]*]]  'int' 1
+// C_FLOAT-NEXT: | `-OverloadableAttr [[ADDR_23:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: |-FunctionDecl [[ADDR_24:0x[a-z0-9]*]]  line:10:22 also_after[implementation={extension(disable_implicit_base)}] 'int (double)'
+// C_FLOAT-NEXT: | |-ParmVarDecl [[ADDR_25:0x[a-z0-9]*]]  col:23 d 'double'
+// C_FLOAT-NEXT: | |-CompoundStmt [[ADDR_26:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | | `-ReturnStmt [[ADDR_27:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | |   `-IntegerLiteral [[ADDR_28:0x[a-z0-9]*]]  'int' 0
+// C_FLOAT-NEXT: | `-OverloadableAttr [[ADDR_29:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: |-FunctionDecl [[ADDR_30:0x[a-z0-9]*]]  line:10:22 also_after[implementation={extension(disable_implicit_base)}] 'int (long)'
+// C_FLOAT-NEXT: | |-ParmVarDecl [[ADDR_31:0x[a-z0-9]*]]  col:21 l 'long'
+// C_FLOAT-NEXT: | |-CompoundStmt [[ADDR_32:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | | `-ReturnStmt [[ADDR_33:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | |   `-IntegerLiteral [[ADDR_34:0x[a-z0-9]*]]  'int' 0
+// C_FLOAT-NEXT: | `-OverloadableAttr [[ADDR_35:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: |-FunctionDecl [[ADDR_36:0x[a-z0-9]*]]  line:57:11 used also_after 'float (double)'
+// C_FLOAT-NEXT: | |-ParmVarDecl [[ADDR_37:0x[a-z0-9]*]]  col:29 d 'double'
+// C_FLOAT-NEXT: | |-CompoundStmt [[ADDR_38:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | | `-ReturnStmt [[ADDR_39:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | |   `-ImplicitCastExpr [[ADDR_40:0x[a-z0-9]*]]  'float' 
+// C_FLOAT-NEXT: | |     `-IntegerLiteral [[ADDR_41:0x[a-z0-9]*]]  'int' 1
+// C_FLOAT-NEXT: | `-OverloadableAttr [[ADDR_42:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: |-FunctionDecl [[ADDR_43:0x[a-z0-9]*]]  line:61:11 used also_after 'float (long)'
+// C_FLOAT-NEXT: | |-ParmVarDecl [[ADDR_44:0x[a-z0-9]*]]  col:27 l 'long'
+// C_FLOAT-NEXT: | |-CompoundStmt [[ADDR_45:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | | `-ReturnStmt [[ADDR_46:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: | |   `-ImplicitCastExpr [[ADDR_47:0x[a-z0-9]*]]  'float' 
+// C_FLOAT-NEXT: | |     `-IntegerLiteral [[ADDR_48:0x[a-z0-9]*]]  'int' 1
+// C_FLOAT-NEXT: | `-OverloadableAttr [[ADDR_49:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT: `-FunctionDecl [[ADDR_50:0x[a-z0-9]*]]  line:65:5 main 'int ({{.*}})'
+// C_FLOAT-NEXT:   `-CompoundStmt [[ADDR_51:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT:     `-ReturnStmt [[ADDR_52:0x[a-z0-9]*]] 
+// C_FLOAT-NEXT:       `-ImplicitCastExpr [[ADDR_53:0x[a-z0-9]*]]  'int' 
+// C_FLOAT-NEXT:         `-BinaryOperator [[ADDR_54:0x[a-z0-9]*]]  'float' '+'
+// C_FLOAT-NEXT:           |-BinaryOperator [[ADDR_55:0x[a-z0-9]*]]  'float' '+'
+// C_FLOAT-NEXT:           | |-BinaryOperator [[ADDR_56:0x[a-z0-9]*]]  'float' '+'
+// C_FLOAT-NEXT:           | | |-BinaryOperator [[ADDR_57:0x[a-z0-9]*]]  'float' '+'
+// C_FLOAT-NEXT:           | | | |-CallExpr [[ADDR_58:0x[a-z0-9]*]]  'float'
+// C_FLOAT-NEXT:           | | | | `-ImplicitCastExpr [[ADDR_59:0x[a-z0-9]*]]  'float (*)({{.*}})' 
+// C_FLOAT-NEXT:           | | | |   `-DeclRefExpr [[ADDR_60:0x[a-z0-9]*]]  'float ({{.*}})' {{.*}}Function [[ADDR_0]] 'also_before' 'float ({{.*}})'
+// C_FLOAT-NEXT:           | | | `-CallExpr [[ADDR_61:0x[a-z0-9]*]]  'float'
+// C_FLOAT-NEXT:           | | |   |-ImplicitCastExpr [[ADDR_62:0x[a-z0-9]*]]  'float (*)(int)' 
+// C_FLOAT-NEXT:           | | |   | `-DeclRefExpr [[ADDR_63:0x[a-z0-9]*]]  'float (int)' {{.*}}Function [[ADDR_6]] 'also_before' 'float (int)'
+// C_FLOAT-NEXT:           | | |   `-IntegerLiteral [[ADDR_64:0x[a-z0-9]*]]  'int' 1
+// C_FLOAT-NEXT:           | | `-CallExpr [[ADDR_65:0x[a-z0-9]*]]  'float'
+// C_FLOAT-NEXT:           | |   |-ImplicitCastExpr [[ADDR_66:0x[a-z0-9]*]]  'float (*)(int)' 
+// C_FLOAT-NEXT:           | |   | `-DeclRefExpr [[ADDR_67:0x[a-z0-9]*]]  'float (int)' {{.*}}Function [[ADDR_6]] 'also_before' 'float (int)'
+// C_FLOAT-NEXT:           | |   `-ImplicitCastExpr [[ADDR_68:0x[a-z0-9]*]]  'int' 
+// C_FLOAT-NEXT:           | |     `-FloatingLiteral [[ADDR_69:0x[a-z0-9]*]]  'float' 2.000000e+00
+// C_FLOAT-NEXT:           | `-CallExpr [[ADDR_70:0x[a-z0-9]*]]  'float'
+// C_FLOAT-NEXT:           |   |-ImplicitCastExpr [[ADDR_71:0x[a-z0-9]*]]  'float (*)(double)' 
+// C_FLOAT-NEXT:           |   | `-DeclRefExpr [[ADDR_72:0x[a-z0-9]*]]  'float (double)' {{.*}}Function [[ADDR_36]] 'also_after' 'float (double)'
+// C_FLOAT-NEXT:           |   `-FloatingLiteral [[ADDR_73:0x[a-z0-9]*]]  'double' 3.000000e+00
+// C_FLOAT-NEXT:           `-CallExpr [[ADDR_74:0x[a-z0-9]*]]  'float'
+// C_FLOAT-NEXT:             |-ImplicitCastExpr [[ADDR_75:0x[a-z0-9]*]]  'float (*)(long)' 
+// C_FLOAT-NEXT:             | `-DeclRefExpr [[ADDR_76:0x[a-z0-9]*]]  'float (long)' {{.*}}Function [[ADDR_43]] 'also_after' 'float (long)'
+// C_FLOAT-NEXT:             `-IntegerLiteral [[ADDR_77:0x[a-z0-9]*]]  'long' 4
+
+// CXX_FLOAT:      |-FunctionDecl [[ADDR_0:0x[a-z0-9]*]] <{{.*}}, line:30:1> line:28:11 used also_before 'float ({{.*}})'
+// CXX_FLOAT-NEXT: | `-CompoundStmt [[ADDR_1:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |   `-ReturnStmt [[ADDR_2:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |     `-ImplicitCastExpr [[ADDR_3:0x[a-z0-9]*]]  'float' 
+// CXX_FLOAT-NEXT: |       `-IntegerLiteral [[ADDR_4:0x[a-z0-9]*]]  'int' 0
+// CXX_FLOAT-NEXT: |-FunctionDecl [[ADDR_5:0x[a-z0-9]*]]  line:32:11 used also_before 'float (int)'
+// CXX_FLOAT-NEXT: | |-ParmVarDecl [[ADDR_6:0x[a-z0-9]*]]  col:27 i 'int'
+// CXX_FLOAT-NEXT: | `-CompoundStmt [[ADDR_7:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |   `-ReturnStmt [[ADDR_8:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |     `-ImplicitCastExpr [[ADDR_9:0x[a-z0-9]*]]  'float' 
+// CXX_FLOAT-NEXT: |       `-IntegerLiteral [[ADDR_10:0x[a-z0-9]*]]  'int' 0
+// CXX_FLOAT-NEXT: |-FunctionDecl [[ADDR_11:0x[a-z0-9]*]]  line:38:1 also_before[implementation={extension(disable_implicit_base)}] 'int ({{.*}})'
+// CXX_FLOAT-NEXT: | `-CompoundStmt [[ADDR_12:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |   `-ReturnStmt [[ADDR_13:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |     `-IntegerLiteral [[ADDR_14:0x[a-z0-9]*]]  'int' 1
+// CXX_FLOAT-NEXT: |-FunctionDecl [[ADDR_15:0x[a-z0-9]*]]  line:42:1 also_before[implementation={extension(disable_implicit_base)}] 'int (int)'
+// CXX_FLOAT-NEXT: | |-ParmVarDecl [[ADDR_16:0x[a-z0-9]*]]  col:21 i 'int'
+// CXX_FLOAT-NEXT: | `-CompoundStmt [[ADDR_17:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |   `-ReturnStmt [[ADDR_18:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |     `-IntegerLiteral [[ADDR_19:0x[a-z0-9]*]]  'int' 1
+// CXX_FLOAT-NEXT: |-FunctionDecl [[ADDR_20:0x[a-z0-9]*]]  line:47:1 also_after[implementation={extension(disable_implicit_base)}] 'int (double)'
+// CXX_FLOAT-NEXT: | |-ParmVarDecl [[ADDR_21:0x[a-z0-9]*]]  col:23 d 'double'
+// CXX_FLOAT-NEXT: | `-CompoundStmt [[ADDR_22:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |   `-ReturnStmt [[ADDR_23:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |     `-IntegerLiteral [[ADDR_24:0x[a-z0-9]*]]  'int' 0
+// CXX_FLOAT-NEXT: |-FunctionDecl [[ADDR_25:0x[a-z0-9]*]]  line:51:1 also_after[implementation={extension(disable_implicit_base)}] 'int (long)'
+// CXX_FLOAT-NEXT: | |-ParmVarDecl [[ADDR_26:0x[a-z0-9]*]]  col:21 l 'long'
+// CXX_FLOAT-NEXT: | `-CompoundStmt [[ADDR_27:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |   `-ReturnStmt [[ADDR_28:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |     `-IntegerLiteral [[ADDR_29:0x[a-z0-9]*]]  'int' 0
+// CXX_FLOAT-NEXT: |-FunctionDecl [[ADDR_30:0x[a-z0-9]*]]  line:57:11 used also_after 'float (double)'
+// CXX_FLOAT-NEXT: | |-ParmVarDecl [[ADDR_31:0x[a-z0-9]*]]  col:29 d 'double'
+// CXX_FLOAT-NEXT: | `-CompoundStmt [[ADDR_32:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |   `-ReturnStmt [[ADDR_33:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |     `-ImplicitCastExpr [[ADDR_34:0x[a-z0-9]*]]  'float' 
+// CXX_FLOAT-NEXT: |       `-IntegerLiteral [[ADDR_35:0x[a-z0-9]*]]  'int' 1
+// CXX_FLOAT-NEXT: |-FunctionDecl [[ADDR_36:0x[a-z0-9]*]]  line:61:11 used also_after 'float (long)'
+// CXX_FLOAT-NEXT: | |-ParmVarDecl [[ADDR_37:0x[a-z0-9]*]]  col:27 l 'long'
+// CXX_FLOAT-NEXT: | `-CompoundStmt [[ADDR_38:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |   `-ReturnStmt [[ADDR_39:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT: |     `-ImplicitCastExpr [[ADDR_40:0x[a-z0-9]*]]  'float' 
+// CXX_FLOAT-NEXT: |       `-IntegerLiteral [[ADDR_41:0x[a-z0-9]*]]  'int' 1
+// CXX_FLOAT-NEXT: `-FunctionDecl [[ADDR_42:0x[a-z0-9]*]]  line:65:5 main 'int ({{.*}})'
+// CXX_FLOAT-NEXT:   `-CompoundStmt [[ADDR_43:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT:     `-ReturnStmt [[ADDR_44:0x[a-z0-9]*]] 
+// CXX_FLOAT-NEXT:       `-ImplicitCastExpr [[ADDR_45:0x[a-z0-9]*]]  'int' 
+// CXX_FLOAT-NEXT:         `-BinaryOperator [[ADDR_46:0x[a-z0-9]*]]  'float' '+'
+// CXX_FLOAT-NEXT:           |-BinaryOperator [[ADDR_47:0x[a-z0-9]*]]  'float' '+'
+// CXX_FLOAT-NEXT:           | |-BinaryOperator [[ADDR_48:0x[a-z0-9]*]]  'float' '+'
+// CXX_FLOAT-NEXT:           | | |-BinaryOperator [[ADDR_49:0x[a-z0-9]*]]  'float' '+'
+// CXX_FLOAT-NEXT:           | | | |-CallExpr [[ADDR_50:0x[a-z0-9]*]]  'float'
+// CXX_FLOAT-NEXT:           | | | | `-ImplicitCastExpr [[ADDR_51:0x[a-z0-9]*]]  'float (*)({{.*}})' 
+// CXX_FLOAT-NEXT:           | | | |   `-DeclRefExpr [[ADDR_52:0x[a-z0-9]*]]  'float ({{.*}})' {{.*}}Function [[ADDR_0]] 'also_before' 'float ({{.*}})'
+// CXX_FLOAT-NEXT:           | | | `-CallExpr [[ADDR_53:0x[a-z0-9]*]]  'float'
+// CXX_FLOAT-NEXT:           | | |   |-ImplicitCastExpr [[ADDR_54:0x[a-z0-9]*]]  'float (*)(int)' 
+// CXX_FLOAT-NEXT:           | | |   | `-DeclRefExpr [[ADDR_55:0x[a-z0-9]*]]  'float (int)' {{.*}}Function [[ADDR_5]] 'also_before' 'float (int)'
+// CXX_FLOAT-NEXT:           | | |   `-IntegerLiteral [[ADDR_56:0x[a-z0-9]*]]  'int' 1
+// CXX_FLOAT-NEXT:           | | `-CallExpr [[ADDR_57:0x[a-z0-9]*]]  'float'
+// CXX_FLOAT-NEXT:           | |   |-ImplicitCastExpr [[ADDR_58:0x[a-z0-9]*]]  'float (*)(int)' 
+// CXX_FLOAT-NEXT:           | |   | `-DeclRefExpr [[ADDR_59:0x[a-z0-9]*]]  'float (int)' {{.*}}Function [[ADDR_5]] 'also_before' 'float (int)'
+// CXX_FLOAT-NEXT:           | |   `-ImplicitCastExpr [[ADDR_60:0x[a-z0-9]*]]  'int' 
+// CXX_FLOAT-NEXT:           | |     `-FloatingLiteral [[ADDR_61:0x[a-z0-9]*]]  'float' 2.000000e+00
+// CXX_FLOAT-NEXT:           | `-CallExpr [[ADDR_62:0x[a-z0-9]*]]  'float'
+// CXX_FLOAT-NEXT:           |   |-ImplicitCastExpr [[ADDR_63:0x[a-z0-9]*]]  'float (*)(double)' 
+// CXX_FLOAT-NEXT:           |   | `-DeclRefExpr [[ADDR_64:0x[a-z0-9]*]]  'float (double)' {{.*}}Function [[ADDR_30]] 'also_after' 'float (double)'
+// CXX_FLOAT-NEXT:           |   `-FloatingLiteral [[ADDR_65:0x[a-z0-9]*]]  'double' 3.000000e+00
+// CXX_FLOAT-NEXT:           `-CallExpr [[ADDR_66:0x[a-z0-9]*]]  'float'
+// CXX_FLOAT-NEXT:             |-ImplicitCastExpr [[ADDR_67:0x[a-z0-9]*]]  'float (*)(long)' 
+// CXX_FLOAT-NEXT:             | `-DeclRefExpr [[ADDR_68:0x[a-z0-9]*]]  'float (long)' {{.*}}Function [[ADDR_36]] 'also_after' 'float (long)'
+// CXX_FLOAT-NEXT:             `-IntegerLiteral [[ADDR_69:0x[a-z0-9]*]]  'long' 4
+
+// C_INT:      |-FunctionDecl [[ADDR_0:0x[a-z0-9]*]] <{{.*}}, line:30:1> line:28:11 used also_before 'int ({{.*}})'
+// C_INT-NEXT: | |-CompoundStmt [[ADDR_1:0x[a-z0-9]*]] 
+// C_INT-NEXT: | | `-ReturnStmt [[ADDR_2:0x[a-z0-9]*]] 
+// C_INT-NEXT: | |   `-IntegerLiteral [[ADDR_3:0x[a-z0-9]*]]  'int' 1
+// C_INT-NEXT: | |-OverloadableAttr [[ADDR_4:0x[a-z0-9]*]] 
+// C_INT-NEXT: | `-OMPDeclareVariantAttr [[ADDR_5:0x[a-z0-9]*]] <> Implicit implementation={extension(disable_implicit_base)}
+// C_INT-NEXT: |   `-DeclRefExpr [[ADDR_6:0x[a-z0-9]*]]  'int ({{.*}})' Function [[ADDR_7:0x[a-z0-9]*]] 'also_before[implementation={extension(disable_implicit_base)}]' 'int ({{.*}})'
+// C_INT-NEXT: |-FunctionDecl [[ADDR_8:0x[a-z0-9]*]]  line:32:11 used also_before 'int (int)'
+// C_INT-NEXT: | |-ParmVarDecl [[ADDR_9:0x[a-z0-9]*]]  col:27 i 'int'
+// C_INT-NEXT: | |-CompoundStmt [[ADDR_10:0x[a-z0-9]*]] 
+// C_INT-NEXT: | | `-ReturnStmt [[ADDR_11:0x[a-z0-9]*]] 
+// C_INT-NEXT: | |   `-IntegerLiteral [[ADDR_12:0x[a-z0-9]*]]  'int' 1
+// C_INT-NEXT: | |-OverloadableAttr [[ADDR_13:0x[a-z0-9]*]] 
+// C_INT-NEXT: | `-OMPDeclareVariantAttr [[ADDR_14:0x[a-z0-9]*]] <> Implicit implementation={extension(disable_implicit_base)}
+// C_INT-NEXT: |   `-DeclRefExpr [[ADDR_15:0x[a-z0-9]*]]  'int (int)' Function [[ADDR_16:0x[a-z0-9]*]] 'also_before[implementation={extension(disable_implicit_base)}]' 'int (int)'
+// C_INT-NEXT: |-FunctionDecl [[ADDR_7]]  line:10:22 also_before[implementation={extension(disable_implicit_base)}] 'int ({{.*}})'
+// C_INT-NEXT: | |-CompoundStmt [[ADDR_17:0x[a-z0-9]*]] 
+// C_INT-NEXT: | | `-ReturnStmt [[ADDR_18:0x[a-z0-9]*]] 
+// C_INT-NEXT: | |   `-IntegerLiteral [[ADDR_19:0x[a-z0-9]*]]  'int' 0
+// C_INT-NEXT: | `-OverloadableAttr [[ADDR_20:0x[a-z0-9]*]] 
+// C_INT-NEXT: |-FunctionDecl [[ADDR_16]]  line:10:22 also_before[implementation={extension(disable_implicit_base)}] 'int (int)'
+// C_INT-NEXT: | |-ParmVarDecl [[ADDR_21:0x[a-z0-9]*]]  col:21 i 'int'
+// C_INT-NEXT: | |-CompoundStmt [[ADDR_22:0x[a-z0-9]*]] 
+// C_INT-NEXT: | | `-ReturnStmt [[ADDR_23:0x[a-z0-9]*]] 
+// C_INT-NEXT: | |   `-IntegerLiteral [[ADDR_24:0x[a-z0-9]*]]  'int' 0
+// C_INT-NEXT: | `-OverloadableAttr [[ADDR_25:0x[a-z0-9]*]] 
+// C_INT-NEXT: |-FunctionDecl [[ADDR_26:0x[a-z0-9]*]]  line:10:22 also_after[implementation={extension(disable_implicit_base)}] 'int (double)'
+// C_INT-NEXT: | |-ParmVarDecl [[ADDR_27:0x[a-z0-9]*]]  col:23 d 'double'
+// C_INT-NEXT: | |-CompoundStmt [[ADDR_28:0x[a-z0-9]*]] 
+// C_INT-NEXT: | | `-ReturnStmt [[ADDR_29:0x[a-z0-9]*]] 
+// C_INT-NEXT: | |   `-IntegerLiteral [[ADDR_30:0x[a-z0-9]*]]  'int' 1
+// C_INT-NEXT: | `-OverloadableAttr [[ADDR_31:0x[a-z0-9]*]] 
+// C_INT-NEXT: |-FunctionDecl [[ADDR_32:0x[a-z0-9]*]]  line:10:22 also_after[implementation={extension(disable_implicit_base)}] 'int (long)'
+// C_INT-NEXT: | |-ParmVarDecl [[ADDR_33:0x[a-z0-9]*]]  col:21 l 'long'
+// C_INT-NEXT: | |-CompoundStmt [[ADDR_34:0x[a-z0-9]*]] 
+// C_INT-NEXT: | | `-ReturnStmt [[ADDR_35:0x[a-z0-9]*]] 
+// C_INT-NEXT: | |   `-IntegerLiteral [[ADDR_36:0x[a-z0-9]*]]  'int' 1
+// C_INT-NEXT: | `-OverloadableAttr [[ADDR_37:0x[a-z0-9]*]] 
+// C_INT-NEXT: |-FunctionDecl [[ADDR_38:0x[a-z0-9]*]]  line:57:11 used also_after 'int (double)'
+// C_INT-NEXT: | |-ParmVarDecl [[ADDR_39:0x[a-z0-9]*]]  col:29 d 'double'
+// C_INT-NEXT: | |-CompoundStmt [[ADDR_40:0x[a-z0-9]*]] 
+// C_INT-NEXT: | | `-ReturnStmt [[ADDR_41:0x[a-z0-9]*]] 
+// C_INT-NEXT: | |   `-IntegerLiteral [[ADDR_42:0x[a-z0-9]*]]  'int' 0
+// C_INT-NEXT: | `-OverloadableAttr [[ADDR_43:0x[a-z0-9]*]] 
+// C_INT-NEXT: |-FunctionDecl [[ADDR_44:0x[a-z0-9]*]]  line:61:11 used also_after 'int (long)'
+// C_INT-NEXT: | |-ParmVarDecl [[ADDR_45:0x[a-z0-9]*]]  col:27 l 'long'
+// C_INT-NEXT: | |-CompoundStmt [[ADDR_46:0x[a-z0-9]*]] 
+// C_INT-NEXT: | | `-ReturnStmt [[ADDR_47:0x[a-z0-9]*]] 
+// C_INT-NEXT: | |   `-IntegerLiteral [[ADDR_48:0x[a-z0-9]*]]  'int' 0
+// C_INT-NEXT: | `-OverloadableAttr [[ADDR_49:0x[a-z0-9]*]] 
+// C_INT-NEXT: `-FunctionDecl [[ADDR_50:0x[a-z0-9]*]]  line:65:5 main 'int ({{.*}})'
+// C_INT-NEXT:   `-CompoundStmt [[ADDR_51:0x[a-z0-9]*]] 
+// C_INT-NEXT:     `-ReturnStmt [[ADDR_52:0x[a-z0-9]*]] 
+// C_INT-NEXT:       `-BinaryOperator [[ADDR_53:0x[a-z0-9]*]]  'int' '+'
+// C_INT-NEXT:         |-BinaryOperator [[ADDR_54:0x[a-z0-9]*]]  'int' '+'
+// C_INT-NEXT:         | |-BinaryOperator [[ADDR_55:0x[a-z0-9]*]]  'int' '+'
+// C_INT-NEXT:         | | |-BinaryOperator [[ADDR_56:0x[a-z0-9]*]]  'int' '+'
+// C_INT-NEXT:         | | | |-PseudoObjectExpr [[ADDR_57:0x[a-z0-9]*]]  'int'
+// C_INT-NEXT:         | | | | |-CallExpr [[ADDR_58:0x[a-z0-9]*]]  'int'
+// C_INT-NEXT:         | | | | | `-ImplicitCastExpr [[ADDR_59:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// C_INT-NEXT:         | | | | |   `-DeclRefExpr [[ADDR_60:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_0]] 'also_before' 'int ({{.*}})'
+// C_INT-NEXT:         | | | | `-CallExpr [[ADDR_61:0x[a-z0-9]*]]  'int'
+// C_INT-NEXT:         | | | |   `-ImplicitCastExpr [[ADDR_62:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// C_INT-NEXT:         | | | |     `-DeclRefExpr [[ADDR_6]]  'int ({{.*}})' Function [[ADDR_7]] 'also_before[implementation={extension(disable_implicit_base)}]' 'int ({{.*}})'
+// C_INT-NEXT:         | | | `-PseudoObjectExpr [[ADDR_63:0x[a-z0-9]*]]  'int'
+// C_INT-NEXT:         | | |   |-CallExpr [[ADDR_64:0x[a-z0-9]*]]  'int'
+// C_INT-NEXT:         | | |   | |-ImplicitCastExpr [[ADDR_65:0x[a-z0-9]*]]  'int (*)(int)' 
+// C_INT-NEXT:         | | |   | | `-DeclRefExpr [[ADDR_66:0x[a-z0-9]*]]  'int (int)' {{.*}}Function [[ADDR_8]] 'also_before' 'int (int)'
+// C_INT-NEXT:         | | |   | `-IntegerLiteral [[ADDR_67:0x[a-z0-9]*]]  'int' 1
+// C_INT-NEXT:         | | |   `-CallExpr [[ADDR_68:0x[a-z0-9]*]]  'int'
+// C_INT-NEXT:         | | |     |-ImplicitCastExpr [[ADDR_69:0x[a-z0-9]*]]  'int (*)(int)' 
+// C_INT-NEXT:         | | |     | `-DeclRefExpr [[ADDR_15]]  'int (int)' Function [[ADDR_16]] 'also_before[implementation={extension(disable_implicit_base)}]' 'int (int)'
+// C_INT-NEXT:         | | |     `-IntegerLiteral [[ADDR_67]]  'int' 1
+// C_INT-NEXT:         | | `-PseudoObjectExpr [[ADDR_70:0x[a-z0-9]*]]  'int'
+// C_INT-NEXT:         | |   |-CallExpr [[ADDR_71:0x[a-z0-9]*]]  'int'
+// C_INT-NEXT:         | |   | |-ImplicitCastExpr [[ADDR_72:0x[a-z0-9]*]]  'int (*)(int)' 
+// C_INT-NEXT:         | |   | | `-DeclRefExpr [[ADDR_73:0x[a-z0-9]*]]  'int (int)' {{.*}}Function [[ADDR_8]] 'also_before' 'int (int)'
+// C_INT-NEXT:         | |   | `-ImplicitCastExpr [[ADDR_74:0x[a-z0-9]*]]  'int' 
+// C_INT-NEXT:         | |   |   `-FloatingLiteral [[ADDR_75:0x[a-z0-9]*]]  'float' 2.000000e+00
+// C_INT-NEXT:         | |   `-CallExpr [[ADDR_76:0x[a-z0-9]*]]  'int'
+// C_INT-NEXT:         | |     |-ImplicitCastExpr [[ADDR_77:0x[a-z0-9]*]]  'int (*)(int)' 
+// C_INT-NEXT:         | |     | `-DeclRefExpr [[ADDR_15]]  'int (int)' Function [[ADDR_16]] 'also_before[implementation={extension(disable_implicit_base)}]' 'int (int)'
+// C_INT-NEXT:         | |     `-ImplicitCastExpr [[ADDR_78:0x[a-z0-9]*]]  'int' 
+// C_INT-NEXT:         | |       `-FloatingLiteral [[ADDR_75]]  'float' 2.000000e+00
+// C_INT-NEXT:         | `-CallExpr [[ADDR_79:0x[a-z0-9]*]]  'int'
+// C_INT-NEXT:         |   |-ImplicitCastExpr [[ADDR_80:0x[a-z0-9]*]]  'int (*)(double)' 
+// C_INT-NEXT:         |   | `-DeclRefExpr [[ADDR_81:0x[a-z0-9]*]]  'int (double)' {{.*}}Function [[ADDR_38]] 'also_after' 'int (double)'
+// C_INT-NEXT:         |   `-FloatingLiteral [[ADDR_82:0x[a-z0-9]*]]  'double' 3.000000e+00
+// C_INT-NEXT:         `-CallExpr [[ADDR_83:0x[a-z0-9]*]]  'int'
+// C_INT-NEXT:           |-ImplicitCastExpr [[ADDR_84:0x[a-z0-9]*]]  'int (*)(long)' 
+// C_INT-NEXT:           | `-DeclRefExpr [[ADDR_85:0x[a-z0-9]*]]  'int (long)' {{.*}}Function [[ADDR_44]] 'also_after' 'int (long)'
+// C_INT-NEXT:           `-IntegerLiteral [[ADDR_86:0x[a-z0-9]*]]  'long' 4
+
+// CXX_INT:      |-FunctionDecl [[ADDR_0:0x[a-z0-9]*]] <{{.*}}, line:30:1> line:28:11 used also_before 'int ({{.*}})'
+// CXX_INT-NEXT: | |-CompoundStmt [[ADDR_1:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: | | `-ReturnStmt [[ADDR_2:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: | |   `-IntegerLiteral [[ADDR_3:0x[a-z0-9]*]]  'int' 1
+// CXX_INT-NEXT: | `-OMPDeclareVariantAttr [[ADDR_4:0x[a-z0-9]*]] <> Implicit implementation={extension(disable_implicit_base)}
+// CXX_INT-NEXT: |   `-DeclRefExpr [[ADDR_5:0x[a-z0-9]*]]  'int ({{.*}})' Function [[ADDR_6:0x[a-z0-9]*]] 'also_before[implementation={extension(disable_implicit_base)}]' 'int ({{.*}})'
+// CXX_INT-NEXT: |-FunctionDecl [[ADDR_7:0x[a-z0-9]*]]  line:32:11 used also_before 'int (int)'
+// CXX_INT-NEXT: | |-ParmVarDecl [[ADDR_8:0x[a-z0-9]*]]  col:27 i 'int'
+// CXX_INT-NEXT: | |-CompoundStmt [[ADDR_9:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: | | `-ReturnStmt [[ADDR_10:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: | |   `-IntegerLiteral [[ADDR_11:0x[a-z0-9]*]]  'int' 1
+// CXX_INT-NEXT: | `-OMPDeclareVariantAttr [[ADDR_12:0x[a-z0-9]*]] <> Implicit implementation={extension(disable_implicit_base)}
+// CXX_INT-NEXT: |   `-DeclRefExpr [[ADDR_13:0x[a-z0-9]*]]  'int (int)' Function [[ADDR_14:0x[a-z0-9]*]] 'also_before[implementation={extension(disable_implicit_base)}]' 'int (int)'
+// CXX_INT-NEXT: |-FunctionDecl [[ADDR_6]]  line:38:1 also_before[implementation={extension(disable_implicit_base)}] 'int ({{.*}})'
+// CXX_INT-NEXT: | `-CompoundStmt [[ADDR_15:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: |   `-ReturnStmt [[ADDR_16:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: |     `-IntegerLiteral [[ADDR_17:0x[a-z0-9]*]]  'int' 0
+// CXX_INT-NEXT: |-FunctionDecl [[ADDR_14]]  line:42:1 also_before[implementation={extension(disable_implicit_base)}] 'int (int)'
+// CXX_INT-NEXT: | |-ParmVarDecl [[ADDR_18:0x[a-z0-9]*]]  col:21 i 'int'
+// CXX_INT-NEXT: | `-CompoundStmt [[ADDR_19:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: |   `-ReturnStmt [[ADDR_20:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: |     `-IntegerLiteral [[ADDR_21:0x[a-z0-9]*]]  'int' 0
+// CXX_INT-NEXT: |-FunctionDecl [[ADDR_22:0x[a-z0-9]*]]  line:47:1 also_after[implementation={extension(disable_implicit_base)}] 'int (double)'
+// CXX_INT-NEXT: | |-ParmVarDecl [[ADDR_23:0x[a-z0-9]*]]  col:23 d 'double'
+// CXX_INT-NEXT: | `-CompoundStmt [[ADDR_24:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: |   `-ReturnStmt [[ADDR_25:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: |     `-IntegerLiteral [[ADDR_26:0x[a-z0-9]*]]  'int' 1
+// CXX_INT-NEXT: |-FunctionDecl [[ADDR_27:0x[a-z0-9]*]]  line:51:1 also_after[implementation={extension(disable_implicit_base)}] 'int (long)'
+// CXX_INT-NEXT: | |-ParmVarDecl [[ADDR_28:0x[a-z0-9]*]]  col:21 l 'long'
+// CXX_INT-NEXT: | `-CompoundStmt [[ADDR_29:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: |   `-ReturnStmt [[ADDR_30:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: |     `-IntegerLiteral [[ADDR_31:0x[a-z0-9]*]]  'int' 1
+// CXX_INT-NEXT: |-FunctionDecl [[ADDR_32:0x[a-z0-9]*]]  line:57:11 used also_after 'int (double)'
+// CXX_INT-NEXT: | |-ParmVarDecl [[ADDR_33:0x[a-z0-9]*]]  col:29 d 'double'
+// CXX_INT-NEXT: | `-CompoundStmt [[ADDR_34:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: |   `-ReturnStmt [[ADDR_35:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: |     `-IntegerLiteral [[ADDR_36:0x[a-z0-9]*]]  'int' 0
+// CXX_INT-NEXT: |-FunctionDecl [[ADDR_37:0x[a-z0-9]*]]  line:61:11 used also_after 'int (long)'
+// CXX_INT-NEXT: | |-ParmVarDecl [[ADDR_38:0x[a-z0-9]*]]  col:27 l 'long'
+// CXX_INT-NEXT: | `-CompoundStmt [[ADDR_39:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: |   `-ReturnStmt [[ADDR_40:0x[a-z0-9]*]] 
+// CXX_INT-NEXT: |     `-IntegerLiteral [[ADDR_41:0x[a-z0-9]*]]  'int' 0
+// CXX_INT-NEXT: `-FunctionDecl [[ADDR_42:0x[a-z0-9]*]]  line:65:5 main 'int ({{.*}})'
+// CXX_INT-NEXT:   `-CompoundStmt [[ADDR_43:0x[a-z0-9]*]] 
+// CXX_INT-NEXT:     `-ReturnStmt [[ADDR_44:0x[a-z0-9]*]] 
+// CXX_INT-NEXT:       `-BinaryOperator [[ADDR_45:0x[a-z0-9]*]]  'int' '+'
+// CXX_INT-NEXT:         |-BinaryOperator [[ADDR_46:0x[a-z0-9]*]]  'int' '+'
+// CXX_INT-NEXT:         | |-BinaryOperator [[ADDR_47:0x[a-z0-9]*]]  'int' '+'
+// CXX_INT-NEXT:         | | |-BinaryOperator [[ADDR_48:0x[a-z0-9]*]]  'int' '+'
+// CXX_INT-NEXT:         | | | |-PseudoObjectExpr [[ADDR_49:0x[a-z0-9]*]]  'int'
+// CXX_INT-NEXT:         | | | | |-CallExpr [[ADDR_50:0x[a-z0-9]*]]  'int'
+// CXX_INT-NEXT:         | | | | | `-ImplicitCastExpr [[ADDR_51:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CXX_INT-NEXT:         | | | | |   `-DeclRefExpr [[ADDR_52:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_0]] 'also_before' 'int ({{.*}})'
+// CXX_INT-NEXT:         | | | | `-CallExpr [[ADDR_53:0x[a-z0-9]*]]  'int'
+// CXX_INT-NEXT:         | | | |   `-ImplicitCastExpr [[ADDR_54:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CXX_INT-NEXT:         | | | |     `-DeclRefExpr [[ADDR_5]]  'int ({{.*}})' Function [[ADDR_6]] 'also_before[implementation={extension(disable_implicit_base)}]' 'int ({{.*}})'
+// CXX_INT-NEXT:         | | | `-PseudoObjectExpr [[ADDR_55:0x[a-z0-9]*]]  'int'
+// CXX_INT-NEXT:         | | |   |-CallExpr [[ADDR_56:0x[a-z0-9]*]]  'int'
+// CXX_INT-NEXT:         | | |   | |-ImplicitCastExpr [[ADDR_57:0x[a-z0-9]*]]  'int (*)(int)' 
+// CXX_INT-NEXT:         | | |   | | `-DeclRefExpr [[ADDR_58:0x[a-z0-9]*]]  'int (int)' {{.*}}Function [[ADDR_7]] 'also_before' 'int (int)'
+// CXX_INT-NEXT:         | | |   | `-IntegerLiteral [[ADDR_59:0x[a-z0-9]*]]  'int' 1
+// CXX_INT-NEXT:         | | |   `-CallExpr [[ADDR_60:0x[a-z0-9]*]]  'int'
+// CXX_INT-NEXT:         | | |     |-ImplicitCastExpr [[ADDR_61:0x[a-z0-9]*]]  'int (*)(int)' 
+// CXX_INT-NEXT:         | | |     | `-DeclRefExpr [[ADDR_13]]  'int (int)' Function [[ADDR_14]] 'also_before[implementation={extension(disable_implicit_base)}]' 'int (int)'
+// CXX_INT-NEXT:         | | |     `-IntegerLiteral [[ADDR_59]]  'int' 1
+// CXX_INT-NEXT:         | | `-PseudoObjectExpr [[ADDR_62:0x[a-z0-9]*]]  'int'
+// CXX_INT-NEXT:         | |   |-CallExpr [[ADDR_63:0x[a-z0-9]*]]  'int'
+// CXX_INT-NEXT:         | |   | |-ImplicitCastExpr [[ADDR_64:0x[a-z0-9]*]]  'int (*)(int)' 
+// CXX_INT-NEXT:         | |   | | `-DeclRefExpr [[ADDR_65:0x[a-z0-9]*]]  'int (int)' {{.*}}Function [[ADDR_7]] 'also_before' 'int (int)'
+// CXX_INT-NEXT:         | |   | `-ImplicitCastExpr [[ADDR_66:0x[a-z0-9]*]]  'int' 
+// CXX_INT-NEXT:         | |   |   `-FloatingLiteral [[ADDR_67:0x[a-z0-9]*]]  'float' 2.000000e+00
+// CXX_INT-NEXT:         | |   `-CallExpr [[ADDR_68:0x[a-z0-9]*]]  'int'
+// CXX_INT-NEXT:         | |     |-ImplicitCastExpr [[ADDR_69:0x[a-z0-9]*]]  'int (*)(int)' 
+// CXX_INT-NEXT:         | |     | `-DeclRefExpr [[ADDR_13]]  'int (int)' Function [[ADDR_14]] 'also_before[implementation={extension(disable_implicit_base)}]' 'int (int)'
+// CXX_INT-NEXT:         | |     `-ImplicitCastExpr [[ADDR_70:0x[a-z0-9]*]]  'int' 
+// CXX_INT-NEXT:         | |       `-FloatingLiteral [[ADDR_67]]  'float' 2.000000e+00
+// CXX_INT-NEXT:         | `-CallExpr [[ADDR_71:0x[a-z0-9]*]]  'int'
+// CXX_INT-NEXT:         |   |-ImplicitCastExpr [[ADDR_72:0x[a-z0-9]*]]  'int (*)(double)' 
+// CXX_INT-NEXT:         |   | `-DeclRefExpr [[ADDR_73:0x[a-z0-9]*]]  'int (double)' {{.*}}Function [[ADDR_32]] 'also_after' 'int (double)'
+// CXX_INT-NEXT:         |   `-FloatingLiteral [[ADDR_74:0x[a-z0-9]*]]  'double' 3.000000e+00
+// CXX_INT-NEXT:         `-CallExpr [[ADDR_75:0x[a-z0-9]*]]  'int'
+// CXX_INT-NEXT:           |-ImplicitCastExpr [[ADDR_76:0x[a-z0-9]*]]  'int (*)(long)' 
+// CXX_INT-NEXT:           | `-DeclRefExpr [[ADDR_77:0x[a-z0-9]*]]  'int (long)' {{.*}}Function [[ADDR_37]] 'also_after' 'int (long)'
+// CXX_INT-NEXT:           `-IntegerLiteral [[ADDR_78:0x[a-z0-9]*]]  'long' 4
diff --git a/clang/test/AST/ast-dump-openmp-begin-declare-variant_13.c b/clang/test/AST/ast-dump-openmp-begin-declare-variant_13.c
new file mode 100644
index 0000000000000..93d847a077779
--- /dev/null
+++ b/clang/test/AST/ast-dump-openmp-begin-declare-variant_13.c
@@ -0,0 +1,67 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -verify -ast-dump %s       | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -verify -ast-dump %s -x c++| FileCheck %s
+// expected-no-diagnostics
+
+int also_before(void) {
+  return 1;
+}
+
+#pragma omp begin declare variant match(user = {condition(1)})
+int also_after(void) {
+  return 0;
+}
+int also_before(void) {
+  return 0;
+}
+#pragma omp end declare variant
+
+int also_after(void) {
+  return 2;
+}
+
+int test() {
+  // Should return 0.
+  return also_after() + also_before();
+}
+
+// CHECK:      |-FunctionDecl [[ADDR_0:0x[a-z0-9]*]] <{{.*}}, line:7:1> line:5:5 used also_before 'int ({{.*}})'
+// CHECK-NEXT: | |-CompoundStmt [[ADDR_1:0x[a-z0-9]*]] 
+// CHECK-NEXT: | | `-ReturnStmt [[ADDR_2:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |   `-IntegerLiteral [[ADDR_3:0x[a-z0-9]*]]  'int' 1
+// CHECK-NEXT: | `-OMPDeclareVariantAttr [[ADDR_4:0x[a-z0-9]*]] <> Implicit user={condition(1)}
+// CHECK-NEXT: |   `-DeclRefExpr [[ADDR_5:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_6:0x[a-z0-9]*]] 'also_before[user={condition(...)}]' 'int ({{.*}})'
+// CHECK-NEXT: |-FunctionDecl [[ADDR_7:0x[a-z0-9]*]]  col:5 implicit used also_after 'int ({{.*}})'
+// CHECK-NEXT: | `-OMPDeclareVariantAttr [[ADDR_8:0x[a-z0-9]*]] <> Implicit user={condition(1)}
+// CHECK-NEXT: |   `-DeclRefExpr [[ADDR_9:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_10:0x[a-z0-9]*]] 'also_after[user={condition(...)}]' 'int ({{.*}})'
+// CHECK-NEXT: |-FunctionDecl [[ADDR_10]]  line:10:1 also_after[user={condition(...)}] 'int ({{.*}})'
+// CHECK-NEXT: | `-CompoundStmt [[ADDR_11:0x[a-z0-9]*]] 
+// CHECK-NEXT: |   `-ReturnStmt [[ADDR_12:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-IntegerLiteral [[ADDR_13:0x[a-z0-9]*]]  'int' 0
+// CHECK-NEXT: |-FunctionDecl [[ADDR_6]]  line:13:1 also_before[user={condition(...)}] 'int ({{.*}})'
+// CHECK-NEXT: | `-CompoundStmt [[ADDR_14:0x[a-z0-9]*]] 
+// CHECK-NEXT: |   `-ReturnStmt [[ADDR_15:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-IntegerLiteral [[ADDR_16:0x[a-z0-9]*]]  'int' 0
+// CHECK-NEXT: |-FunctionDecl [[ADDR_17:0x[a-z0-9]*]] prev [[ADDR_7]]  line:18:5 used also_after 'int ({{.*}})'
+// CHECK-NEXT: | |-CompoundStmt [[ADDR_18:0x[a-z0-9]*]] 
+// CHECK-NEXT: | | `-ReturnStmt [[ADDR_19:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |   `-IntegerLiteral [[ADDR_20:0x[a-z0-9]*]]  'int' 2
+// CHECK-NEXT: | `-OMPDeclareVariantAttr [[ADDR_21:0x[a-z0-9]*]] <> Inherited Implicit user={condition(1)}
+// CHECK-NEXT: |   `-DeclRefExpr [[ADDR_9]]  'int ({{.*}})' {{.*}}Function [[ADDR_10]] 'also_after[user={condition(...)}]' 'int ({{.*}})'
+// CHECK-NEXT: `-FunctionDecl [[ADDR_22:0x[a-z0-9]*]]  line:22:5 test 'int ({{.*}})'
+// CHECK-NEXT:   `-CompoundStmt [[ADDR_23:0x[a-z0-9]*]] 
+// CHECK-NEXT:     `-ReturnStmt [[ADDR_24:0x[a-z0-9]*]] 
+// CHECK-NEXT:       `-BinaryOperator [[ADDR_25:0x[a-z0-9]*]]  'int' '+'
+// CHECK-NEXT:         |-PseudoObjectExpr [[ADDR_26:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:         | |-CallExpr [[ADDR_27:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:         | | `-ImplicitCastExpr [[ADDR_28:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CHECK-NEXT:         | |   `-DeclRefExpr [[ADDR_29:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_17]] 'also_after' 'int ({{.*}})'
+// CHECK-NEXT:         | `-CallExpr [[ADDR_30:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:         |   `-ImplicitCastExpr [[ADDR_31:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CHECK-NEXT:         |     `-DeclRefExpr [[ADDR_9]]  'int ({{.*}})' {{.*}}Function [[ADDR_10]] 'also_after[user={condition(...)}]' 'int ({{.*}})'
+// CHECK-NEXT:         `-PseudoObjectExpr [[ADDR_32:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:           |-CallExpr [[ADDR_33:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:           | `-ImplicitCastExpr [[ADDR_34:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CHECK-NEXT:           |   `-DeclRefExpr [[ADDR_35:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_0]] 'also_before' 'int ({{.*}})'
+// CHECK-NEXT:           `-CallExpr [[ADDR_36:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:             `-ImplicitCastExpr [[ADDR_37:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CHECK-NEXT:               `-DeclRefExpr [[ADDR_5]]  'int ({{.*}})' {{.*}}Function [[ADDR_6]] 'also_before[user={condition(...)}]' 'int ({{.*}})'
diff --git a/clang/test/AST/ast-dump-openmp-begin-declare-variant_nested.c b/clang/test/AST/ast-dump-openmp-begin-declare-variant_nested.c
new file mode 100644
index 0000000000000..e4b5b39ae87a0
--- /dev/null
+++ b/clang/test/AST/ast-dump-openmp-begin-declare-variant_nested.c
@@ -0,0 +1,87 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -verify -ast-dump %s       | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -verify -ast-dump %s -x c++| FileCheck %s
+// expected-no-diagnostics
+
+int also_before(void) {
+  return 1;
+}
+
+#pragma omp begin declare variant match(user = {condition(1)}, device = {kind(cpu)}, implementation = {vendor(llvm)})
+#pragma omp begin declare variant match(device = {kind(cpu)}, implementation = {vendor(llvm, pgi), extension(match_any)})
+#pragma omp begin declare variant match(device = {kind(any)}, implementation = {dynamic_allocators})
+int also_after(void) {
+  return 0;
+}
+int also_before(void) {
+  return 0;
+}
+#pragma omp end declare variant
+#pragma omp end declare variant
+#pragma omp end declare variant
+
+int also_after(void) {
+  return 2;
+}
+
+int test() {
+  // Should return 0.
+  return also_after() + also_before();
+}
+
+#pragma omp begin declare variant match(device = {isa("sse")})
+#pragma omp declare variant(test) match(device = {isa(sse)})
+int equivalent_isa_trait(void);
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {isa("sse")})
+#pragma omp declare variant(test) match(device = {isa("sse2")})
+int non_equivalent_isa_trait(void);
+#pragma omp end declare variant
+
+// CHECK:      |-FunctionDecl [[ADDR_0:0x[a-z0-9]*]] <{{.*}}, line:7:1> line:5:5 used also_before 'int ({{.*}})'
+// CHECK-NEXT: | |-CompoundStmt [[ADDR_1:0x[a-z0-9]*]] 
+// CHECK-NEXT: | | `-ReturnStmt [[ADDR_2:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |   `-IntegerLiteral [[ADDR_3:0x[a-z0-9]*]]  'int' 1
+// CHECK-NEXT: | `-OMPDeclareVariantAttr [[ADDR_4:0x[a-z0-9]*]] <> Implicit device={kind(any, cpu)}, implementation={dynamic_allocators, vendor(llvm, pgi), extension(match_any)}, user={condition(1)}
+// CHECK-NEXT: |   `-DeclRefExpr [[ADDR_5:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_6:0x[a-z0-9]*]] 'also_before[device={kind(any, cpu)}, implementation={dynamic_allocators, vendor(llvm, pgi), extension(match_any)}, user={condition(...)}]' 'int ({{.*}})'
+// CHECK-NEXT: |-FunctionDecl [[ADDR_7:0x[a-z0-9]*]]  col:5 implicit used also_after 'int ({{.*}})'
+// CHECK-NEXT: | `-OMPDeclareVariantAttr [[ADDR_8:0x[a-z0-9]*]] <> Implicit device={kind(any, cpu)}, implementation={dynamic_allocators, vendor(llvm, pgi), extension(match_any)}, user={condition(1)}
+// CHECK-NEXT: |   `-DeclRefExpr [[ADDR_9:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_10:0x[a-z0-9]*]] 'also_after[device={kind(any, cpu)}, implementation={dynamic_allocators, vendor(llvm, pgi), extension(match_any)}, user={condition(...)}]' 'int ({{.*}})'
+// CHECK-NEXT: |-FunctionDecl [[ADDR_10]]  line:12:1 also_after[device={kind(any, cpu)}, implementation={dynamic_allocators, vendor(llvm, pgi), extension(match_any)}, user={condition(...)}] 'int ({{.*}})'
+// CHECK-NEXT: | `-CompoundStmt [[ADDR_11:0x[a-z0-9]*]] 
+// CHECK-NEXT: |   `-ReturnStmt [[ADDR_12:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-IntegerLiteral [[ADDR_13:0x[a-z0-9]*]]  'int' 0
+// CHECK-NEXT: |-FunctionDecl [[ADDR_6]]  line:15:1 also_before[device={kind(any, cpu)}, implementation={dynamic_allocators, vendor(llvm, pgi), extension(match_any)}, user={condition(...)}] 'int ({{.*}})'
+// CHECK-NEXT: | `-CompoundStmt [[ADDR_14:0x[a-z0-9]*]] 
+// CHECK-NEXT: |   `-ReturnStmt [[ADDR_15:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-IntegerLiteral [[ADDR_16:0x[a-z0-9]*]]  'int' 0
+// CHECK-NEXT: |-FunctionDecl [[ADDR_17:0x[a-z0-9]*]] prev [[ADDR_7]]  line:22:5 used also_after 'int ({{.*}})'
+// CHECK-NEXT: | |-CompoundStmt [[ADDR_18:0x[a-z0-9]*]] 
+// CHECK-NEXT: | | `-ReturnStmt [[ADDR_19:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |   `-IntegerLiteral [[ADDR_20:0x[a-z0-9]*]]  'int' 2
+// CHECK-NEXT: | `-OMPDeclareVariantAttr [[ADDR_21:0x[a-z0-9]*]] <> Inherited Implicit device={kind(any, cpu)}, implementation={dynamic_allocators, vendor(llvm, pgi), extension(match_any)}, user={condition(1)}
+// CHECK-NEXT: |   `-DeclRefExpr [[ADDR_9]]  'int ({{.*}})' {{.*}}Function [[ADDR_10]] 'also_after[device={kind(any, cpu)}, implementation={dynamic_allocators, vendor(llvm, pgi), extension(match_any)}, user={condition(...)}]' 'int ({{.*}})'
+// CHECK-NEXT: |-FunctionDecl [[ADDR_22:0x[a-z0-9]*]]  line:26:5 referenced test 'int ({{.*}})'
+// CHECK-NEXT: | `-CompoundStmt [[ADDR_23:0x[a-z0-9]*]] 
+// CHECK-NEXT: |   `-ReturnStmt [[ADDR_24:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-BinaryOperator [[ADDR_25:0x[a-z0-9]*]]  'int' '+'
+// CHECK-NEXT: |       |-PseudoObjectExpr [[ADDR_26:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT: |       | |-CallExpr [[ADDR_27:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT: |       | | `-ImplicitCastExpr [[ADDR_28:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CHECK-NEXT: |       | |   `-DeclRefExpr [[ADDR_29:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_17]] 'also_after' 'int ({{.*}})'
+// CHECK-NEXT: |       | `-CallExpr [[ADDR_30:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT: |       |   `-ImplicitCastExpr [[ADDR_31:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CHECK-NEXT: |       |     `-DeclRefExpr [[ADDR_9]]  'int ({{.*}})' {{.*}}Function [[ADDR_10]] 'also_after[device={kind(any, cpu)}, implementation={dynamic_allocators, vendor(llvm, pgi), extension(match_any)}, user={condition(...)}]' 'int ({{.*}})'
+// CHECK-NEXT: |       `-PseudoObjectExpr [[ADDR_32:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT: |         |-CallExpr [[ADDR_33:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT: |         | `-ImplicitCastExpr [[ADDR_34:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CHECK-NEXT: |         |   `-DeclRefExpr [[ADDR_35:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_0]] 'also_before' 'int ({{.*}})'
+// CHECK-NEXT: |         `-CallExpr [[ADDR_36:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT: |           `-ImplicitCastExpr [[ADDR_37:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CHECK-NEXT: |             `-DeclRefExpr [[ADDR_5]]  'int ({{.*}})' {{.*}}Function [[ADDR_6]] 'also_before[device={kind(any, cpu)}, implementation={dynamic_allocators, vendor(llvm, pgi), extension(match_any)}, user={condition(...)}]' 'int ({{.*}})'
+// CHECK-NEXT: |-FunctionDecl [[ADDR_38:0x[a-z0-9]*]]  col:5 equivalent_isa_trait 'int ({{.*}})'
+// CHECK-NEXT: | `-OMPDeclareVariantAttr [[ADDR_39:0x[a-z0-9]*]]  Implicit device={isa(sse)}
+// CHECK-NEXT: |   `-DeclRefExpr [[ADDR_40:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_22]] 'test' 'int ({{.*}})' non_odr_use_unevaluated
+// CHECK-NEXT: `-FunctionDecl [[ADDR_41:0x[a-z0-9]*]]  col:5 non_equivalent_isa_trait 'int ({{.*}})'
+// CHECK-NEXT:   `-OMPDeclareVariantAttr [[ADDR_42:0x[a-z0-9]*]]  Implicit device={isa(sse2, sse)}
+// CHECK-NEXT:     `-DeclRefExpr [[ADDR_43:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_22]] 'test' 'int ({{.*}})' non_odr_use_unevaluated
diff --git a/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp b/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp
new file mode 100644
index 0000000000000..9613e86634927
--- /dev/null
+++ b/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp
@@ -0,0 +1,264 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -verify -ast-dump %s -x c++ | FileCheck %s
+// expected-no-diagnostics
+
+template 
+int also_before(T) {
+  return 1;
+}
+template 
+int also_before_mismatch(void) {
+  return 0;
+}
+int also_before_non_template(void) {
+  return 0;
+}
+
+#pragma omp begin declare variant match(implementation = {extension(allow_templates)})
+template 
+int also_before(T) {
+  return 0;
+}
+template 
+int also_after(T) {
+  return 0;
+}
+template 
+int also_after_mismatch(T, Q) {
+  return 2;
+}
+template 
+int also_before_mismatch(T) {
+  return 3;
+}
+template 
+int also_before_non_template(T) {
+  return 4;
+}
+template 
+int only_def(void) {
+  return 0;
+}
+#pragma omp end declare variant
+
+template 
+int also_after(T) {
+  return 6;
+}
+template 
+int also_after_mismatch(T) {
+  return 0;
+}
+
+int test() {
+  // Should return 0.
+  return also_before(0.) + also_before_mismatch<0>() + also_before_non_template() + also_after(0) + also_after_mismatch(0) + only_def<0>();
+}
+
+// CHECK:      |-FunctionTemplateDecl [[ADDR_0:0x[a-z0-9]*]] <{{.*}}, line:7:1> line:5:5 also_before
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_1:0x[a-z0-9]*]]  col:20 referenced typename depth 0 index 0 T
+// CHECK-NEXT: | |-FunctionDecl [[ADDR_2:0x[a-z0-9]*]]  line:5:5 also_before 'int (T)'
+// CHECK-NEXT: | | |-ParmVarDecl [[ADDR_3:0x[a-z0-9]*]]  col:18 'T'
+// CHECK-NEXT: | | |-CompoundStmt [[ADDR_4:0x[a-z0-9]*]] 
+// CHECK-NEXT: | | | `-ReturnStmt [[ADDR_5:0x[a-z0-9]*]] 
+// CHECK-NEXT: | | |   `-IntegerLiteral [[ADDR_6:0x[a-z0-9]*]]  'int' 1
+// CHECK-NEXT: | | `-OMPDeclareVariantAttr [[ADDR_7:0x[a-z0-9]*]] <> Implicit implementation={extension(allow_templates)}
+// CHECK-NEXT: | |   `-DeclRefExpr [[ADDR_8:0x[a-z0-9]*]]  'int (T)' {{.*}}Function [[ADDR_9:0x[a-z0-9]*]] 'also_before[implementation={extension(allow_templates)}]' 'int (T)'
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_10:0x[a-z0-9]*]]  line:5:5 used also_before 'int (double)'
+// CHECK-NEXT: |   |-TemplateArgument type 'double'
+// CHECK-NEXT: |   | `-BuiltinType [[ADDR_11:0x[a-z0-9]*]] 'double'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_12:0x[a-z0-9]*]]  col:18 'double':'double'
+// CHECK-NEXT: |   |-CompoundStmt [[ADDR_13:0x[a-z0-9]*]] 
+// CHECK-NEXT: |   | `-ReturnStmt [[ADDR_14:0x[a-z0-9]*]] 
+// CHECK-NEXT: |   |   `-IntegerLiteral [[ADDR_6]]  'int' 1
+// CHECK-NEXT: |   `-OMPDeclareVariantAttr [[ADDR_15:0x[a-z0-9]*]] <> Implicit implementation={extension(allow_templates)}
+// CHECK-NEXT: |     `-DeclRefExpr [[ADDR_16:0x[a-z0-9]*]]  'int (double)' {{.*}}Function [[ADDR_17:0x[a-z0-9]*]] 'also_before[implementation={extension(allow_templates)}]' 'int (double)'
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_18:0x[a-z0-9]*]]  line:9:5 also_before_mismatch
+// CHECK-NEXT: | |-NonTypeTemplateParmDecl [[ADDR_19:0x[a-z0-9]*]]  col:15 'int' depth 0 index 0 V
+// CHECK-NEXT: | |-FunctionDecl [[ADDR_20:0x[a-z0-9]*]]  line:9:5 also_before_mismatch 'int ({{.*}})'
+// CHECK-NEXT: | | `-CompoundStmt [[ADDR_21:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |   `-ReturnStmt [[ADDR_22:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |     `-IntegerLiteral [[ADDR_23:0x[a-z0-9]*]]  'int' 0
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_24:0x[a-z0-9]*]]  line:9:5 used also_before_mismatch 'int ({{.*}})'
+// CHECK-NEXT: |   |-TemplateArgument integral 0
+// CHECK-NEXT: |   `-CompoundStmt [[ADDR_25:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-ReturnStmt [[ADDR_26:0x[a-z0-9]*]] 
+// CHECK-NEXT: |       `-IntegerLiteral [[ADDR_23]]  'int' 0
+// CHECK-NEXT: |-FunctionDecl [[ADDR_27:0x[a-z0-9]*]]  line:12:5 used also_before_non_template 'int ({{.*}})'
+// CHECK-NEXT: | `-CompoundStmt [[ADDR_28:0x[a-z0-9]*]] 
+// CHECK-NEXT: |   `-ReturnStmt [[ADDR_29:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-IntegerLiteral [[ADDR_30:0x[a-z0-9]*]]  'int' 0
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_31:0x[a-z0-9]*]]  line:18:1 also_before[implementation={extension(allow_templates)}]
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_32:0x[a-z0-9]*]]  col:20 referenced typename depth 0 index 0 T
+// CHECK-NEXT: | |-FunctionDecl [[ADDR_9]]  line:18:1 referenced also_before[implementation={extension(allow_templates)}] 'int (T)'
+// CHECK-NEXT: | | |-ParmVarDecl [[ADDR_33:0x[a-z0-9]*]]  col:18 'T'
+// CHECK-NEXT: | | `-CompoundStmt [[ADDR_34:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |   `-ReturnStmt [[ADDR_35:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |     `-IntegerLiteral [[ADDR_36:0x[a-z0-9]*]]  'int' 0
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_17]]  line:18:1 also_before[implementation={extension(allow_templates)}] 'int (double)'
+// CHECK-NEXT: |   |-TemplateArgument type 'double'
+// CHECK-NEXT: |   | `-BuiltinType [[ADDR_11]] 'double'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_37:0x[a-z0-9]*]]  col:18 'double':'double'
+// CHECK-NEXT: |   `-CompoundStmt [[ADDR_38:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-ReturnStmt [[ADDR_39:0x[a-z0-9]*]] 
+// CHECK-NEXT: |       `-IntegerLiteral [[ADDR_36]]  'int' 0
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_40:0x[a-z0-9]*]]  col:5 implicit also_after
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_41:0x[a-z0-9]*]]  col:20 referenced typename depth 0 index 0 T
+// CHECK-NEXT: | |-FunctionDecl [[ADDR_42:0x[a-z0-9]*]]  col:5 also_after 'int (T)'
+// CHECK-NEXT: | | |-ParmVarDecl [[ADDR_43:0x[a-z0-9]*]]  col:17 'T'
+// CHECK-NEXT: | | `-OMPDeclareVariantAttr [[ADDR_44:0x[a-z0-9]*]] <> Implicit implementation={extension(allow_templates)}
+// CHECK-NEXT: | |   `-DeclRefExpr [[ADDR_45:0x[a-z0-9]*]]  'int (T)' {{.*}}Function [[ADDR_46:0x[a-z0-9]*]] 'also_after[implementation={extension(allow_templates)}]' 'int (T)'
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_47:0x[a-z0-9]*]]  line:44:5 used also_after 'int (char)'
+// CHECK-NEXT: |   |-TemplateArgument type 'char'
+// CHECK-NEXT: |   | `-BuiltinType [[ADDR_48:0x[a-z0-9]*]] 'char'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_49:0x[a-z0-9]*]]  col:17 'char':'char'
+// CHECK-NEXT: |   |-CompoundStmt [[ADDR_50:0x[a-z0-9]*]] 
+// CHECK-NEXT: |   | `-ReturnStmt [[ADDR_51:0x[a-z0-9]*]] 
+// CHECK-NEXT: |   |   `-IntegerLiteral [[ADDR_52:0x[a-z0-9]*]]  'int' 6
+// CHECK-NEXT: |   `-OMPDeclareVariantAttr [[ADDR_53:0x[a-z0-9]*]] <> Implicit implementation={extension(allow_templates)}
+// CHECK-NEXT: |     `-DeclRefExpr [[ADDR_54:0x[a-z0-9]*]]  'int (char)' {{.*}}Function [[ADDR_55:0x[a-z0-9]*]] 'also_after[implementation={extension(allow_templates)}]' 'int (char)'
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_56:0x[a-z0-9]*]]  line:22:1 also_after[implementation={extension(allow_templates)}]
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_41]]  col:20 referenced typename depth 0 index 0 T
+// CHECK-NEXT: | |-FunctionDecl [[ADDR_46]]  line:22:1 referenced also_after[implementation={extension(allow_templates)}] 'int (T)'
+// CHECK-NEXT: | | |-ParmVarDecl [[ADDR_43]]  col:17 'T'
+// CHECK-NEXT: | | `-CompoundStmt [[ADDR_57:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |   `-ReturnStmt [[ADDR_58:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |     `-IntegerLiteral [[ADDR_59:0x[a-z0-9]*]]  'int' 0
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_55]]  line:22:1 also_after[implementation={extension(allow_templates)}] 'int (char)'
+// CHECK-NEXT: |   |-TemplateArgument type 'char'
+// CHECK-NEXT: |   | `-BuiltinType [[ADDR_48]] 'char'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_60:0x[a-z0-9]*]]  col:17 'char':'char'
+// CHECK-NEXT: |   `-CompoundStmt [[ADDR_61:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-ReturnStmt [[ADDR_62:0x[a-z0-9]*]] 
+// CHECK-NEXT: |       `-IntegerLiteral [[ADDR_59]]  'int' 0
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_63:0x[a-z0-9]*]]  col:5 implicit also_after_mismatch
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_64:0x[a-z0-9]*]]  col:20 referenced typename depth 0 index 0 T
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_65:0x[a-z0-9]*]]  col:32 referenced typename depth 0 index 1 Q
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_66:0x[a-z0-9]*]]  col:5 also_after_mismatch 'int (T, Q)'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_67:0x[a-z0-9]*]]  col:26 'T'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_68:0x[a-z0-9]*]]  col:29 'Q'
+// CHECK-NEXT: |   `-OMPDeclareVariantAttr [[ADDR_69:0x[a-z0-9]*]] <> Implicit implementation={extension(allow_templates)}
+// CHECK-NEXT: |     `-DeclRefExpr [[ADDR_70:0x[a-z0-9]*]]  'int (T, Q)' {{.*}}Function [[ADDR_71:0x[a-z0-9]*]] 'also_after_mismatch[implementation={extension(allow_templates)}]' 'int (T, Q)'
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_72:0x[a-z0-9]*]]  line:26:1 also_after_mismatch[implementation={extension(allow_templates)}]
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_64]]  col:20 referenced typename depth 0 index 0 T
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_65]]  col:32 referenced typename depth 0 index 1 Q
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_71]]  line:26:1 also_after_mismatch[implementation={extension(allow_templates)}] 'int (T, Q)'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_67]]  col:26 'T'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_68]]  col:29 'Q'
+// CHECK-NEXT: |   `-CompoundStmt [[ADDR_73:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-ReturnStmt [[ADDR_74:0x[a-z0-9]*]] 
+// CHECK-NEXT: |       `-IntegerLiteral [[ADDR_75:0x[a-z0-9]*]]  'int' 2
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_76:0x[a-z0-9]*]]  col:5 implicit also_before_mismatch
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_77:0x[a-z0-9]*]]  col:20 referenced typename depth 0 index 0 T
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_78:0x[a-z0-9]*]]  col:5 also_before_mismatch 'int (T)'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_79:0x[a-z0-9]*]]  col:27 'T'
+// CHECK-NEXT: |   `-OMPDeclareVariantAttr [[ADDR_80:0x[a-z0-9]*]] <> Implicit implementation={extension(allow_templates)}
+// CHECK-NEXT: |     `-DeclRefExpr [[ADDR_81:0x[a-z0-9]*]]  'int (T)' {{.*}}Function [[ADDR_82:0x[a-z0-9]*]] 'also_before_mismatch[implementation={extension(allow_templates)}]' 'int (T)'
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_83:0x[a-z0-9]*]]  line:30:1 also_before_mismatch[implementation={extension(allow_templates)}]
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_77]]  col:20 referenced typename depth 0 index 0 T
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_82]]  line:30:1 also_before_mismatch[implementation={extension(allow_templates)}] 'int (T)'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_79]]  col:27 'T'
+// CHECK-NEXT: |   `-CompoundStmt [[ADDR_84:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-ReturnStmt [[ADDR_85:0x[a-z0-9]*]] 
+// CHECK-NEXT: |       `-IntegerLiteral [[ADDR_86:0x[a-z0-9]*]]  'int' 3
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_87:0x[a-z0-9]*]]  col:5 implicit also_before_non_template
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_88:0x[a-z0-9]*]]  col:20 referenced typename depth 0 index 0 T
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_89:0x[a-z0-9]*]]  col:5 also_before_non_template 'int (T)'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_90:0x[a-z0-9]*]]  col:31 'T'
+// CHECK-NEXT: |   `-OMPDeclareVariantAttr [[ADDR_91:0x[a-z0-9]*]] <> Implicit implementation={extension(allow_templates)}
+// CHECK-NEXT: |     `-DeclRefExpr [[ADDR_92:0x[a-z0-9]*]]  'int (T)' {{.*}}Function [[ADDR_93:0x[a-z0-9]*]] 'also_before_non_template[implementation={extension(allow_templates)}]' 'int (T)'
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_94:0x[a-z0-9]*]]  line:34:1 also_before_non_template[implementation={extension(allow_templates)}]
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_88]]  col:20 referenced typename depth 0 index 0 T
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_93]]  line:34:1 also_before_non_template[implementation={extension(allow_templates)}] 'int (T)'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_90]]  col:31 'T'
+// CHECK-NEXT: |   `-CompoundStmt [[ADDR_95:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-ReturnStmt [[ADDR_96:0x[a-z0-9]*]] 
+// CHECK-NEXT: |       `-IntegerLiteral [[ADDR_97:0x[a-z0-9]*]]  'int' 4
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_98:0x[a-z0-9]*]]  col:5 implicit only_def
+// CHECK-NEXT: | |-NonTypeTemplateParmDecl [[ADDR_99:0x[a-z0-9]*]]  col:15 'int' depth 0 index 0 V
+// CHECK-NEXT: | |-FunctionDecl [[ADDR_100:0x[a-z0-9]*]]  col:5 only_def 'int ({{.*}})'
+// CHECK-NEXT: | | `-OMPDeclareVariantAttr [[ADDR_101:0x[a-z0-9]*]] <> Implicit implementation={extension(allow_templates)}
+// CHECK-NEXT: | |   `-DeclRefExpr [[ADDR_102:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_103:0x[a-z0-9]*]] 'only_def[implementation={extension(allow_templates)}]' 'int ({{.*}})'
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_104:0x[a-z0-9]*]]  col:5 used only_def 'int ({{.*}})'
+// CHECK-NEXT: |   |-TemplateArgument integral 0
+// CHECK-NEXT: |   `-OMPDeclareVariantAttr [[ADDR_105:0x[a-z0-9]*]] <> Implicit implementation={extension(allow_templates)}
+// CHECK-NEXT: |     `-DeclRefExpr [[ADDR_106:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_107:0x[a-z0-9]*]] 'only_def[implementation={extension(allow_templates)}]' 'int ({{.*}})'
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_108:0x[a-z0-9]*]]  line:38:1 only_def[implementation={extension(allow_templates)}]
+// CHECK-NEXT: | |-NonTypeTemplateParmDecl [[ADDR_99]]  col:15 'int' depth 0 index 0 V
+// CHECK-NEXT: | |-FunctionDecl [[ADDR_103]]  line:38:1 referenced only_def[implementation={extension(allow_templates)}] 'int ({{.*}})'
+// CHECK-NEXT: | | `-CompoundStmt [[ADDR_109:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |   `-ReturnStmt [[ADDR_110:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |     `-IntegerLiteral [[ADDR_111:0x[a-z0-9]*]]  'int' 0
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_107]]  line:38:1 only_def[implementation={extension(allow_templates)}] 'int ({{.*}})'
+// CHECK-NEXT: |   |-TemplateArgument integral 0
+// CHECK-NEXT: |   `-CompoundStmt [[ADDR_112:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-ReturnStmt [[ADDR_113:0x[a-z0-9]*]] 
+// CHECK-NEXT: |       `-IntegerLiteral [[ADDR_111]]  'int' 0
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_114:0x[a-z0-9]*]] prev [[ADDR_40]]  line:44:5 also_after
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_115:0x[a-z0-9]*]]  col:20 referenced typename depth 0 index 0 T
+// CHECK-NEXT: | |-FunctionDecl [[ADDR_116:0x[a-z0-9]*]] prev [[ADDR_42]]  line:44:5 also_after 'int (T)'
+// CHECK-NEXT: | | |-ParmVarDecl [[ADDR_117:0x[a-z0-9]*]]  col:17 'T'
+// CHECK-NEXT: | | |-CompoundStmt [[ADDR_118:0x[a-z0-9]*]] 
+// CHECK-NEXT: | | | `-ReturnStmt [[ADDR_119:0x[a-z0-9]*]] 
+// CHECK-NEXT: | | |   `-IntegerLiteral [[ADDR_52]]  'int' 6
+// CHECK-NEXT: | | `-OMPDeclareVariantAttr [[ADDR_120:0x[a-z0-9]*]] <> Inherited Implicit implementation={extension(allow_templates)}
+// CHECK-NEXT: | |   `-DeclRefExpr [[ADDR_45]]  'int (T)' {{.*}}Function [[ADDR_46]] 'also_after[implementation={extension(allow_templates)}]' 'int (T)'
+// CHECK-NEXT: | `-Function [[ADDR_47]] 'also_after' 'int (char)'
+// CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_121:0x[a-z0-9]*]]  line:48:5 also_after_mismatch
+// CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_122:0x[a-z0-9]*]]  col:20 referenced typename depth 0 index 0 T
+// CHECK-NEXT: | |-FunctionDecl [[ADDR_123:0x[a-z0-9]*]]  line:48:5 also_after_mismatch 'int (T)'
+// CHECK-NEXT: | | |-ParmVarDecl [[ADDR_124:0x[a-z0-9]*]]  col:26 'T'
+// CHECK-NEXT: | | `-CompoundStmt [[ADDR_125:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |   `-ReturnStmt [[ADDR_126:0x[a-z0-9]*]] 
+// CHECK-NEXT: | |     `-IntegerLiteral [[ADDR_127:0x[a-z0-9]*]]  'int' 0
+// CHECK-NEXT: | `-FunctionDecl [[ADDR_128:0x[a-z0-9]*]]  line:48:5 used also_after_mismatch 'int (int)'
+// CHECK-NEXT: |   |-TemplateArgument type 'int'
+// CHECK-NEXT: |   | `-BuiltinType [[ADDR_129:0x[a-z0-9]*]] 'int'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_130:0x[a-z0-9]*]]  col:26 'int':'int'
+// CHECK-NEXT: |   `-CompoundStmt [[ADDR_131:0x[a-z0-9]*]] 
+// CHECK-NEXT: |     `-ReturnStmt [[ADDR_132:0x[a-z0-9]*]] 
+// CHECK-NEXT: |       `-IntegerLiteral [[ADDR_127]]  'int' 0
+// CHECK-NEXT: `-FunctionDecl [[ADDR_133:0x[a-z0-9]*]]  line:52:5 test 'int ({{.*}})'
+// CHECK-NEXT:   `-CompoundStmt [[ADDR_134:0x[a-z0-9]*]] 
+// CHECK-NEXT:     `-ReturnStmt [[ADDR_135:0x[a-z0-9]*]] 
+// CHECK-NEXT:       `-BinaryOperator [[ADDR_136:0x[a-z0-9]*]]  'int' '+'
+// CHECK-NEXT:         |-BinaryOperator [[ADDR_137:0x[a-z0-9]*]]  'int' '+'
+// CHECK-NEXT:         | |-BinaryOperator [[ADDR_138:0x[a-z0-9]*]]  'int' '+'
+// CHECK-NEXT:         | | |-BinaryOperator [[ADDR_139:0x[a-z0-9]*]]  'int' '+'
+// CHECK-NEXT:         | | | |-BinaryOperator [[ADDR_140:0x[a-z0-9]*]]  'int' '+'
+// CHECK-NEXT:         | | | | |-PseudoObjectExpr [[ADDR_141:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:         | | | | | |-CallExpr [[ADDR_142:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:         | | | | | | |-ImplicitCastExpr [[ADDR_143:0x[a-z0-9]*]]  'int (*)(double)' 
+// CHECK-NEXT:         | | | | | | | `-DeclRefExpr [[ADDR_144:0x[a-z0-9]*]]  'int (double)' {{.*}}Function [[ADDR_10]] 'also_before' 'int (double)' (FunctionTemplate [[ADDR_0]] 'also_before')
+// CHECK-NEXT:         | | | | | | `-FloatingLiteral [[ADDR_145:0x[a-z0-9]*]]  'double' 0.000000e+00
+// CHECK-NEXT:         | | | | | `-CallExpr [[ADDR_146:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:         | | | | |   |-ImplicitCastExpr [[ADDR_147:0x[a-z0-9]*]]  'int (*)(double)' 
+// CHECK-NEXT:         | | | | |   | `-DeclRefExpr [[ADDR_16]]  'int (double)' {{.*}}Function [[ADDR_17]] 'also_before[implementation={extension(allow_templates)}]' 'int (double)'
+// CHECK-NEXT:         | | | | |   `-FloatingLiteral [[ADDR_145]]  'double' 0.000000e+00
+// CHECK-NEXT:         | | | | `-CallExpr [[ADDR_148:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:         | | | |   `-ImplicitCastExpr [[ADDR_149:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CHECK-NEXT:         | | | |     `-DeclRefExpr [[ADDR_150:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_24]] 'also_before_mismatch' 'int ({{.*}})' (FunctionTemplate [[ADDR_18]] 'also_before_mismatch')
+// CHECK-NEXT:         | | | `-CallExpr [[ADDR_151:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:         | | |   `-ImplicitCastExpr [[ADDR_152:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CHECK-NEXT:         | | |     `-DeclRefExpr [[ADDR_153:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_27]] 'also_before_non_template' 'int ({{.*}})'
+// CHECK-NEXT:         | | `-PseudoObjectExpr [[ADDR_154:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:         | |   |-CallExpr [[ADDR_155:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:         | |   | |-ImplicitCastExpr [[ADDR_156:0x[a-z0-9]*]]  'int (*)(char)' 
+// CHECK-NEXT:         | |   | | `-DeclRefExpr [[ADDR_157:0x[a-z0-9]*]]  'int (char)' {{.*}}Function [[ADDR_47]] 'also_after' 'int (char)' (FunctionTemplate [[ADDR_114]] 'also_after')
+// CHECK-NEXT:         | |   | `-ImplicitCastExpr [[ADDR_158:0x[a-z0-9]*]]  'char':'char' 
+// CHECK-NEXT:         | |   |   `-IntegerLiteral [[ADDR_159:0x[a-z0-9]*]]  'int' 0
+// CHECK-NEXT:         | |   `-CallExpr [[ADDR_160:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:         | |     |-ImplicitCastExpr [[ADDR_161:0x[a-z0-9]*]]  'int (*)(char)' 
+// CHECK-NEXT:         | |     | `-DeclRefExpr [[ADDR_54]]  'int (char)' {{.*}}Function [[ADDR_55]] 'also_after[implementation={extension(allow_templates)}]' 'int (char)'
+// CHECK-NEXT:         | |     `-ImplicitCastExpr [[ADDR_162:0x[a-z0-9]*]]  'char':'char' 
+// CHECK-NEXT:         | |       `-IntegerLiteral [[ADDR_159]]  'int' 0
+// CHECK-NEXT:         | `-CallExpr [[ADDR_163:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:         |   |-ImplicitCastExpr [[ADDR_164:0x[a-z0-9]*]]  'int (*)(int)' 
+// CHECK-NEXT:         |   | `-DeclRefExpr [[ADDR_165:0x[a-z0-9]*]]  'int (int)' {{.*}}Function [[ADDR_128]] 'also_after_mismatch' 'int (int)' (FunctionTemplate [[ADDR_121]] 'also_after_mismatch')
+// CHECK-NEXT:         |   `-IntegerLiteral [[ADDR_166:0x[a-z0-9]*]]  'int' 0
+// CHECK-NEXT:         `-PseudoObjectExpr [[ADDR_167:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:           |-CallExpr [[ADDR_168:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:           | `-ImplicitCastExpr [[ADDR_169:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CHECK-NEXT:           |   `-DeclRefExpr [[ADDR_170:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_104]] 'only_def' 'int ({{.*}})' (FunctionTemplate [[ADDR_98]] 'only_def')
+// CHECK-NEXT:           `-CallExpr [[ADDR_171:0x[a-z0-9]*]]  'int'
+// CHECK-NEXT:             `-ImplicitCastExpr [[ADDR_172:0x[a-z0-9]*]]  'int (*)({{.*}})' 
+// CHECK-NEXT:               `-DeclRefExpr [[ADDR_106]]  'int ({{.*}})' {{.*}}Function [[ADDR_107]] 'only_def[implementation={extension(allow_templates)}]' 'int ({{.*}})'
diff --git a/clang/test/AST/ast-dump-openmp-declare-variant-extensions.c b/clang/test/AST/ast-dump-openmp-declare-variant-extensions.c
index 4a755282e39d3..577abbc5fe0b0 100644
--- a/clang/test/AST/ast-dump-openmp-declare-variant-extensions.c
+++ b/clang/test/AST/ast-dump-openmp-declare-variant-extensions.c
@@ -200,8 +200,8 @@ int test() {
 // CHECK-NEXT: |   `-DeclRefExpr [[ADDR_111:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_18]] 'picked7' 'int ({{.*}})' non_odr_use_unevaluated
 // CHECK-NEXT: |-FunctionDecl [[ADDR_112:0x[a-z0-9]*]]  col:5 implicit used overloaded1 'int ({{.*}})'
 // CHECK-NEXT: | `-OMPDeclareVariantAttr [[ADDR_113:0x[a-z0-9]*]] <> Implicit implementation={extension(match_any)}, device={kind(cpu, gpu)}
-// CHECK-NEXT: |   `-DeclRefExpr [[ADDR_114:0x[a-z0-9]*]]  'int ({{.*}})' Function [[ADDR_115:0x[a-z0-9]*]] 'overloaded1[implementation={extension(match_any)}]' 'int ({{.*}})'
-// CHECK-NEXT: |-FunctionDecl [[ADDR_115]]  col:1 overloaded1[implementation={extension(match_any)}] 'int ({{.*}})'
+// CHECK-NEXT: |   `-DeclRefExpr [[ADDR_114:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_115:0x[a-z0-9]*]] 'overloaded1[implementation={extension(match_any)}, device={kind(cpu, gpu)}]' 'int ({{.*}})'
+// CHECK-NEXT: |-FunctionDecl [[ADDR_115]]  col:1 overloaded1[implementation={extension(match_any)}, device={kind(cpu, gpu)}] 'int ({{.*}})'
 // CHECK-NEXT: | `-CompoundStmt [[ADDR_116:0x[a-z0-9]*]] 
 // CHECK-NEXT: |   `-ReturnStmt [[ADDR_117:0x[a-z0-9]*]] 
 // CHECK-NEXT: |     `-IntegerLiteral [[ADDR_118:0x[a-z0-9]*]]  'int' 0
@@ -210,8 +210,8 @@ int test() {
 // CHECK-NEXT: | | `-ReturnStmt [[ADDR_121:0x[a-z0-9]*]] 
 // CHECK-NEXT: | |   `-IntegerLiteral [[ADDR_122:0x[a-z0-9]*]]  'int' 1
 // CHECK-NEXT: | `-OMPDeclareVariantAttr [[ADDR_123:0x[a-z0-9]*]] <> Implicit implementation={extension(match_none)}, device={kind(fpga, gpu)}
-// CHECK-NEXT: |   `-DeclRefExpr [[ADDR_124:0x[a-z0-9]*]]  'int ({{.*}})' Function [[ADDR_125:0x[a-z0-9]*]] 'overloaded2[implementation={extension(match_none)}]' 'int ({{.*}})'
-// CHECK-NEXT: |-FunctionDecl [[ADDR_125]]  col:1 overloaded2[implementation={extension(match_none)}] 'int ({{.*}})'
+// CHECK-NEXT: |   `-DeclRefExpr [[ADDR_124:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_125:0x[a-z0-9]*]] 'overloaded2[implementation={extension(match_none)}, device={kind(fpga, gpu)}]' 'int ({{.*}})'
+// CHECK-NEXT: |-FunctionDecl [[ADDR_125]]  col:1 overloaded2[implementation={extension(match_none)}, device={kind(fpga, gpu)}] 'int ({{.*}})'
 // CHECK-NEXT: | `-CompoundStmt [[ADDR_126:0x[a-z0-9]*]] 
 // CHECK-NEXT: |   `-ReturnStmt [[ADDR_127:0x[a-z0-9]*]] 
 // CHECK-NEXT: |     `-IntegerLiteral [[ADDR_128:0x[a-z0-9]*]]  'int' 0
@@ -333,11 +333,11 @@ int test() {
 // CHECK-NEXT:         |   |   `-DeclRefExpr [[ADDR_236:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_112]] 'overloaded1' 'int ({{.*}})'
 // CHECK-NEXT:         |   `-CallExpr [[ADDR_237:0x[a-z0-9]*]]  'int'
 // CHECK-NEXT:         |     `-ImplicitCastExpr [[ADDR_238:0x[a-z0-9]*]]  'int (*)({{.*}})' 
-// CHECK-NEXT:         |       `-DeclRefExpr [[ADDR_114]]  'int ({{.*}})' Function [[ADDR_115]] 'overloaded1[implementation={extension(match_any)}]' 'int ({{.*}})'
+// CHECK-NEXT:         |       `-DeclRefExpr [[ADDR_114]]  'int ({{.*}})' {{.*}}Function [[ADDR_115]] 'overloaded1[implementation={extension(match_any)}, device={kind(cpu, gpu)}]' 'int ({{.*}})'
 // CHECK-NEXT:         `-PseudoObjectExpr [[ADDR_239:0x[a-z0-9]*]]  'int'
 // CHECK-NEXT:           |-CallExpr [[ADDR_240:0x[a-z0-9]*]]  'int'
 // CHECK-NEXT:           | `-ImplicitCastExpr [[ADDR_241:0x[a-z0-9]*]]  'int (*)({{.*}})' 
 // CHECK-NEXT:           |   `-DeclRefExpr [[ADDR_242:0x[a-z0-9]*]]  'int ({{.*}})' {{.*}}Function [[ADDR_119]] 'overloaded2' 'int ({{.*}})'
 // CHECK-NEXT:           `-CallExpr [[ADDR_243:0x[a-z0-9]*]]  'int'
 // CHECK-NEXT:             `-ImplicitCastExpr [[ADDR_244:0x[a-z0-9]*]]  'int (*)({{.*}})' 
-// CHECK-NEXT:               `-DeclRefExpr [[ADDR_124]]  'int ({{.*}})' Function [[ADDR_125]] 'overloaded2[implementation={extension(match_none)}]' 'int ({{.*}})'
+// CHECK-NEXT:               `-DeclRefExpr [[ADDR_124]]  'int ({{.*}})' {{.*}}Function [[ADDR_125]] 'overloaded2[implementation={extension(match_none)}, device={kind(fpga, gpu)}]' 'int ({{.*}})'
diff --git a/clang/test/AST/attr-swift_bridge.m b/clang/test/AST/attr-swift_bridge.m
new file mode 100644
index 0000000000000..2caa86bef4c0e
--- /dev/null
+++ b/clang/test/AST/attr-swift_bridge.m
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -fsyntax-only -ast-dump %s | FileCheck %s
+
+struct __attribute__((__swift_bridge__("BridgedS"))) S;
+// CHECK: RecordDecl {{.*}} struct S
+// CHECK: SwiftBridgeAttr {{.*}} "BridgedS"
+
+struct S {
+};
+
+// CHECK: RecordDecl {{.*}} struct S definition
+// CHECK: SwiftBridgeAttr {{.*}} Inherited "BridgedS"
diff --git a/clang/test/AST/attr-swift_bridged_typedef.m b/clang/test/AST/attr-swift_bridged_typedef.m
new file mode 100644
index 0000000000000..8c7c0987569ec
--- /dev/null
+++ b/clang/test/AST/attr-swift_bridged_typedef.m
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fsyntax-only -ast-dump %s | FileCheck %s
+
+typedef struct T TBridged __attribute((__swift_bridged_typedef__));
+// CHECK: TypedefDecl {{.*}} TBridged 'struct T'
+// CHECK: SwiftBridgedTypedefAttr
+
+typedef struct T TBridged;
+// CHECK: TypedefDecl {{.*}} TBridged 'struct T'
+// CHECK: SwiftBridgedTypedefAttr
diff --git a/clang/test/AST/attr-swift_bridged_typedef.mm b/clang/test/AST/attr-swift_bridged_typedef.mm
new file mode 100644
index 0000000000000..44fd022d5ea79
--- /dev/null
+++ b/clang/test/AST/attr-swift_bridged_typedef.mm
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fsyntax-only %s -ast-dump | FileCheck %s
+
+@interface NSString
+@end
+
+using NSStringAlias __attribute__((__swift_bridged_typedef__)) = NSString *;
+// CHECK: TypeAliasDecl {{.*}} NSStringAlias 'NSString *'
+// CHECK: SwiftBridgedTypedefAttr
diff --git a/clang/test/Analysis/Inputs/expected-plists/plist-macros-with-expansion.cpp.plist b/clang/test/Analysis/Inputs/expected-plists/plist-macros-with-expansion.cpp.plist
index 2988f8504fcf7..4a2741f0d4937 100644
--- a/clang/test/Analysis/Inputs/expected-plists/plist-macros-with-expansion.cpp.plist
+++ b/clang/test/Analysis/Inputs/expected-plists/plist-macros-with-expansion.cpp.plist
@@ -16,12 +16,12 @@
         start
          
           
-           line25
+           line23
            col3
            file0
           
           
-           line25
+           line23
            col5
            file0
           
@@ -29,12 +29,12 @@
         end
          
           
-           line26
+           line24
            col3
            file0
           
           
-           line26
+           line24
            col21
            file0
           
@@ -46,7 +46,7 @@
      kindevent
      location
      
-      line26
+      line24
       col3
       file0
      
@@ -54,12 +54,12 @@
      
        
         
-         line26
+         line24
          col3
          file0
         
         
-         line26
+         line24
          col21
          file0
         
@@ -79,12 +79,12 @@
         start
          
           
-           line27
+           line25
            col3
            file0
           
           
-           line27
+           line25
            col3
            file0
           
@@ -92,12 +92,12 @@
         end
          
           
-           line27
+           line25
            col8
            file0
           
           
-           line27
+           line25
            col8
            file0
           
@@ -109,7 +109,7 @@
      kindevent
      location
      
-      line27
+      line25
       col8
       file0
      
@@ -117,12 +117,12 @@
      
        
         
-         line27
+         line25
          col4
          file0
         
         
-         line27
+         line25
          col6
          file0
         
@@ -140,7 +140,7 @@
     
      location
      
-      line26
+      line24
       col3
       file0
      
@@ -159,7 +159,7 @@
   issue_hash_function_offset3
   location
   
-   line27
+   line25
    col8
    file0
   
@@ -167,10 +167,10 @@
   
    0
    
+    22
+    23
     24
     25
-    26
-    27
    
   
   
@@ -185,12 +185,12 @@
         start
          
           
-           line38
+           line36
            col3
            file0
           
           
-           line38
+           line36
            col5
            file0
           
@@ -198,12 +198,12 @@
         end
          
           
-           line39
+           line37
            col3
            file0
           
           
-           line39
+           line37
            col39
            file0
           
@@ -215,7 +215,7 @@
      kindevent
      location
      
-      line39
+      line37
       col3
       file0
      
@@ -223,12 +223,12 @@
      
        
         
-         line39
+         line37
          col3
          file0
         
         
-         line39
+         line37
          col39
          file0
         
@@ -248,12 +248,12 @@
         start
          
           
-           line40
+           line38
            col3
            file0
           
           
-           line40
+           line38
            col3
            file0
           
@@ -261,12 +261,12 @@
         end
          
           
-           line40
+           line38
            col8
            file0
           
           
-           line40
+           line38
            col8
            file0
           
@@ -278,7 +278,7 @@
      kindevent
      location
      
-      line40
+      line38
       col8
       file0
      
@@ -286,12 +286,12 @@
      
        
         
-         line40
+         line38
          col4
          file0
         
         
-         line40
+         line38
          col6
          file0
         
@@ -309,7 +309,7 @@
     
      location
      
-      line39
+      line37
       col3
       file0
      
@@ -328,7 +328,7 @@
   issue_hash_function_offset3
   location
   
-   line40
+   line38
    col8
    file0
   
@@ -336,10 +336,10 @@
   
    0
    
+    35
+    36
     37
     38
-    39
-    40
    
   
   
@@ -354,12 +354,12 @@
         start
          
           
-           line58
+           line56
            col3
            file0
           
           
-           line58
+           line56
            col5
            file0
           
@@ -367,12 +367,12 @@
         end
          
           
-           line59
+           line57
            col3
            file0
           
           
-           line59
+           line57
            col9
            file0
           
@@ -384,7 +384,7 @@
      kindevent
      location
      
-      line59
+      line57
       col3
       file0
      
@@ -392,12 +392,12 @@
      
        
         
-         line59
+         line57
          col3
          file0
         
         
-         line59
+         line57
          col15
          file0
         
@@ -413,7 +413,7 @@
      kindevent
      location
      
-      line50
+      line48
       col1
       file0
      
@@ -431,12 +431,12 @@
         start
          
           
-           line50
+           line48
            col1
            file0
           
           
-           line50
+           line48
            col4
            file0
           
@@ -444,12 +444,12 @@
         end
          
           
-           line51
+           line49
            col3
            file0
           
           
-           line51
+           line49
            col3
            file0
           
@@ -461,7 +461,7 @@
      kindevent
      location
      
-      line51
+      line49
       col3
       file0
      
@@ -469,12 +469,12 @@
      
        
         
-         line51
+         line49
          col3
          file0
         
         
-         line51
+         line49
          col17
          file0
         
@@ -490,7 +490,7 @@
      kindevent
      location
      
-      line59
+      line57
       col3
       file0
      
@@ -498,12 +498,12 @@
      
        
         
-         line59
+         line57
          col3
          file0
         
         
-         line59
+         line57
          col15
          file0
         
@@ -523,12 +523,12 @@
         start
          
           
-           line60
+           line58
            col3
            file0
           
           
-           line60
+           line58
            col3
            file0
           
@@ -536,12 +536,12 @@
         end
          
           
-           line60
+           line58
            col8
            file0
           
           
-           line60
+           line58
            col8
            file0
           
@@ -553,7 +553,7 @@
      kindevent
      location
      
-      line60
+      line58
       col8
       file0
      
@@ -561,12 +561,12 @@
      
        
         
-         line60
+         line58
          col4
          file0
         
         
-         line60
+         line58
          col6
          file0
         
@@ -584,7 +584,7 @@
     
      location
      
-      line59
+      line57
       col3
       file0
      
@@ -603,7 +603,7 @@
   issue_hash_function_offset3
   location
   
-   line60
+   line58
    col8
    file0
   
@@ -611,12 +611,12 @@
   
    0
    
-    50
-    51
+    48
+    49
+    55
+    56
     57
     58
-    59
-    60
    
   
   
@@ -631,12 +631,12 @@
         start
          
           
-           line78
+           line76
            col3
            file0
           
           
-           line78
+           line76
            col5
            file0
           
@@ -644,12 +644,12 @@
         end
          
           
-           line79
+           line77
            col3
            file0
           
           
-           line79
+           line77
            col9
            file0
           
@@ -661,7 +661,7 @@
      kindevent
      location
      
-      line79
+      line77
       col3
       file0
      
@@ -669,12 +669,12 @@
      
        
         
-         line79
+         line77
          col3
          file0
         
         
-         line79
+         line77
          col13
          file0
         
@@ -690,7 +690,7 @@
      kindevent
      location
      
-      line50
+      line48
       col1
       file0
      
@@ -708,12 +708,12 @@
         start
          
           
-           line50
+           line48
            col1
            file0
           
           
-           line50
+           line48
            col4
            file0
           
@@ -721,12 +721,12 @@
         end
          
           
-           line51
+           line49
            col3
            file0
           
           
-           line51
+           line49
            col3
            file0
           
@@ -738,7 +738,7 @@
      kindevent
      location
      
-      line51
+      line49
       col3
       file0
      
@@ -746,12 +746,12 @@
      
        
         
-         line51
+         line49
          col3
          file0
         
         
-         line51
+         line49
          col17
          file0
         
@@ -767,7 +767,7 @@
      kindevent
      location
      
-      line79
+      line77
       col3
       file0
      
@@ -775,12 +775,12 @@
      
        
         
-         line79
+         line77
          col3
          file0
         
         
-         line79
+         line77
          col13
          file0
         
@@ -796,7 +796,7 @@
      kindevent
      location
      
-      line80
+      line78
       col12
       file0
      
@@ -804,12 +804,12 @@
      
        
         
-         line80
+         line78
          col3
          file0
         
         
-         line80
+         line78
          col10
          file0
         
@@ -827,7 +827,7 @@
     
      location
      
-      line79
+      line77
       col3
       file0
      
@@ -837,7 +837,7 @@
     
      location
      
-      line80
+      line78
       col3
       file0
      
@@ -856,7 +856,7 @@
   issue_hash_function_offset3
   location
   
-   line80
+   line78
    col12
    file0
   
@@ -864,12 +864,12 @@
   
    0
    
-    50
-    51
+    48
+    49
+    75
+    76
     77
     78
-    79
-    80
    
   
   
@@ -884,12 +884,12 @@
         start
          
           
-           line97
+           line95
            col3
            file0
           
           
-           line97
+           line95
            col5
            file0
           
@@ -897,12 +897,12 @@
         end
          
           
-           line98
+           line96
            col3
            file0
           
           
-           line98
+           line96
            col28
            file0
           
@@ -914,7 +914,7 @@
      kindevent
      location
      
-      line98
+      line96
       col3
       file0
      
@@ -922,12 +922,12 @@
      
        
         
-         line98
+         line96
          col3
          file0
         
         
-         line98
+         line96
          col33
          file0
         
@@ -947,12 +947,12 @@
         start
          
           
-           line99
+           line97
            col3
            file0
           
           
-           line99
+           line97
            col3
            file0
           
@@ -960,12 +960,12 @@
         end
          
           
-           line99
+           line97
            col8
            file0
           
           
-           line99
+           line97
            col8
            file0
           
@@ -977,7 +977,7 @@
      kindevent
      location
      
-      line99
+      line97
       col8
       file0
      
@@ -985,12 +985,12 @@
      
        
         
-         line99
+         line97
          col4
          file0
         
         
-         line99
+         line97
          col6
          file0
         
@@ -1008,7 +1008,7 @@
     
      location
      
-      line98
+      line96
       col3
       file0
      
@@ -1027,7 +1027,7 @@
   issue_hash_function_offset3
   location
   
-   line99
+   line97
    col8
    file0
   
@@ -1035,10 +1035,10 @@
   
    0
    
+    94
+    95
     96
     97
-    98
-    99
    
   
   
@@ -1053,12 +1053,12 @@
         start
          
           
-           line114
+           line112
            col3
            file0
           
           
-           line114
+           line112
            col5
            file0
           
@@ -1066,12 +1066,12 @@
         end
          
           
-           line115
+           line113
            col3
            file0
           
           
-           line115
+           line113
            col42
            file0
           
@@ -1083,7 +1083,7 @@
      kindevent
      location
      
-      line115
+      line113
       col3
       file0
      
@@ -1091,12 +1091,12 @@
      
        
         
-         line115
+         line113
          col3
          file0
         
         
-         line115
+         line113
          col47
          file0
         
@@ -1116,12 +1116,12 @@
         start
          
           
-           line116
+           line114
            col3
            file0
           
           
-           line116
+           line114
            col3
            file0
           
@@ -1129,12 +1129,12 @@
         end
          
           
-           line116
+           line114
            col8
            file0
           
           
-           line116
+           line114
            col8
            file0
           
@@ -1146,7 +1146,7 @@
      kindevent
      location
      
-      line116
+      line114
       col8
       file0
      
@@ -1154,12 +1154,12 @@
      
        
         
-         line116
+         line114
          col4
          file0
         
         
-         line116
+         line114
          col6
          file0
         
@@ -1177,7 +1177,7 @@
     
      location
      
-      line115
+      line113
       col3
       file0
      
@@ -1196,7 +1196,7 @@
   issue_hash_function_offset3
   location
   
-   line116
+   line114
    col8
    file0
   
@@ -1204,10 +1204,10 @@
   
    0
    
+    111
+    112
     113
     114
-    115
-    116
    
   
   
@@ -1222,12 +1222,12 @@
         start
          
           
-           line134
+           line132
            col3
            file0
           
           
-           line134
+           line132
            col5
            file0
           
@@ -1235,12 +1235,12 @@
         end
          
           
-           line135
+           line133
            col3
            file0
           
           
-           line135
+           line133
            col39
            file0
           
@@ -1252,7 +1252,7 @@
      kindevent
      location
      
-      line135
+      line133
       col3
       file0
      
@@ -1260,12 +1260,12 @@
      
        
         
-         line135
+         line133
          col3
          file0
         
         
-         line135
+         line133
          col44
          file0
         
@@ -1285,12 +1285,12 @@
         start
          
           
-           line136
+           line134
            col3
            file0
           
           
-           line136
+           line134
            col3
            file0
           
@@ -1298,12 +1298,12 @@
         end
          
           
-           line136
+           line134
            col8
            file0
           
           
-           line136
+           line134
            col8
            file0
           
@@ -1315,7 +1315,7 @@
      kindevent
      location
      
-      line136
+      line134
       col8
       file0
      
@@ -1323,12 +1323,12 @@
      
        
         
-         line136
+         line134
          col4
          file0
         
         
-         line136
+         line134
          col6
          file0
         
@@ -1346,7 +1346,7 @@
     
      location
      
-      line135
+      line133
       col3
       file0
      
@@ -1365,7 +1365,7 @@
   issue_hash_function_offset3
   location
   
-   line136
+   line134
    col8
    file0
   
@@ -1373,10 +1373,10 @@
   
    0
    
+    131
+    132
     133
     134
-    135
-    136
    
   
   
@@ -1391,12 +1391,12 @@
         start
          
           
-           line161
+           line159
            col3
            file0
           
           
-           line161
+           line159
            col5
            file0
           
@@ -1404,12 +1404,12 @@
         end
          
           
-           line162
+           line160
            col3
            file0
           
           
-           line162
+           line160
            col19
            file0
           
@@ -1421,7 +1421,7 @@
      kindevent
      location
      
-      line162
+      line160
       col3
       file0
      
@@ -1429,12 +1429,12 @@
      
        
         
-         line162
+         line160
          col3
          file0
         
         
-         line162
+         line160
          col52
          file0
         
@@ -1454,12 +1454,12 @@
         start
          
           
-           line163
+           line161
            col3
            file0
           
           
-           line163
+           line161
            col3
            file0
           
@@ -1467,12 +1467,12 @@
         end
          
           
-           line163
+           line161
            col6
            file0
           
           
-           line163
+           line161
            col6
            file0
           
@@ -1484,7 +1484,7 @@
      kindevent
      location
      
-      line163
+      line161
       col6
       file0
      
@@ -1492,12 +1492,12 @@
      
        
         
-         line163
+         line161
          col4
          file0
         
         
-         line163
+         line161
          col4
          file0
         
@@ -1515,7 +1515,7 @@
     
      location
      
-      line162
+      line160
       col3
       file0
      
@@ -1534,7 +1534,7 @@
   issue_hash_function_offset3
   location
   
-   line163
+   line161
    col6
    file0
   
@@ -1542,10 +1542,10 @@
   
    0
    
+    158
+    159
     160
     161
-    162
-    163
    
   
   
@@ -1560,12 +1560,12 @@
         start
          
           
-           line170
+           line168
            col3
            file0
           
           
-           line170
+           line168
            col5
            file0
           
@@ -1573,12 +1573,12 @@
         end
          
           
-           line171
+           line169
            col3
            file0
           
           
-           line171
+           line169
            col19
            file0
           
@@ -1590,7 +1590,7 @@
      kindevent
      location
      
-      line171
+      line169
       col3
       file0
      
@@ -1598,12 +1598,12 @@
      
        
         
-         line171
+         line169
          col3
          file0
         
         
-         line171
+         line169
          col52
          file0
         
@@ -1623,12 +1623,12 @@
         start
          
           
-           line172
+           line170
            col3
            file0
           
           
-           line172
+           line170
            col3
            file0
           
@@ -1636,12 +1636,12 @@
         end
          
           
-           line172
+           line170
            col6
            file0
           
           
-           line172
+           line170
            col6
            file0
           
@@ -1653,7 +1653,7 @@
      kindevent
      location
      
-      line172
+      line170
       col6
       file0
      
@@ -1661,12 +1661,12 @@
      
        
         
-         line172
+         line170
          col4
          file0
         
         
-         line172
+         line170
          col4
          file0
         
@@ -1684,7 +1684,7 @@
     
      location
      
-      line171
+      line169
       col3
       file0
      
@@ -1703,7 +1703,7 @@
   issue_hash_function_offset3
   location
   
-   line172
+   line170
    col6
    file0
   
@@ -1711,10 +1711,10 @@
   
    0
    
+    167
+    168
     169
     170
-    171
-    172
    
   
   
@@ -1729,12 +1729,12 @@
         start
          
           
-           line179
+           line177
            col3
            file0
           
           
-           line179
+           line177
            col5
            file0
           
@@ -1742,12 +1742,12 @@
         end
          
           
-           line180
+           line178
            col3
            file0
           
           
-           line180
+           line178
            col19
            file0
           
@@ -1759,7 +1759,7 @@
      kindevent
      location
      
-      line180
+      line178
       col3
       file0
      
@@ -1767,12 +1767,12 @@
      
        
         
-         line180
+         line178
          col3
          file0
         
         
-         line180
+         line178
          col52
          file0
         
@@ -1792,12 +1792,12 @@
         start
          
           
-           line181
+           line179
            col3
            file0
           
           
-           line181
+           line179
            col3
            file0
           
@@ -1805,12 +1805,12 @@
         end
          
           
-           line181
+           line179
            col6
            file0
           
           
-           line181
+           line179
            col6
            file0
           
@@ -1822,7 +1822,7 @@
      kindevent
      location
      
-      line181
+      line179
       col6
       file0
      
@@ -1830,12 +1830,12 @@
      
        
         
-         line181
+         line179
          col4
          file0
         
         
-         line181
+         line179
          col4
          file0
         
@@ -1853,7 +1853,7 @@
     
      location
      
-      line180
+      line178
       col3
       file0
      
@@ -1872,7 +1872,7 @@
   issue_hash_function_offset3
   location
   
-   line181
+   line179
    col6
    file0
   
@@ -1880,10 +1880,10 @@
   
    0
    
+    176
+    177
     178
     179
-    180
-    181
    
   
   
@@ -1898,12 +1898,12 @@
         start
          
           
-           line193
+           line191
            col3
            file0
           
           
-           line193
+           line191
            col5
            file0
           
@@ -1911,12 +1911,12 @@
         end
          
           
-           line194
+           line192
            col3
            file0
           
           
-           line194
+           line192
            col15
            file0
           
@@ -1928,7 +1928,7 @@
      kindevent
      location
      
-      line194
+      line192
       col3
       file0
      
@@ -1936,12 +1936,12 @@
      
        
         
-         line194
+         line192
          col3
          file0
         
         
-         line194
+         line192
          col30
          file0
         
@@ -1957,7 +1957,7 @@
      kindevent
      location
      
-      line50
+      line48
       col1
       file0
      
@@ -1975,12 +1975,12 @@
         start
          
           
-           line50
+           line48
            col1
            file0
           
           
-           line50
+           line48
            col4
            file0
           
@@ -1988,12 +1988,12 @@
         end
          
           
-           line51
+           line49
            col3
            file0
           
           
-           line51
+           line49
            col3
            file0
           
@@ -2005,7 +2005,7 @@
      kindevent
      location
      
-      line51
+      line49
       col3
       file0
      
@@ -2013,12 +2013,12 @@
      
        
         
-         line51
+         line49
          col3
          file0
         
         
-         line51
+         line49
          col17
          file0
         
@@ -2034,7 +2034,7 @@
      kindevent
      location
      
-      line194
+      line192
       col3
       file0
      
@@ -2042,12 +2042,12 @@
      
        
         
-         line194
+         line192
          col3
          file0
         
         
-         line194
+         line192
          col30
          file0
         
@@ -2067,12 +2067,12 @@
         start
          
           
-           line195
+           line193
            col3
            file0
           
           
-           line195
+           line193
            col3
            file0
           
@@ -2080,12 +2080,12 @@
         end
          
           
-           line195
+           line193
            col6
            file0
           
           
-           line195
+           line193
            col6
            file0
           
@@ -2097,7 +2097,7 @@
      kindevent
      location
      
-      line195
+      line193
       col6
       file0
      
@@ -2105,12 +2105,12 @@
      
        
         
-         line195
+         line193
          col4
          file0
         
         
-         line195
+         line193
          col4
          file0
         
@@ -2128,7 +2128,7 @@
     
      location
      
-      line194
+      line192
       col3
       file0
      
@@ -2147,7 +2147,7 @@
   issue_hash_function_offset3
   location
   
-   line195
+   line193
    col6
    file0
   
@@ -2155,12 +2155,12 @@
   
    0
    
-    50
-    51
+    48
+    49
+    190
+    191
     192
     193
-    194
-    195
    
   
   
@@ -2175,12 +2175,12 @@
         start
          
           
-           line207
+           line205
            col3
            file0
           
           
-           line207
+           line205
            col5
            file0
           
@@ -2188,12 +2188,12 @@
         end
          
           
-           line208
+           line206
            col3
            file0
           
           
-           line208
+           line206
            col15
            file0
           
@@ -2205,7 +2205,7 @@
      kindevent
      location
      
-      line208
+      line206
       col3
       file0
      
@@ -2213,12 +2213,12 @@
      
        
         
-         line208
+         line206
          col3
          file0
         
         
-         line208
+         line206
          col48
          file0
         
@@ -2234,7 +2234,7 @@
      kindevent
      location
      
-      line201
+      line199
       col1
       file0
      
@@ -2252,12 +2252,12 @@
         start
          
           
-           line201
+           line199
            col1
            file0
           
           
-           line201
+           line199
            col4
            file0
           
@@ -2265,12 +2265,12 @@
         end
          
           
-           line202
+           line200
            col3
            file0
           
           
-           line202
+           line200
            col11
            file0
           
@@ -2282,7 +2282,7 @@
      kindevent
      location
      
-      line202
+      line200
       col3
       file0
      
@@ -2290,12 +2290,12 @@
      
        
         
-         line202
+         line200
          col3
          file0
         
         
-         line202
+         line200
          col17
          file0
         
@@ -2311,7 +2311,7 @@
      kindevent
      location
      
-      line50
+      line48
       col1
       file0
      
@@ -2329,12 +2329,12 @@
         start
          
           
-           line50
+           line48
            col1
            file0
           
           
-           line50
+           line48
            col4
            file0
           
@@ -2342,12 +2342,12 @@
         end
          
           
-           line51
+           line49
            col3
            file0
           
           
-           line51
+           line49
            col3
            file0
           
@@ -2359,7 +2359,7 @@
      kindevent
      location
      
-      line51
+      line49
       col3
       file0
      
@@ -2367,12 +2367,12 @@
      
        
         
-         line51
+         line49
          col3
          file0
         
         
-         line51
+         line49
          col17
          file0
         
@@ -2388,7 +2388,7 @@
      kindevent
      location
      
-      line202
+      line200
       col3
       file0
      
@@ -2396,12 +2396,12 @@
      
        
         
-         line202
+         line200
          col3
          file0
         
         
-         line202
+         line200
          col17
          file0
         
@@ -2421,12 +2421,12 @@
         start
          
           
-           line202
+           line200
            col3
            file0
           
           
-           line202
+           line200
            col11
            file0
           
@@ -2434,12 +2434,12 @@
         end
          
           
-           line203
+           line201
            col3
            file0
           
           
-           line203
+           line201
            col7
            file0
           
@@ -2451,7 +2451,7 @@
      kindevent
      location
      
-      line208
+      line206
       col3
       file0
      
@@ -2459,12 +2459,12 @@
      
        
         
-         line208
+         line206
          col3
          file0
         
         
-         line208
+         line206
          col48
          file0
         
@@ -2484,12 +2484,12 @@
         start
          
           
-           line209
+           line207
            col3
            file0
           
           
-           line209
+           line207
            col3
            file0
           
@@ -2497,12 +2497,12 @@
         end
          
           
-           line209
+           line207
            col6
            file0
           
           
-           line209
+           line207
            col6
            file0
           
@@ -2514,7 +2514,7 @@
      kindevent
      location
      
-      line209
+      line207
       col6
       file0
      
@@ -2522,12 +2522,12 @@
      
        
         
-         line209
+         line207
          col4
          file0
         
         
-         line209
+         line207
          col4
          file0
         
@@ -2545,7 +2545,7 @@
     
      location
      
-      line208
+      line206
       col3
       file0
      
@@ -2564,7 +2564,7 @@
   issue_hash_function_offset3
   location
   
-   line209
+   line207
    col6
    file0
   
@@ -2572,15 +2572,15 @@
   
    0
    
-    50
-    51
+    48
+    49
+    199
+    200
     201
-    202
-    203
+    204
+    205
     206
     207
-    208
-    209
    
   
   
@@ -2595,12 +2595,12 @@
         start
          
           
-           line219
+           line217
            col3
            file0
           
           
-           line219
+           line217
            col5
            file0
           
@@ -2608,12 +2608,12 @@
         end
          
           
-           line220
+           line218
            col3
            file0
           
           
-           line220
+           line218
            col31
            file0
           
@@ -2625,7 +2625,7 @@
      kindevent
      location
      
-      line220
+      line218
       col3
       file0
      
@@ -2633,12 +2633,12 @@
      
        
         
-         line220
+         line218
          col3
          file0
         
         
-         line220
+         line218
          col64
          file0
         
@@ -2654,7 +2654,7 @@
      kindevent
      location
      
-      line201
+      line199
       col1
       file0
      
@@ -2672,12 +2672,12 @@
         start
          
           
-           line201
+           line199
            col1
            file0
           
           
-           line201
+           line199
            col4
            file0
           
@@ -2685,12 +2685,12 @@
         end
          
           
-           line202
+           line200
            col3
            file0
           
           
-           line202
+           line200
            col11
            file0
           
@@ -2702,7 +2702,7 @@
      kindevent
      location
      
-      line202
+      line200
       col3
       file0
      
@@ -2710,12 +2710,12 @@
      
        
         
-         line202
+         line200
          col3
          file0
         
         
-         line202
+         line200
          col17
          file0
         
@@ -2731,7 +2731,7 @@
      kindevent
      location
      
-      line50
+      line48
       col1
       file0
      
@@ -2749,12 +2749,12 @@
         start
          
           
-           line50
+           line48
            col1
            file0
           
           
-           line50
+           line48
            col4
            file0
           
@@ -2762,12 +2762,12 @@
         end
          
           
-           line51
+           line49
            col3
            file0
           
           
-           line51
+           line49
            col3
            file0
           
@@ -2779,7 +2779,7 @@
      kindevent
      location
      
-      line51
+      line49
       col3
       file0
      
@@ -2787,12 +2787,12 @@
      
        
         
-         line51
+         line49
          col3
          file0
         
         
-         line51
+         line49
          col17
          file0
         
@@ -2808,7 +2808,7 @@
      kindevent
      location
      
-      line202
+      line200
       col3
       file0
      
@@ -2816,12 +2816,12 @@
      
        
         
-         line202
+         line200
          col3
          file0
         
         
-         line202
+         line200
          col17
          file0
         
@@ -2841,12 +2841,12 @@
         start
          
           
-           line202
+           line200
            col3
            file0
           
           
-           line202
+           line200
            col11
            file0
           
@@ -2854,12 +2854,12 @@
         end
          
           
-           line203
+           line201
            col3
            file0
           
           
-           line203
+           line201
            col7
            file0
           
@@ -2871,7 +2871,7 @@
      kindevent
      location
      
-      line220
+      line218
       col3
       file0
      
@@ -2879,12 +2879,12 @@
      
        
         
-         line220
+         line218
          col3
          file0
         
         
-         line220
+         line218
          col64
          file0
         
@@ -2904,12 +2904,12 @@
         start
          
           
-           line221
+           line219
            col3
            file0
           
           
-           line221
+           line219
            col3
            file0
           
@@ -2917,12 +2917,12 @@
         end
          
           
-           line221
+           line219
            col6
            file0
           
           
-           line221
+           line219
            col6
            file0
           
@@ -2934,7 +2934,7 @@
      kindevent
      location
      
-      line221
+      line219
       col6
       file0
      
@@ -2942,12 +2942,12 @@
      
        
         
-         line221
+         line219
          col4
          file0
         
         
-         line221
+         line219
          col4
          file0
         
@@ -2965,7 +2965,7 @@
     
      location
      
-      line220
+      line218
       col3
       file0
      
@@ -2984,7 +2984,7 @@
   issue_hash_function_offset3
   location
   
-   line221
+   line219
    col6
    file0
   
@@ -2992,15 +2992,15 @@
   
    0
    
-    50
-    51
+    48
+    49
+    199
+    200
     201
-    202
-    203
+    216
+    217
     218
     219
-    220
-    221
    
   
   
@@ -3015,12 +3015,12 @@
         start
          
           
-           line231
+           line229
            col3
            file0
           
           
-           line231
+           line229
            col5
            file0
           
@@ -3028,12 +3028,12 @@
         end
          
           
-           line235
+           line233
            col3
            file0
           
           
-           line235
+           line233
            col13
            file0
           
@@ -3045,7 +3045,7 @@
      kindevent
      location
      
-      line235
+      line233
       col3
       file0
      
@@ -3053,12 +3053,12 @@
      
        
         
-         line235
+         line233
          col3
          file0
         
         
-         line235
+         line233
          col58
          file0
         
@@ -3074,7 +3074,7 @@
      kindevent
      location
      
-      line235
+      line233
       col3
       file0
      
@@ -3088,7 +3088,7 @@
      kindevent
      location
      
-      line235
+      line233
       col3
       file0
      
@@ -3096,12 +3096,12 @@
      
        
         
-         line235
+         line233
          col3
          file0
         
         
-         line235
+         line233
          col58
          file0
         
@@ -3117,7 +3117,7 @@
      kindevent
      location
      
-      line50
+      line48
       col1
       file0
      
@@ -3135,12 +3135,12 @@
         start
          
           
-           line50
+           line48
            col1
            file0
           
           
-           line50
+           line48
            col4
            file0
           
@@ -3148,12 +3148,12 @@
         end
          
           
-           line51
+           line49
            col3
            file0
           
           
-           line51
+           line49
            col3
            file0
           
@@ -3165,7 +3165,7 @@
      kindevent
      location
      
-      line51
+      line49
       col3
       file0
      
@@ -3173,12 +3173,12 @@
      
        
         
-         line51
+         line49
          col3
          file0
         
         
-         line51
+         line49
          col17
          file0
         
@@ -3194,7 +3194,7 @@
      kindevent
      location
      
-      line235
+      line233
       col3
       file0
      
@@ -3202,12 +3202,12 @@
      
        
         
-         line235
+         line233
          col3
          file0
         
         
-         line235
+         line233
          col58
          file0
         
@@ -3223,7 +3223,7 @@
      kindevent
      location
      
-      line235
+      line233
       col3
       file0
      
@@ -3231,12 +3231,12 @@
      
        
         
-         line235
+         line233
          col3
          file0
         
         
-         line235
+         line233
          col58
          file0
         
@@ -3256,12 +3256,12 @@
         start
          
           
-           line236
+           line234
            col3
            file0
           
           
-           line236
+           line234
            col3
            file0
           
@@ -3269,12 +3269,12 @@
         end
          
           
-           line236
+           line234
            col8
            file0
           
           
-           line236
+           line234
            col8
            file0
           
@@ -3286,7 +3286,7 @@
      kindevent
      location
      
-      line236
+      line234
       col8
       file0
      
@@ -3294,12 +3294,12 @@
      
        
         
-         line236
+         line234
          col4
          file0
         
         
-         line236
+         line234
          col6
          file0
         
@@ -3317,7 +3317,7 @@
     
      location
      
-      line235
+      line233
       col3
       file0
      
@@ -3327,7 +3327,7 @@
     
      location
      
-      line235
+      line233
       col3
       file0
      
@@ -3346,7 +3346,7 @@
   issue_hash_function_offset6
   location
   
-   line236
+   line234
    col8
    file0
   
@@ -3354,13 +3354,13 @@
   
    0
    
-    50
-    51
+    48
+    49
+    228
+    229
     230
-    231
-    232
-    235
-    236
+    233
+    234
    
   
   
@@ -3371,7 +3371,7 @@
      kindevent
      location
      
-      line246
+      line244
       col3
       file0
      
@@ -3379,12 +3379,12 @@
      
        
         
-         line246
+         line244
          col3
          file0
         
         
-         line254
+         line252
          col4
          file0
         
@@ -3400,7 +3400,7 @@
      kindevent
      location
      
-      line246
+      line244
       col3
       file0
      
@@ -3408,12 +3408,12 @@
      
        
         
-         line246
+         line244
          col3
          file0
         
         
-         line254
+         line252
          col4
          file0
         
@@ -3431,7 +3431,7 @@
     
      location
      
-      line246
+      line244
       col3
       file0
      
@@ -3450,7 +3450,7 @@
   issue_hash_function_offset1
   location
   
-   line246
+   line244
    col3
    file0
   
@@ -3458,8 +3458,8 @@
   
    0
    
-    245
-    246
+    243
+    244
    
   
   
@@ -3474,12 +3474,12 @@
         start
          
           
-           line268
+           line266
            col3
            file0
           
           
-           line268
+           line266
            col5
            file0
           
@@ -3487,12 +3487,12 @@
         end
          
           
-           line270
+           line268
            col3
            file0
           
           
-           line270
+           line268
            col25
            file0
           
@@ -3504,7 +3504,7 @@
      kindevent
      location
      
-      line270
+      line268
       col3
       file0
      
@@ -3512,12 +3512,12 @@
      
        
         
-         line270
+         line268
          col3
          file0
         
         
-         line270
+         line268
          col31
          file0
         
@@ -3537,12 +3537,12 @@
         start
          
           
-           line271
+           line269
            col3
            file0
           
           
-           line271
+           line269
            col3
            file0
           
@@ -3550,12 +3550,12 @@
         end
          
           
-           line271
+           line269
            col8
            file0
           
           
-           line271
+           line269
            col8
            file0
           
@@ -3567,7 +3567,7 @@
      kindevent
      location
      
-      line271
+      line269
       col8
       file0
      
@@ -3575,12 +3575,12 @@
      
        
         
-         line271
+         line269
          col4
          file0
         
         
-         line271
+         line269
          col6
          file0
         
@@ -3598,7 +3598,7 @@
     
      location
      
-      line270
+      line268
       col3
       file0
      
@@ -3617,7 +3617,7 @@
   issue_hash_function_offset4
   location
   
-   line271
+   line269
    col8
    file0
   
@@ -3625,10 +3625,10 @@
   
    0
    
-    267
+    265
+    266
     268
-    270
-    271
+    269
    
   
   
@@ -3643,12 +3643,12 @@
         start
          
           
-           line282
+           line280
            col3
            file0
           
           
-           line282
+           line280
            col5
            file0
           
@@ -3656,12 +3656,12 @@
         end
          
           
-           line284
+           line282
            col3
            file0
           
           
-           line284
+           line282
            col20
            file0
           
@@ -3673,7 +3673,7 @@
      kindevent
      location
      
-      line284
+      line282
       col3
       file0
      
@@ -3681,12 +3681,12 @@
      
        
         
-         line284
+         line282
          col3
          file0
         
         
-         line284
+         line282
          col27
          file0
         
@@ -3706,12 +3706,12 @@
         start
          
           
-           line285
+           line283
            col3
            file0
           
           
-           line285
+           line283
            col3
            file0
           
@@ -3719,12 +3719,12 @@
         end
          
           
-           line285
+           line283
            col8
            file0
           
           
-           line285
+           line283
            col8
            file0
           
@@ -3736,7 +3736,7 @@
      kindevent
      location
      
-      line285
+      line283
       col8
       file0
      
@@ -3744,12 +3744,12 @@
      
        
         
-         line285
+         line283
          col4
          file0
         
         
-         line285
+         line283
          col6
          file0
         
@@ -3767,7 +3767,7 @@
     
      location
      
-      line284
+      line282
       col3
       file0
      
@@ -3786,7 +3786,7 @@
   issue_hash_function_offset4
   location
   
-   line285
+   line283
    col8
    file0
   
@@ -3794,10 +3794,10 @@
   
    0
    
-    281
+    279
+    280
     282
-    284
-    285
+    283
    
   
   
@@ -3812,12 +3812,12 @@
         start
          
           
-           line295
+           line293
            col3
            file0
           
           
-           line295
+           line293
            col5
            file0
           
@@ -3825,12 +3825,12 @@
         end
          
           
-           line296
+           line294
            col3
            file0
           
           
-           line296
+           line294
            col44
            file0
           
@@ -3842,7 +3842,7 @@
      kindevent
      location
      
-      line296
+      line294
       col3
       file0
      
@@ -3850,12 +3850,12 @@
      
        
         
-         line296
+         line294
          col3
          file0
         
         
-         line296
+         line294
          col61
          file0
         
@@ -3871,7 +3871,7 @@
      kindevent
      location
      
-      line50
+      line48
       col1
       file0
      
@@ -3889,12 +3889,12 @@
         start
          
           
-           line50
+           line48
            col1
            file0
           
           
-           line50
+           line48
            col4
            file0
           
@@ -3902,12 +3902,12 @@
         end
          
           
-           line51
+           line49
            col3
            file0
           
           
-           line51
+           line49
            col3
            file0
           
@@ -3919,7 +3919,7 @@
      kindevent
      location
      
-      line51
+      line49
       col3
       file0
      
@@ -3927,12 +3927,12 @@
      
        
         
-         line51
+         line49
          col3
          file0
         
         
-         line51
+         line49
          col17
          file0
         
@@ -3948,7 +3948,7 @@
      kindevent
      location
      
-      line296
+      line294
       col3
       file0
      
@@ -3956,12 +3956,12 @@
      
        
         
-         line296
+         line294
          col3
          file0
         
         
-         line296
+         line294
          col61
          file0
         
@@ -3981,12 +3981,12 @@
         start
          
           
-           line297
+           line295
            col3
            file0
           
           
-           line297
+           line295
            col3
            file0
           
@@ -3994,12 +3994,12 @@
         end
          
           
-           line297
+           line295
            col8
            file0
           
           
-           line297
+           line295
            col8
            file0
           
@@ -4011,7 +4011,7 @@
      kindevent
      location
      
-      line297
+      line295
       col8
       file0
      
@@ -4019,12 +4019,12 @@
      
        
         
-         line297
+         line295
          col4
          file0
         
         
-         line297
+         line295
          col6
          file0
         
@@ -4042,7 +4042,7 @@
     
      location
      
-      line296
+      line294
       col3
       file0
      
@@ -4061,7 +4061,7 @@
   issue_hash_function_offset3
   location
   
-   line297
+   line295
    col8
    file0
   
@@ -4069,12 +4069,12 @@
   
    0
    
-    50
-    51
+    48
+    49
+    292
+    293
     294
     295
-    296
-    297
    
   
   
@@ -4089,12 +4089,12 @@
         start
          
           
-           line315
+           line313
            col3
            file0
           
           
-           line315
+           line313
            col5
            file0
           
@@ -4102,12 +4102,12 @@
         end
          
           
-           line316
+           line314
            col3
            file0
           
           
-           line316
+           line314
            col22
            file0
           
@@ -4119,7 +4119,7 @@
      kindevent
      location
      
-      line316
+      line314
       col3
       file0
      
@@ -4127,12 +4127,12 @@
      
        
         
-         line316
+         line314
          col3
          file0
         
         
-         line316
+         line314
          col42
          file0
         
@@ -4152,12 +4152,12 @@
         start
          
           
-           line317
+           line315
            col3
            file0
           
           
-           line317
+           line315
            col3
            file0
           
@@ -4165,12 +4165,12 @@
         end
          
           
-           line317
+           line315
            col8
            file0
           
           
-           line317
+           line315
            col8
            file0
           
@@ -4182,7 +4182,7 @@
      kindevent
      location
      
-      line317
+      line315
       col8
       file0
      
@@ -4190,12 +4190,12 @@
      
        
         
-         line317
+         line315
          col4
          file0
         
         
-         line317
+         line315
          col6
          file0
         
@@ -4213,7 +4213,7 @@
     
      location
      
-      line316
+      line314
       col3
       file0
      
@@ -4232,7 +4232,7 @@
   issue_hash_function_offset3
   location
   
-   line317
+   line315
    col8
    file0
   
@@ -4240,10 +4240,10 @@
   
    0
    
+    312
+    313
     314
     315
-    316
-    317
    
   
   
@@ -4258,12 +4258,12 @@
         start
          
           
-           line324
+           line322
            col3
            file0
           
           
-           line324
+           line322
            col5
            file0
           
@@ -4271,12 +4271,12 @@
         end
          
           
-           line327
+           line325
            col3
            file0
           
           
-           line327
+           line325
            col22
            file0
           
@@ -4288,7 +4288,7 @@
      kindevent
      location
      
-      line327
+      line325
       col3
       file0
      
@@ -4296,12 +4296,12 @@
      
        
         
-         line327
+         line325
          col3
          file0
         
         
-         line327
+         line325
          col27
          file0
         
@@ -4321,12 +4321,12 @@
         start
          
           
-           line328
+           line326
            col3
            file0
           
           
-           line328
+           line326
            col3
            file0
           
@@ -4334,12 +4334,12 @@
         end
          
           
-           line328
+           line326
            col8
            file0
           
           
-           line328
+           line326
            col8
            file0
           
@@ -4351,7 +4351,7 @@
      kindevent
      location
      
-      line328
+      line326
       col8
       file0
      
@@ -4359,12 +4359,12 @@
      
        
         
-         line328
+         line326
          col4
          file0
         
         
-         line328
+         line326
          col6
          file0
         
@@ -4382,7 +4382,7 @@
     
      location
      
-      line327
+      line325
       col3
       file0
      
@@ -4401,7 +4401,7 @@
   issue_hash_function_offset5
   location
   
-   line328
+   line326
    col8
    file0
   
@@ -4409,10 +4409,10 @@
   
    0
    
-    323
-    324
-    327
-    328
+    321
+    322
+    325
+    326
    
   
   
@@ -4427,12 +4427,12 @@
         start
          
           
-           line343
+           line341
            col3
            file0
           
           
-           line343
+           line341
            col5
            file0
           
@@ -4440,12 +4440,12 @@
         end
          
           
-           line344
+           line342
            col3
            file0
           
           
-           line344
+           line342
            col30
            file0
           
@@ -4457,7 +4457,7 @@
      kindevent
      location
      
-      line344
+      line342
       col3
       file0
      
@@ -4465,12 +4465,12 @@
      
        
         
-         line344
+         line342
          col3
          file0
         
         
-         line344
+         line342
          col45
          file0
         
@@ -4490,12 +4490,12 @@
         start
          
           
-           line345
+           line343
            col3
            file0
           
           
-           line345
+           line343
            col3
            file0
           
@@ -4503,12 +4503,12 @@
         end
          
           
-           line345
+           line343
            col8
            file0
           
           
-           line345
+           line343
            col8
            file0
           
@@ -4520,7 +4520,7 @@
      kindevent
      location
      
-      line345
+      line343
       col8
       file0
      
@@ -4528,12 +4528,12 @@
      
        
         
-         line345
+         line343
          col4
          file0
         
         
-         line345
+         line343
          col6
          file0
         
@@ -4551,7 +4551,7 @@
     
      location
      
-      line344
+      line342
       col3
       file0
      
@@ -4570,7 +4570,7 @@
   issue_hash_function_offset3
   location
   
-   line345
+   line343
    col8
    file0
   
@@ -4578,10 +4578,10 @@
   
    0
    
+    340
+    341
     342
     343
-    344
-    345
    
   
   
@@ -4596,12 +4596,12 @@
         start
          
           
-           line352
+           line350
            col3
            file0
           
           
-           line352
+           line350
            col5
            file0
           
@@ -4609,12 +4609,12 @@
         end
          
           
-           line353
+           line351
            col3
            file0
           
           
-           line353
+           line351
            col19
            file0
           
@@ -4626,7 +4626,7 @@
      kindevent
      location
      
-      line353
+      line351
       col3
       file0
      
@@ -4634,12 +4634,12 @@
      
        
         
-         line353
+         line351
          col3
          file0
         
         
-         line353
+         line351
          col53
          file0
         
@@ -4659,12 +4659,12 @@
         start
          
           
-           line354
+           line352
            col3
            file0
           
           
-           line354
+           line352
            col3
            file0
           
@@ -4672,12 +4672,12 @@
         end
          
           
-           line354
+           line352
            col6
            file0
           
           
-           line354
+           line352
            col6
            file0
           
@@ -4689,7 +4689,7 @@
      kindevent
      location
      
-      line354
+      line352
       col6
       file0
      
@@ -4697,12 +4697,12 @@
      
        
         
-         line354
+         line352
          col4
          file0
         
         
-         line354
+         line352
          col4
          file0
         
@@ -4720,7 +4720,7 @@
     
      location
      
-      line353
+      line351
       col3
       file0
      
@@ -4739,7 +4739,7 @@
   issue_hash_function_offset3
   location
   
-   line354
+   line352
    col6
    file0
   
@@ -4747,10 +4747,10 @@
   
    0
    
+    349
+    350
     351
     352
-    353
-    354
    
   
   
@@ -4765,12 +4765,12 @@
         start
          
           
-           line365
+           line363
            col3
            file0
           
           
-           line365
+           line363
            col5
            file0
           
@@ -4778,12 +4778,12 @@
         end
          
           
-           line366
+           line364
            col3
            file0
           
           
-           line366
+           line364
            col11
            file0
           
@@ -4795,7 +4795,7 @@
      kindevent
      location
      
-      line366
+      line364
       col3
       file0
      
@@ -4803,12 +4803,12 @@
      
        
         
-         line366
+         line364
          col3
          file0
         
         
-         line366
+         line364
          col23
          file0
         
@@ -4828,12 +4828,12 @@
         start
          
           
-           line367
+           line365
            col3
            file0
           
           
-           line367
+           line365
            col3
            file0
           
@@ -4841,12 +4841,12 @@
         end
          
           
-           line367
+           line365
            col8
            file0
           
           
-           line367
+           line365
            col8
            file0
           
@@ -4858,7 +4858,7 @@
      kindevent
      location
      
-      line367
+      line365
       col8
       file0
      
@@ -4866,12 +4866,12 @@
      
        
         
-         line367
+         line365
          col4
          file0
         
         
-         line367
+         line365
          col6
          file0
         
@@ -4889,7 +4889,7 @@
     
      location
      
-      line366
+      line364
       col3
       file0
      
@@ -4908,7 +4908,7 @@
   issue_hash_function_offset3
   location
   
-   line367
+   line365
    col8
    file0
   
@@ -4916,10 +4916,10 @@
   
    0
    
+    362
+    363
     364
     365
-    366
-    367
    
   
   
@@ -4934,12 +4934,12 @@
         start
          
           
-           line374
+           line372
            col3
            file0
           
           
-           line374
+           line372
            col5
            file0
           
@@ -4947,12 +4947,12 @@
         end
          
           
-           line375
+           line373
            col3
            file0
           
           
-           line375
+           line373
            col19
            file0
           
@@ -4964,7 +4964,7 @@
      kindevent
      location
      
-      line375
+      line373
       col3
       file0
      
@@ -4972,12 +4972,12 @@
      
        
         
-         line375
+         line373
          col3
          file0
         
         
-         line375
+         line373
          col52
          file0
         
@@ -4997,12 +4997,12 @@
         start
          
           
-           line376
+           line374
            col3
            file0
           
           
-           line376
+           line374
            col3
            file0
           
@@ -5010,12 +5010,12 @@
         end
          
           
-           line376
+           line374
            col6
            file0
           
           
-           line376
+           line374
            col6
            file0
           
@@ -5027,7 +5027,7 @@
      kindevent
      location
      
-      line376
+      line374
       col6
       file0
      
@@ -5035,12 +5035,12 @@
      
        
         
-         line376
+         line374
          col4
          file0
         
         
-         line376
+         line374
          col4
          file0
         
@@ -5058,7 +5058,7 @@
     
      location
      
-      line375
+      line373
       col3
       file0
      
@@ -5077,7 +5077,7 @@
   issue_hash_function_offset3
   location
   
-   line376
+   line374
    col6
    file0
   
@@ -5085,10 +5085,10 @@
   
    0
    
+    371
+    372
     373
     374
-    375
-    376
    
   
   
@@ -5103,12 +5103,12 @@
         start
          
           
-           line422
+           line420
            col3
            file0
           
           
-           line422
+           line420
            col5
            file0
           
@@ -5116,12 +5116,12 @@
         end
          
           
-           line422
+           line420
            col18
            file0
           
           
-           line422
+           line420
            col43
            file0
           
@@ -5133,7 +5133,7 @@
      kindevent
      location
      
-      line422
+      line420
       col18
       file0
      
@@ -5141,12 +5141,12 @@
      
        
         
-         line422
+         line420
          col18
          file0
         
         
-         line422
+         line420
          col49
          file0
         
@@ -5162,7 +5162,7 @@
      kindevent
      location
      
-      line417
+      line415
       col1
       file0
      
@@ -5180,12 +5180,12 @@
         start
          
           
-           line417
+           line415
            col1
            file0
           
           
-           line417
+           line415
            col3
            file0
           
@@ -5193,12 +5193,12 @@
         end
          
           
-           line418
+           line416
            col3
            file0
           
           
-           line418
+           line416
            col21
            file0
           
@@ -5210,7 +5210,7 @@
      kindpop-up
      location
      
-      line418
+      line416
       col3
       file0
      
@@ -5218,12 +5218,12 @@
      
        
         
-         line418
+         line416
          col3
          file0
         
         
-         line418
+         line416
          col27
          file0
         
@@ -5238,7 +5238,7 @@
      kindpop-up
      location
      
-      line418
+      line416
       col3
       file0
      
@@ -5246,12 +5246,12 @@
      
        
         
-         line418
+         line416
          col3
          file0
         
         
-         line418
+         line416
          col27
          file0
         
@@ -5266,7 +5266,7 @@
      kindevent
      location
      
-      line418
+      line416
       col3
       file0
      
@@ -5274,12 +5274,12 @@
      
        
         
-         line418
+         line416
          col3
          file0
         
         
-         line418
+         line416
          col27
          file0
         
@@ -5297,7 +5297,7 @@
     
      location
      
-      line418
+      line416
       col3
       file0
      
@@ -5316,7 +5316,7 @@
   issue_hash_function_offset1
   location
   
-   line418
+   line416
    col3
    file0
   
@@ -5324,10 +5324,10 @@
   
    0
    
-    417
-    418
-    421
-    422
+    415
+    416
+    419
+    420
    
   
   
@@ -5342,12 +5342,12 @@
         start
          
           
-           line437
+           line435
            col3
            file0
           
           
-           line437
+           line435
            col5
            file0
           
@@ -5355,12 +5355,12 @@
         end
          
           
-           line438
+           line436
            col3
            file0
           
           
-           line438
+           line436
            col25
            file0
           
@@ -5372,7 +5372,7 @@
      kindevent
      location
      
-      line438
+      line436
       col3
       file0
      
@@ -5380,12 +5380,12 @@
      
        
         
-         line438
+         line436
          col3
          file0
         
         
-         line438
+         line436
          col67
          file0
         
@@ -5405,12 +5405,12 @@
         start
          
           
-           line439
+           line437
            col3
            file0
           
           
-           line439
+           line437
            col3
            file0
           
@@ -5418,12 +5418,12 @@
         end
          
           
-           line439
+           line437
            col8
            file0
           
           
-           line439
+           line437
            col8
            file0
           
@@ -5435,7 +5435,7 @@
      kindevent
      location
      
-      line439
+      line437
       col8
       file0
      
@@ -5443,12 +5443,12 @@
      
        
         
-         line439
+         line437
          col4
          file0
         
         
-         line439
+         line437
          col6
          file0
         
@@ -5466,7 +5466,7 @@
     
      location
      
-      line438
+      line436
       col3
       file0
      
@@ -5485,7 +5485,7 @@
   issue_hash_function_offset3
   location
   
-   line439
+   line437
    col8
    file0
   
@@ -5493,10 +5493,10 @@
   
    0
    
+    434
+    435
     436
     437
-    438
-    439
    
   
   
@@ -5511,12 +5511,12 @@
         start
          
           
-           line450
+           line448
            col3
            file0
           
           
-           line450
+           line448
            col4
            file0
           
@@ -5524,12 +5524,12 @@
         end
          
           
-           line450
+           line448
            col7
            file0
           
           
-           line450
+           line448
            col11
            file0
           
@@ -5541,7 +5541,7 @@
      kindevent
      location
      
-      line450
+      line448
       col7
       file0
      
@@ -5549,12 +5549,12 @@
      
        
         
-         line450
+         line448
          col7
          file0
         
         
-         line450
+         line448
          col16
          file0
         
@@ -5570,7 +5570,7 @@
      kindevent
      location
      
-      line451
+      line449
       col7
       file0
      
@@ -5578,12 +5578,12 @@
      
        
         
-         line451
+         line449
          col5
          file0
         
         
-         line451
+         line449
          col13
          file0
         
@@ -5601,7 +5601,7 @@
     
      location
      
-      line450
+      line448
       col7
       file0
      
@@ -5620,7 +5620,7 @@
   issue_hash_function_offset2
   location
   
-   line451
+   line449
    col7
    file0
   
@@ -5628,9 +5628,9 @@
   
    0
    
+    447
+    448
     449
-    450
-    451
    
   
   
@@ -5645,12 +5645,12 @@
         start
          
           
-           line459
+           line460
            col33
            file0
           
           
-           line459
+           line460
            col33
            file0
           
@@ -5658,12 +5658,12 @@
         end
          
           
-           line459
+           line460
            col37
            file0
           
           
-           line459
+           line460
            col39
            file0
           
@@ -5675,7 +5675,7 @@
      kindevent
      location
      
-      line459
+      line460
       col37
       file0
      
@@ -5683,12 +5683,12 @@
      
        
         
-         line459
+         line460
          col37
          file0
         
         
-         line459
+         line460
          col41
          file0
         
@@ -5704,7 +5704,7 @@
      kindevent
      location
      
-      line458
+      line459
       col1
       file0
      
@@ -5718,7 +5718,7 @@
      kindevent
      location
      
-      line458
+      line459
       col1
       file0
      
@@ -5726,12 +5726,12 @@
      
        
         
-         line458
+         line459
          col1
          file0
         
         
-         line458
+         line459
          col16
          file0
         
@@ -5747,7 +5747,7 @@
      kindevent
      location
      
-      line459
+      line460
       col37
       file0
      
@@ -5755,12 +5755,12 @@
      
        
         
-         line459
+         line460
          col37
          file0
         
         
-         line459
+         line460
          col41
          file0
         
@@ -5780,12 +5780,12 @@
         start
          
           
-           line459
+           line460
            col37
            file0
           
           
-           line459
+           line460
            col39
            file0
           
@@ -5793,12 +5793,12 @@
         end
          
           
-           line459
+           line460
            col35
            file0
           
           
-           line459
+           line460
            col35
            file0
           
@@ -5810,7 +5810,7 @@
      kindevent
      location
      
-      line459
+      line460
       col35
       file0
      
@@ -5818,12 +5818,12 @@
      
        
         
-         line459
+         line460
          col33
          file0
         
         
-         line459
+         line460
          col41
          file0
         
@@ -5841,7 +5841,7 @@
     
      location
      
-      line458
+      line459
       col1
       file0
      
@@ -5860,7 +5860,7 @@
   issue_hash_function_offset0
   location
   
-   line459
+   line460
    col35
    file0
   
@@ -5868,8 +5868,8 @@
   
    0
    
-    458
     459
+    460
    
   
   
@@ -5884,12 +5884,12 @@
         start
          
           
-           line468
+           line469
            col33
            file0
           
           
-           line468
+           line469
            col33
            file0
           
@@ -5897,12 +5897,12 @@
         end
          
           
-           line468
+           line469
            col37
            file0
           
           
-           line468
+           line469
            col39
            file0
           
@@ -5914,7 +5914,7 @@
      kindevent
      location
      
-      line468
+      line469
       col37
       file0
      
@@ -5922,12 +5922,12 @@
      
        
         
-         line468
+         line469
          col37
          file0
         
         
-         line468
+         line469
          col41
          file0
         
@@ -5943,7 +5943,7 @@
      kindevent
      location
      
-      line467
+      line468
       col1
       file0
      
@@ -5957,7 +5957,7 @@
      kindevent
      location
      
-      line467
+      line468
       col1
       file0
      
@@ -5965,12 +5965,12 @@
      
        
         
-         line467
+         line468
          col1
          file0
         
         
-         line467
+         line468
          col11
          file0
         
@@ -5986,7 +5986,7 @@
      kindevent
      location
      
-      line468
+      line469
       col37
       file0
      
@@ -5994,12 +5994,12 @@
      
        
         
-         line468
+         line469
          col37
          file0
         
         
-         line468
+         line469
          col41
          file0
         
@@ -6019,12 +6019,12 @@
         start
          
           
-           line468
+           line469
            col37
            file0
           
           
-           line468
+           line469
            col39
            file0
           
@@ -6032,12 +6032,12 @@
         end
          
           
-           line468
+           line469
            col35
            file0
           
           
-           line468
+           line469
            col35
            file0
           
@@ -6049,7 +6049,7 @@
      kindevent
      location
      
-      line468
+      line469
       col35
       file0
      
@@ -6057,12 +6057,12 @@
      
        
         
-         line468
+         line469
          col33
          file0
         
         
-         line468
+         line469
          col41
          file0
         
@@ -6080,7 +6080,7 @@
     
      location
      
-      line467
+      line468
       col1
       file0
      
@@ -6099,7 +6099,7 @@
   issue_hash_function_offset0
   location
   
-   line468
+   line469
    col35
    file0
   
@@ -6107,8 +6107,683 @@
   
    0
    
-    467
     468
+    469
+   
+  
+  
+  
+   path
+   
+    
+     kindcontrol
+     edges
+      
+       
+        start
+         
+          
+           line481
+           col3
+           file0
+          
+          
+           line481
+           col5
+           file0
+          
+         
+        end
+         
+          
+           line482
+           col3
+           file0
+          
+          
+           line482
+           col10
+           file0
+          
+         
+       
+      
+    
+    
+     kindevent
+     location
+     
+      line482
+      col3
+      file0
+     
+     ranges
+     
+       
+        
+         line482
+         col3
+         file0
+        
+        
+         line482
+         col28
+         file0
+        
+       
+     
+     depth0
+     extended_message
+     The value 0 is assigned to 'x'
+     message
+     The value 0 is assigned to 'x'
+    
+    
+     kindevent
+     location
+     
+      line483
+      col13
+      file0
+     
+     ranges
+     
+       
+        
+         line483
+         col10
+         file0
+        
+        
+         line483
+         col15
+         file0
+        
+       
+     
+     depth0
+     extended_message
+     Division by zero
+     message
+     Division by zero
+    
+   
+   macro_expansions
+   
+    
+     location
+     
+      line482
+      col3
+      file0
+     
+     nameDISPATCH
+     expansionfoo(x, "LF1M healer");x = 0;;
+    
+   
+   descriptionDivision by zero
+   categoryLogic error
+   typeDivision by zero
+   check_namecore.DivideZero
+   
+   issue_hash_content_of_line_in_context0911a97774745d4fa0ac03cd9680dfe1
+  issue_context_kindfunction
+  issue_contextmulitpleParamsResolveToVA_ARGS
+  issue_hash_function_offset3
+  location
+  
+   line483
+   col13
+   file0
+  
+  ExecutedLines
+  
+   0
+   
+    480
+    481
+    482
+    483
+   
+  
+  
+  
+   path
+   
+    
+     kindcontrol
+     edges
+      
+       
+        start
+         
+          
+           line494
+           col3
+           file0
+          
+          
+           line494
+           col5
+           file0
+          
+         
+        end
+         
+          
+           line495
+           col3
+           file0
+          
+          
+           line495
+           col16
+           file0
+          
+         
+       
+      
+    
+    
+     kindevent
+     location
+     
+      line495
+      col3
+      file0
+     
+     ranges
+     
+       
+        
+         line495
+         col3
+         file0
+        
+        
+         line495
+         col71
+         file0
+        
+       
+     
+     depth0
+     extended_message
+     The value 0 is assigned to 'x'
+     message
+     The value 0 is assigned to 'x'
+    
+    
+     kindevent
+     location
+     
+      line496
+      col13
+      file0
+     
+     ranges
+     
+       
+        
+         line496
+         col10
+         file0
+        
+        
+         line496
+         col15
+         file0
+        
+       
+     
+     depth0
+     extended_message
+     Division by zero
+     message
+     Division by zero
+    
+   
+   macro_expansions
+   
+    
+     location
+     
+      line495
+      col3
+      file0
+     
+     nameCONCAT_VA_ARGS
+     expansionvariadicCFunction(x, "You need to construct additional pylons.",'c', 9);x = 0;
+    
+   
+   descriptionDivision by zero
+   categoryLogic error
+   typeDivision by zero
+   check_namecore.DivideZero
+   
+   issue_hash_content_of_line_in_contexted592fb952ed786e7efdc81bbc538e94
+  issue_context_kindfunction
+  issue_contextconcatVA_ARGS
+  issue_hash_function_offset3
+  location
+  
+   line496
+   col13
+   file0
+  
+  ExecutedLines
+  
+   0
+   
+    493
+    494
+    495
+    496
+   
+  
+  
+  
+   path
+   
+    
+     kindcontrol
+     edges
+      
+       
+        start
+         
+          
+           line502
+           col3
+           file0
+          
+          
+           line502
+           col5
+           file0
+          
+         
+        end
+         
+          
+           line503
+           col3
+           file0
+          
+          
+           line503
+           col16
+           file0
+          
+         
+       
+      
+    
+    
+     kindevent
+     location
+     
+      line503
+      col3
+      file0
+     
+     ranges
+     
+       
+        
+         line503
+         col3
+         file0
+        
+        
+         line503
+         col44
+         file0
+        
+       
+     
+     depth0
+     extended_message
+     The value 0 is assigned to 'x'
+     message
+     The value 0 is assigned to 'x'
+    
+    
+     kindevent
+     location
+     
+      line504
+      col13
+      file0
+     
+     ranges
+     
+       
+        
+         line504
+         col10
+         file0
+        
+        
+         line504
+         col15
+         file0
+        
+       
+     
+     depth0
+     extended_message
+     Division by zero
+     message
+     Division by zero
+    
+   
+   macro_expansions
+   
+    
+     location
+     
+      line503
+      col3
+      file0
+     
+     nameCONCAT_VA_ARGS
+     expansionvariadicCFunction(x, "You need to construct",);x = 0;
+    
+   
+   descriptionDivision by zero
+   categoryLogic error
+   typeDivision by zero
+   check_namecore.DivideZero
+   
+   issue_hash_content_of_line_in_context4b0ab46d7a972d0a388b4bb59351480a
+  issue_context_kindfunction
+  issue_contextconcatVA_ARGSEmpty
+  issue_hash_function_offset3
+  location
+  
+   line504
+   col13
+   file0
+  
+  ExecutedLines
+  
+   0
+   
+    501
+    502
+    503
+    504
+   
+  
+  
+  
+   path
+   
+    
+     kindcontrol
+     edges
+      
+       
+        start
+         
+          
+           line514
+           col3
+           file0
+          
+          
+           line514
+           col5
+           file0
+          
+         
+        end
+         
+          
+           line515
+           col3
+           file0
+          
+          
+           line515
+           col21
+           file0
+          
+         
+       
+      
+    
+    
+     kindevent
+     location
+     
+      line515
+      col3
+      file0
+     
+     ranges
+     
+       
+        
+         line515
+         col3
+         file0
+        
+        
+         line515
+         col71
+         file0
+        
+       
+     
+     depth0
+     extended_message
+     The value 0 is assigned to 'x'
+     message
+     The value 0 is assigned to 'x'
+    
+    
+     kindevent
+     location
+     
+      line516
+      col13
+      file0
+     
+     ranges
+     
+       
+        
+         line516
+         col10
+         file0
+        
+        
+         line516
+         col15
+         file0
+        
+       
+     
+     depth0
+     extended_message
+     Division by zero
+     message
+     Division by zero
+    
+   
+   macro_expansions
+   
+    
+     location
+     
+      line515
+      col3
+      file0
+     
+     nameSTRINGIFIED_VA_ARGS
+     expansionvariadicCFunction(x, "Additional supply depots required.",  "'a'", 10);x = 0;
+    
+   
+   descriptionDivision by zero
+   categoryLogic error
+   typeDivision by zero
+   check_namecore.DivideZero
+   
+   issue_hash_content_of_line_in_context6622e3f0651f97e6cbf4e075e6b07707
+  issue_context_kindfunction
+  issue_contextstringifyVA_ARGS
+  issue_hash_function_offset3
+  location
+  
+   line516
+   col13
+   file0
+  
+  ExecutedLines
+  
+   0
+   
+    513
+    514
+    515
+    516
+   
+  
+  
+  
+   path
+   
+    
+     kindcontrol
+     edges
+      
+       
+        start
+         
+          
+           line524
+           col3
+           file0
+          
+          
+           line524
+           col5
+           file0
+          
+         
+        end
+         
+          
+           line525
+           col3
+           file0
+          
+          
+           line525
+           col21
+           file0
+          
+         
+       
+      
+    
+    
+     kindevent
+     location
+     
+      line525
+      col3
+      file0
+     
+     ranges
+     
+       
+        
+         line525
+         col3
+         file0
+        
+        
+         line525
+         col62
+         file0
+        
+       
+     
+     depth0
+     extended_message
+     The value 0 is assigned to 'x'
+     message
+     The value 0 is assigned to 'x'
+    
+    
+     kindevent
+     location
+     
+      line526
+      col13
+      file0
+     
+     ranges
+     
+       
+        
+         line526
+         col10
+         file0
+        
+        
+         line526
+         col15
+         file0
+        
+       
+     
+     depth0
+     extended_message
+     Division by zero
+     message
+     Division by zero
+    
+   
+   macro_expansions
+   
+    
+     location
+     
+      line525
+      col3
+      file0
+     
+     nameSTRINGIFIED_VA_ARGS
+     expansionvariadicCFunction(x, "Additional supply depots required.", ")";x = 0;
+    
+   
+   descriptionDivision by zero
+   categoryLogic error
+   typeDivision by zero
+   check_namecore.DivideZero
+   
+   issue_hash_content_of_line_in_context86c6e52c81f1129e6c9f51e6938d9ee7
+  issue_context_kindfunction
+  issue_contextstringifyVA_ARGSEmpty
+  issue_hash_function_offset3
+  location
+  
+   line526
+   col13
+   file0
+  
+  ExecutedLines
+  
+   0
+   
+    523
+    524
+    525
+    526
    
   
   
diff --git a/clang/test/Analysis/Inputs/system-header-simulator.h b/clang/test/Analysis/Inputs/system-header-simulator.h
index a98546c7056c9..b72f45a9b0e55 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator.h
@@ -46,8 +46,8 @@ FILE *fopen(const char *path, const char *mode);
 FILE *tmpfile(void);
 FILE *freopen(const char *pathname, const char *mode, FILE *stream);
 int fclose(FILE *fp);
-size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
-size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
+size_t fread(void *restrict, size_t, size_t, FILE *restrict);
+size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict);
 int fputc(int ch, FILE *stream);
 int fseek(FILE *__stream, long int __off, int __whence);
 long int ftell(FILE *__stream);
diff --git a/clang/test/Analysis/analyzer-enabled-checkers.c b/clang/test/Analysis/analyzer-enabled-checkers.c
index bef786a1a59b6..7c00e78c16acd 100644
--- a/clang/test/Analysis/analyzer-enabled-checkers.c
+++ b/clang/test/Analysis/analyzer-enabled-checkers.c
@@ -6,11 +6,11 @@
 
 // CHECK:      OVERVIEW: Clang Static Analyzer Enabled Checkers List
 // CHECK-EMPTY:
+// CHECK-NEXT: core.CallAndMessageModeling
 // CHECK-NEXT: apiModeling.StdCLibraryFunctions
 // CHECK-NEXT: apiModeling.TrustNonnull
 // CHECK-NEXT: apiModeling.llvm.CastValue
 // CHECK-NEXT: apiModeling.llvm.ReturnValue
-// CHECK-NEXT: core.CallAndMessageModeling
 // CHECK-NEXT: core.CallAndMessage
 // CHECK-NEXT: core.DivideZero
 // CHECK-NEXT: core.DynamicTypePropagation
diff --git a/clang/test/Analysis/eval-predefined-exprs.cpp b/clang/test/Analysis/eval-predefined-exprs.cpp
new file mode 100644
index 0000000000000..cc48a264f2d32
--- /dev/null
+++ b/clang/test/Analysis/eval-predefined-exprs.cpp
@@ -0,0 +1,109 @@
+// RUN: %clang_analyze_cc1 -std=c++17 -analyzer-checker=core,debug.ExprInspection -verify %s
+//
+// RUN: %clang_analyze_cc1 -std=c++17 -analyzer-checker=core,debug.ExprInspection -verify \
+// RUN:   -triple i386-pc-win32 -fms-compatibility -fms-extensions -DANALYZER_MS %s
+
+template 
+void clang_analyzer_dump(const T *);
+void clang_analyzer_warnIfReached();
+
+void builtin_unique_stable_name_of_lambda() {
+  auto y = [] {};
+  clang_analyzer_dump(__builtin_unique_stable_name(y));
+  // expected-warning@-1 {{&Element{"_ZTSZ36builtin_unique_stable_name_of_lambdavEUlvE11_12",0 S64b,char}}}
+}
+
+template 
+void func(U param) {
+  clang_analyzer_dump(__func__);
+  clang_analyzer_dump(__FUNCTION__);
+  clang_analyzer_dump(__PRETTY_FUNCTION__);
+  // expected-warning@-3 {{&Element{"func",0 S64b,char}}}
+  // expected-warning@-3 {{&Element{"func",0 S64b,char}}}
+  // expected-warning@-3 {{&Element{"void func(U) [T = Class, Value = 42, U = char]",0 S64b,char}}}
+
+#ifdef ANALYZER_MS
+  clang_analyzer_dump(__FUNCDNAME__);
+  clang_analyzer_dump(L__FUNCTION__);
+  clang_analyzer_dump(__FUNCSIG__);
+  clang_analyzer_dump(L__FUNCSIG__);
+  // expected-warning@-4 {{&Element{"??$func@UClass@?1??foo@@YAXXZ@$0CK@D@@YAXD@Z",0 S64b,char}}}
+  // expected-warning@-4 {{&Element{L"func",0 S64b,wchar_t}}}
+  // expected-warning@-4 {{&Element{"void __cdecl func(U) [T = Class, Value = 42, U = char]",0 S64b,char}}}
+  // expected-warning@-4 {{&Element{L"void __cdecl func(U) [T = Class, Value = 42, U = char]",0 S64b,wchar_t}}}
+#endif
+}
+
+void foo() {
+  clang_analyzer_dump(__func__);
+  clang_analyzer_dump(__FUNCTION__);
+  clang_analyzer_dump(__PRETTY_FUNCTION__);
+  // expected-warning@-3 {{&Element{"foo",0 S64b,char}}}
+  // expected-warning@-3 {{&Element{"foo",0 S64b,char}}}
+  // expected-warning@-3 {{&Element{"void foo()",0 S64b,char}}}
+
+#ifdef ANALYZER_MS
+  clang_analyzer_dump(__FUNCDNAME__);
+  clang_analyzer_dump(L__FUNCTION__);
+  clang_analyzer_dump(__FUNCSIG__);
+  clang_analyzer_dump(L__FUNCSIG__);
+  // expected-warning@-4 {{&Element{"?foo@@YAXXZ",0 S64b,char}}}
+  // expected-warning@-4 {{&Element{L"foo",0 S64b,wchar_t}}}
+  // expected-warning@-4 {{&Element{"void __cdecl foo(void)",0 S64b,char}}}
+  // expected-warning@-4 {{&Element{L"void __cdecl foo(void)",0 S64b,wchar_t}}}
+#endif
+
+  func('b'); // instantiate template
+}
+
+void test_builtin_unique_stable_name(int a) {
+  clang_analyzer_dump(__builtin_unique_stable_name(a));
+  // expected-warning@-1 {{&Element{"_ZTSi",0 S64b,char}}}
+}
+
+struct A {
+  A() {
+    clang_analyzer_dump(__func__);
+    clang_analyzer_dump(__FUNCTION__);
+    clang_analyzer_dump(__PRETTY_FUNCTION__);
+    // expected-warning@-3 {{&Element{"A",0 S64b,char}}}
+    // expected-warning@-3 {{&Element{"A",0 S64b,char}}}
+    // expected-warning@-3 {{&Element{"A::A()",0 S64b,char}}}
+
+#ifdef ANALYZER_MS
+    clang_analyzer_dump(__FUNCDNAME__);
+    clang_analyzer_dump(L__FUNCTION__);
+    clang_analyzer_dump(__FUNCSIG__);
+    clang_analyzer_dump(L__FUNCSIG__);
+    // expected-warning@-4 {{&Element{"??0A@@QAE@XZ",0 S64b,char}}}
+    // expected-warning@-4 {{&Element{L"A",0 S64b,wchar_t}}}
+    // expected-warning@-4 {{&Element{"__thiscall A::A(void)",0 S64b,char}}}
+    // expected-warning@-4 {{&Element{L"__thiscall A::A(void)",0 S64b,wchar_t}}}
+#endif
+  }
+  ~A() {
+    clang_analyzer_dump(__func__);
+    clang_analyzer_dump(__FUNCTION__);
+    clang_analyzer_dump(__PRETTY_FUNCTION__);
+    // expected-warning@-3 {{&Element{"~A",0 S64b,char}}}
+    // expected-warning@-3 {{&Element{"~A",0 S64b,char}}}
+    // expected-warning@-3 {{&Element{"A::~A()",0 S64b,char}}}
+
+#ifdef ANALYZER_MS
+    clang_analyzer_dump(__FUNCDNAME__);
+    clang_analyzer_dump(L__FUNCTION__);
+    clang_analyzer_dump(__FUNCSIG__);
+    clang_analyzer_dump(L__FUNCSIG__);
+    // expected-warning@-4 {{&Element{"??1A@@QAE@XZ",0 S64b,char}}}
+    // expected-warning@-4 {{&Element{L"~A",0 S64b,wchar_t}}}
+    // expected-warning@-4 {{&Element{"__thiscall A::~A(void)",0 S64b,char}}}
+    // expected-warning@-4 {{&Element{L"__thiscall A::~A(void)",0 S64b,wchar_t}}}
+#endif
+  }
+
+  template  int dependent() {
+    // We should not analyze dependent functions.
+    // Such functions have no function name of predefined expressions such as: '__func__' etc.
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
diff --git a/clang/test/Analysis/live-stmts.cpp b/clang/test/Analysis/live-stmts.cpp
index 1b8a750c5e5ca..16954f30129f7 100644
--- a/clang/test/Analysis/live-stmts.cpp
+++ b/clang/test/Analysis/live-stmts.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -w -analyzer-checker=debug.DumpLiveStmts %s 2>&1\
+// RUN: %clang_analyze_cc1 -w -analyzer-checker=debug.DumpLiveExprs %s 2>&1\
 // RUN:   | FileCheck %s
 
 int coin();
@@ -7,13 +7,24 @@ int coin();
 int testThatDumperWorks(int x, int y, int z) {
   return x ? y : z;
 }
-// CHECK: [ B0 (live statements at block exit) ]
+
+// [B5 (ENTRY)]
+//    |
+//    V
+// [B4 (x)] ? [B2 (y)] : [B3 (z)]
+//                \        /
+//                 ---|----
+//                    V
+//                   [B1] --> [B0 (EXIT)]
+//                  return
+
+// CHECK: [ B0 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B1 (live statements at block exit) ]
+// CHECK: [ B1 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B2 (live statements at block exit) ]
+// CHECK: [ B2 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int'
 // CHECK-EMPTY:
@@ -24,7 +35,7 @@ int testThatDumperWorks(int x, int y, int z) {
 // CHECK-NEXT:   `-DeclRefExpr {{.*}} 'x' 'int'
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B3 (live statements at block exit) ]
+// CHECK: [ B3 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int'
 // CHECK-EMPTY:
@@ -33,7 +44,7 @@ int testThatDumperWorks(int x, int y, int z) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 
 // CHECK-NEXT: `-ImplicitCastExpr {{.*}} 
 // CHECK-NEXT:   `-DeclRefExpr {{.*}} 'x' 'int'
-// CHECK: [ B4 (live statements at block exit) ]
+// CHECK: [ B4 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int'
 // CHECK-EMPTY:
@@ -44,7 +55,7 @@ int testThatDumperWorks(int x, int y, int z) {
 // CHECK-NEXT:   `-DeclRefExpr {{.*}} 'x' 'int'
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B5 (live statements at block exit) ]
+// CHECK: [ B5 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int'
 // CHECK-EMPTY:
@@ -61,22 +72,22 @@ void testIfBranchExpression(bool flag) {
       e;
   }
 }
-// CHECK: [ B0 (live statements at block exit) ]
+// CHECK: [ B0 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B1 (live statements at block exit) ]
+// CHECK: [ B1 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B2 (live statements at block exit) ]
+// CHECK: [ B2 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B3 (live statements at block exit) ]
+// CHECK: [ B3 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B4 (live statements at block exit) ]
+// CHECK: [ B4 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B5 (live statements at block exit) ]
+// CHECK: [ B5 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
 
@@ -89,22 +100,22 @@ void testWhileBodyExpression(bool flag) {
       e;
   }
 }
-// CHECK: [ B0 (live statements at block exit) ]
+// CHECK: [ B0 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B1 (live statements at block exit) ]
+// CHECK: [ B1 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B2 (live statements at block exit) ]
+// CHECK: [ B2 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B3 (live statements at block exit) ]
+// CHECK: [ B3 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B4 (live statements at block exit) ]
+// CHECK: [ B4 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B5 (live statements at block exit) ]
+// CHECK: [ B5 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
 
@@ -118,22 +129,22 @@ void testDoWhileBodyExpression(bool flag) {
     while (coin());
   }
 }
-// CHECK: [ B0 (live statements at block exit) ]
+// CHECK: [ B0 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B1 (live statements at block exit) ]
+// CHECK: [ B1 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B2 (live statements at block exit) ]
+// CHECK: [ B2 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B3 (live statements at block exit) ]
+// CHECK: [ B3 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B4 (live statements at block exit) ]
+// CHECK: [ B4 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B5 (live statements at block exit) ]
+// CHECK: [ B5 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
 
@@ -146,22 +157,39 @@ void testForBodyExpression(bool flag) {
       e;
   }
 }
-// CHECK: [ B0 (live statements at block exit) ]
+// CHECK: [ B0 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B1 (live statements at block exit) ]
+// CHECK: [ B1 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B2 (live statements at block exit) ]
+// CHECK: [ B2 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B3 (live statements at block exit) ]
+// CHECK: [ B3 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B4 (live statements at block exit) ]
+// CHECK: [ B4 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK: [ B5 (live statements at block exit) ]
+// CHECK: [ B5 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
 
+void clang_analyzer_eval(bool);
+
+void test_lambda_refcapture() {
+  int a = 6;
+  [&](int &a) { a = 42; }(a);
+  clang_analyzer_eval(a == 42); // expected-warning{{TRUE}}
+}
+
+// CHECK: [ B0 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK-NEXT: [ B1 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK-NEXT: [ B2 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK-EMPTY:
diff --git a/clang/test/Analysis/live-stmts.mm b/clang/test/Analysis/live-stmts.mm
index a6ddd03ca5d85..8acdd77149ebe 100644
--- a/clang/test/Analysis/live-stmts.mm
+++ b/clang/test/Analysis/live-stmts.mm
@@ -1,5 +1,5 @@
 // RUN: %clang_analyze_cc1 -w -fblocks %s \
-// RUN:   -analyzer-checker=debug.DumpLiveStmts \
+// RUN:   -analyzer-checker=debug.DumpLiveExprs \
 // RUN:   2>&1 | FileCheck %s
 
 @interface Item
@@ -18,25 +18,25 @@ @interface Collection
 public:
   RAII(Blk blk): blk(blk) {}
 
-// CHECK: [ B0 (live statements at block exit) ]
+// CHECK: [ B0 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK-NEXT: [ B1 (live statements at block exit) ]
+// CHECK-NEXT: [ B1 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK-NEXT: [ B2 (live statements at block exit) ]
+// CHECK-NEXT: [ B2 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
 
   ~RAII() { blk(); }
 
-// CHECK-NEXT: [ B0 (live statements at block exit) ]
+// CHECK-NEXT: [ B0 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK-NEXT: [ B1 (live statements at block exit) ]
+// CHECK-NEXT: [ B1 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK-NEXT: [ B2 (live statements at block exit) ]
+// CHECK-NEXT: [ B2 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
 };
@@ -45,57 +45,37 @@ void foo(Collection *coll) {
   RAII raii(^{});
   for (Item *item in coll) {}
 }
-// CHECK-NEXT: [ B0 (live statements at block exit) ]
+// CHECK-NEXT: [ B0 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK-NEXT: [ B1 (live statements at block exit) ]
+// CHECK-NEXT: [ B1 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK-NEXT: [ B2 (live statements at block exit) ]
-// CHECK-EMPTY:
-// CHECK-NEXT: DeclStmt {{.*}}
-// CHECK-NEXT: `-VarDecl {{.*}}  item 'Item *'
+// CHECK-NEXT: [ B2 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'Collection *' 
 // CHECK-NEXT: `-DeclRefExpr {{.*}} 'Collection *' lvalue ParmVar {{.*}} 'coll' 'Collection *'
 // CHECK-EMPTY:
-// CHECK-NEXT: CompoundStmt {{.*}}
-// CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK-NEXT: [ B3 (live statements at block exit) ]
-// CHECK-EMPTY:
-// CHECK-NEXT: DeclStmt {{.*}}
-// CHECK-NEXT: `-VarDecl {{.*}}  item 'Item *'
+// CHECK-NEXT: [ B3 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'Collection *' 
 // CHECK-NEXT: `-DeclRefExpr {{.*}} 'Collection *' lvalue ParmVar {{.*}} 'coll' 'Collection *'
 // CHECK-EMPTY:
-// CHECK-NEXT: CompoundStmt {{.*}}
-// CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK-NEXT: [ B4 (live statements at block exit) ]
-// CHECK-EMPTY:
-// CHECK-NEXT: DeclStmt {{.*}}
-// CHECK-NEXT: `-VarDecl {{.*}}  item 'Item *'
+// CHECK-NEXT: [ B4 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'Collection *' 
 // CHECK-NEXT: `-DeclRefExpr {{.*}} 'Collection *' lvalue ParmVar {{.*}} 'coll' 'Collection *'
 // CHECK-EMPTY:
-// CHECK-NEXT: CompoundStmt {{.*}}
-// CHECK-EMPTY:
-// CHECK-EMPTY:
-// CHECK-NEXT: [ B5 (live statements at block exit) ]
-// CHECK-EMPTY:
-// CHECK-NEXT: DeclStmt {{.*}}
-// CHECK-NEXT: `-VarDecl {{.*}}  item 'Item *'
 // CHECK-EMPTY:
-// CHECK-NEXT: CompoundStmt {{.*}}
+// CHECK-NEXT: [ B5 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK-NEXT: [ B0 (live statements at block exit) ]
+// CHECK-NEXT: [ B0 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
-// CHECK-NEXT: [ B1 (live statements at block exit) ]
+// CHECK-NEXT: [ B1 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
 
diff --git a/clang/test/Analysis/objc-live-crash.mm b/clang/test/Analysis/objc-live-crash.mm
new file mode 100644
index 0000000000000..b3b4f19bfc0dd
--- /dev/null
+++ b/clang/test/Analysis/objc-live-crash.mm
@@ -0,0 +1,30 @@
+// RUN: %clang --analyze %s -fblocks
+
+// https://reviews.llvm.org/D82598#2171312
+
+@interface Item
+// ...
+@end
+
+@interface Collection
+// ...
+@end
+
+typedef void (^Blk)();
+
+struct RAII {
+  Blk blk;
+
+public:
+  RAII(Blk blk): blk(blk) {}
+  ~RAII() { blk(); }
+};
+
+void foo(Collection *coll) {
+  RAII raii(^{});
+  for (Item *item in coll) {}
+  int i;
+  {
+    int j;
+  }
+}
diff --git a/clang/test/Analysis/plist-macros-with-expansion.cpp b/clang/test/Analysis/plist-macros-with-expansion.cpp
index e07747eaec74d..f79070095385d 100644
--- a/clang/test/Analysis/plist-macros-with-expansion.cpp
+++ b/clang/test/Analysis/plist-macros-with-expansion.cpp
@@ -1,5 +1,3 @@
-// RUN: %clang_analyze_cc1 -std=c++14 -analyzer-checker=core -verify %s
-//
 // RUN: %clang_analyze_cc1 -std=c++14 -analyzer-checker=core %s  \
 // RUN:   -analyzer-output=plist -o %t.plist \
 // RUN:   -analyzer-config expand-macros=true
@@ -452,6 +450,9 @@ void recursiveMacroUser() {
                // expected-warning@-1{{expression result unused}}
 }
 
+// CHECK: namevalue
+// CHECK-NEXT: expansiongarbage_
+
 #define FOO(x) int foo() { return x; }
 #define APPLY_ZERO1(function) function(0)
 
@@ -469,3 +470,62 @@ void useZeroApplier2() { (void)(1 / bar()); } // expected-warning{{Division by z
 
 // CHECK: nameAPPLY_ZERO2
 // CHECK-NEXT: expansionint bar() { return 0; }
+
+void foo(int &x, const char *str);
+
+#define PARAMS_RESOLVE_TO_VA_ARGS(i, fmt) foo(i, fmt); \
+  i = 0;
+#define DISPATCH(...) PARAMS_RESOLVE_TO_VA_ARGS(__VA_ARGS__);
+
+void mulitpleParamsResolveToVA_ARGS(void) {
+  int x = 1;
+  DISPATCH(x, "LF1M healer");
+  (void)(10 / x); // expected-warning{{Division by zero}}
+}
+// CHECK: nameDISPATCH
+// CHECK-NEXT: expansionfoo(x, "LF1M healer");x = 0;;
+
+void variadicCFunction(int &x, const char *str, ...);
+
+#define CONCAT_VA_ARGS(i, fmt, ...) variadicCFunction(i, fmt, ##__VA_ARGS__); \
+  i = 0;
+
+void concatVA_ARGS(void) {
+  int x = 1;
+  CONCAT_VA_ARGS(x, "You need to construct additional pylons.", 'c', 9);
+  (void)(10 / x); // expected-warning{{Division by zero}}
+}
+// CHECK: nameCONCAT_VA_ARGS
+// CHECK-NEXT: expansionvariadicCFunction(x, "You need to construct additional pylons.",'c', 9);x = 0;
+
+void concatVA_ARGSEmpty(void) {
+  int x = 1;
+  CONCAT_VA_ARGS(x, "You need to construct");
+  (void)(10 / x); // expected-warning{{Division by zero}}
+}
+// FIXME: The comma shouldn't be present after the last argument.
+// CHECK: nameCONCAT_VA_ARGS
+// CHECK-NEXT: expansionvariadicCFunction(x, "You need to construct",);x = 0;
+
+#define STRINGIFIED_VA_ARGS(i, fmt, ...) variadicCFunction(i, fmt, #__VA_ARGS__); \
+  i = 0;
+
+void stringifyVA_ARGS(void) {
+  int x = 1;
+  STRINGIFIED_VA_ARGS(x, "Additional supply depots required.", 'a', 10);
+  (void)(10 / x); // expected-warning{{Division by zero}}
+}
+
+// FIXME: Stringify and escape __VA_ARGS__ correctly.
+// CHECK: nameSTRINGIFIED_VA_ARGS
+// CHECK-NEXT: expansionvariadicCFunction(x, "Additional supply depots required.",  "'a'", 10);x = 0;
+
+void stringifyVA_ARGSEmpty(void) {
+  int x = 1;
+  STRINGIFIED_VA_ARGS(x, "Additional supply depots required.");
+  (void)(10 / x); // expected-warning{{Division by zero}}
+}
+
+// FIXME: Stringify and escape __VA_ARGS__ correctly.
+// CHECK: nameSTRINGIFIED_VA_ARGS
+// CHECK-NEXT: expansionvariadicCFunction(x, "Additional supply depots required.", ")";x = 0;
diff --git a/clang/test/Analysis/std-c-library-functions-POSIX.c b/clang/test/Analysis/std-c-library-functions-POSIX.c
index c2c98df864899..9285aee6178bc 100644
--- a/clang/test/Analysis/std-c-library-functions-POSIX.c
+++ b/clang/test/Analysis/std-c-library-functions-POSIX.c
@@ -63,8 +63,6 @@
 // CHECK: Loaded summary for: void rewinddir(DIR *dir)
 // CHECK: Loaded summary for: void seekdir(DIR *dirp, long loc)
 // CHECK: Loaded summary for: int rand_r(unsigned int *seedp)
-// CHECK: Loaded summary for: int strcasecmp(const char *s1, const char *s2)
-// CHECK: Loaded summary for: int strncasecmp(const char *s1, const char *s2, size_t n)
 // CHECK: Loaded summary for: int fileno(FILE *stream)
 // CHECK: Loaded summary for: int fseeko(FILE *stream, off_t offset, int whence)
 // CHECK: Loaded summary for: off_t ftello(FILE *stream)
@@ -195,8 +193,6 @@ FILE *fdopen(int fd, const char *mode);
 void rewinddir(DIR *dir);
 void seekdir(DIR *dirp, long loc);
 int rand_r(unsigned int *seedp);
-int strcasecmp(const char *s1, const char *s2);
-int strncasecmp(const char *s1, const char *s2, size_t n);
 int fileno(FILE *stream);
 int fseeko(FILE *stream, off_t offset, int whence);
 off_t ftello(FILE *stream);
diff --git a/clang/test/Analysis/std-c-library-functions-arg-constraints.c b/clang/test/Analysis/std-c-library-functions-arg-constraints.c
index 28979abd43b58..afc2ce28efc62 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-constraints.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-constraints.c
@@ -194,6 +194,22 @@ void test_notnull_symbolic2(FILE *fp, int *buf) {
     // bugpath-warning{{Function argument constraint is not satisfied}} \
     // bugpath-note{{Function argument constraint is not satisfied}}
 }
+typedef __WCHAR_TYPE__ wchar_t;
+// This is one test case for the ARR38-C SEI-CERT rule.
+void ARR38_C_F(FILE *file) {
+  enum { BUFFER_SIZE = 1024 };
+  wchar_t wbuf[BUFFER_SIZE]; // bugpath-note{{'wbuf' initialized here}}
+
+  const size_t size = sizeof(*wbuf);
+  const size_t nitems = sizeof(wbuf);
+
+  // The 3rd parameter should be the number of elements to read, not
+  // the size in bytes.
+  fread(wbuf, size, nitems, file); // \
+  // report-warning{{Function argument constraint is not satisfied}} \
+  // bugpath-warning{{Function argument constraint is not satisfied}} \
+  // bugpath-note{{Function argument constraint is not satisfied}}
+}
 
 int __two_constrained_args(int, int);
 void test_constraints_on_multiple_args(int x, int y) {
diff --git a/clang/test/Analysis/std-c-library-functions-arg-cstring-dependency.c b/clang/test/Analysis/std-c-library-functions-arg-cstring-dependency.c
new file mode 100644
index 0000000000000..37425e4e3e169
--- /dev/null
+++ b/clang/test/Analysis/std-c-library-functions-arg-cstring-dependency.c
@@ -0,0 +1,21 @@
+// This test case crashes if strncasecmp is modeled in StdCLibraryFunctions.
+// Either we fix CStringChecker to handle the call prerequisites in
+// checkPreCall, or we must not evaluate any pure functions in
+// StdCLibraryFunctions that are also handled in CStringChecker.
+
+// RUN: %clang_analyze_cc1 %s \
+// RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=apiModeling.StdCLibraryFunctions \
+// RUN:   -analyzer-checker=unix.cstring.NullArg \
+// RUN:   -analyzer-config apiModeling.StdCLibraryFunctions:ModelPOSIX=true \
+// RUN:   -analyzer-checker=alpha.unix.StdCLibraryFunctionArgs \
+// RUN:   -triple x86_64-unknown-linux-gnu \
+// RUN:   -verify
+
+typedef __typeof(sizeof(int)) size_t;
+int strncasecmp(const char *s1, const char *s2, size_t n);
+
+int strncasecmp_null_argument(char *a, size_t n) {
+  char *b = 0;
+  return strncasecmp(a, b, n); // expected-warning{{Null pointer passed as 2nd argument to string comparison function}}
+}
diff --git a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c
new file mode 100644
index 0000000000000..61106f1f8d6bc
--- /dev/null
+++ b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c
@@ -0,0 +1,58 @@
+// Check the case when only the StreamChecker is enabled.
+// RUN: %clang_analyze_cc1 %s \
+// RUN:   -analyzer-checker=core,alpha.unix.Stream \
+// RUN:   -analyzer-checker=debug.ExprInspection \
+// RUN:   -analyzer-config eagerly-assume=false \
+// RUN:   -triple x86_64-unknown-linux \
+// RUN:   -verify=stream
+
+// Check the case when only the StdLibraryFunctionsChecker is enabled.
+// RUN: %clang_analyze_cc1 %s \
+// RUN:   -analyzer-checker=apiModeling.StdCLibraryFunctions \
+// RUN:   -analyzer-config apiModeling.StdCLibraryFunctions:DisplayLoadedSummaries=true \
+// RUN:   -analyzer-checker=debug.ExprInspection \
+// RUN:   -analyzer-config eagerly-assume=false \
+// RUN:   -triple x86_64-unknown-linux \
+// RUN:   -verify=stdLib 2>&1 | FileCheck %s
+
+// Check the case when both the StreamChecker and the
+// StdLibraryFunctionsChecker are enabled.
+// RUN: %clang_analyze_cc1 %s \
+// RUN:   -analyzer-checker=core,alpha.unix.Stream \
+// RUN:   -analyzer-checker=apiModeling.StdCLibraryFunctions \
+// RUN:   -analyzer-config apiModeling.StdCLibraryFunctions:DisplayLoadedSummaries=true \
+// RUN:   -analyzer-checker=debug.ExprInspection \
+// RUN:   -analyzer-config eagerly-assume=false \
+// RUN:   -triple x86_64-unknown-linux \
+// RUN:   -verify=both 2>&1 | FileCheck %s
+
+// Verify that the summaries are loaded when the StdLibraryFunctionsChecker is
+// enabled.
+//      CHECK: Loaded summary for: int getchar()
+// CHECK-NEXT: Loaded summary for: unsigned long fread(void *restrict, size_t, size_t, FILE *restrict)
+// CHECK-NEXT: Loaded summary for: unsigned long fwrite(const void *restrict, size_t, size_t, FILE *restrict)
+
+#include "Inputs/system-header-simulator.h"
+
+void clang_analyzer_eval(int);
+
+void test_fread_fwrite(FILE *fp, int *buf) {
+  fp = fopen("foo", "r");
+  if (!fp)
+    return;
+  size_t x = fwrite(buf, sizeof(int), 10, fp);
+
+  clang_analyzer_eval(x <= 10); // \
+ // stream-warning{{TRUE}} \
+ // stdLib-warning{{TRUE}} \
+ // both-warning{{TRUE}} \
+
+  clang_analyzer_eval(x == 10); // \
+  // stream-warning{{TRUE}} \
+  // stream-warning{{FALSE}} \
+  // stdLib-warning{{UNKNOWN}} \
+  // both-warning{{TRUE}} \
+  // both-warning{{FALSE}}
+
+  fclose(fp);
+}
diff --git a/clang/test/Analysis/z3/pretty-dump.c b/clang/test/Analysis/z3/pretty-dump.c
new file mode 100644
index 0000000000000..811da172e7490
--- /dev/null
+++ b/clang/test/Analysis/z3/pretty-dump.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -analyze -analyzer-constraints=z3 -setup-static-analyzer \
+// RUN:   -analyzer-checker=core,debug.ExprInspection %s 2>&1 | FileCheck %s
+//
+// REQUIRES: z3
+//
+// Works only with the z3 constraint manager.
+
+void clang_analyzer_printState();
+
+void foo(int x) {
+  if (x == 3) {
+    clang_analyzer_printState();
+    (void)x;
+    // CHECK: "constraints": [
+    // CHECK-NEXT: { "symbol": "(reg_$[[#]]) == 3", "range": "(= reg_$[[#]] #x00000003)" }
+  }
+}
diff --git a/clang/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p9.cpp b/clang/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p9.cpp
index 8d51dbde71776..3720b277af7a9 100644
--- a/clang/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p9.cpp
+++ b/clang/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p9.cpp
@@ -24,11 +24,10 @@ constexpr double &ni3; // expected-error {{declaration of reference variable 'ni
 
 constexpr int nc1 = i; // expected-error {{constexpr variable 'nc1' must be initialized by a constant expression}} expected-note {{read of non-const variable 'i' is not allowed in a constant expression}}
 constexpr C nc2 = C(); // expected-error {{cannot have non-literal type 'const C'}}
-int &f(); // expected-note {{declared here}}
+int &f(); // expected-note 2{{declared here}}
 constexpr int &nc3 = f(); // expected-error {{constexpr variable 'nc3' must be initialized by a constant expression}} expected-note {{non-constexpr function 'f' cannot be used in a constant expression}}
 constexpr int nc4(i); // expected-error {{constexpr variable 'nc4' must be initialized by a constant expression}} expected-note {{read of non-const variable 'i' is not allowed in a constant expression}}
 constexpr C nc5((C())); // expected-error {{cannot have non-literal type 'const C'}}
-int &f(); // expected-note {{here}}
 constexpr int &nc6(f()); // expected-error {{constexpr variable 'nc6' must be initialized by a constant expression}} expected-note {{non-constexpr function 'f'}}
 
 struct pixel {
diff --git a/clang/test/CodeGen/Inputs/start-lib1.ll b/clang/test/CodeGen/Inputs/start-lib1.ll
new file mode 100644
index 0000000000000..18b6ea25386f5
--- /dev/null
+++ b/clang/test/CodeGen/Inputs/start-lib1.ll
@@ -0,0 +1,9 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @bar()
+
+define void @foo() {
+  call void @bar()
+  ret void
+}
diff --git a/clang/test/CodeGen/Inputs/start-lib2.ll b/clang/test/CodeGen/Inputs/start-lib2.ll
new file mode 100644
index 0000000000000..68b3c8362808e
--- /dev/null
+++ b/clang/test/CodeGen/Inputs/start-lib2.ll
@@ -0,0 +1,6 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @bar() {
+  ret void
+}
diff --git a/clang/test/CodeGen/3dnow-builtins.c b/clang/test/CodeGen/X86/3dnow-builtins.c
similarity index 100%
rename from clang/test/CodeGen/3dnow-builtins.c
rename to clang/test/CodeGen/X86/3dnow-builtins.c
diff --git a/clang/test/CodeGen/adc-builtins.c b/clang/test/CodeGen/X86/adc-builtins.c
similarity index 100%
rename from clang/test/CodeGen/adc-builtins.c
rename to clang/test/CodeGen/X86/adc-builtins.c
diff --git a/clang/test/CodeGen/adx-builtins.c b/clang/test/CodeGen/X86/adx-builtins.c
similarity index 100%
rename from clang/test/CodeGen/adx-builtins.c
rename to clang/test/CodeGen/X86/adx-builtins.c
diff --git a/clang/test/CodeGen/AMX/amx.c b/clang/test/CodeGen/X86/amx.c
similarity index 100%
rename from clang/test/CodeGen/AMX/amx.c
rename to clang/test/CodeGen/X86/amx.c
diff --git a/clang/test/CodeGen/AMX/amx_errors.c b/clang/test/CodeGen/X86/amx_errors.c
similarity index 100%
rename from clang/test/CodeGen/AMX/amx_errors.c
rename to clang/test/CodeGen/X86/amx_errors.c
diff --git a/clang/test/CodeGen/AMX/amx_inline_asm.c b/clang/test/CodeGen/X86/amx_inline_asm.c
similarity index 100%
rename from clang/test/CodeGen/AMX/amx_inline_asm.c
rename to clang/test/CodeGen/X86/amx_inline_asm.c
diff --git a/clang/test/CodeGen/avx-builtins-constrained-cmp.c b/clang/test/CodeGen/X86/avx-builtins-constrained-cmp.c
similarity index 100%
rename from clang/test/CodeGen/avx-builtins-constrained-cmp.c
rename to clang/test/CodeGen/X86/avx-builtins-constrained-cmp.c
diff --git a/clang/test/CodeGen/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx-builtins.c
rename to clang/test/CodeGen/X86/avx-builtins.c
diff --git a/clang/test/CodeGen/avx-cmp-builtins.c b/clang/test/CodeGen/X86/avx-cmp-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx-cmp-builtins.c
rename to clang/test/CodeGen/X86/avx-cmp-builtins.c
diff --git a/clang/test/CodeGen/avx-shuffle-builtins.c b/clang/test/CodeGen/X86/avx-shuffle-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx-shuffle-builtins.c
rename to clang/test/CodeGen/X86/avx-shuffle-builtins.c
diff --git a/clang/test/CodeGen/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
similarity index 96%
rename from clang/test/CodeGen/avx2-builtins.c
rename to clang/test/CodeGen/X86/avx2-builtins.c
index f3de6d1b87474..46717a78b49ed 100644
--- a/clang/test/CodeGen/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -727,85 +727,73 @@ void test_mm256_maskstore_epi64(long long *a, __m256i m, __m256i b) {
 
 __m256i test_mm256_max_epi8(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_max_epi8
-  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK: call <32 x i8> @llvm.smax.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_max_epi8(a, b);
 }
 
 __m256i test_mm256_max_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_max_epi16
-  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK: call <16 x i16> @llvm.smax.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_max_epi16(a, b);
 }
 
 __m256i test_mm256_max_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_max_epi32
-  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK: call <8 x i32> @llvm.smax.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_max_epi32(a, b);
 }
 
 __m256i test_mm256_max_epu8(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_max_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK: call <32 x i8> @llvm.umax.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_max_epu8(a, b);
 }
 
 __m256i test_mm256_max_epu16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_max_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK: call <16 x i16> @llvm.umax.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_max_epu16(a, b);
 }
 
 __m256i test_mm256_max_epu32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_max_epu32
-  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK: call <8 x i32> @llvm.umax.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_max_epu32(a, b);
 }
 
 __m256i test_mm256_min_epi8(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_min_epi8
-  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK: call <32 x i8> @llvm.smin.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_min_epi8(a, b);
 }
 
 __m256i test_mm256_min_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_min_epi16
-  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK: call <16 x i16> @llvm.smin.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_min_epi16(a, b);
 }
 
 __m256i test_mm256_min_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_min_epi32
-  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK: call <8 x i32> @llvm.smin.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_min_epi32(a, b);
 }
 
 __m256i test_mm256_min_epu8(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_min_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK: call <32 x i8> @llvm.umin.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_min_epu8(a, b);
 }
 
 __m256i test_mm256_min_epu16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_min_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK: call <16 x i16> @llvm.umin.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_min_epu16(a, b);
 }
 
 __m256i test_mm256_min_epu32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_min_epu32
-  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK: call <8 x i32> @llvm.umin.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_min_epu32(a, b);
 }
 
diff --git a/clang/test/CodeGen/avx512-inline-asm-kregisters-basics.c b/clang/test/CodeGen/X86/avx512-inline-asm-kregisters-basics.c
similarity index 100%
rename from clang/test/CodeGen/avx512-inline-asm-kregisters-basics.c
rename to clang/test/CodeGen/X86/avx512-inline-asm-kregisters-basics.c
diff --git a/clang/test/CodeGen/avx512-kconstraints-att_inline_asm.c b/clang/test/CodeGen/X86/avx512-kconstraints-att_inline_asm.c
similarity index 100%
rename from clang/test/CodeGen/avx512-kconstraints-att_inline_asm.c
rename to clang/test/CodeGen/X86/avx512-kconstraints-att_inline_asm.c
diff --git a/clang/test/CodeGen/avx512-reduceIntrin.c b/clang/test/CodeGen/X86/avx512-reduceIntrin.c
similarity index 100%
rename from clang/test/CodeGen/avx512-reduceIntrin.c
rename to clang/test/CodeGen/X86/avx512-reduceIntrin.c
diff --git a/clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c b/clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c
new file mode 100644
index 0000000000000..923672bb80953
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c
@@ -0,0 +1,372 @@
+// RUN: %clang_cc1 -fexperimental-new-pass-manager -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include 
+
+long long test_mm512_reduce_max_epi64(__m512i __W){
+// CHECK-LABEL: @test_mm512_reduce_max_epi64(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.smax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.smax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.smax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    extractelement <8 x i64> %{{.*}}, i32 0
+  return _mm512_reduce_max_epi64(__W);
+}
+
+unsigned long long test_mm512_reduce_max_epu64(__m512i __W){
+// CHECK-LABEL: @test_mm512_reduce_max_epu64(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.umax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.umax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.umax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    extractelement <8 x i64> %{{.*}}, i32 0
+  return _mm512_reduce_max_epu64(__W);
+}
+
+double test_mm512_reduce_max_pd(__m512d __W){
+// CHECK-LABEL: @test_mm512_reduce_max_pd(
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> 
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> 
+// CHECK:    call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+// CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> 
+// CHECK:    call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+// CHECK:    extractelement <2 x double> %{{.*}}, i32 0
+  return _mm512_reduce_max_pd(__W); 
+}
+
+long long test_mm512_reduce_min_epi64(__m512i __W){
+// CHECK-LABEL: @test_mm512_reduce_min_epi64(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.smin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.smin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.smin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    extractelement <8 x i64> %{{.*}}, i32 0
+  return _mm512_reduce_min_epi64(__W);
+}
+
+unsigned long long test_mm512_reduce_min_epu64(__m512i __W){
+// CHECK-LABEL: @test_mm512_reduce_min_epu64(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.umin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.umin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.umin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    extractelement <8 x i64> %{{.*}}, i32 0
+  return _mm512_reduce_min_epu64(__W);
+}
+
+double test_mm512_reduce_min_pd(__m512d __W){
+// CHECK-LABEL: @test_mm512_reduce_min_pd(
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> 
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> 
+// CHECK:    call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+// CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> 
+// CHECK:    call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+// CHECK:    extractelement <2 x double> %{{.*}}, i32 0
+  return _mm512_reduce_min_pd(__W); 
+}
+
+long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){
+// CHECK-LABEL: @test_mm512_mask_reduce_max_epi64(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.smax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.smax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.smax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    extractelement <8 x i64> %{{.*}}, i32 0
+  return _mm512_mask_reduce_max_epi64(__M, __W); 
+}
+
+unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){
+// CHECK-LABEL: @test_mm512_mask_reduce_max_epu64(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.umax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.umax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.umax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    extractelement <8 x i64> %{{.*}}, i32 0
+  return _mm512_mask_reduce_max_epu64(__M, __W); 
+}
+
+double test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){
+// CHECK-LABEL: @test_mm512_mask_reduce_max_pd(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> 
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> 
+// CHECK:    call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+// CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double>  %{{.*}}, <2 x i32> 
+// CHECK:    call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+// CHECK:    extractelement <2 x double> %{{.*}}, i32 0
+  return _mm512_mask_reduce_max_pd(__M, __W); 
+}
+
+long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){
+// CHECK-LABEL: @test_mm512_mask_reduce_min_epi64(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.smin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.smin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.smin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    extractelement <8 x i64> %{{.*}}, i32 0
+  return _mm512_mask_reduce_min_epi64(__M, __W); 
+}
+
+unsigned long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){
+// CHECK-LABEL: @test_mm512_mask_reduce_min_epu64(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.umin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.umin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+// CHECK:    call <8 x i64> @llvm.umin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
+// CHECK:    extractelement <8 x i64> %{{.*}}, i32 0
+  return _mm512_mask_reduce_min_epu64(__M, __W); 
+}
+
+double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){
+// CHECK-LABEL: @test_mm512_mask_reduce_min_pd(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> 
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> 
+// CHECK:    call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+// CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double>  %{{.*}}, <2 x i32> 
+// CHECK:    call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+// CHECK:    extractelement <2 x double> %{{.*}}, i32 0
+  return _mm512_mask_reduce_min_pd(__M, __W); 
+}
+
+int test_mm512_reduce_max_epi32(__m512i __W){
+// CHECK-LABEL: @test_mm512_reduce_max_epi32(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    call <8 x i32> @llvm.smax.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    call <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
+  return _mm512_reduce_max_epi32(__W);
+}
+
+unsigned int test_mm512_reduce_max_epu32(__m512i __W){
+// CHECK-LABEL: @test_mm512_reduce_max_epu32(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    call <8 x i32> @llvm.umax.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    call <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
+  return _mm512_reduce_max_epu32(__W);
+}
+
+float test_mm512_reduce_max_ps(__m512 __W){
+// CHECK-LABEL: define float @test_mm512_reduce_max_ps(
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> 
+// CHECK:    call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    extractelement <4 x float> %{{.*}}, i32 0
+  return _mm512_reduce_max_ps(__W); 
+}
+
+int test_mm512_reduce_min_epi32(__m512i __W){
+// CHECK-LABEL: @test_mm512_reduce_min_epi32(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    call <8 x i32> @llvm.smin.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    call <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
+  return _mm512_reduce_min_epi32(__W);
+}
+
+unsigned int test_mm512_reduce_min_epu32(__m512i __W){
+// CHECK-LABEL: @test_mm512_reduce_min_epu32(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    call <8 x i32> @llvm.umin.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    call <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
+  return _mm512_reduce_min_epu32(__W);
+}
+
+float test_mm512_reduce_min_ps(__m512 __W){
+// CHECK-LABEL: define float @test_mm512_reduce_min_ps(
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> 
+// CHECK:    call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    extractelement <4 x float> %{{.*}}, i32 0
+  return _mm512_reduce_min_ps(__W); 
+}
+
+int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){
+// CHECK-LABEL: @test_mm512_mask_reduce_max_epi32(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    call <8 x i32> @llvm.smax.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    call <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
+  return _mm512_mask_reduce_max_epi32(__M, __W); 
+}
+
+unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){
+// CHECK-LABEL: @test_mm512_mask_reduce_max_epu32(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    call <8 x i32> @llvm.umax.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    call <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
+  return _mm512_mask_reduce_max_epu32(__M, __W); 
+}
+
+float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){
+// CHECK-LABEL: define float @test_mm512_mask_reduce_max_ps(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> 
+// CHECK:    call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    extractelement <4 x float> %{{.*}}, i32 0
+  return _mm512_mask_reduce_max_ps(__M, __W); 
+}
+
+int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){
+// CHECK-LABEL: @test_mm512_mask_reduce_min_epi32(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    call <8 x i32> @llvm.smin.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    call <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
+  return _mm512_mask_reduce_min_epi32(__M, __W); 
+}
+
+unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){
+// CHECK-LABEL: @test_mm512_mask_reduce_min_epu32(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> 
+// CHECK:    call <8 x i32> @llvm.umin.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> 
+// CHECK:    call <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
+  return _mm512_mask_reduce_min_epu32(__M, __W); 
+}
+
+float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){
+// CHECK-LABEL: define float @test_mm512_mask_reduce_min_ps(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> 
+// CHECK:    call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> 
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> 
+// CHECK:    call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> 
+// CHECK:    call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    extractelement <4 x float> %{{.*}}, i32 0
+  return _mm512_mask_reduce_min_ps(__M, __W); 
+}
+
diff --git a/clang/test/CodeGen/avx512bf16-builtins.c b/clang/test/CodeGen/X86/avx512bf16-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512bf16-builtins.c
rename to clang/test/CodeGen/X86/avx512bf16-builtins.c
diff --git a/clang/test/CodeGen/avx512bitalg-builtins.c b/clang/test/CodeGen/X86/avx512bitalg-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512bitalg-builtins.c
rename to clang/test/CodeGen/X86/avx512bitalg-builtins.c
diff --git a/clang/test/CodeGen/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
similarity index 96%
rename from clang/test/CodeGen/avx512bw-builtins.c
rename to clang/test/CodeGen/X86/avx512bw-builtins.c
index cc173f1a9cfe6..58b2488f3caf0 100644
--- a/clang/test/CodeGen/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1088,161 +1088,137 @@ __m512i test_mm512_maskz_avg_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
 }
 __m512i test_mm512_max_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_max_epi8
-  // CHECK:       [[CMP:%.*]] = icmp sgt <64 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_max_epi8(__A,__B); 
 }
 __m512i test_mm512_maskz_max_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_max_epi8
-  // CHECK:       [[CMP:%.*]] = icmp sgt <64 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_maskz_max_epi8(__M,__A,__B); 
 }
 __m512i test_mm512_mask_max_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_max_epi8
-  // CHECK:       [[CMP:%.*]] = icmp sgt <64 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_mask_max_epi8(__W,__M,__A,__B); 
 }
 __m512i test_mm512_max_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_max_epi16
-  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_max_epi16(__A,__B); 
 }
 __m512i test_mm512_maskz_max_epi16(__mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_max_epi16
-  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_maskz_max_epi16(__M,__A,__B); 
 }
 __m512i test_mm512_mask_max_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_max_epi16
-  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_mask_max_epi16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_max_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_max_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ugt <64 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_max_epu8(__A,__B); 
 }
 __m512i test_mm512_maskz_max_epu8(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_max_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ugt <64 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_maskz_max_epu8(__M,__A,__B); 
 }
 __m512i test_mm512_mask_max_epu8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_max_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ugt <64 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_mask_max_epu8(__W,__M,__A,__B); 
 }
 __m512i test_mm512_max_epu16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_max_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_max_epu16(__A,__B); 
 }
 __m512i test_mm512_maskz_max_epu16(__mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_max_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_maskz_max_epu16(__M,__A,__B); 
 }
 __m512i test_mm512_mask_max_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_max_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_mask_max_epu16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_min_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_min_epi8
-  // CHECK:       [[CMP:%.*]] = icmp slt <64 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_min_epi8(__A,__B); 
 }
 __m512i test_mm512_maskz_min_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_min_epi8
-  // CHECK:       [[CMP:%.*]] = icmp slt <64 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_maskz_min_epi8(__M,__A,__B); 
 }
 __m512i test_mm512_mask_min_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_min_epi8
-  // CHECK:       [[CMP:%.*]] = icmp slt <64 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_mask_min_epi8(__W,__M,__A,__B); 
 }
 __m512i test_mm512_min_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_min_epi16
-  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_min_epi16(__A,__B); 
 }
 __m512i test_mm512_maskz_min_epi16(__mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_min_epi16
-  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_maskz_min_epi16(__M,__A,__B); 
 }
 __m512i test_mm512_mask_min_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_min_epi16
-  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_mask_min_epi16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_min_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_min_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ult <64 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_min_epu8(__A,__B); 
 }
 __m512i test_mm512_maskz_min_epu8(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_min_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ult <64 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_maskz_min_epu8(__M,__A,__B); 
 }
 __m512i test_mm512_mask_min_epu8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_min_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ult <64 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[X]], <64 x i8> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_mask_min_epu8(__W,__M,__A,__B); 
 }
 __m512i test_mm512_min_epu16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_min_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_min_epu16(__A,__B); 
 }
 __m512i test_mm512_maskz_min_epu16(__mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_min_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_maskz_min_epu16(__M,__A,__B); 
 }
 __m512i test_mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_min_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[X]], <32 x i16> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_mask_min_epu16(__W,__M,__A,__B); 
 }
diff --git a/clang/test/CodeGen/avx512cdintrin.c b/clang/test/CodeGen/X86/avx512cdintrin.c
similarity index 100%
rename from clang/test/CodeGen/avx512cdintrin.c
rename to clang/test/CodeGen/X86/avx512cdintrin.c
diff --git a/clang/test/CodeGen/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512dq-builtins.c
rename to clang/test/CodeGen/X86/avx512dq-builtins.c
diff --git a/clang/test/CodeGen/avx512er-builtins.c b/clang/test/CodeGen/X86/avx512er-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512er-builtins.c
rename to clang/test/CodeGen/X86/avx512er-builtins.c
diff --git a/clang/test/CodeGen/avx512f-builtins-constrained-cmp.c b/clang/test/CodeGen/X86/avx512f-builtins-constrained-cmp.c
similarity index 100%
rename from clang/test/CodeGen/avx512f-builtins-constrained-cmp.c
rename to clang/test/CodeGen/X86/avx512f-builtins-constrained-cmp.c
diff --git a/clang/test/CodeGen/avx512f-builtins-constrained.c b/clang/test/CodeGen/X86/avx512f-builtins-constrained.c
similarity index 100%
rename from clang/test/CodeGen/avx512f-builtins-constrained.c
rename to clang/test/CodeGen/X86/avx512f-builtins-constrained.c
diff --git a/clang/test/CodeGen/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
similarity index 99%
rename from clang/test/CodeGen/avx512f-builtins.c
rename to clang/test/CodeGen/X86/avx512f-builtins.c
index fb5db4c321748..a4b23eb1cf5e2 100644
--- a/clang/test/CodeGen/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -9882,16 +9882,14 @@ __m512d test_mm512_roundscale_round_pd(__m512d __A)
 __m512i test_mm512_max_epi32 (__m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_max_epi32 
-  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_max_epi32 (__A,__B);
 }
 
 __m512i test_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_max_epi32 
-  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_mask_max_epi32 (__W,__M,__A,__B);
 }
@@ -9899,8 +9897,7 @@ __m512i test_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m5
 __m512i test_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_max_epi32 
-  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_maskz_max_epi32 (__M,__A,__B);
 }
@@ -9908,16 +9905,14 @@ __m512i test_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
 __m512i test_mm512_max_epi64 (__m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_max_epi64 
-  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_max_epi64 (__A,__B);
 }
 
 __m512i test_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_max_epi64 
-  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_mask_max_epi64 (__W,__M,__A,__B);
 }
@@ -9925,8 +9920,7 @@ __m512i test_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m51
 __m512i test_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_max_epi64 
-  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_maskz_max_epi64 (__M,__A,__B);
 }
@@ -9934,16 +9928,14 @@ __m512i test_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
 __m512i test_mm512_max_epu64 (__m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_max_epu64 
-  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_max_epu64 (__A,__B);
 }
 
 __m512i test_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_max_epu64 
-  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_mask_max_epu64 (__W,__M,__A,__B);
 }
@@ -9951,8 +9943,7 @@ __m512i test_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m51
 __m512i test_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_max_epu64 
-  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_maskz_max_epu64 (__M,__A,__B);
 }
@@ -9960,16 +9951,14 @@ __m512i test_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
 __m512i test_mm512_max_epu32 (__m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_max_epu32 
-  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_max_epu32 (__A,__B);
 }
 
 __m512i test_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_max_epu32 
-  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_mask_max_epu32 (__W,__M,__A,__B);
 }
@@ -9977,8 +9966,7 @@ __m512i test_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m5
 __m512i test_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_max_epu32 
-  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_maskz_max_epu32 (__M,__A,__B);
 }
@@ -9986,16 +9974,14 @@ __m512i test_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
 __m512i test_mm512_min_epi32 (__m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_min_epi32 
-  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_min_epi32 (__A,__B);
 }
 
 __m512i test_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_min_epi32 
-  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_mask_min_epi32 (__W,__M,__A,__B);
 }
@@ -10003,8 +9989,7 @@ __m512i test_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m5
 __m512i test_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_min_epi32 
-  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_maskz_min_epi32 (__M,__A,__B);
 }
@@ -10012,16 +9997,14 @@ __m512i test_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
 __m512i test_mm512_min_epu32 (__m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_min_epu32 
-  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_min_epu32 (__A,__B);
 }
 
 __m512i test_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_min_epu32 
-  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_mask_min_epu32 (__W,__M,__A,__B);
 }
@@ -10029,8 +10012,7 @@ __m512i test_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m5
 __m512i test_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_min_epu32 
-  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[X]], <16 x i32> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_maskz_min_epu32 (__M,__A,__B);
 }
@@ -10038,16 +10020,14 @@ __m512i test_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
 __m512i test_mm512_min_epi64 (__m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_min_epi64 
-  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_min_epi64 (__A,__B);
 }
 
 __m512i test_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_min_epi64 
-  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_mask_min_epi64 (__W,__M,__A,__B);
 }
@@ -10055,8 +10035,7 @@ __m512i test_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m51
 __m512i test_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_min_epi64 
-  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_maskz_min_epi64 (__M,__A,__B);
 }
@@ -10064,16 +10043,14 @@ __m512i test_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
 __m512i test_mm512_min_epu64 (__m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_min_epu64 
-  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_min_epu64 (__A,__B);
 }
 
 __m512i test_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_min_epu64 
-  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_mask_min_epu64 (__W,__M,__A,__B);
 }
@@ -10081,8 +10058,7 @@ __m512i test_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m51
 __m512i test_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_min_epu64 
-  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[X]], <8 x i64> [[Y]]
+  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_maskz_min_epu64 (__M,__A,__B);
 }
diff --git a/clang/test/CodeGen/avx512ifma-builtins.c b/clang/test/CodeGen/X86/avx512ifma-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512ifma-builtins.c
rename to clang/test/CodeGen/X86/avx512ifma-builtins.c
diff --git a/clang/test/CodeGen/avx512ifmavl-builtins.c b/clang/test/CodeGen/X86/avx512ifmavl-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512ifmavl-builtins.c
rename to clang/test/CodeGen/X86/avx512ifmavl-builtins.c
diff --git a/clang/test/CodeGen/avx512pf-builtins.c b/clang/test/CodeGen/X86/avx512pf-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512pf-builtins.c
rename to clang/test/CodeGen/X86/avx512pf-builtins.c
diff --git a/clang/test/CodeGen/avx512vbmi-builtins.c b/clang/test/CodeGen/X86/avx512vbmi-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512vbmi-builtins.c
rename to clang/test/CodeGen/X86/avx512vbmi-builtins.c
diff --git a/clang/test/CodeGen/avx512vbmi2-builtins.c b/clang/test/CodeGen/X86/avx512vbmi2-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512vbmi2-builtins.c
rename to clang/test/CodeGen/X86/avx512vbmi2-builtins.c
diff --git a/clang/test/CodeGen/avx512vbmivl-builtin.c b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
similarity index 100%
rename from clang/test/CodeGen/avx512vbmivl-builtin.c
rename to clang/test/CodeGen/X86/avx512vbmivl-builtin.c
diff --git a/clang/test/CodeGen/avx512vl-builtins-constrained-cmp.c b/clang/test/CodeGen/X86/avx512vl-builtins-constrained-cmp.c
similarity index 100%
rename from clang/test/CodeGen/avx512vl-builtins-constrained-cmp.c
rename to clang/test/CodeGen/X86/avx512vl-builtins-constrained-cmp.c
diff --git a/clang/test/CodeGen/avx512vl-builtins-constrained.c b/clang/test/CodeGen/X86/avx512vl-builtins-constrained.c
similarity index 100%
rename from clang/test/CodeGen/avx512vl-builtins-constrained.c
rename to clang/test/CodeGen/X86/avx512vl-builtins-constrained.c
diff --git a/clang/test/CodeGen/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
similarity index 98%
rename from clang/test/CodeGen/avx512vl-builtins.c
rename to clang/test/CodeGen/X86/avx512vl-builtins.c
index e7965119fb4b9..248cb61d97ae4 100644
--- a/clang/test/CodeGen/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -4603,8 +4603,7 @@ __m256i test_mm256_maskz_abs_epi64(__mmask8 __U, __m256i __A) {
 }
 __m128i test_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epi32
-  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <4 x i32> [[RES]] to <2 x i64>
   // CHECK: [[RES:%.*]] = bitcast <2 x i64> [[TMP]] to <4 x i32>
   // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
@@ -4612,8 +4611,7 @@ __m128i test_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
 }
 __m128i test_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epi32
-  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <4 x i32> [[RES]] to <2 x i64>
   // CHECK: [[RES:%.*]] = bitcast <2 x i64> [[TMP]] to <4 x i32>
   // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
@@ -4621,8 +4619,7 @@ __m128i test_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i _
 }
 __m256i test_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epi32
-  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <8 x i32> [[RES]] to <4 x i64>
   // CHECK: [[RES:%.*]] = bitcast <4 x i64> [[TMP]] to <8 x i32>
   // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
@@ -4630,8 +4627,7 @@ __m256i test_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
 }
 __m256i test_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epi32
-  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <8 x i32> [[RES]] to <4 x i64>
   // CHECK: [[RES:%.*]] = bitcast <4 x i64> [[TMP]] to <8 x i32>
   // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
@@ -4639,48 +4635,41 @@ __m256i test_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256
 }
 __m128i test_mm_maskz_max_epi64(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epi64
-  // CHECK:       [[CMP:%.*]] = icmp sgt <2 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_maskz_max_epi64(__M,__A,__B); 
 }
 __m128i test_mm_mask_max_epi64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epi64
-  // CHECK:       [[CMP:%.*]] = icmp sgt <2 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_mask_max_epi64(__W,__M,__A,__B); 
 }
 __m128i test_mm_max_epi64(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_max_epi64
-  // CHECK:       [[CMP:%.*]] = icmp sgt <2 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_max_epi64(__A,__B); 
 }
 __m256i test_mm256_maskz_max_epi64(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epi64
-  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_maskz_max_epi64(__M,__A,__B); 
 }
 __m256i test_mm256_mask_max_epi64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epi64
-  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_mask_max_epi64(__W,__M,__A,__B); 
 }
 __m256i test_mm256_max_epi64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_max_epi64
-  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_max_epi64(__A,__B); 
 }
 __m128i test_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epu32
-  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <4 x i32> [[RES]] to <2 x i64>
   // CHECK: [[RES:%.*]] = bitcast <2 x i64> [[TMP]] to <4 x i32>
   // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
@@ -4688,8 +4677,7 @@ __m128i test_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
 }
 __m128i test_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epu32
-  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <4 x i32> [[RES]] to <2 x i64>
   // CHECK: [[RES:%.*]] = bitcast <2 x i64> [[TMP]] to <4 x i32>
   // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
@@ -4697,8 +4685,7 @@ __m128i test_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i _
 }
 __m256i test_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epu32
-  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <8 x i32> [[RES]] to <4 x i64>
   // CHECK: [[RES:%.*]] = bitcast <4 x i64> [[TMP]] to <8 x i32>
   // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
@@ -4706,8 +4693,7 @@ __m256i test_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
 }
 __m256i test_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epu32
-  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <8 x i32> [[RES]] to <4 x i64>
   // CHECK: [[RES:%.*]] = bitcast <4 x i64> [[TMP]] to <8 x i32>
   // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
@@ -4715,48 +4701,41 @@ __m256i test_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256
 }
 __m128i test_mm_maskz_max_epu64(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epu64
-  // CHECK:       [[CMP:%.*]] = icmp ugt <2 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_maskz_max_epu64(__M,__A,__B); 
 }
 __m128i test_mm_max_epu64(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_max_epu64
-  // CHECK:       [[CMP:%.*]] = icmp ugt <2 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_max_epu64(__A,__B); 
 }
 __m128i test_mm_mask_max_epu64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epu64
-  // CHECK:       [[CMP:%.*]] = icmp ugt <2 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_mask_max_epu64(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_max_epu64(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epu64
-  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_maskz_max_epu64(__M,__A,__B); 
 }
 __m256i test_mm256_max_epu64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_max_epu64
-  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_max_epu64(__A,__B); 
 }
 __m256i test_mm256_mask_max_epu64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epu64
-  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_mask_max_epu64(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epi32
-  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <4 x i32> [[RES]] to <2 x i64>
   // CHECK: [[RES:%.*]] = bitcast <2 x i64> [[TMP]] to <4 x i32>
   // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
@@ -4764,8 +4743,7 @@ __m128i test_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
 }
 __m128i test_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epi32
-  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <4 x i32> [[RES]] to <2 x i64>
   // CHECK: [[RES:%.*]] = bitcast <2 x i64> [[TMP]] to <4 x i32>
   // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
@@ -4773,8 +4751,7 @@ __m128i test_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i _
 }
 __m256i test_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epi32
-  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <8 x i32> [[RES]] to <4 x i64>
   // CHECK: [[RES:%.*]] = bitcast <4 x i64> [[TMP]] to <8 x i32>
   // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
@@ -4782,8 +4759,7 @@ __m256i test_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
 }
 __m256i test_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epi32
-  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <8 x i32> [[RES]] to <4 x i64>
   // CHECK: [[RES:%.*]] = bitcast <4 x i64> [[TMP]] to <8 x i32>
   // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
@@ -4791,48 +4767,41 @@ __m256i test_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256
 }
 __m128i test_mm_min_epi64(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_min_epi64
-  // CHECK:       [[CMP:%.*]] = icmp slt <2 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_min_epi64(__A,__B); 
 }
 __m128i test_mm_mask_min_epi64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epi64
-  // CHECK:       [[CMP:%.*]] = icmp slt <2 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_mask_min_epi64(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_min_epi64(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epi64
-  // CHECK:       [[CMP:%.*]] = icmp slt <2 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_maskz_min_epi64(__M,__A,__B); 
 }
 __m256i test_mm256_min_epi64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_min_epi64
-  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_min_epi64(__A,__B); 
 }
 __m256i test_mm256_mask_min_epi64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epi64
-  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_mask_min_epi64(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_min_epi64(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epi64
-  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_maskz_min_epi64(__M,__A,__B); 
 }
 __m128i test_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epu32
-  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <4 x i32> [[RES]] to <2 x i64>
   // CHECK: [[RES:%.*]] = bitcast <2 x i64> [[TMP]] to <4 x i32>
   // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
@@ -4840,8 +4809,7 @@ __m128i test_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
 }
 __m128i test_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epu32
-  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <4 x i32> [[RES]] to <2 x i64>
   // CHECK: [[RES:%.*]] = bitcast <2 x i64> [[TMP]] to <4 x i32>
   // CHECK:       select <4 x i1> {{.*}}, <4 x i32> [[RES]], <4 x i32> {{.*}}
@@ -4849,8 +4817,7 @@ __m128i test_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i _
 }
 __m256i test_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epu32
-  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <8 x i32> [[RES]] to <4 x i64>
   // CHECK: [[RES:%.*]] = bitcast <4 x i64> [[TMP]] to <8 x i32>
   // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
@@ -4858,8 +4825,7 @@ __m256i test_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
 }
 __m256i test_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epu32
-  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast <8 x i32> [[RES]] to <4 x i64>
   // CHECK: [[RES:%.*]] = bitcast <4 x i64> [[TMP]] to <8 x i32>
   // CHECK:       select <8 x i1> {{.*}}, <8 x i32> [[RES]], <8 x i32> {{.*}}
@@ -4867,41 +4833,35 @@ __m256i test_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256
 }
 __m128i test_mm_min_epu64(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_min_epu64
-  // CHECK:       [[CMP:%.*]] = icmp ult <2 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_min_epu64(__A,__B); 
 }
 __m128i test_mm_mask_min_epu64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epu64
-  // CHECK:       [[CMP:%.*]] = icmp ult <2 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_mask_min_epu64(__W,__M,__A,__B); 
 }
 __m128i test_mm_maskz_min_epu64(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epu64
-  // CHECK:       [[CMP:%.*]] = icmp ult <2 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i64> [[X]], <2 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK:       select <2 x i1> {{.*}}, <2 x i64> [[RES]], <2 x i64> {{.*}}
   return _mm_maskz_min_epu64(__M,__A,__B); 
 }
 __m256i test_mm256_min_epu64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_min_epu64
-  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_min_epu64(__A,__B); 
 }
 __m256i test_mm256_mask_min_epu64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epu64
-  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_mask_min_epu64(__W,__M,__A,__B); 
 }
 __m256i test_mm256_maskz_min_epu64(__mmask8 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epu64
-  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i64> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[X]], <4 x i64> [[Y]]
+  // CHECK: [[RES:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   // CHECK:       select <4 x i1> {{.*}}, <4 x i64> [[RES]], <4 x i64> {{.*}}
   return _mm256_maskz_min_epu64(__M,__A,__B); 
 }
diff --git a/clang/test/CodeGen/avx512vlbf16-builtins.c b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512vlbf16-builtins.c
rename to clang/test/CodeGen/X86/avx512vlbf16-builtins.c
diff --git a/clang/test/CodeGen/avx512vlbitalg-builtins.c b/clang/test/CodeGen/X86/avx512vlbitalg-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512vlbitalg-builtins.c
rename to clang/test/CodeGen/X86/avx512vlbitalg-builtins.c
diff --git a/clang/test/CodeGen/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
similarity index 96%
rename from clang/test/CodeGen/avx512vlbw-builtins.c
rename to clang/test/CodeGen/X86/avx512vlbw-builtins.c
index df2adfdb97be6..36feafd29437b 100644
--- a/clang/test/CodeGen/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -1226,8 +1226,7 @@ __m256i test_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
 }
 __m128i test_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epi8
-  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
@@ -1235,8 +1234,7 @@ __m128i test_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
 }
 __m128i test_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epi8
-  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
@@ -1244,8 +1242,7 @@ __m128i test_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i _
 }
 __m256i test_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epi8
-  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
@@ -1253,8 +1250,7 @@ __m256i test_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
 }
 __m256i test_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epi8
-  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
@@ -1262,8 +1258,7 @@ __m256i test_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256
 }
 __m128i test_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epi16
-  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
@@ -1271,8 +1266,7 @@ __m128i test_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) {
 }
 __m128i test_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epi16
-  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
@@ -1280,8 +1274,7 @@ __m128i test_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i _
 }
 __m256i test_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epi16
-  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
@@ -1289,8 +1282,7 @@ __m256i test_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) {
 }
 __m256i test_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epi16
-  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
@@ -1298,8 +1290,7 @@ __m256i test_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m25
 }
 __m128i test_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
@@ -1307,8 +1298,7 @@ __m128i test_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) {
 }
 __m128i test_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
@@ -1316,8 +1306,7 @@ __m128i test_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i _
 }
 __m256i test_mm256_maskz_max_epu8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
@@ -1325,8 +1314,7 @@ __m256i test_mm256_maskz_max_epu8(__mmask32 __M, __m256i __A, __m256i __B) {
 }
 __m256i test_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
@@ -1334,8 +1322,7 @@ __m256i test_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256
 }
 __m128i test_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_max_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
@@ -1343,8 +1330,7 @@ __m128i test_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) {
 }
 __m128i test_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
@@ -1352,8 +1338,7 @@ __m128i test_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i _
 }
 __m256i test_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_max_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
@@ -1361,8 +1346,7 @@ __m256i test_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) {
 }
 __m256i test_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
@@ -1370,8 +1354,7 @@ __m256i test_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m25
 }
 __m128i test_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epi8
-  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
@@ -1379,8 +1362,7 @@ __m128i test_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
 }
 __m128i test_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epi8
-  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
@@ -1388,8 +1370,7 @@ __m128i test_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i _
 }
 __m256i test_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epi8
-  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
@@ -1397,8 +1378,7 @@ __m256i test_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
 }
 __m256i test_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epi8
-  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
@@ -1406,8 +1386,7 @@ __m256i test_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256
 }
 __m128i test_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epi16
-  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
@@ -1415,8 +1394,7 @@ __m128i test_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) {
 }
 __m128i test_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epi16
-  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
@@ -1424,8 +1402,7 @@ __m128i test_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i _
 }
 __m256i test_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epi16
-  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
@@ -1433,8 +1410,7 @@ __m256i test_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) {
 }
 __m256i test_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epi16
-  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
@@ -1442,8 +1418,7 @@ __m256i test_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m25
 }
 __m128i test_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
@@ -1451,8 +1426,7 @@ __m128i test_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) {
 }
 __m128i test_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
@@ -1460,8 +1434,7 @@ __m128i test_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i _
 }
 __m256i test_mm256_maskz_min_epu8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
@@ -1469,8 +1442,7 @@ __m256i test_mm256_maskz_min_epu8(__mmask32 __M, __m256i __A, __m256i __B) {
 }
 __m256i test_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
@@ -1478,8 +1450,7 @@ __m256i test_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256
 }
 __m128i test_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_min_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
@@ -1487,8 +1458,7 @@ __m128i test_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) {
 }
 __m128i test_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
@@ -1496,8 +1466,7 @@ __m128i test_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i _
 }
 __m256i test_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_min_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
@@ -1505,8 +1474,7 @@ __m256i test_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) {
 }
 __m256i test_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]]
   // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
diff --git a/clang/test/CodeGen/avx512vlcd-builtins.c b/clang/test/CodeGen/X86/avx512vlcd-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512vlcd-builtins.c
rename to clang/test/CodeGen/X86/avx512vlcd-builtins.c
diff --git a/clang/test/CodeGen/avx512vldq-builtins.c b/clang/test/CodeGen/X86/avx512vldq-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512vldq-builtins.c
rename to clang/test/CodeGen/X86/avx512vldq-builtins.c
diff --git a/clang/test/CodeGen/avx512vlvbmi2-builtins.c b/clang/test/CodeGen/X86/avx512vlvbmi2-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512vlvbmi2-builtins.c
rename to clang/test/CodeGen/X86/avx512vlvbmi2-builtins.c
diff --git a/clang/test/CodeGen/avx512vlvnni-builtins.c b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512vlvnni-builtins.c
rename to clang/test/CodeGen/X86/avx512vlvnni-builtins.c
diff --git a/clang/test/CodeGen/avx512vnni-builtins.c b/clang/test/CodeGen/X86/avx512vnni-builtins.c
similarity index 100%
rename from clang/test/CodeGen/avx512vnni-builtins.c
rename to clang/test/CodeGen/X86/avx512vnni-builtins.c
diff --git a/clang/test/CodeGen/avx512vpopcntdqintrin.c b/clang/test/CodeGen/X86/avx512vpopcntdqintrin.c
similarity index 100%
rename from clang/test/CodeGen/avx512vpopcntdqintrin.c
rename to clang/test/CodeGen/X86/avx512vpopcntdqintrin.c
diff --git a/clang/test/CodeGen/avx512vpopcntdqvlintrin.c b/clang/test/CodeGen/X86/avx512vpopcntdqvlintrin.c
similarity index 100%
rename from clang/test/CodeGen/avx512vpopcntdqvlintrin.c
rename to clang/test/CodeGen/X86/avx512vpopcntdqvlintrin.c
diff --git a/clang/test/CodeGen/bitscan-builtins.c b/clang/test/CodeGen/X86/bitscan-builtins.c
similarity index 100%
rename from clang/test/CodeGen/bitscan-builtins.c
rename to clang/test/CodeGen/X86/bitscan-builtins.c
diff --git a/clang/test/CodeGen/bmi-builtins.c b/clang/test/CodeGen/X86/bmi-builtins.c
similarity index 100%
rename from clang/test/CodeGen/bmi-builtins.c
rename to clang/test/CodeGen/X86/bmi-builtins.c
diff --git a/clang/test/CodeGen/bmi2-builtins.c b/clang/test/CodeGen/X86/bmi2-builtins.c
similarity index 100%
rename from clang/test/CodeGen/bmi2-builtins.c
rename to clang/test/CodeGen/X86/bmi2-builtins.c
diff --git a/clang/test/CodeGen/builtin-clflushopt.c b/clang/test/CodeGen/X86/builtin-clflushopt.c
similarity index 100%
rename from clang/test/CodeGen/builtin-clflushopt.c
rename to clang/test/CodeGen/X86/builtin-clflushopt.c
diff --git a/clang/test/CodeGen/builtin-clwb.c b/clang/test/CodeGen/X86/builtin-clwb.c
similarity index 100%
rename from clang/test/CodeGen/builtin-clwb.c
rename to clang/test/CodeGen/X86/builtin-clwb.c
diff --git a/clang/test/CodeGen/builtin-clzero.c b/clang/test/CodeGen/X86/builtin-clzero.c
similarity index 100%
rename from clang/test/CodeGen/builtin-clzero.c
rename to clang/test/CodeGen/X86/builtin-clzero.c
diff --git a/clang/test/CodeGen/builtin-movdir.c b/clang/test/CodeGen/X86/builtin-movdir.c
similarity index 100%
rename from clang/test/CodeGen/builtin-movdir.c
rename to clang/test/CodeGen/X86/builtin-movdir.c
diff --git a/clang/test/CodeGen/builtin-wbinvd.c b/clang/test/CodeGen/X86/builtin-wbinvd.c
similarity index 100%
rename from clang/test/CodeGen/builtin-wbinvd.c
rename to clang/test/CodeGen/X86/builtin-wbinvd.c
diff --git a/clang/test/CodeGen/builtin-wbnoinvd.c b/clang/test/CodeGen/X86/builtin-wbnoinvd.c
similarity index 100%
rename from clang/test/CodeGen/builtin-wbnoinvd.c
rename to clang/test/CodeGen/X86/builtin-wbnoinvd.c
diff --git a/clang/test/CodeGen/cetintrin.c b/clang/test/CodeGen/X86/cetintrin.c
similarity index 100%
rename from clang/test/CodeGen/cetintrin.c
rename to clang/test/CodeGen/X86/cetintrin.c
diff --git a/clang/test/CodeGen/cldemote.c b/clang/test/CodeGen/X86/cldemote.c
similarity index 100%
rename from clang/test/CodeGen/cldemote.c
rename to clang/test/CodeGen/X86/cldemote.c
diff --git a/clang/test/CodeGen/f16c-builtins-constrained.c b/clang/test/CodeGen/X86/f16c-builtins-constrained.c
similarity index 100%
rename from clang/test/CodeGen/f16c-builtins-constrained.c
rename to clang/test/CodeGen/X86/f16c-builtins-constrained.c
diff --git a/clang/test/CodeGen/f16c-builtins.c b/clang/test/CodeGen/X86/f16c-builtins.c
similarity index 100%
rename from clang/test/CodeGen/f16c-builtins.c
rename to clang/test/CodeGen/X86/f16c-builtins.c
diff --git a/clang/test/CodeGen/fma-builtins-constrained.c b/clang/test/CodeGen/X86/fma-builtins-constrained.c
similarity index 100%
rename from clang/test/CodeGen/fma-builtins-constrained.c
rename to clang/test/CodeGen/X86/fma-builtins-constrained.c
diff --git a/clang/test/CodeGen/fma-builtins.c b/clang/test/CodeGen/X86/fma-builtins.c
similarity index 100%
rename from clang/test/CodeGen/fma-builtins.c
rename to clang/test/CodeGen/X86/fma-builtins.c
diff --git a/clang/test/CodeGen/fma4-builtins.c b/clang/test/CodeGen/X86/fma4-builtins.c
similarity index 100%
rename from clang/test/CodeGen/fma4-builtins.c
rename to clang/test/CodeGen/X86/fma4-builtins.c
diff --git a/clang/test/CodeGen/fsgsbase-builtins.c b/clang/test/CodeGen/X86/fsgsbase-builtins.c
similarity index 100%
rename from clang/test/CodeGen/fsgsbase-builtins.c
rename to clang/test/CodeGen/X86/fsgsbase-builtins.c
diff --git a/clang/test/CodeGen/gfni-builtins.c b/clang/test/CodeGen/X86/gfni-builtins.c
similarity index 100%
rename from clang/test/CodeGen/gfni-builtins.c
rename to clang/test/CodeGen/X86/gfni-builtins.c
diff --git a/clang/test/CodeGen/intel-avx512vlvp2intersect.c b/clang/test/CodeGen/X86/intel-avx512vlvp2intersect.c
similarity index 100%
rename from clang/test/CodeGen/intel-avx512vlvp2intersect.c
rename to clang/test/CodeGen/X86/intel-avx512vlvp2intersect.c
diff --git a/clang/test/CodeGen/intel-avx512vp2intersect.c b/clang/test/CodeGen/X86/intel-avx512vp2intersect.c
similarity index 100%
rename from clang/test/CodeGen/intel-avx512vp2intersect.c
rename to clang/test/CodeGen/X86/intel-avx512vp2intersect.c
diff --git a/clang/test/CodeGen/invpcid.c b/clang/test/CodeGen/X86/invpcid.c
similarity index 100%
rename from clang/test/CodeGen/invpcid.c
rename to clang/test/CodeGen/X86/invpcid.c
diff --git a/clang/test/CodeGen/lwp-builtins.c b/clang/test/CodeGen/X86/lwp-builtins.c
similarity index 100%
rename from clang/test/CodeGen/lwp-builtins.c
rename to clang/test/CodeGen/X86/lwp-builtins.c
diff --git a/clang/test/CodeGen/lzcnt-builtins.c b/clang/test/CodeGen/X86/lzcnt-builtins.c
similarity index 100%
rename from clang/test/CodeGen/lzcnt-builtins.c
rename to clang/test/CodeGen/X86/lzcnt-builtins.c
diff --git a/clang/test/CodeGen/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
similarity index 100%
rename from clang/test/CodeGen/mmx-builtins.c
rename to clang/test/CodeGen/X86/mmx-builtins.c
diff --git a/clang/test/CodeGen/mmx-inline-asm-error.c b/clang/test/CodeGen/X86/mmx-inline-asm-error.c
similarity index 100%
rename from clang/test/CodeGen/mmx-inline-asm-error.c
rename to clang/test/CodeGen/X86/mmx-inline-asm-error.c
diff --git a/clang/test/CodeGen/mmx-inline-asm.c b/clang/test/CodeGen/X86/mmx-inline-asm.c
similarity index 100%
rename from clang/test/CodeGen/mmx-inline-asm.c
rename to clang/test/CodeGen/X86/mmx-inline-asm.c
diff --git a/clang/test/CodeGen/mmx-shift-with-immediate.c b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
similarity index 100%
rename from clang/test/CodeGen/mmx-shift-with-immediate.c
rename to clang/test/CodeGen/X86/mmx-shift-with-immediate.c
diff --git a/clang/test/CodeGen/movbe-builtins.c b/clang/test/CodeGen/X86/movbe-builtins.c
similarity index 100%
rename from clang/test/CodeGen/movbe-builtins.c
rename to clang/test/CodeGen/X86/movbe-builtins.c
diff --git a/clang/test/CodeGen/pause.c b/clang/test/CodeGen/X86/pause.c
similarity index 100%
rename from clang/test/CodeGen/pause.c
rename to clang/test/CodeGen/X86/pause.c
diff --git a/clang/test/CodeGen/pclmul-builtins.c b/clang/test/CodeGen/X86/pclmul-builtins.c
similarity index 100%
rename from clang/test/CodeGen/pclmul-builtins.c
rename to clang/test/CodeGen/X86/pclmul-builtins.c
diff --git a/clang/test/CodeGen/pku.c b/clang/test/CodeGen/X86/pku.c
similarity index 100%
rename from clang/test/CodeGen/pku.c
rename to clang/test/CodeGen/X86/pku.c
diff --git a/clang/test/CodeGen/popcnt-builtins.c b/clang/test/CodeGen/X86/popcnt-builtins.c
similarity index 100%
rename from clang/test/CodeGen/popcnt-builtins.c
rename to clang/test/CodeGen/X86/popcnt-builtins.c
diff --git a/clang/test/CodeGen/prefetchw-builtins.c b/clang/test/CodeGen/X86/prefetchw-builtins.c
similarity index 100%
rename from clang/test/CodeGen/prefetchw-builtins.c
rename to clang/test/CodeGen/X86/prefetchw-builtins.c
diff --git a/clang/test/CodeGen/ptwrite.c b/clang/test/CodeGen/X86/ptwrite.c
similarity index 100%
rename from clang/test/CodeGen/ptwrite.c
rename to clang/test/CodeGen/X86/ptwrite.c
diff --git a/clang/test/CodeGen/rd-builtins.c b/clang/test/CodeGen/X86/rd-builtins.c
similarity index 100%
rename from clang/test/CodeGen/rd-builtins.c
rename to clang/test/CodeGen/X86/rd-builtins.c
diff --git a/clang/test/CodeGen/rdpid-builtins.c b/clang/test/CodeGen/X86/rdpid-builtins.c
similarity index 100%
rename from clang/test/CodeGen/rdpid-builtins.c
rename to clang/test/CodeGen/X86/rdpid-builtins.c
diff --git a/clang/test/CodeGen/rdrand-builtins.c b/clang/test/CodeGen/X86/rdrand-builtins.c
similarity index 100%
rename from clang/test/CodeGen/rdrand-builtins.c
rename to clang/test/CodeGen/X86/rdrand-builtins.c
diff --git a/clang/test/CodeGen/rot-intrinsics.c b/clang/test/CodeGen/X86/rot-intrinsics.c
similarity index 100%
rename from clang/test/CodeGen/rot-intrinsics.c
rename to clang/test/CodeGen/X86/rot-intrinsics.c
diff --git a/clang/test/CodeGen/rtm-builtins.c b/clang/test/CodeGen/X86/rtm-builtins.c
similarity index 100%
rename from clang/test/CodeGen/rtm-builtins.c
rename to clang/test/CodeGen/X86/rtm-builtins.c
diff --git a/clang/test/CodeGen/sha-builtins.c b/clang/test/CodeGen/X86/sha-builtins.c
similarity index 100%
rename from clang/test/CodeGen/sha-builtins.c
rename to clang/test/CodeGen/X86/sha-builtins.c
diff --git a/clang/test/CodeGen/sse-builtins-constrained-cmp.c b/clang/test/CodeGen/X86/sse-builtins-constrained-cmp.c
similarity index 100%
rename from clang/test/CodeGen/sse-builtins-constrained-cmp.c
rename to clang/test/CodeGen/X86/sse-builtins-constrained-cmp.c
diff --git a/clang/test/CodeGen/sse-builtins-constrained.c b/clang/test/CodeGen/X86/sse-builtins-constrained.c
similarity index 100%
rename from clang/test/CodeGen/sse-builtins-constrained.c
rename to clang/test/CodeGen/X86/sse-builtins-constrained.c
diff --git a/clang/test/CodeGen/sse-builtins-dbg.c b/clang/test/CodeGen/X86/sse-builtins-dbg.c
similarity index 100%
rename from clang/test/CodeGen/sse-builtins-dbg.c
rename to clang/test/CodeGen/X86/sse-builtins-dbg.c
diff --git a/clang/test/CodeGen/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
similarity index 100%
rename from clang/test/CodeGen/sse-builtins.c
rename to clang/test/CodeGen/X86/sse-builtins.c
diff --git a/clang/test/CodeGen/sse.c b/clang/test/CodeGen/X86/sse.c
similarity index 100%
rename from clang/test/CodeGen/sse.c
rename to clang/test/CodeGen/X86/sse.c
diff --git a/clang/test/CodeGen/sse2-builtins-constrained-cmp.c b/clang/test/CodeGen/X86/sse2-builtins-constrained-cmp.c
similarity index 100%
rename from clang/test/CodeGen/sse2-builtins-constrained-cmp.c
rename to clang/test/CodeGen/X86/sse2-builtins-constrained-cmp.c
diff --git a/clang/test/CodeGen/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
similarity index 99%
rename from clang/test/CodeGen/sse2-builtins.c
rename to clang/test/CodeGen/X86/sse2-builtins.c
index 34e3baef84c32..180677de03314 100644
--- a/clang/test/CodeGen/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -752,15 +752,13 @@ void test_mm_maskmoveu_si128(__m128i A, __m128i B, char* C) {
 
 __m128i test_mm_max_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_max_epi16
-  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK: call <8 x i16> @llvm.smax.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_max_epi16(A, B);
 }
 
 __m128i test_mm_max_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_max_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK: call <16 x i8> @llvm.umax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_max_epu8(A, B);
 }
 
@@ -784,15 +782,13 @@ void test_mm_mfence() {
 
 __m128i test_mm_min_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_min_epi16
-  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK: call <8 x i16> @llvm.smin.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_min_epi16(A, B);
 }
 
 __m128i test_mm_min_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_min_epu8
-  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK: call <16 x i8> @llvm.umin.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_min_epu8(A, B);
 }
 
diff --git a/clang/test/CodeGen/sse3-builtins.c b/clang/test/CodeGen/X86/sse3-builtins.c
similarity index 100%
rename from clang/test/CodeGen/sse3-builtins.c
rename to clang/test/CodeGen/X86/sse3-builtins.c
diff --git a/clang/test/CodeGen/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
similarity index 91%
rename from clang/test/CodeGen/sse41-builtins.c
rename to clang/test/CodeGen/X86/sse41-builtins.c
index 5f623ce9c38fd..1e38e3c3355a9 100644
--- a/clang/test/CodeGen/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -248,57 +248,49 @@ __m128 test_mm_insert_ps(__m128 x, __m128 y) {
 
 __m128i test_mm_max_epi8(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_max_epi8
-  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK: call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_max_epi8(x, y);
 }
 
 __m128i test_mm_max_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_max_epi32
-  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK: call <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_max_epi32(x, y);
 }
 
 __m128i test_mm_max_epu16(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_max_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK: call <8 x i16> @llvm.umax.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_max_epu16(x, y);
 }
 
 __m128i test_mm_max_epu32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_max_epu32
-  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK: call <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_max_epu32(x, y);
 }
 
 __m128i test_mm_min_epi8(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_min_epi8
-  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i8> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
+  // CHECK: call <16 x i8> @llvm.smin.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_min_epi8(x, y);
 }
 
 __m128i test_mm_min_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_min_epi32
-  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK: call <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_min_epi32(x, y);
 }
 
 __m128i test_mm_min_epu16(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_min_epu16
-  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i16> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  // CHECK: call <8 x i16> @llvm.umin.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_min_epu16(x, y);
 }
 
 __m128i test_mm_min_epu32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_min_epu32
-  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i32> [[X:%.*]], [[Y:%.*]]
-  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
+  // CHECK: call <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_min_epu32(x, y);
 }
 
diff --git a/clang/test/CodeGen/sse42-builtins.c b/clang/test/CodeGen/X86/sse42-builtins.c
similarity index 100%
rename from clang/test/CodeGen/sse42-builtins.c
rename to clang/test/CodeGen/X86/sse42-builtins.c
diff --git a/clang/test/CodeGen/sse4a-builtins.c b/clang/test/CodeGen/X86/sse4a-builtins.c
similarity index 100%
rename from clang/test/CodeGen/sse4a-builtins.c
rename to clang/test/CodeGen/X86/sse4a-builtins.c
diff --git a/clang/test/CodeGen/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
similarity index 100%
rename from clang/test/CodeGen/ssse3-builtins.c
rename to clang/test/CodeGen/X86/ssse3-builtins.c
diff --git a/clang/test/CodeGen/tbm-builtins.c b/clang/test/CodeGen/X86/tbm-builtins.c
similarity index 100%
rename from clang/test/CodeGen/tbm-builtins.c
rename to clang/test/CodeGen/X86/tbm-builtins.c
diff --git a/clang/test/CodeGen/vaes-builtins.c b/clang/test/CodeGen/X86/vaes-builtins.c
similarity index 100%
rename from clang/test/CodeGen/vaes-builtins.c
rename to clang/test/CodeGen/X86/vaes-builtins.c
diff --git a/clang/test/CodeGen/vpclmulqdq-builtins.c b/clang/test/CodeGen/X86/vpclmulqdq-builtins.c
similarity index 100%
rename from clang/test/CodeGen/vpclmulqdq-builtins.c
rename to clang/test/CodeGen/X86/vpclmulqdq-builtins.c
diff --git a/clang/test/CodeGen/waitpkg.c b/clang/test/CodeGen/X86/waitpkg.c
similarity index 100%
rename from clang/test/CodeGen/waitpkg.c
rename to clang/test/CodeGen/X86/waitpkg.c
diff --git a/clang/test/CodeGen/x86-64-inline-asm.c b/clang/test/CodeGen/X86/x86-64-inline-asm.c
similarity index 100%
rename from clang/test/CodeGen/x86-64-inline-asm.c
rename to clang/test/CodeGen/X86/x86-64-inline-asm.c
diff --git a/clang/test/CodeGen/x86-GCC-inline-asm-Y-constraints.c b/clang/test/CodeGen/X86/x86-GCC-inline-asm-Y-constraints.c
similarity index 100%
rename from clang/test/CodeGen/x86-GCC-inline-asm-Y-constraints.c
rename to clang/test/CodeGen/X86/x86-GCC-inline-asm-Y-constraints.c
diff --git a/clang/test/CodeGen/x86-atomic-long_double.c b/clang/test/CodeGen/X86/x86-atomic-long_double.c
similarity index 100%
rename from clang/test/CodeGen/x86-atomic-long_double.c
rename to clang/test/CodeGen/X86/x86-atomic-long_double.c
diff --git a/clang/test/CodeGen/x86-bswap.c b/clang/test/CodeGen/X86/x86-bswap.c
similarity index 100%
rename from clang/test/CodeGen/x86-bswap.c
rename to clang/test/CodeGen/X86/x86-bswap.c
diff --git a/clang/test/CodeGen/x86-builtins-vector-width.c b/clang/test/CodeGen/X86/x86-builtins-vector-width.c
similarity index 100%
rename from clang/test/CodeGen/x86-builtins-vector-width.c
rename to clang/test/CodeGen/X86/x86-builtins-vector-width.c
diff --git a/clang/test/CodeGen/x86-builtins.c b/clang/test/CodeGen/X86/x86-builtins.c
similarity index 100%
rename from clang/test/CodeGen/x86-builtins.c
rename to clang/test/CodeGen/X86/x86-builtins.c
diff --git a/clang/test/CodeGen/x86-cf-protection.c b/clang/test/CodeGen/X86/x86-cf-protection.c
similarity index 100%
rename from clang/test/CodeGen/x86-cf-protection.c
rename to clang/test/CodeGen/X86/x86-cf-protection.c
diff --git a/clang/test/CodeGen/x86-crc-builtins.c b/clang/test/CodeGen/X86/x86-crc-builtins.c
similarity index 100%
rename from clang/test/CodeGen/x86-crc-builtins.c
rename to clang/test/CodeGen/X86/x86-crc-builtins.c
diff --git a/clang/test/CodeGen/x86-enqcmd-builtins.c b/clang/test/CodeGen/X86/x86-enqcmd-builtins.c
similarity index 100%
rename from clang/test/CodeGen/x86-enqcmd-builtins.c
rename to clang/test/CodeGen/X86/x86-enqcmd-builtins.c
diff --git a/clang/test/CodeGen/x86-inline-asm-min-vector-width.c b/clang/test/CodeGen/X86/x86-inline-asm-min-vector-width.c
similarity index 100%
rename from clang/test/CodeGen/x86-inline-asm-min-vector-width.c
rename to clang/test/CodeGen/X86/x86-inline-asm-min-vector-width.c
diff --git a/clang/test/CodeGen/x86-inline-asm-v-constraint.c b/clang/test/CodeGen/X86/x86-inline-asm-v-constraint.c
similarity index 100%
rename from clang/test/CodeGen/x86-inline-asm-v-constraint.c
rename to clang/test/CodeGen/X86/x86-inline-asm-v-constraint.c
diff --git a/clang/test/CodeGen/x86-long-double.cpp b/clang/test/CodeGen/X86/x86-long-double.cpp
similarity index 100%
rename from clang/test/CodeGen/x86-long-double.cpp
rename to clang/test/CodeGen/X86/x86-long-double.cpp
diff --git a/clang/test/CodeGen/x86-nontemporal.c b/clang/test/CodeGen/X86/x86-nontemporal.c
similarity index 100%
rename from clang/test/CodeGen/x86-nontemporal.c
rename to clang/test/CodeGen/X86/x86-nontemporal.c
diff --git a/clang/test/CodeGen/x86-serialize-intrin.c b/clang/test/CodeGen/X86/x86-serialize-intrin.c
similarity index 100%
rename from clang/test/CodeGen/x86-serialize-intrin.c
rename to clang/test/CodeGen/X86/x86-serialize-intrin.c
diff --git a/clang/test/CodeGen/x86-soft-float.c b/clang/test/CodeGen/X86/x86-soft-float.c
similarity index 100%
rename from clang/test/CodeGen/x86-soft-float.c
rename to clang/test/CodeGen/X86/x86-soft-float.c
diff --git a/clang/test/CodeGen/x86-tsxldtrk-builtins.c b/clang/test/CodeGen/X86/x86-tsxldtrk-builtins.c
similarity index 100%
rename from clang/test/CodeGen/x86-tsxldtrk-builtins.c
rename to clang/test/CodeGen/X86/x86-tsxldtrk-builtins.c
diff --git a/clang/test/CodeGen/x86-vec-i128.c b/clang/test/CodeGen/X86/x86-vec-i128.c
similarity index 100%
rename from clang/test/CodeGen/x86-vec-i128.c
rename to clang/test/CodeGen/X86/x86-vec-i128.c
diff --git a/clang/test/CodeGen/x86-vec-struct-packing.c b/clang/test/CodeGen/X86/x86-vec-struct-packing.c
similarity index 100%
rename from clang/test/CodeGen/x86-vec-struct-packing.c
rename to clang/test/CodeGen/X86/x86-vec-struct-packing.c
diff --git a/clang/test/CodeGen/x86-vector-width.c b/clang/test/CodeGen/X86/x86-vector-width.c
similarity index 100%
rename from clang/test/CodeGen/x86-vector-width.c
rename to clang/test/CodeGen/X86/x86-vector-width.c
diff --git a/clang/test/CodeGen/x86.c b/clang/test/CodeGen/X86/x86.c
similarity index 100%
rename from clang/test/CodeGen/x86.c
rename to clang/test/CodeGen/X86/x86.c
diff --git a/clang/test/CodeGen/x86_32-arguments-darwin.c b/clang/test/CodeGen/X86/x86_32-arguments-darwin.c
similarity index 100%
rename from clang/test/CodeGen/x86_32-arguments-darwin.c
rename to clang/test/CodeGen/X86/x86_32-arguments-darwin.c
diff --git a/clang/test/CodeGen/x86_32-arguments-iamcu.c b/clang/test/CodeGen/X86/x86_32-arguments-iamcu.c
similarity index 100%
rename from clang/test/CodeGen/x86_32-arguments-iamcu.c
rename to clang/test/CodeGen/X86/x86_32-arguments-iamcu.c
diff --git a/clang/test/CodeGen/x86_32-arguments-linux.c b/clang/test/CodeGen/X86/x86_32-arguments-linux.c
similarity index 100%
rename from clang/test/CodeGen/x86_32-arguments-linux.c
rename to clang/test/CodeGen/X86/x86_32-arguments-linux.c
diff --git a/clang/test/CodeGen/x86_32-arguments-nommx.c b/clang/test/CodeGen/X86/x86_32-arguments-nommx.c
similarity index 100%
rename from clang/test/CodeGen/x86_32-arguments-nommx.c
rename to clang/test/CodeGen/X86/x86_32-arguments-nommx.c
diff --git a/clang/test/CodeGen/x86_32-arguments-realign.c b/clang/test/CodeGen/X86/x86_32-arguments-realign.c
similarity index 100%
rename from clang/test/CodeGen/x86_32-arguments-realign.c
rename to clang/test/CodeGen/X86/x86_32-arguments-realign.c
diff --git a/clang/test/CodeGen/x86_32-arguments-win32.c b/clang/test/CodeGen/X86/x86_32-arguments-win32.c
similarity index 100%
rename from clang/test/CodeGen/x86_32-arguments-win32.c
rename to clang/test/CodeGen/X86/x86_32-arguments-win32.c
diff --git a/clang/test/CodeGen/x86_32-fpcc-struct-return.c b/clang/test/CodeGen/X86/x86_32-fpcc-struct-return.c
similarity index 100%
rename from clang/test/CodeGen/x86_32-fpcc-struct-return.c
rename to clang/test/CodeGen/X86/x86_32-fpcc-struct-return.c
diff --git a/clang/test/CodeGen/x86_32-inline-asm.c b/clang/test/CodeGen/X86/x86_32-inline-asm.c
similarity index 100%
rename from clang/test/CodeGen/x86_32-inline-asm.c
rename to clang/test/CodeGen/X86/x86_32-inline-asm.c
diff --git a/clang/test/CodeGen/x86_32-xsave.c b/clang/test/CodeGen/X86/x86_32-xsave.c
similarity index 100%
rename from clang/test/CodeGen/x86_32-xsave.c
rename to clang/test/CodeGen/X86/x86_32-xsave.c
diff --git a/clang/test/CodeGen/x86_64-PR42672.c b/clang/test/CodeGen/X86/x86_64-PR42672.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-PR42672.c
rename to clang/test/CodeGen/X86/x86_64-PR42672.c
diff --git a/clang/test/CodeGen/x86_64-arguments-darwin.c b/clang/test/CodeGen/X86/x86_64-arguments-darwin.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-arguments-darwin.c
rename to clang/test/CodeGen/X86/x86_64-arguments-darwin.c
diff --git a/clang/test/CodeGen/x86_64-arguments-nacl.c b/clang/test/CodeGen/X86/x86_64-arguments-nacl.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-arguments-nacl.c
rename to clang/test/CodeGen/X86/x86_64-arguments-nacl.c
diff --git a/clang/test/CodeGen/x86_64-arguments-win32.c b/clang/test/CodeGen/X86/x86_64-arguments-win32.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-arguments-win32.c
rename to clang/test/CodeGen/X86/x86_64-arguments-win32.c
diff --git a/clang/test/CodeGen/x86_64-arguments.c b/clang/test/CodeGen/X86/x86_64-arguments.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-arguments.c
rename to clang/test/CodeGen/X86/x86_64-arguments.c
diff --git a/clang/test/CodeGen/x86_64-atomic-128.c b/clang/test/CodeGen/X86/x86_64-atomic-128.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-atomic-128.c
rename to clang/test/CodeGen/X86/x86_64-atomic-128.c
diff --git a/clang/test/CodeGen/x86_64-floatvectors.c b/clang/test/CodeGen/X86/x86_64-floatvectors.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-floatvectors.c
rename to clang/test/CodeGen/X86/x86_64-floatvectors.c
diff --git a/clang/test/CodeGen/x86_64-instrument-functions.c b/clang/test/CodeGen/X86/x86_64-instrument-functions.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-instrument-functions.c
rename to clang/test/CodeGen/X86/x86_64-instrument-functions.c
diff --git a/clang/test/CodeGen/x86_64-longdouble.c b/clang/test/CodeGen/X86/x86_64-longdouble.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-longdouble.c
rename to clang/test/CodeGen/X86/x86_64-longdouble.c
diff --git a/clang/test/CodeGen/x86_64-mno-sse.c b/clang/test/CodeGen/X86/x86_64-mno-sse.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-mno-sse.c
rename to clang/test/CodeGen/X86/x86_64-mno-sse.c
diff --git a/clang/test/CodeGen/x86_64-mno-sse2.c b/clang/test/CodeGen/X86/x86_64-mno-sse2.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-mno-sse2.c
rename to clang/test/CodeGen/X86/x86_64-mno-sse2.c
diff --git a/clang/test/CodeGen/x86_64-profiling-keep-fp.c b/clang/test/CodeGen/X86/x86_64-profiling-keep-fp.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-profiling-keep-fp.c
rename to clang/test/CodeGen/X86/x86_64-profiling-keep-fp.c
diff --git a/clang/test/CodeGen/x86_64-xsave.c b/clang/test/CodeGen/X86/x86_64-xsave.c
similarity index 100%
rename from clang/test/CodeGen/x86_64-xsave.c
rename to clang/test/CodeGen/X86/x86_64-xsave.c
diff --git a/clang/test/CodeGen/x86_inlineasm_curly_bracket_escape.c b/clang/test/CodeGen/X86/x86_inlineasm_curly_bracket_escape.c
similarity index 100%
rename from clang/test/CodeGen/x86_inlineasm_curly_bracket_escape.c
rename to clang/test/CodeGen/X86/x86_inlineasm_curly_bracket_escape.c
diff --git a/clang/test/CodeGen/xop-builtins-cmp.c b/clang/test/CodeGen/X86/xop-builtins-cmp.c
similarity index 100%
rename from clang/test/CodeGen/xop-builtins-cmp.c
rename to clang/test/CodeGen/X86/xop-builtins-cmp.c
diff --git a/clang/test/CodeGen/xop-builtins.c b/clang/test/CodeGen/X86/xop-builtins.c
similarity index 100%
rename from clang/test/CodeGen/xop-builtins.c
rename to clang/test/CodeGen/X86/xop-builtins.c
diff --git a/clang/test/CodeGen/align_value.cpp b/clang/test/CodeGen/align_value.cpp
index acbfbaf2ba5c7..a18cb651fe4c0 100644
--- a/clang/test/CodeGen/align_value.cpp
+++ b/clang/test/CodeGen/align_value.cpp
@@ -29,10 +29,7 @@ struct ad_struct {
 // CHECK-NEXT:    [[TMP0:%.*]] = load %struct.ad_struct*, %struct.ad_struct** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_AD_STRUCT:%.*]], %struct.ad_struct* [[TMP0]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[A]], align 8
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint double* [[TMP1]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(double* [[TMP1]], i64 64) ]
 // CHECK-NEXT:    ret double* [[TMP1]]
 //
 double *foo(ad_struct& x) {
@@ -48,10 +45,7 @@ double *foo(ad_struct& x) {
 // CHECK-NEXT:    [[TMP0:%.*]] = load %struct.ad_struct*, %struct.ad_struct** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_AD_STRUCT:%.*]], %struct.ad_struct* [[TMP0]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[A]], align 8
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint double* [[TMP1]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(double* [[TMP1]], i64 64) ]
 // CHECK-NEXT:    ret double* [[TMP1]]
 //
 double *goo(ad_struct *x) {
@@ -66,10 +60,7 @@ double *goo(ad_struct *x) {
 // CHECK-NEXT:    store double** [[X]], double*** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load double**, double*** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[TMP0]], align 8
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint double* [[TMP1]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(double* [[TMP1]], i64 64) ]
 // CHECK-NEXT:    ret double* [[TMP1]]
 //
 double *bar(aligned_double *x) {
@@ -84,10 +75,7 @@ double *bar(aligned_double *x) {
 // CHECK-NEXT:    store double** [[X]], double*** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load double**, double*** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[TMP0]], align 8
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint double* [[TMP1]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(double* [[TMP1]], i64 64) ]
 // CHECK-NEXT:    ret double* [[TMP1]]
 //
 double *car(aligned_double &x) {
@@ -103,10 +91,7 @@ double *car(aligned_double &x) {
 // CHECK-NEXT:    [[TMP0:%.*]] = load double**, double*** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double*, double** [[TMP0]], i64 5
 // CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint double* [[TMP1]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(double* [[TMP1]], i64 64) ]
 // CHECK-NEXT:    ret double* [[TMP1]]
 //
 double *dar(aligned_double *x) {
@@ -118,10 +103,7 @@ aligned_double eep();
 // CHECK-LABEL: define {{[^@]+}}@_Z3retv() #0
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call double* @_Z3eepv()
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint double* [[CALL]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(double* [[CALL]], i64 64) ]
 // CHECK-NEXT:    ret double* [[CALL]]
 //
 double *ret() {
diff --git a/clang/test/CodeGen/alloc-align-attr.c b/clang/test/CodeGen/alloc-align-attr.c
index 9517c50dbb1db..44a57291b47c8 100644
--- a/clang/test/CodeGen/alloc-align-attr.c
+++ b/clang/test/CodeGen/alloc-align-attr.c
@@ -11,12 +11,8 @@ __INT32_TYPE__*m1(__INT32_TYPE__ i) __attribute__((alloc_align(1)));
 // CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* @m1(i32 [[TMP0]])
-// CHECK-NEXT:    [[ALIGNMENTCAST:%.*]] = zext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[ALIGNMENTCAST]], 1
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 [[CASTED_ALIGN]]) ]
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -32,12 +28,8 @@ __INT32_TYPE__ test1(__INT32_TYPE__ a) {
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* @m1(i32 [[CONV]])
-// CHECK-NEXT:    [[ALIGNMENTCAST:%.*]] = zext i32 [[CONV]] to i64
-// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[ALIGNMENTCAST]], 1
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = zext i32 [[CONV]] to i64
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 [[CASTED_ALIGN]]) ]
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -55,11 +47,7 @@ __INT32_TYPE__ *m2(__SIZE_TYPE__ i) __attribute__((alloc_align(1)));
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP0]] to i64
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* @m2(i64 [[CONV]])
-// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[CONV]], 1
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 [[CONV]]) ]
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -75,11 +63,7 @@ __INT32_TYPE__ test3(__INT32_TYPE__ a) {
 // CHECK-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* @m2(i64 [[TMP0]])
-// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[TMP0]], 1
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 [[TMP0]]) ]
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -115,12 +99,8 @@ __INT32_TYPE__ *m3(struct Empty s, __int128_t i) __attribute__((alloc_align(2)))
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[TMP4]], i32 0, i32 1
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 8
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* @m3(i64 [[TMP6]], i64 [[TMP8]])
-// CHECK-NEXT:    [[ALIGNMENTCAST:%.*]] = trunc i128 [[TMP3]] to i64
-// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[ALIGNMENTCAST]], 1
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP3]] to i64
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 [[CASTED_ALIGN]]) ]
 // CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP9]]
 //
@@ -157,12 +137,8 @@ __INT32_TYPE__ *m4(struct MultiArgs s, __int128_t i) __attribute__((alloc_align(
 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[TMP9]], i32 0, i32 1
 // CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* @m4(i64 [[TMP6]], i64 [[TMP8]], i64 [[TMP11]], i64 [[TMP13]])
-// CHECK-NEXT:    [[ALIGNMENTCAST:%.*]] = trunc i128 [[TMP3]] to i64
-// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[ALIGNMENTCAST]], 1
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP3]] to i64
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 [[CASTED_ALIGN]]) ]
 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP14]]
 //
diff --git a/clang/test/CodeGen/assume-aligned-and-alloc-align-attributes.c b/clang/test/CodeGen/assume-aligned-and-alloc-align-attributes.c
index fa4ee8db12e7f..cd8a6f19b4f49 100644
--- a/clang/test/CodeGen/assume-aligned-and-alloc-align-attributes.c
+++ b/clang/test/CodeGen/assume-aligned-and-alloc-align-attributes.c
@@ -36,12 +36,8 @@ void *t2_immediate2() {
 // CHECK-NEXT:    store i32 [[ALIGNMENT:%.*]], i32* [[ALIGNMENT_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ALIGNMENT_ADDR]], align 4
 // CHECK-NEXT:    [[CALL:%.*]] = call align 32 i8* @my_aligned_alloc(i32 320, i32 [[TMP0]])
-// CHECK-NEXT:    [[ALIGNMENTCAST:%.*]] = zext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[ALIGNMENTCAST]], 1
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[CALL]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[CALL]], i64 [[TMP1]]) ]
 // CHECK-NEXT:    ret i8* [[CALL]]
 //
 void *t3_variable(int alignment) {
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
index cab424c3dbe17..84559e9edb9a3 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
@@ -31,21 +31,21 @@ DEFINE_STRUCT(bool)
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], %struct.struct_int64* [[S:%.*]], i64 0, i32 1, i64 0
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64>* [[ARRAYIDX]] to *
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, !tbaa !2
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA2:!tbaa !.*]]
 // CHECK-128-NEXT:    ret  [[TMP1]]
 //
 // CHECK-256-LABEL: @read_int64(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], %struct.struct_int64* [[S:%.*]], i64 0, i32 1, i64 0
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <4 x i64>* [[ARRAYIDX]] to *
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, !tbaa !2
+// CHECK-256-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA2:!tbaa !.*]]
 // CHECK-256-NEXT:    ret  [[TMP1]]
 //
 // CHECK-512-LABEL: @read_int64(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], %struct.struct_int64* [[S:%.*]], i64 0, i32 1, i64 0
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast <8 x i64>* [[ARRAYIDX]] to *
-// CHECK-512-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, !tbaa !2
+// CHECK-512-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA2:!tbaa !.*]]
 // CHECK-512-NEXT:    ret  [[TMP1]]
 //
 svint64_t read_int64(struct struct_int64 *s) {
@@ -55,31 +55,31 @@ svint64_t read_int64(struct struct_int64 *s) {
 // CHECK-128-LABEL: @write_int64(
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[X_ADDR:%.*]] = alloca , align 16
-// CHECK-128-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !5
+// CHECK-128-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA5:!tbaa !.*]]
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <2 x i64>*
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 16, !tbaa !2
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], %struct.struct_int64* [[S:%.*]], i64 0, i32 1, i64 0
-// CHECK-128-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[ARRAYIDX]], align 16, !tbaa !2
+// CHECK-128-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[ARRAYIDX]], align 16, [[TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_int64(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[X_ADDR:%.*]] = alloca , align 16
-// CHECK-256-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !5
+// CHECK-256-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA5:!tbaa !.*]]
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <4 x i64>*
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 16, !tbaa !2
+// CHECK-256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-256-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], %struct.struct_int64* [[S:%.*]], i64 0, i32 1, i64 0
-// CHECK-256-NEXT:    store <4 x i64> [[TMP1]], <4 x i64>* [[ARRAYIDX]], align 16, !tbaa !2
+// CHECK-256-NEXT:    store <4 x i64> [[TMP1]], <4 x i64>* [[ARRAYIDX]], align 16, [[TBAA2]]
 // CHECK-256-NEXT:    ret void
 //
 // CHECK-512-LABEL: @write_int64(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[X_ADDR:%.*]] = alloca , align 16
-// CHECK-512-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !5
+// CHECK-512-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA5:!tbaa !.*]]
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <8 x i64>*
-// CHECK-512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[TMP0]], align 16, !tbaa !2
+// CHECK-512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], %struct.struct_int64* [[S:%.*]], i64 0, i32 1, i64 0
-// CHECK-512-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[ARRAYIDX]], align 16, !tbaa !2
+// CHECK-512-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[ARRAYIDX]], align 16, [[TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
 void write_int64(struct struct_int64 *s, svint64_t x) {
@@ -94,21 +94,21 @@ void write_int64(struct struct_int64 *s, svint64_t x) {
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], %struct.struct_float64* [[S:%.*]], i64 0, i32 1, i64 0
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <2 x double>* [[ARRAYIDX]] to *
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, !tbaa !2
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA2]]
 // CHECK-128-NEXT:    ret  [[TMP1]]
 //
 // CHECK-256-LABEL: @read_float64(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], %struct.struct_float64* [[S:%.*]], i64 0, i32 1, i64 0
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <4 x double>* [[ARRAYIDX]] to *
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, !tbaa !2
+// CHECK-256-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA2]]
 // CHECK-256-NEXT:    ret  [[TMP1]]
 //
 // CHECK-512-LABEL: @read_float64(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], %struct.struct_float64* [[S:%.*]], i64 0, i32 1, i64 0
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast <8 x double>* [[ARRAYIDX]] to *
-// CHECK-512-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, !tbaa !2
+// CHECK-512-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA2]]
 // CHECK-512-NEXT:    ret  [[TMP1]]
 //
 svfloat64_t read_float64(struct struct_float64 *s) {
@@ -118,31 +118,31 @@ svfloat64_t read_float64(struct struct_float64 *s) {
 // CHECK-128-LABEL: @write_float64(
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[X_ADDR:%.*]] = alloca , align 16
-// CHECK-128-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !7
+// CHECK-128-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA7:!tbaa !.*]]
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <2 x double>*
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 16, !tbaa !2
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], %struct.struct_float64* [[S:%.*]], i64 0, i32 1, i64 0
-// CHECK-128-NEXT:    store <2 x double> [[TMP1]], <2 x double>* [[ARRAYIDX]], align 16, !tbaa !2
+// CHECK-128-NEXT:    store <2 x double> [[TMP1]], <2 x double>* [[ARRAYIDX]], align 16, [[TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_float64(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[X_ADDR:%.*]] = alloca , align 16
-// CHECK-256-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !7
+// CHECK-256-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA7:!tbaa !.*]]
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <4 x double>*
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 16, !tbaa !2
+// CHECK-256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-256-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], %struct.struct_float64* [[S:%.*]], i64 0, i32 1, i64 0
-// CHECK-256-NEXT:    store <4 x double> [[TMP1]], <4 x double>* [[ARRAYIDX]], align 16, !tbaa !2
+// CHECK-256-NEXT:    store <4 x double> [[TMP1]], <4 x double>* [[ARRAYIDX]], align 16, [[TBAA2]]
 // CHECK-256-NEXT:    ret void
 //
 // CHECK-512-LABEL: @write_float64(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[X_ADDR:%.*]] = alloca , align 16
-// CHECK-512-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !7
+// CHECK-512-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA7:!tbaa !.*]]
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <8 x double>*
-// CHECK-512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[TMP0]], align 16, !tbaa !2
+// CHECK-512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], %struct.struct_float64* [[S:%.*]], i64 0, i32 1, i64 0
-// CHECK-512-NEXT:    store <8 x double> [[TMP1]], <8 x double>* [[ARRAYIDX]], align 16, !tbaa !2
+// CHECK-512-NEXT:    store <8 x double> [[TMP1]], <8 x double>* [[ARRAYIDX]], align 16, [[TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
 void write_float64(struct struct_float64 *s, svfloat64_t x) {
@@ -157,21 +157,21 @@ void write_float64(struct struct_float64 *s, svfloat64_t x) {
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], %struct.struct_bfloat16* [[S:%.*]], i64 0, i32 1, i64 0
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat>* [[ARRAYIDX]] to *
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, !tbaa !2
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA2]]
 // CHECK-128-NEXT:    ret  [[TMP1]]
 //
 // CHECK-256-LABEL: @read_bfloat16(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], %struct.struct_bfloat16* [[S:%.*]], i64 0, i32 1, i64 0
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <16 x bfloat>* [[ARRAYIDX]] to *
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, !tbaa !2
+// CHECK-256-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA2]]
 // CHECK-256-NEXT:    ret  [[TMP1]]
 //
 // CHECK-512-LABEL: @read_bfloat16(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], %struct.struct_bfloat16* [[S:%.*]], i64 0, i32 1, i64 0
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast <32 x bfloat>* [[ARRAYIDX]] to *
-// CHECK-512-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, !tbaa !2
+// CHECK-512-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 16, [[TBAA2]]
 // CHECK-512-NEXT:    ret  [[TMP1]]
 //
 svbfloat16_t read_bfloat16(struct struct_bfloat16 *s) {
@@ -181,31 +181,31 @@ svbfloat16_t read_bfloat16(struct struct_bfloat16 *s) {
 // CHECK-128-LABEL: @write_bfloat16(
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[X_ADDR:%.*]] = alloca , align 16
-// CHECK-128-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !9
+// CHECK-128-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA9:!tbaa !.*]]
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <8 x bfloat>*
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP0]], align 16, !tbaa !2
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], %struct.struct_bfloat16* [[S:%.*]], i64 0, i32 1, i64 0
-// CHECK-128-NEXT:    store <8 x bfloat> [[TMP1]], <8 x bfloat>* [[ARRAYIDX]], align 16, !tbaa !2
+// CHECK-128-NEXT:    store <8 x bfloat> [[TMP1]], <8 x bfloat>* [[ARRAYIDX]], align 16, [[TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_bfloat16(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[X_ADDR:%.*]] = alloca , align 16
-// CHECK-256-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !9
+// CHECK-256-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA9:!tbaa !.*]]
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <16 x bfloat>*
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load <16 x bfloat>, <16 x bfloat>* [[TMP0]], align 16, !tbaa !2
+// CHECK-256-NEXT:    [[TMP1:%.*]] = load <16 x bfloat>, <16 x bfloat>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-256-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], %struct.struct_bfloat16* [[S:%.*]], i64 0, i32 1, i64 0
-// CHECK-256-NEXT:    store <16 x bfloat> [[TMP1]], <16 x bfloat>* [[ARRAYIDX]], align 16, !tbaa !2
+// CHECK-256-NEXT:    store <16 x bfloat> [[TMP1]], <16 x bfloat>* [[ARRAYIDX]], align 16, [[TBAA2]]
 // CHECK-256-NEXT:    ret void
 //
 // CHECK-512-LABEL: @write_bfloat16(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[X_ADDR:%.*]] = alloca , align 16
-// CHECK-512-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !9
+// CHECK-512-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA9:!tbaa !.*]]
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <32 x bfloat>*
-// CHECK-512-NEXT:    [[TMP1:%.*]] = load <32 x bfloat>, <32 x bfloat>* [[TMP0]], align 16, !tbaa !2
+// CHECK-512-NEXT:    [[TMP1:%.*]] = load <32 x bfloat>, <32 x bfloat>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], %struct.struct_bfloat16* [[S:%.*]], i64 0, i32 1, i64 0
-// CHECK-512-NEXT:    store <32 x bfloat> [[TMP1]], <32 x bfloat>* [[ARRAYIDX]], align 16, !tbaa !2
+// CHECK-512-NEXT:    store <32 x bfloat> [[TMP1]], <32 x bfloat>* [[ARRAYIDX]], align 16, [[TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
 void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) {
@@ -220,21 +220,21 @@ void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) {
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <2 x i8>* [[ARRAYIDX]] to *
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 2, !tbaa !2
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 2, [[TBAA2]]
 // CHECK-128-NEXT:    ret  [[TMP1]]
 //
 // CHECK-256-LABEL: @read_bool(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8>* [[ARRAYIDX]] to *
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 2, !tbaa !2
+// CHECK-256-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 2, [[TBAA2]]
 // CHECK-256-NEXT:    ret  [[TMP1]]
 //
 // CHECK-512-LABEL: @read_bool(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8>* [[ARRAYIDX]] to *
-// CHECK-512-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 2, !tbaa !2
+// CHECK-512-NEXT:    [[TMP1:%.*]] = load , * [[TMP0]], align 2, [[TBAA2]]
 // CHECK-512-NEXT:    ret  [[TMP1]]
 //
 svbool_t read_bool(struct struct_bool *s) {
@@ -244,33 +244,33 @@ svbool_t read_bool(struct struct_bool *s) {
 // CHECK-128-LABEL: @write_bool(
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[X_ADDR:%.*]] = alloca , align 16
-// CHECK-128-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !11
+// CHECK-128-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA11:!tbaa !.*]]
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <2 x i8>*
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 16, !tbaa !2
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0
-// CHECK-128-NEXT:    store <2 x i8> [[TMP1]], <2 x i8>* [[ARRAYIDX]], align 2, !tbaa !2
+// CHECK-128-NEXT:    store <2 x i8> [[TMP1]], <2 x i8>* [[ARRAYIDX]], align 2, [[TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_bool(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[X_ADDR:%.*]] = alloca , align 16
-// CHECK-256-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !11
+// CHECK-256-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA11:!tbaa !.*]]
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to i32*
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 16, !tbaa !2
+// CHECK-256-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1
 // CHECK-256-NEXT:    [[TMP2:%.*]] = bitcast [3 x <4 x i8>]* [[Y]] to i32*
-// CHECK-256-NEXT:    store i32 [[TMP1]], i32* [[TMP2]], align 2, !tbaa !2
+// CHECK-256-NEXT:    store i32 [[TMP1]], i32* [[TMP2]], align 2, [[TBAA2]]
 // CHECK-256-NEXT:    ret void
 //
 // CHECK-512-LABEL: @write_bool(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[X_ADDR:%.*]] = alloca , align 16
-// CHECK-512-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !11
+// CHECK-512-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA11:!tbaa !.*]]
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to i64*
-// CHECK-512-NEXT:    [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 16, !tbaa !2
+// CHECK-512-NEXT:    [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1
 // CHECK-512-NEXT:    [[TMP2:%.*]] = bitcast [3 x <8 x i8>]* [[Y]] to i64*
-// CHECK-512-NEXT:    store i64 [[TMP1]], i64* [[TMP2]], align 2, !tbaa !2
+// CHECK-512-NEXT:    store i64 [[TMP1]], i64* [[TMP2]], align 2, [[TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
 void write_bool(struct struct_bool *s, svbool_t x) {
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-call.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-call.c
index 490ec92dfdeb5..1c08e46681fbc 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-call.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-call.c
@@ -30,13 +30,13 @@ svint32_t sizeless_callee(svint32_t x) {
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i32>* [[X]] to *
 // CHECK-NEXT:    store  [[X_COERCE:%.*]], * [[TMP0]], align 16
-// CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, <16 x i32>* [[X]], align 16, !tbaa !2
-// CHECK-NEXT:    store <16 x i32> [[X1]], <16 x i32>* [[X_ADDR]], align 16, !tbaa !2
+// CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, <16 x i32>* [[X]], align 16, [[TBAA2:!tbaa !.*]]
+// CHECK-NEXT:    store <16 x i32> [[X1]], <16 x i32>* [[X_ADDR]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[X_ADDR]] to *
-// CHECK-NEXT:    [[TMP2:%.*]] = load , * [[TMP1]], align 16, !tbaa !2
-// CHECK-NEXT:    store  [[TMP2]], * [[SAVED_CALL_RVALUE]], align 16, !tbaa !5
+// CHECK-NEXT:    [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA2]]
+// CHECK-NEXT:    store  [[TMP2]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA5:!tbaa !.*]]
 // CHECK-NEXT:    [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <16 x i32>*
-// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <16 x i32>*
 // CHECK-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16
 // CHECK-NEXT:    [[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -52,7 +52,7 @@ fixed_int32_t fixed_caller(fixed_int32_t x) {
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i32>* [[X]] to *
 // CHECK-NEXT:    store  [[X_COERCE:%.*]], * [[TMP0]], align 16
-// CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, <16 x i32>* [[X]], align 16, !tbaa !2
+// CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, <16 x i32>* [[X]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <16 x i32>*
 // CHECK-NEXT:    store <16 x i32> [[X1]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16
 // CHECK-NEXT:    [[TMP1:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -68,19 +68,19 @@ fixed_int32_t fixed_callee(fixed_int32_t x) {
 // CHECK-NEXT:    [[COERCE_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[COERCE1:%.*]] = alloca <16 x i32>, align 16
 // CHECK-NEXT:    [[SAVED_CALL_RVALUE:%.*]] = alloca <16 x i32>, align 64
-// CHECK-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, !tbaa !5
+// CHECK-NEXT:    store  [[X:%.*]], * [[X_ADDR]], align 16, [[TBAA5]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast * [[X_ADDR]] to <16 x i32>*
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[COERCE_0__SROA_CAST:%.*]] = bitcast * [[COERCE_COERCE]] to <16 x i32>*
 // CHECK-NEXT:    store <16 x i32> [[TMP1]], <16 x i32>* [[COERCE_0__SROA_CAST]], align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = load , * [[COERCE_COERCE]], align 16
 // CHECK-NEXT:    [[CALL:%.*]] = call  @fixed_callee( [[TMP2]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32>* [[COERCE1]] to *
 // CHECK-NEXT:    store  [[CALL]], * [[TMP3]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, <16 x i32>* [[COERCE1]], align 16, !tbaa !2
-// CHECK-NEXT:    store <16 x i32> [[TMP4]], <16 x i32>* [[SAVED_CALL_RVALUE]], align 64, !tbaa !2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, <16 x i32>* [[COERCE1]], align 16, [[TBAA2]]
+// CHECK-NEXT:    store <16 x i32> [[TMP4]], <16 x i32>* [[SAVED_CALL_RVALUE]], align 64, [[TBAA2]]
 // CHECK-NEXT:    [[CASTFIXEDSVE:%.*]] = bitcast <16 x i32>* [[SAVED_CALL_RVALUE]] to *
-// CHECK-NEXT:    [[TMP5:%.*]] = load , * [[CASTFIXEDSVE]], align 64, !tbaa !2
+// CHECK-NEXT:    [[TMP5:%.*]] = load , * [[CASTFIXEDSVE]], align 64, [[TBAA2]]
 // CHECK-NEXT:    ret  [[TMP5]]
 //
 svint32_t sizeless_caller(svint32_t x) {
@@ -101,21 +101,21 @@ svint32_t sizeless_caller(svint32_t x) {
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i32>* [[OP1]] to *
 // CHECK-NEXT:    store  [[OP1_COERCE:%.*]], * [[TMP0]], align 16
-// CHECK-NEXT:    [[OP11:%.*]] = load <16 x i32>, <16 x i32>* [[OP1]], align 16, !tbaa !2
+// CHECK-NEXT:    [[OP11:%.*]] = load <16 x i32>, <16 x i32>* [[OP1]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[OP2]] to *
 // CHECK-NEXT:    store  [[OP2_COERCE:%.*]], * [[TMP1]], align 16
-// CHECK-NEXT:    [[OP22:%.*]] = load <16 x i32>, <16 x i32>* [[OP2]], align 16, !tbaa !2
-// CHECK-NEXT:    store <16 x i32> [[OP11]], <16 x i32>* [[OP1_ADDR]], align 16, !tbaa !2
-// CHECK-NEXT:    store <16 x i32> [[OP22]], <16 x i32>* [[OP2_ADDR]], align 16, !tbaa !2
+// CHECK-NEXT:    [[OP22:%.*]] = load <16 x i32>, <16 x i32>* [[OP2]], align 16, [[TBAA2]]
+// CHECK-NEXT:    store <16 x i32> [[OP11]], <16 x i32>* [[OP1_ADDR]], align 16, [[TBAA2]]
+// CHECK-NEXT:    store <16 x i32> [[OP22]], <16 x i32>* [[OP2_ADDR]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32>* [[OP1_ADDR]] to *
-// CHECK-NEXT:    [[TMP3:%.*]] = load , * [[TMP2]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP3:%.*]] = load , * [[TMP2]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32>* [[OP2_ADDR]] to *
-// CHECK-NEXT:    [[TMP5:%.*]] = load , * [[TMP4]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP5:%.*]] = load , * [[TMP4]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP6:%.*]] = call  @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]])
 // CHECK-NEXT:    [[TMP7:%.*]] = call  @llvm.aarch64.sve.sel.nxv4i32( [[TMP6]],  [[TMP3]],  [[TMP5]])
-// CHECK-NEXT:    store  [[TMP7]], * [[SAVED_CALL_RVALUE]], align 16, !tbaa !5
+// CHECK-NEXT:    store  [[TMP7]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA5]]
 // CHECK-NEXT:    [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <16 x i32>*
-// CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <16 x i32>*
 // CHECK-NEXT:    store <16 x i32> [[TMP8]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16
 // CHECK-NEXT:    [[TMP9:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -135,21 +135,21 @@ fixed_int32_t call_int32_ff(svbool_t pg, fixed_int32_t op1, fixed_int32_t op2) {
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x double>* [[OP1]] to *
 // CHECK-NEXT:    store  [[OP1_COERCE:%.*]], * [[TMP0]], align 16
-// CHECK-NEXT:    [[OP11:%.*]] = load <8 x double>, <8 x double>* [[OP1]], align 16, !tbaa !2
+// CHECK-NEXT:    [[OP11:%.*]] = load <8 x double>, <8 x double>* [[OP1]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x double>* [[OP2]] to *
 // CHECK-NEXT:    store  [[OP2_COERCE:%.*]], * [[TMP1]], align 16
-// CHECK-NEXT:    [[OP22:%.*]] = load <8 x double>, <8 x double>* [[OP2]], align 16, !tbaa !2
-// CHECK-NEXT:    store <8 x double> [[OP11]], <8 x double>* [[OP1_ADDR]], align 16, !tbaa !2
-// CHECK-NEXT:    store <8 x double> [[OP22]], <8 x double>* [[OP2_ADDR]], align 16, !tbaa !2
+// CHECK-NEXT:    [[OP22:%.*]] = load <8 x double>, <8 x double>* [[OP2]], align 16, [[TBAA2]]
+// CHECK-NEXT:    store <8 x double> [[OP11]], <8 x double>* [[OP1_ADDR]], align 16, [[TBAA2]]
+// CHECK-NEXT:    store <8 x double> [[OP22]], <8 x double>* [[OP2_ADDR]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x double>* [[OP1_ADDR]] to *
-// CHECK-NEXT:    [[TMP3:%.*]] = load , * [[TMP2]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP3:%.*]] = load , * [[TMP2]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x double>* [[OP2_ADDR]] to *
-// CHECK-NEXT:    [[TMP5:%.*]] = load , * [[TMP4]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP5:%.*]] = load , * [[TMP4]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP6:%.*]] = call  @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]])
 // CHECK-NEXT:    [[TMP7:%.*]] = call  @llvm.aarch64.sve.sel.nxv2f64( [[TMP6]],  [[TMP3]],  [[TMP5]])
-// CHECK-NEXT:    store  [[TMP7]], * [[SAVED_CALL_RVALUE]], align 16, !tbaa !7
+// CHECK-NEXT:    store  [[TMP7]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA7:!tbaa !.*]]
 // CHECK-NEXT:    [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <8 x double>*
-// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x double>, <8 x double>* [[CASTFIXEDSVE]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x double>, <8 x double>* [[CASTFIXEDSVE]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <8 x double>*
 // CHECK-NEXT:    store <8 x double> [[TMP8]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16
 // CHECK-NEXT:    [[TMP9:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -170,23 +170,23 @@ fixed_float64_t call_float64_ff(svbool_t pg, fixed_float64_t op1, fixed_float64_
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8>* [[OP1]] to *
 // CHECK-NEXT:    store  [[OP1_COERCE:%.*]], * [[TMP0]], align 16
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8>* [[OP1]] to i64*
-// CHECK-NEXT:    [[OP113:%.*]] = load i64, i64* [[TMP1]], align 16, !tbaa !2
+// CHECK-NEXT:    [[OP113:%.*]] = load i64, i64* [[TMP1]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8>* [[OP2]] to *
 // CHECK-NEXT:    store  [[OP2_COERCE:%.*]], * [[TMP2]], align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8>* [[OP2]] to i64*
-// CHECK-NEXT:    [[OP224:%.*]] = load i64, i64* [[TMP3]], align 16, !tbaa !2
+// CHECK-NEXT:    [[OP224:%.*]] = load i64, i64* [[TMP3]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8>* [[OP1_ADDR]] to i64*
-// CHECK-NEXT:    store i64 [[OP113]], i64* [[TMP4]], align 16, !tbaa !2
+// CHECK-NEXT:    store i64 [[OP113]], i64* [[TMP4]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8>* [[OP2_ADDR]] to i64*
-// CHECK-NEXT:    store i64 [[OP224]], i64* [[TMP5]], align 16, !tbaa !2
+// CHECK-NEXT:    store i64 [[OP224]], i64* [[TMP5]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8>* [[OP1_ADDR]] to *
-// CHECK-NEXT:    [[TMP7:%.*]] = load , * [[TMP6]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP7:%.*]] = load , * [[TMP6]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8>* [[OP2_ADDR]] to *
-// CHECK-NEXT:    [[TMP9:%.*]] = load , * [[TMP8]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP9:%.*]] = load , * [[TMP8]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP10:%.*]] = call  @llvm.aarch64.sve.sel.nxv16i1( [[PG:%.*]],  [[TMP7]],  [[TMP9]])
-// CHECK-NEXT:    store  [[TMP10]], * [[SAVED_CALL_RVALUE]], align 16, !tbaa !9
+// CHECK-NEXT:    store  [[TMP10]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA9:!tbaa !.*]]
 // CHECK-NEXT:    [[TMP11:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to i64*
-// CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[TMP11]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[TMP11]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP13:%.*]] = bitcast * [[RETVAL_COERCE]] to i64*
 // CHECK-NEXT:    store i64 [[TMP12]], i64* [[TMP13]], align 16
 // CHECK-NEXT:    [[TMP14:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -208,15 +208,15 @@ fixed_bool_t call_bool_ff(svbool_t pg, fixed_bool_t op1, fixed_bool_t op2) {
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i32>* [[OP1]] to *
 // CHECK-NEXT:    store  [[OP1_COERCE:%.*]], * [[TMP0]], align 16
-// CHECK-NEXT:    [[OP11:%.*]] = load <16 x i32>, <16 x i32>* [[OP1]], align 16, !tbaa !2
-// CHECK-NEXT:    store <16 x i32> [[OP11]], <16 x i32>* [[OP1_ADDR]], align 16, !tbaa !2
+// CHECK-NEXT:    [[OP11:%.*]] = load <16 x i32>, <16 x i32>* [[OP1]], align 16, [[TBAA2]]
+// CHECK-NEXT:    store <16 x i32> [[OP11]], <16 x i32>* [[OP1_ADDR]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[OP1_ADDR]] to *
-// CHECK-NEXT:    [[TMP2:%.*]] = load , * [[TMP1]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = call  @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]])
 // CHECK-NEXT:    [[TMP4:%.*]] = call  @llvm.aarch64.sve.sel.nxv4i32( [[TMP3]],  [[TMP2]],  [[OP2:%.*]])
-// CHECK-NEXT:    store  [[TMP4]], * [[SAVED_CALL_RVALUE]], align 16, !tbaa !5
+// CHECK-NEXT:    store  [[TMP4]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA5]]
 // CHECK-NEXT:    [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <16 x i32>*
-// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <16 x i32>*
 // CHECK-NEXT:    store <16 x i32> [[TMP5]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16
 // CHECK-NEXT:    [[TMP6:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -234,15 +234,15 @@ fixed_int32_t call_int32_fs(svbool_t pg, fixed_int32_t op1, svint32_t op2) {
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x double>* [[OP1]] to *
 // CHECK-NEXT:    store  [[OP1_COERCE:%.*]], * [[TMP0]], align 16
-// CHECK-NEXT:    [[OP11:%.*]] = load <8 x double>, <8 x double>* [[OP1]], align 16, !tbaa !2
-// CHECK-NEXT:    store <8 x double> [[OP11]], <8 x double>* [[OP1_ADDR]], align 16, !tbaa !2
+// CHECK-NEXT:    [[OP11:%.*]] = load <8 x double>, <8 x double>* [[OP1]], align 16, [[TBAA2]]
+// CHECK-NEXT:    store <8 x double> [[OP11]], <8 x double>* [[OP1_ADDR]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x double>* [[OP1_ADDR]] to *
-// CHECK-NEXT:    [[TMP2:%.*]] = load , * [[TMP1]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = call  @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]])
 // CHECK-NEXT:    [[TMP4:%.*]] = call  @llvm.aarch64.sve.sel.nxv2f64( [[TMP3]],  [[TMP2]],  [[OP2:%.*]])
-// CHECK-NEXT:    store  [[TMP4]], * [[SAVED_CALL_RVALUE]], align 16, !tbaa !7
+// CHECK-NEXT:    store  [[TMP4]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA7]]
 // CHECK-NEXT:    [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <8 x double>*
-// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x double>, <8 x double>* [[CASTFIXEDSVE]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x double>, <8 x double>* [[CASTFIXEDSVE]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <8 x double>*
 // CHECK-NEXT:    store <8 x double> [[TMP5]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16
 // CHECK-NEXT:    [[TMP6:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -261,15 +261,15 @@ fixed_float64_t call_float64_fs(svbool_t pg, fixed_float64_t op1, svfloat64_t op
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8>* [[OP1]] to *
 // CHECK-NEXT:    store  [[OP1_COERCE:%.*]], * [[TMP0]], align 16
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8>* [[OP1]] to i64*
-// CHECK-NEXT:    [[OP112:%.*]] = load i64, i64* [[TMP1]], align 16, !tbaa !2
+// CHECK-NEXT:    [[OP112:%.*]] = load i64, i64* [[TMP1]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8>* [[OP1_ADDR]] to i64*
-// CHECK-NEXT:    store i64 [[OP112]], i64* [[TMP2]], align 16, !tbaa !2
+// CHECK-NEXT:    store i64 [[OP112]], i64* [[TMP2]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8>* [[OP1_ADDR]] to *
-// CHECK-NEXT:    [[TMP4:%.*]] = load , * [[TMP3]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP4:%.*]] = load , * [[TMP3]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call  @llvm.aarch64.sve.sel.nxv16i1( [[PG:%.*]],  [[TMP4]],  [[OP2:%.*]])
-// CHECK-NEXT:    store  [[TMP5]], * [[SAVED_CALL_RVALUE]], align 16, !tbaa !9
+// CHECK-NEXT:    store  [[TMP5]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA9]]
 // CHECK-NEXT:    [[TMP6:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to i64*
-// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP8:%.*]] = bitcast * [[RETVAL_COERCE]] to i64*
 // CHECK-NEXT:    store i64 [[TMP7]], i64* [[TMP8]], align 16
 // CHECK-NEXT:    [[TMP9:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -289,9 +289,9 @@ fixed_bool_t call_bool_fs(svbool_t pg, fixed_bool_t op1, svbool_t op2) {
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = call  @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = call  @llvm.aarch64.sve.sel.nxv4i32( [[TMP0]],  [[OP1:%.*]],  [[OP2:%.*]])
-// CHECK-NEXT:    store  [[TMP1]], * [[SAVED_CALL_RVALUE]], align 16, !tbaa !5
+// CHECK-NEXT:    store  [[TMP1]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA5]]
 // CHECK-NEXT:    [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <16 x i32>*
-// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[CASTFIXEDSVE]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <16 x i32>*
 // CHECK-NEXT:    store <16 x i32> [[TMP2]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -307,9 +307,9 @@ fixed_int32_t call_int32_ss(svbool_t pg, svint32_t op1, svint32_t op2) {
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = call  @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = call  @llvm.aarch64.sve.sel.nxv2f64( [[TMP0]],  [[OP1:%.*]],  [[OP2:%.*]])
-// CHECK-NEXT:    store  [[TMP1]], * [[SAVED_CALL_RVALUE]], align 16, !tbaa !7
+// CHECK-NEXT:    store  [[TMP1]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA7]]
 // CHECK-NEXT:    [[CASTFIXEDSVE:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to <8 x double>*
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x double>, <8 x double>* [[CASTFIXEDSVE]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x double>, <8 x double>* [[CASTFIXEDSVE]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <8 x double>*
 // CHECK-NEXT:    store <8 x double> [[TMP2]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -324,9 +324,9 @@ fixed_float64_t call_float64_ss(svbool_t pg, svfloat64_t op1, svfloat64_t op2) {
 // CHECK-NEXT:    [[SAVED_CALL_RVALUE:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = call  @llvm.aarch64.sve.sel.nxv16i1( [[PG:%.*]],  [[OP1:%.*]],  [[OP2:%.*]])
-// CHECK-NEXT:    store  [[TMP0]], * [[SAVED_CALL_RVALUE]], align 16, !tbaa !9
+// CHECK-NEXT:    store  [[TMP0]], * [[SAVED_CALL_RVALUE]], align 16, [[TBAA9]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast * [[SAVED_CALL_RVALUE]] to i64*
-// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast * [[RETVAL_COERCE]] to i64*
 // CHECK-NEXT:    store i64 [[TMP2]], i64* [[TMP3]], align 16
 // CHECK-NEXT:    [[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
index 13d8f14f991a8..e65537cead104 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
@@ -9,6 +9,7 @@
 typedef svint32_t fixed_int32_t __attribute__((arm_sve_vector_bits(N)));
 typedef svfloat64_t fixed_float64_t __attribute__((arm_sve_vector_bits(N)));
 typedef svbool_t fixed_bool_t __attribute__((arm_sve_vector_bits(N)));
+typedef int32_t gnu_int32_t __attribute__((vector_size(N / 8)));
 
 // CHECK-LABEL: @to_svint32_t(
 // CHECK-NEXT:  entry:
@@ -16,10 +17,10 @@ typedef svbool_t fixed_bool_t __attribute__((arm_sve_vector_bits(N)));
 // CHECK-NEXT:    [[TYPE_ADDR:%.*]] = alloca <16 x i32>, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i32>* [[TYPE]] to *
 // CHECK-NEXT:    store  [[TYPE_COERCE:%.*]], * [[TMP0]], align 16
-// CHECK-NEXT:    [[TYPE1:%.*]] = load <16 x i32>, <16 x i32>* [[TYPE]], align 16, !tbaa !2
-// CHECK-NEXT:    store <16 x i32> [[TYPE1]], <16 x i32>* [[TYPE_ADDR]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TYPE1:%.*]] = load <16 x i32>, <16 x i32>* [[TYPE]], align 16, [[TBAA2:!tbaa !.*]]
+// CHECK-NEXT:    store <16 x i32> [[TYPE1]], <16 x i32>* [[TYPE_ADDR]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[TYPE_ADDR]] to *
-// CHECK-NEXT:    [[TMP2:%.*]] = load , * [[TMP1]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA2]]
 // CHECK-NEXT:    ret  [[TMP2]]
 //
 svint32_t to_svint32_t(fixed_int32_t type) {
@@ -30,9 +31,9 @@ svint32_t to_svint32_t(fixed_int32_t type) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TYPE_ADDR:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
-// CHECK-NEXT:    store  [[TYPE:%.*]], * [[TYPE_ADDR]], align 16, !tbaa !5
+// CHECK-NEXT:    store  [[TYPE:%.*]], * [[TYPE_ADDR]], align 16, [[TBAA5:!tbaa !.*]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast * [[TYPE_ADDR]] to <16 x i32>*
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <16 x i32>*
 // CHECK-NEXT:    store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -48,10 +49,10 @@ fixed_int32_t from_svint32_t(svint32_t type) {
 // CHECK-NEXT:    [[TYPE_ADDR:%.*]] = alloca <8 x double>, align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x double>* [[TYPE]] to *
 // CHECK-NEXT:    store  [[TYPE_COERCE:%.*]], * [[TMP0]], align 16
-// CHECK-NEXT:    [[TYPE1:%.*]] = load <8 x double>, <8 x double>* [[TYPE]], align 16, !tbaa !2
-// CHECK-NEXT:    store <8 x double> [[TYPE1]], <8 x double>* [[TYPE_ADDR]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TYPE1:%.*]] = load <8 x double>, <8 x double>* [[TYPE]], align 16, [[TBAA2]]
+// CHECK-NEXT:    store <8 x double> [[TYPE1]], <8 x double>* [[TYPE_ADDR]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x double>* [[TYPE_ADDR]] to *
-// CHECK-NEXT:    [[TMP2:%.*]] = load , * [[TMP1]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA2]]
 // CHECK-NEXT:    ret  [[TMP2]]
 //
 svfloat64_t to_svfloat64_t(fixed_float64_t type) {
@@ -62,9 +63,9 @@ svfloat64_t to_svfloat64_t(fixed_float64_t type) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TYPE_ADDR:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
-// CHECK-NEXT:    store  [[TYPE:%.*]], * [[TYPE_ADDR]], align 16, !tbaa !7
+// CHECK-NEXT:    store  [[TYPE:%.*]], * [[TYPE_ADDR]], align 16, [[TBAA7:!tbaa !.*]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast * [[TYPE_ADDR]] to <8 x double>*
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[TMP0]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <8 x double>*
 // CHECK-NEXT:    store <8 x double> [[TMP1]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -81,11 +82,11 @@ fixed_float64_t from_svfloat64_t(svfloat64_t type) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8>* [[TYPE]] to *
 // CHECK-NEXT:    store  [[TYPE_COERCE:%.*]], * [[TMP0]], align 16
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8>* [[TYPE]] to i64*
-// CHECK-NEXT:    [[TYPE12:%.*]] = load i64, i64* [[TMP1]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TYPE12:%.*]] = load i64, i64* [[TMP1]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8>* [[TYPE_ADDR]] to i64*
-// CHECK-NEXT:    store i64 [[TYPE12]], i64* [[TMP2]], align 16, !tbaa !2
+// CHECK-NEXT:    store i64 [[TYPE12]], i64* [[TMP2]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8>* [[TYPE_ADDR]] to *
-// CHECK-NEXT:    [[TMP4:%.*]] = load , * [[TMP3]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP4:%.*]] = load , * [[TMP3]], align 16, [[TBAA2]]
 // CHECK-NEXT:    ret  [[TMP4]]
 //
 svbool_t to_svbool_t(fixed_bool_t type) {
@@ -96,9 +97,9 @@ svbool_t to_svbool_t(fixed_bool_t type) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TYPE_ADDR:%.*]] = alloca , align 16
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
-// CHECK-NEXT:    store  [[TYPE:%.*]], * [[TYPE_ADDR]], align 16, !tbaa !9
+// CHECK-NEXT:    store  [[TYPE:%.*]], * [[TYPE_ADDR]], align 16, [[TBAA9:!tbaa !.*]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast * [[TYPE_ADDR]] to i64*
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 16, !tbaa !2
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 16, [[TBAA2]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i64*
 // CHECK-NEXT:    store i64 [[TMP1]], i64* [[TMP2]], align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = load , * [[RETVAL_COERCE]], align 16
@@ -107,3 +108,55 @@ svbool_t to_svbool_t(fixed_bool_t type) {
 fixed_bool_t from_svbool_t(svbool_t type) {
   return type;
 }
+
+// CHECK-LABEL: @to_svint32_t__from_gnu_int32_t(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TYPE_ADDR:%.*]] = alloca <16 x i32>, align 16
+// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0:%.*]], align 16, [[TBAA2]]
+// CHECK-NEXT:    store <16 x i32> [[TYPE]], <16 x i32>* [[TYPE_ADDR]], align 16, [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[TYPE_ADDR]] to *
+// CHECK-NEXT:    [[TMP2:%.*]] = load , * [[TMP1]], align 16, [[TBAA2]]
+// CHECK-NEXT:    ret  [[TMP2]]
+//
+svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) {
+  return type;
+}
+
+// CHECK-LABEL: @from_svint32_t__to_gnu_int32_t(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TYPE_ADDR:%.*]] = alloca , align 16
+// CHECK-NEXT:    store  [[TYPE:%.*]], * [[TYPE_ADDR]], align 16, [[TBAA5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast * [[TYPE_ADDR]] to <16 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16, [[TBAA2]]
+// CHECK-NEXT:    store <16 x i32> [[TMP1]], <16 x i32>* [[AGG_RESULT:%.*]], align 16, [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) {
+  return type;
+}
+
+// CHECK-LABEL: @to_fixed_int32_t__from_gnu_int32_t(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca , align 16
+// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0:%.*]], align 16, [[TBAA2]]
+// CHECK-NEXT:    [[RETVAL_0__SROA_CAST:%.*]] = bitcast * [[RETVAL_COERCE]] to <16 x i32>*
+// CHECK-NEXT:    store <16 x i32> [[TYPE]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load , * [[RETVAL_COERCE]], align 16
+// CHECK-NEXT:    ret  [[TMP1]]
+//
+fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) {
+  return type;
+}
+
+// CHECK-LABEL: @from_fixed_int32_t__to_gnu_int32_t(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TYPE:%.*]] = alloca <16 x i32>, align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i32>* [[TYPE]] to *
+// CHECK-NEXT:    store  [[TYPE_COERCE:%.*]], * [[TMP0]], align 16
+// CHECK-NEXT:    [[TYPE1:%.*]] = load <16 x i32>, <16 x i32>* [[TYPE]], align 16, [[TBAA2]]
+// CHECK-NEXT:    store <16 x i32> [[TYPE1]], <16 x i32>* [[AGG_RESULT:%.*]], align 16, [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+gnu_int32_t from_fixed_int32_t__to_gnu_int32_t(fixed_int32_t type) {
+  return type;
+}
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c
index d567c718000c8..28464ed4af2b7 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c
@@ -22,19 +22,19 @@ fixed_bool_t global_bool;
 // CHECK-128-LABEL: @write_global_i64(
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[V_ADDR:%.*]] = alloca , align 16
-// CHECK-128-NEXT:    store  [[V:%.*]], * [[V_ADDR]], align 16, !tbaa !2
+// CHECK-128-NEXT:    store  [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA2:!tbaa !.*]]
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast * [[V_ADDR]] to <2 x i64>*
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 16, !tbaa !6
-// CHECK-128-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* @global_i64, align 16, !tbaa !6
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 16, [[TBAA6:!tbaa !.*]]
+// CHECK-128-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* @global_i64, align 16, [[TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-512-LABEL: @write_global_i64(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[V_ADDR:%.*]] = alloca , align 16
-// CHECK-512-NEXT:    store  [[V:%.*]], * [[V_ADDR]], align 16, !tbaa !2
+// CHECK-512-NEXT:    store  [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA2:!tbaa !.*]]
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast * [[V_ADDR]] to <8 x i64>*
-// CHECK-512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[TMP0]], align 16, !tbaa !6
-// CHECK-512-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* @global_i64, align 16, !tbaa !6
+// CHECK-512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[TMP0]], align 16, [[TBAA6:!tbaa !.*]]
+// CHECK-512-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* @global_i64, align 16, [[TBAA6]]
 // CHECK-512-NEXT:    ret void
 //
 void write_global_i64(svint64_t v) { global_i64 = v; }
@@ -42,19 +42,19 @@ void write_global_i64(svint64_t v) { global_i64 = v; }
 // CHECK-128-LABEL: @write_global_bf16(
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[V_ADDR:%.*]] = alloca , align 16
-// CHECK-128-NEXT:    store  [[V:%.*]], * [[V_ADDR]], align 16, !tbaa !7
+// CHECK-128-NEXT:    store  [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA7:!tbaa !.*]]
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast * [[V_ADDR]] to <8 x bfloat>*
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP0]], align 16, !tbaa !6
-// CHECK-128-NEXT:    store <8 x bfloat> [[TMP1]], <8 x bfloat>* @global_bf16, align 16, !tbaa !6
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP0]], align 16, [[TBAA6]]
+// CHECK-128-NEXT:    store <8 x bfloat> [[TMP1]], <8 x bfloat>* @global_bf16, align 16, [[TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-512-LABEL: @write_global_bf16(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[V_ADDR:%.*]] = alloca , align 16
-// CHECK-512-NEXT:    store  [[V:%.*]], * [[V_ADDR]], align 16, !tbaa !7
+// CHECK-512-NEXT:    store  [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA7:!tbaa !.*]]
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast * [[V_ADDR]] to <32 x bfloat>*
-// CHECK-512-NEXT:    [[TMP1:%.*]] = load <32 x bfloat>, <32 x bfloat>* [[TMP0]], align 16, !tbaa !6
-// CHECK-512-NEXT:    store <32 x bfloat> [[TMP1]], <32 x bfloat>* @global_bf16, align 16, !tbaa !6
+// CHECK-512-NEXT:    [[TMP1:%.*]] = load <32 x bfloat>, <32 x bfloat>* [[TMP0]], align 16, [[TBAA6]]
+// CHECK-512-NEXT:    store <32 x bfloat> [[TMP1]], <32 x bfloat>* @global_bf16, align 16, [[TBAA6]]
 // CHECK-512-NEXT:    ret void
 //
 void write_global_bf16(svbfloat16_t v) { global_bf16 = v; }
@@ -62,19 +62,19 @@ void write_global_bf16(svbfloat16_t v) { global_bf16 = v; }
 // CHECK-128-LABEL: @write_global_bool(
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[V_ADDR:%.*]] = alloca , align 16
-// CHECK-128-NEXT:    store  [[V:%.*]], * [[V_ADDR]], align 16, !tbaa !9
+// CHECK-128-NEXT:    store  [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA9:!tbaa !.*]]
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast * [[V_ADDR]] to <2 x i8>*
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 16, !tbaa !6
-// CHECK-128-NEXT:    store <2 x i8> [[TMP1]], <2 x i8>* @global_bool, align 2, !tbaa !6
+// CHECK-128-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 16, [[TBAA6]]
+// CHECK-128-NEXT:    store <2 x i8> [[TMP1]], <2 x i8>* @global_bool, align 2, [[TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-512-LABEL: @write_global_bool(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[V_ADDR:%.*]] = alloca , align 16
-// CHECK-512-NEXT:    store  [[V:%.*]], * [[V_ADDR]], align 16, !tbaa !9
+// CHECK-512-NEXT:    store  [[V:%.*]], * [[V_ADDR]], align 16, [[TBAA9:!tbaa !.*]]
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast * [[V_ADDR]] to i64*
-// CHECK-512-NEXT:    [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 16, !tbaa !6
-// CHECK-512-NEXT:    store i64 [[TMP1]], i64* bitcast (<8 x i8>* @global_bool to i64*), align 2, !tbaa !6
+// CHECK-512-NEXT:    [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 16, [[TBAA6]]
+// CHECK-512-NEXT:    store i64 [[TMP1]], i64* bitcast (<8 x i8>* @global_bool to i64*), align 2, [[TBAA6]]
 // CHECK-512-NEXT:    ret void
 //
 void write_global_bool(svbool_t v) { global_bool = v; }
@@ -85,36 +85,36 @@ void write_global_bool(svbool_t v) { global_bool = v; }
 
 // CHECK-128-LABEL: @read_global_i64(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load , * bitcast (<2 x i64>* @global_i64 to *), align 16, !tbaa !6
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load , * bitcast (<2 x i64>* @global_i64 to *), align 16, [[TBAA6]]
 // CHECK-128-NEXT:    ret  [[TMP0]]
 //
 // CHECK-512-LABEL: @read_global_i64(
 // CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load , * bitcast (<8 x i64>* @global_i64 to *), align 16, !tbaa !6
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load , * bitcast (<8 x i64>* @global_i64 to *), align 16, [[TBAA6]]
 // CHECK-512-NEXT:    ret  [[TMP0]]
 //
 svint64_t read_global_i64() { return global_i64; }
 
 // CHECK-128-LABEL: @read_global_bf16(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load , * bitcast (<8 x bfloat>* @global_bf16 to *), align 16, !tbaa !6
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load , * bitcast (<8 x bfloat>* @global_bf16 to *), align 16, [[TBAA6]]
 // CHECK-128-NEXT:    ret  [[TMP0]]
 //
 // CHECK-512-LABEL: @read_global_bf16(
 // CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load , * bitcast (<32 x bfloat>* @global_bf16 to *), align 16, !tbaa !6
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load , * bitcast (<32 x bfloat>* @global_bf16 to *), align 16, [[TBAA6]]
 // CHECK-512-NEXT:    ret  [[TMP0]]
 //
 svbfloat16_t read_global_bf16() { return global_bf16; }
 
 // CHECK-128-LABEL: @read_global_bool(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load , * bitcast (<2 x i8>* @global_bool to *), align 2, !tbaa !6
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load , * bitcast (<2 x i8>* @global_bool to *), align 2, [[TBAA6]]
 // CHECK-128-NEXT:    ret  [[TMP0]]
 //
 // CHECK-512-LABEL: @read_global_bool(
 // CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load , * bitcast (<8 x i8>* @global_bool to *), align 2, !tbaa !6
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load , * bitcast (<8 x i8>* @global_bool to *), align 2, [[TBAA6]]
 // CHECK-512-NEXT:    ret  [[TMP0]]
 //
 svbool_t read_global_bool() { return global_bool; }
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-types.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-types.c
index a1cfc514081ea..27366dea3d34d 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-types.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-types.c
@@ -4,6 +4,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -msve-vector-bits=512 -fallow-half-arguments-and-returns -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-512
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -msve-vector-bits=1024 -fallow-half-arguments-and-returns -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-1024
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -msve-vector-bits=2048 -fallow-half-arguments-and-returns -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-2048
+// RUN: %clang_cc1 -triple aarch64_32-unknown-darwin -target-feature +sve -target-feature +bf16 -msve-vector-bits=512 -fallow-half-arguments-and-returns -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ILP32
 
 #include 
 
@@ -579,3 +580,11 @@ void f() {
 // CHECK-2048-NEXT:  %local_arr_f64 = alloca [3 x <32 x double>], align 16
 // CHECK-2048-NEXT:  %local_arr_bf16 = alloca [3 x <128 x bfloat>], align 16
 // CHECK-2048-NEXT:  %local_arr_bool = alloca [3 x <32 x i8>], align 2
+
+//===----------------------------------------------------------------------===//
+// ILP32 ABI
+//===----------------------------------------------------------------------===//
+// CHECK-ILP32: @global_i32 = global <16 x i32> zeroinitializer, align 16
+// CHECK-ILP32: @global_i64 = global <8 x i64> zeroinitializer, align 16
+// CHECK-ILP32: @global_u32 = global <16 x i32> zeroinitializer, align 16
+// CHECK-ILP32: @global_u64 = global <8 x i64> zeroinitializer, align 16
diff --git a/clang/test/CodeGen/avx512-reduceMinMaxIntrin.c b/clang/test/CodeGen/avx512-reduceMinMaxIntrin.c
deleted file mode 100644
index c1eebb6f3bc93..0000000000000
--- a/clang/test/CodeGen/avx512-reduceMinMaxIntrin.c
+++ /dev/null
@@ -1,2537 +0,0 @@
-// RUN: %clang_cc1 -fexperimental-new-pass-manager -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
-
-#include 
-
-// CHECK-LABEL: define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I5_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <8 x i64> [[TMP5]], [[TMP6]]
-// CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
-// CHECK-NEXT:    store <8 x i64> [[TMP8]], <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP11]], <8 x i64>* [[__A_ADDR_I5_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__B_ADDR_I6_I]], align 64
-// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I5_I]], align 64
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I6_I]], align 64
-// CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt <8 x i64> [[TMP13]], [[TMP14]]
-// CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP15]], <8 x i64> [[TMP13]], <8 x i64> [[TMP14]]
-// CHECK-NEXT:    store <8 x i64> [[TMP16]], <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP23:%.*]] = icmp sgt <8 x i64> [[TMP21]], [[TMP22]]
-// CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[TMP22]]
-// CHECK-NEXT:    store <8 x i64> [[TMP24]], <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP25]], i32 0
-// CHECK-NEXT:    ret i64 [[VECEXT_I]]
-long long test_mm512_reduce_max_epi64(__m512i __W){
-  return _mm512_reduce_max_epi64(__W);
-}
-
-// CHECK-LABEL: define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I5_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt <8 x i64> [[TMP5]], [[TMP6]]
-// CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
-// CHECK-NEXT:    store <8 x i64> [[TMP8]], <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP11]], <8 x i64>* [[__A_ADDR_I5_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__B_ADDR_I6_I]], align 64
-// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I5_I]], align 64
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I6_I]], align 64
-// CHECK-NEXT:    [[TMP15:%.*]] = icmp ugt <8 x i64> [[TMP13]], [[TMP14]]
-// CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP15]], <8 x i64> [[TMP13]], <8 x i64> [[TMP14]]
-// CHECK-NEXT:    store <8 x i64> [[TMP16]], <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP23:%.*]] = icmp ugt <8 x i64> [[TMP21]], [[TMP22]]
-// CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[TMP22]]
-// CHECK-NEXT:    store <8 x i64> [[TMP24]], <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP25]], i32 0
-// CHECK-NEXT:    ret i64 [[VECEXT_I]]
-unsigned long long test_mm512_reduce_max_epu64(__m512i __W){
-  return _mm512_reduce_max_epu64(__W); 
-}
-
-// CHECK-LABEL: define double @test_mm512_reduce_max_pd(<8 x double> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__A_ADDR_I8_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I9_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I10_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I11_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    store <8 x double> [[__W:%.*]], <8 x double>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, <8 x double>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store <8 x double> [[TMP0]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x double> [[EXTRACT_I]], <4 x double>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x double> [[EXTRACT2_I]], <4 x double>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <4 x double> [[TMP3]], <4 x double>* [[__A_ADDR_I10_I]], align 32
-// CHECK-NEXT:    store <4 x double> [[TMP4]], <4 x double>* [[__B_ADDR_I11_I]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* [[__A_ADDR_I10_I]], align 32
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* [[__B_ADDR_I11_I]], align 32
-// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> [[TMP5]], <4 x double> [[TMP6]]) #2
-// CHECK-NEXT:    store <4 x double> [[TMP7]], <4 x double>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x double> [[EXTRACT4_I]], <2 x double>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x double> [[EXTRACT5_I]], <2 x double>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[__A_ADDR_I8_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[__B_ADDR_I9_I]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = load <2 x double>, <2 x double>* [[__A_ADDR_I8_I]], align 16
-// CHECK-NEXT:    [[TMP13:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I9_I]], align 16
-// CHECK-NEXT:    [[TMP14:%.*]] = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> [[TMP12]], <2 x double> [[TMP13]]) #2
-// CHECK-NEXT:    store <2 x double> [[TMP14]], <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> [[TMP16]], <2 x i32> 
-// CHECK-NEXT:    store <2 x double> [[SHUFFLE_I]], <2 x double>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP17:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x double>, <2 x double>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP17]], <2 x double>* [[__A_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP18]], <2 x double>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = load <2 x double>, <2 x double>* [[__A_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP20:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> [[TMP19]], <2 x double> [[TMP20]]) #2
-// CHECK-NEXT:    store <2 x double> [[TMP21]], <2 x double>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load <2 x double>, <2 x double>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP22]], i32 0
-// CHECK-NEXT:    ret double [[VECEXT_I]]
-double test_mm512_reduce_max_pd(__m512d __W){
-  return _mm512_reduce_max_pd(__W); 
-}
-
-// CHECK-LABEL: define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I5_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i64> [[TMP5]], [[TMP6]]
-// CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
-// CHECK-NEXT:    store <8 x i64> [[TMP8]], <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP11]], <8 x i64>* [[__A_ADDR_I5_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__B_ADDR_I6_I]], align 64
-// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I5_I]], align 64
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I6_I]], align 64
-// CHECK-NEXT:    [[TMP15:%.*]] = icmp slt <8 x i64> [[TMP13]], [[TMP14]]
-// CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP15]], <8 x i64> [[TMP13]], <8 x i64> [[TMP14]]
-// CHECK-NEXT:    store <8 x i64> [[TMP16]], <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP23:%.*]] = icmp slt <8 x i64> [[TMP21]], [[TMP22]]
-// CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[TMP22]]
-// CHECK-NEXT:    store <8 x i64> [[TMP24]], <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP25]], i32 0
-// CHECK-NEXT:    ret i64 [[VECEXT_I]]
-long long test_mm512_reduce_min_epi64(__m512i __W){
-  return _mm512_reduce_min_epi64(__W);
-}
-
-// CHECK-LABEL: define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I5_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP7:%.*]] = icmp ult <8 x i64> [[TMP5]], [[TMP6]]
-// CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
-// CHECK-NEXT:    store <8 x i64> [[TMP8]], <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP11]], <8 x i64>* [[__A_ADDR_I5_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__B_ADDR_I6_I]], align 64
-// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I5_I]], align 64
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I6_I]], align 64
-// CHECK-NEXT:    [[TMP15:%.*]] = icmp ult <8 x i64> [[TMP13]], [[TMP14]]
-// CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP15]], <8 x i64> [[TMP13]], <8 x i64> [[TMP14]]
-// CHECK-NEXT:    store <8 x i64> [[TMP16]], <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP23:%.*]] = icmp ult <8 x i64> [[TMP21]], [[TMP22]]
-// CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[TMP22]]
-// CHECK-NEXT:    store <8 x i64> [[TMP24]], <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP25]], i32 0
-// CHECK-NEXT:    ret i64 [[VECEXT_I]]
-unsigned long long test_mm512_reduce_min_epu64(__m512i __W){
-  return _mm512_reduce_min_epu64(__W);
-}
-
-// CHECK-LABEL: define double @test_mm512_reduce_min_pd(<8 x double> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__A_ADDR_I8_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I9_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I10_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I11_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    store <8 x double> [[__W:%.*]], <8 x double>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, <8 x double>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store <8 x double> [[TMP0]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x double> [[EXTRACT_I]], <4 x double>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x double> [[EXTRACT2_I]], <4 x double>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <4 x double> [[TMP3]], <4 x double>* [[__A_ADDR_I10_I]], align 32
-// CHECK-NEXT:    store <4 x double> [[TMP4]], <4 x double>* [[__B_ADDR_I11_I]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* [[__A_ADDR_I10_I]], align 32
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* [[__B_ADDR_I11_I]], align 32
-// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> [[TMP5]], <4 x double> [[TMP6]]) #2
-// CHECK-NEXT:    store <4 x double> [[TMP7]], <4 x double>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x double> [[EXTRACT4_I]], <2 x double>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x double> [[EXTRACT5_I]], <2 x double>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[__A_ADDR_I8_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[__B_ADDR_I9_I]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = load <2 x double>, <2 x double>* [[__A_ADDR_I8_I]], align 16
-// CHECK-NEXT:    [[TMP13:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I9_I]], align 16
-// CHECK-NEXT:    [[TMP14:%.*]] = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> [[TMP12]], <2 x double> [[TMP13]]) #2
-// CHECK-NEXT:    store <2 x double> [[TMP14]], <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> [[TMP16]], <2 x i32> 
-// CHECK-NEXT:    store <2 x double> [[SHUFFLE_I]], <2 x double>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP17:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x double>, <2 x double>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP17]], <2 x double>* [[__A_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP18]], <2 x double>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = load <2 x double>, <2 x double>* [[__A_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP20:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> [[TMP19]], <2 x double> [[TMP20]]) #2
-// CHECK-NEXT:    store <2 x double> [[TMP21]], <2 x double>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load <2 x double>, <2 x double>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP22]], i32 0
-// CHECK-NEXT:    ret double [[VECEXT_I]]
-double test_mm512_reduce_min_pd(__m512d __W){
-  return _mm512_reduce_min_pd(__W); 
-}
-
-// CHECK-LABEL: define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__D_ADDR_I_I:%.*]] = alloca i64, align 8
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__A_ADDR_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I9_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store i8 [[__M:%.*]], i8* [[__M_ADDR]], align 1
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
-// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store i64 -9223372036854775808, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x i64> [[VECINIT_I_I]], i64 [[TMP3]], i32 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x i64> [[VECINIT1_I_I]], i64 [[TMP4]], i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i64> [[VECINIT2_I_I]], i64 [[TMP5]], i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x i64> [[VECINIT3_I_I]], i64 [[TMP6]], i32 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x i64> [[VECINIT4_I_I]], i64 [[TMP7]], i32 5
-// CHECK-NEXT:    [[TMP8:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x i64> [[VECINIT5_I_I]], i64 [[TMP8]], i32 6
-// CHECK-NEXT:    [[TMP9:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x i64> [[VECINIT6_I_I]], i64 [[TMP9]], i32 7
-// CHECK-NEXT:    store <8 x i64> [[VECINIT7_I_I]], <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP10]], <8 x i64>* [[__W_ADDR_I_I]], align 64
-// CHECK-NEXT:    store i8 [[TMP11]], i8* [[__U_ADDR_I_I]], align 1
-// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__A_ADDR_I11_I]], align 64
-// CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[__U_ADDR_I_I]], align 1
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I11_I]], align 64
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
-// CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP14]], <8 x i64> [[TMP15]]
-// CHECK-NEXT:    store <8 x i64> [[TMP17]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP18]], <8 x i64> [[TMP19]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP21]], <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK-NEXT:    [[TMP24:%.*]] = icmp sgt <8 x i64> [[TMP22]], [[TMP23]]
-// CHECK-NEXT:    [[TMP25:%.*]] = select <8 x i1> [[TMP24]], <8 x i64> [[TMP22]], <8 x i64> [[TMP23]]
-// CHECK-NEXT:    store <8 x i64> [[TMP25]], <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP26]], <8 x i64> [[TMP27]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP28]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP29]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP32:%.*]] = icmp sgt <8 x i64> [[TMP30]], [[TMP31]]
-// CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP32]], <8 x i64> [[TMP30]], <8 x i64> [[TMP31]]
-// CHECK-NEXT:    store <8 x i64> [[TMP33]], <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP35:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE5_I:%.*]] = shufflevector <8 x i64> [[TMP34]], <8 x i64> [[TMP35]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE5_I]], <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP37:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP36]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP37]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP40:%.*]] = icmp sgt <8 x i64> [[TMP38]], [[TMP39]]
-// CHECK-NEXT:    [[TMP41:%.*]] = select <8 x i1> [[TMP40]], <8 x i64> [[TMP38]], <8 x i64> [[TMP39]]
-// CHECK-NEXT:    store <8 x i64> [[TMP41]], <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[TMP42:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP42]], i32 0
-// CHECK-NEXT:    ret i64 [[VECEXT_I]]
-long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){
-  return _mm512_mask_reduce_max_epi64(__M, __W); 
-}
-
-// CHECK-LABEL: define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__A_ADDR_I9_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store i8 [[__M:%.*]], i8* [[__M_ADDR]], align 1
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
-// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store i8 [[TMP2]], i8* [[__U_ADDR_I_I]], align 1
-// CHECK-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[__U_ADDR_I_I]], align 1
-// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK-NEXT:    store <8 x i64> zeroinitializer, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I_I]], align 64
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I_I]], align 64
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-// CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
-// CHECK-NEXT:    store <8 x i64> [[TMP8]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP11]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK-NEXT:    [[TMP15:%.*]] = icmp ugt <8 x i64> [[TMP13]], [[TMP14]]
-// CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP15]], <8 x i64> [[TMP13]], <8 x i64> [[TMP14]]
-// CHECK-NEXT:    store <8 x i64> [[TMP16]], <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE2_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE2_I]], <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__A_ADDR_I6_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I6_I]], align 64
-// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP23:%.*]] = icmp ugt <8 x i64> [[TMP21]], [[TMP22]]
-// CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[TMP22]]
-// CHECK-NEXT:    store <8 x i64> [[TMP24]], <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE4_I:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE4_I]], <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP27]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP28]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP31:%.*]] = icmp ugt <8 x i64> [[TMP29]], [[TMP30]]
-// CHECK-NEXT:    [[TMP32:%.*]] = select <8 x i1> [[TMP31]], <8 x i64> [[TMP29]], <8 x i64> [[TMP30]]
-// CHECK-NEXT:    store <8 x i64> [[TMP32]], <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP33]], i32 0
-// CHECK-NEXT:    ret i64 [[VECEXT_I]]
-unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){
-  return _mm512_mask_reduce_max_epu64(__M, __W); 
-}
-
-// CHECK-LABEL: define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca double, align 8
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    [[__W2_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I10_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I11_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I12_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I13_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    store i8 [[__M:%.*]], i8* [[__M_ADDR]], align 1
-// CHECK-NEXT:    store <8 x double> [[__W:%.*]], <8 x double>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
-// CHECK-NEXT:    store <8 x double> [[TMP1]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store double 0xFFF0000000000000, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x double> [[VECINIT_I_I]], double [[TMP3]], i32 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x double> [[VECINIT1_I_I]], double [[TMP4]], i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x double> [[VECINIT2_I_I]], double [[TMP5]], i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x double> [[VECINIT3_I_I]], double [[TMP6]], i32 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x double> [[VECINIT4_I_I]], double [[TMP7]], i32 5
-// CHECK-NEXT:    [[TMP8:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x double> [[VECINIT5_I_I]], double [[TMP8]], i32 6
-// CHECK-NEXT:    [[TMP9:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x double> [[VECINIT6_I_I]], double [[TMP9]], i32 7
-// CHECK-NEXT:    store <8 x double> [[VECINIT7_I_I]], <8 x double>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x double>, <8 x double>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store <8 x double> [[TMP10]], <8 x double>* [[__W2_ADDR_I_I]], align 64
-// CHECK-NEXT:    store i8 [[TMP11]], i8* [[__U_ADDR_I_I]], align 1
-// CHECK-NEXT:    store <8 x double> [[TMP12]], <8 x double>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[__U_ADDR_I_I]], align 1
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x double>, <8 x double>* [[__W2_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
-// CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP16]], <8 x double> [[TMP14]], <8 x double> [[TMP15]]
-// CHECK-NEXT:    store <8 x double> [[TMP17]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x double> [[EXTRACT_I]], <4 x double>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x double> [[EXTRACT4_I]], <4 x double>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP20:%.*]] = load <4 x double>, <4 x double>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x double>, <4 x double>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <4 x double> [[TMP20]], <4 x double>* [[__A_ADDR_I12_I]], align 32
-// CHECK-NEXT:    store <4 x double> [[TMP21]], <4 x double>* [[__B_ADDR_I13_I]], align 32
-// CHECK-NEXT:    [[TMP22:%.*]] = load <4 x double>, <4 x double>* [[__A_ADDR_I12_I]], align 32
-// CHECK-NEXT:    [[TMP23:%.*]] = load <4 x double>, <4 x double>* [[__B_ADDR_I13_I]], align 32
-// CHECK-NEXT:    [[TMP24:%.*]] = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> [[TMP22]], <4 x double> [[TMP23]]) #2
-// CHECK-NEXT:    store <4 x double> [[TMP24]], <4 x double>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <4 x double> [[TMP25]], <4 x double> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x double> [[EXTRACT6_I]], <2 x double>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <4 x double> [[TMP26]], <4 x double> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x double> [[EXTRACT7_I]], <2 x double>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, <2 x double>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x double>, <2 x double>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP27]], <2 x double>* [[__A_ADDR_I10_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP28]], <2 x double>* [[__B_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x double>, <2 x double>* [[__A_ADDR_I10_I]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> [[TMP29]], <2 x double> [[TMP30]]) #2
-// CHECK-NEXT:    store <2 x double> [[TMP31]], <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP32]], <2 x double> [[TMP33]], <2 x i32> 
-// CHECK-NEXT:    store <2 x double> [[SHUFFLE_I]], <2 x double>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP35:%.*]] = load <2 x double>, <2 x double>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP34]], <2 x double>* [[__A2_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP35]], <2 x double>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = load <2 x double>, <2 x double>* [[__A2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP38:%.*]] = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> [[TMP36]], <2 x double> [[TMP37]]) #2
-// CHECK-NEXT:    store <2 x double> [[TMP38]], <2 x double>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x double>, <2 x double>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP39]], i32 0
-// CHECK-NEXT:    ret double [[VECEXT_I]]
-double test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){
-  return _mm512_mask_reduce_max_pd(__M, __W); 
-}
-
-// CHECK-LABEL: define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__D_ADDR_I_I:%.*]] = alloca i64, align 8
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__A_ADDR_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I9_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store i8 [[__M:%.*]], i8* [[__M_ADDR]], align 1
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
-// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store i64 9223372036854775807, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x i64> [[VECINIT_I_I]], i64 [[TMP3]], i32 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x i64> [[VECINIT1_I_I]], i64 [[TMP4]], i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i64> [[VECINIT2_I_I]], i64 [[TMP5]], i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x i64> [[VECINIT3_I_I]], i64 [[TMP6]], i32 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x i64> [[VECINIT4_I_I]], i64 [[TMP7]], i32 5
-// CHECK-NEXT:    [[TMP8:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x i64> [[VECINIT5_I_I]], i64 [[TMP8]], i32 6
-// CHECK-NEXT:    [[TMP9:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x i64> [[VECINIT6_I_I]], i64 [[TMP9]], i32 7
-// CHECK-NEXT:    store <8 x i64> [[VECINIT7_I_I]], <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP10]], <8 x i64>* [[__W_ADDR_I_I]], align 64
-// CHECK-NEXT:    store i8 [[TMP11]], i8* [[__U_ADDR_I_I]], align 1
-// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__A_ADDR_I11_I]], align 64
-// CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[__U_ADDR_I_I]], align 1
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I11_I]], align 64
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
-// CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP14]], <8 x i64> [[TMP15]]
-// CHECK-NEXT:    store <8 x i64> [[TMP17]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP18]], <8 x i64> [[TMP19]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP21]], <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK-NEXT:    [[TMP24:%.*]] = icmp slt <8 x i64> [[TMP22]], [[TMP23]]
-// CHECK-NEXT:    [[TMP25:%.*]] = select <8 x i1> [[TMP24]], <8 x i64> [[TMP22]], <8 x i64> [[TMP23]]
-// CHECK-NEXT:    store <8 x i64> [[TMP25]], <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP26]], <8 x i64> [[TMP27]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP28]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP29]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP32:%.*]] = icmp slt <8 x i64> [[TMP30]], [[TMP31]]
-// CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP32]], <8 x i64> [[TMP30]], <8 x i64> [[TMP31]]
-// CHECK-NEXT:    store <8 x i64> [[TMP33]], <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP35:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE5_I:%.*]] = shufflevector <8 x i64> [[TMP34]], <8 x i64> [[TMP35]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE5_I]], <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP37:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP36]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP37]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP40:%.*]] = icmp slt <8 x i64> [[TMP38]], [[TMP39]]
-// CHECK-NEXT:    [[TMP41:%.*]] = select <8 x i1> [[TMP40]], <8 x i64> [[TMP38]], <8 x i64> [[TMP39]]
-// CHECK-NEXT:    store <8 x i64> [[TMP41]], <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[TMP42:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP42]], i32 0
-// CHECK-NEXT:    ret i64 [[VECEXT_I]]
-long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){
-  return _mm512_mask_reduce_min_epi64(__M, __W); 
-}
-
-// CHECK-LABEL: define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__D_ADDR_I_I:%.*]] = alloca i64, align 8
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__A_ADDR_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I9_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store i8 [[__M:%.*]], i8* [[__M_ADDR]], align 1
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
-// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store i64 -1, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x i64> [[VECINIT_I_I]], i64 [[TMP3]], i32 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x i64> [[VECINIT1_I_I]], i64 [[TMP4]], i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i64> [[VECINIT2_I_I]], i64 [[TMP5]], i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x i64> [[VECINIT3_I_I]], i64 [[TMP6]], i32 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x i64> [[VECINIT4_I_I]], i64 [[TMP7]], i32 5
-// CHECK-NEXT:    [[TMP8:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x i64> [[VECINIT5_I_I]], i64 [[TMP8]], i32 6
-// CHECK-NEXT:    [[TMP9:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x i64> [[VECINIT6_I_I]], i64 [[TMP9]], i32 7
-// CHECK-NEXT:    store <8 x i64> [[VECINIT7_I_I]], <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP10]], <8 x i64>* [[__W_ADDR_I_I]], align 64
-// CHECK-NEXT:    store i8 [[TMP11]], i8* [[__U_ADDR_I_I]], align 1
-// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__A_ADDR_I11_I]], align 64
-// CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[__U_ADDR_I_I]], align 1
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I11_I]], align 64
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
-// CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP14]], <8 x i64> [[TMP15]]
-// CHECK-NEXT:    store <8 x i64> [[TMP17]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP18]], <8 x i64> [[TMP19]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP21]], <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK-NEXT:    [[TMP24:%.*]] = icmp ult <8 x i64> [[TMP22]], [[TMP23]]
-// CHECK-NEXT:    [[TMP25:%.*]] = select <8 x i1> [[TMP24]], <8 x i64> [[TMP22]], <8 x i64> [[TMP23]]
-// CHECK-NEXT:    store <8 x i64> [[TMP25]], <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP26]], <8 x i64> [[TMP27]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
-// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP28]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP29]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
-// CHECK-NEXT:    [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
-// CHECK-NEXT:    [[TMP32:%.*]] = icmp ult <8 x i64> [[TMP30]], [[TMP31]]
-// CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP32]], <8 x i64> [[TMP30]], <8 x i64> [[TMP31]]
-// CHECK-NEXT:    store <8 x i64> [[TMP33]], <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP35:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[SHUFFLE5_I:%.*]] = shufflevector <8 x i64> [[TMP34]], <8 x i64> [[TMP35]], <8 x i32> 
-// CHECK-NEXT:    store <8 x i64> [[SHUFFLE5_I]], <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
-// CHECK-NEXT:    [[TMP37:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP36]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP37]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP40:%.*]] = icmp ult <8 x i64> [[TMP38]], [[TMP39]]
-// CHECK-NEXT:    [[TMP41:%.*]] = select <8 x i1> [[TMP40]], <8 x i64> [[TMP38]], <8 x i64> [[TMP39]]
-// CHECK-NEXT:    store <8 x i64> [[TMP41]], <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[TMP42:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP42]], i32 0
-// CHECK-NEXT:    ret i64 [[VECEXT_I]]
-long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){
-  return _mm512_mask_reduce_min_epu64(__M, __W);
-}
-
-// CHECK-LABEL: define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca double, align 8
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    [[__W2_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I10_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I11_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I12_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I13_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x double>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x double>, align 64
-// CHECK-NEXT:    store i8 [[__M:%.*]], i8* [[__M_ADDR]], align 1
-// CHECK-NEXT:    store <8 x double> [[__W:%.*]], <8 x double>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
-// CHECK-NEXT:    store <8 x double> [[TMP1]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store double 0x7FF0000000000000, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x double> [[VECINIT_I_I]], double [[TMP3]], i32 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x double> [[VECINIT1_I_I]], double [[TMP4]], i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x double> [[VECINIT2_I_I]], double [[TMP5]], i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x double> [[VECINIT3_I_I]], double [[TMP6]], i32 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x double> [[VECINIT4_I_I]], double [[TMP7]], i32 5
-// CHECK-NEXT:    [[TMP8:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x double> [[VECINIT5_I_I]], double [[TMP8]], i32 6
-// CHECK-NEXT:    [[TMP9:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x double> [[VECINIT6_I_I]], double [[TMP9]], i32 7
-// CHECK-NEXT:    store <8 x double> [[VECINIT7_I_I]], <8 x double>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x double>, <8 x double>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store <8 x double> [[TMP10]], <8 x double>* [[__W2_ADDR_I_I]], align 64
-// CHECK-NEXT:    store i8 [[TMP11]], i8* [[__U_ADDR_I_I]], align 1
-// CHECK-NEXT:    store <8 x double> [[TMP12]], <8 x double>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[__U_ADDR_I_I]], align 1
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x double>, <8 x double>* [[__W2_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
-// CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP16]], <8 x double> [[TMP14]], <8 x double> [[TMP15]]
-// CHECK-NEXT:    store <8 x double> [[TMP17]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x double> [[EXTRACT_I]], <4 x double>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x double> [[EXTRACT4_I]], <4 x double>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP20:%.*]] = load <4 x double>, <4 x double>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x double>, <4 x double>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <4 x double> [[TMP20]], <4 x double>* [[__A_ADDR_I12_I]], align 32
-// CHECK-NEXT:    store <4 x double> [[TMP21]], <4 x double>* [[__B_ADDR_I13_I]], align 32
-// CHECK-NEXT:    [[TMP22:%.*]] = load <4 x double>, <4 x double>* [[__A_ADDR_I12_I]], align 32
-// CHECK-NEXT:    [[TMP23:%.*]] = load <4 x double>, <4 x double>* [[__B_ADDR_I13_I]], align 32
-// CHECK-NEXT:    [[TMP24:%.*]] = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> [[TMP22]], <4 x double> [[TMP23]]) #2
-// CHECK-NEXT:    store <4 x double> [[TMP24]], <4 x double>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <4 x double> [[TMP25]], <4 x double> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x double> [[EXTRACT6_I]], <2 x double>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <4 x double> [[TMP26]], <4 x double> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x double> [[EXTRACT7_I]], <2 x double>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, <2 x double>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x double>, <2 x double>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP27]], <2 x double>* [[__A_ADDR_I10_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP28]], <2 x double>* [[__B_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x double>, <2 x double>* [[__A_ADDR_I10_I]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> [[TMP29]], <2 x double> [[TMP30]]) #2
-// CHECK-NEXT:    store <2 x double> [[TMP31]], <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP32]], <2 x double> [[TMP33]], <2 x i32> 
-// CHECK-NEXT:    store <2 x double> [[SHUFFLE_I]], <2 x double>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP35:%.*]] = load <2 x double>, <2 x double>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP34]], <2 x double>* [[__A2_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP35]], <2 x double>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = load <2 x double>, <2 x double>* [[__A2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP38:%.*]] = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> [[TMP36]], <2 x double> [[TMP37]]) #2
-// CHECK-NEXT:    store <2 x double> [[TMP38]], <2 x double>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x double>, <2 x double>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP39]], i32 0
-// CHECK-NEXT:    ret double [[VECEXT_I]]
-double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){
-  return _mm512_mask_reduce_min_pd(__M, __W); 
-}
-
-// CHECK-LABEL: define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I10_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I11_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT2_I]], <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* [[__A_ADDR_I_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* [[__A_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP5]] to <8 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP7]] to <8 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <8 x i32> [[TMP6]], [[TMP8]]
-// CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> [[TMP6]], <8 x i32> [[TMP8]]
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP10]] to <4 x i64>
-// CHECK-NEXT:    store <4 x i64> [[TMP11]], <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT4_I]], <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT5_I]], <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i64> [[TMP16]] to <4 x i32>
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i64> [[TMP18]] to <4 x i32>
-// CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt <4 x i32> [[TMP17]], [[TMP19]]
-// CHECK-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP20]], <4 x i32> [[TMP17]], <4 x i32> [[TMP19]]
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[TMP21]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP22]], <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <2 x i64> [[TMP23]] to <4 x i32>
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> [[TMP26]], <4 x i32> 
-// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP28]], <2 x i64>* [[__V1_ADDR_I10_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[__V2_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I10_I]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[TMP30]] to <4 x i32>
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i64> [[TMP32]] to <4 x i32>
-// CHECK-NEXT:    [[TMP34:%.*]] = icmp sgt <4 x i32> [[TMP31]], [[TMP33]]
-// CHECK-NEXT:    [[TMP35:%.*]] = select <4 x i1> [[TMP34]], <4 x i32> [[TMP31]], <4 x i32> [[TMP33]]
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[TMP35]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP36]], <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP37]] to <4 x i32>
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <2 x i64> [[TMP39]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE8_I:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP40]], <4 x i32> 
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <4 x i32> [[SHUFFLE8_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP41]], <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    [[TMP42:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP42]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP44]] to <4 x i32>
-// CHECK-NEXT:    [[TMP46:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP46]] to <4 x i32>
-// CHECK-NEXT:    [[TMP48:%.*]] = icmp sgt <4 x i32> [[TMP45]], [[TMP47]]
-// CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[TMP45]], <4 x i32> [[TMP47]]
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <4 x i32> [[TMP49]] to <2 x i64>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <2 x i64> [[TMP50]] to <4 x i32>
-// CHECK-NEXT:    store <4 x i32> [[TMP51]], <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[TMP52:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0
-// CHECK-NEXT:    ret i32 [[VECEXT_I]]
-int test_mm512_reduce_max_epi32(__m512i __W){
-  return _mm512_reduce_max_epi32(__W);
-}
-
-// CHECK-LABEL: define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I10_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I11_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT2_I]], <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* [[__A_ADDR_I_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* [[__A_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP5]] to <8 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP7]] to <8 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = icmp ugt <8 x i32> [[TMP6]], [[TMP8]]
-// CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> [[TMP6]], <8 x i32> [[TMP8]]
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP10]] to <4 x i64>
-// CHECK-NEXT:    store <4 x i64> [[TMP11]], <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT4_I]], <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT5_I]], <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i64> [[TMP16]] to <4 x i32>
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i64> [[TMP18]] to <4 x i32>
-// CHECK-NEXT:    [[TMP20:%.*]] = icmp ugt <4 x i32> [[TMP17]], [[TMP19]]
-// CHECK-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP20]], <4 x i32> [[TMP17]], <4 x i32> [[TMP19]]
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[TMP21]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP22]], <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <2 x i64> [[TMP23]] to <4 x i32>
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> [[TMP26]], <4 x i32> 
-// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP28]], <2 x i64>* [[__V1_ADDR_I10_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[__V2_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I10_I]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[TMP30]] to <4 x i32>
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i64> [[TMP32]] to <4 x i32>
-// CHECK-NEXT:    [[TMP34:%.*]] = icmp ugt <4 x i32> [[TMP31]], [[TMP33]]
-// CHECK-NEXT:    [[TMP35:%.*]] = select <4 x i1> [[TMP34]], <4 x i32> [[TMP31]], <4 x i32> [[TMP33]]
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[TMP35]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP36]], <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP37]] to <4 x i32>
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <2 x i64> [[TMP39]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE8_I:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP40]], <4 x i32> 
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <4 x i32> [[SHUFFLE8_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP41]], <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    [[TMP42:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP42]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP44]] to <4 x i32>
-// CHECK-NEXT:    [[TMP46:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP46]] to <4 x i32>
-// CHECK-NEXT:    [[TMP48:%.*]] = icmp ugt <4 x i32> [[TMP45]], [[TMP47]]
-// CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[TMP45]], <4 x i32> [[TMP47]]
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <4 x i32> [[TMP49]] to <2 x i64>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <2 x i64> [[TMP50]] to <4 x i32>
-// CHECK-NEXT:    store <4 x i32> [[TMP51]], <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[TMP52:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0
-// CHECK-NEXT:    ret i32 [[VECEXT_I]]
-unsigned int test_mm512_reduce_max_epu32(__m512i __W){
-  return _mm512_reduce_max_epu32(__W); 
-}
-
-// CHECK-LABEL: define float @test_mm512_reduce_max_ps(<16 x float> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__A_ADDR_I12_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I13_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I10_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I11_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I14_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I15_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    store <16 x float> [[__W:%.*]], <16 x float>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, <16 x float>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store <16 x float> [[TMP0]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x float> [[TMP1]] to <8 x double>
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x double> [[EXTRACT_I]] to <8 x float>
-// CHECK-NEXT:    store <8 x float> [[TMP3]], <8 x float>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP4]] to <8 x double>
-// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[EXTRACT2_I]] to <8 x float>
-// CHECK-NEXT:    store <8 x float> [[TMP6]], <8 x float>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x float>, <8 x float>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x float>, <8 x float>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[__A_ADDR_I14_I]], align 32
-// CHECK-NEXT:    store <8 x float> [[TMP8]], <8 x float>* [[__B_ADDR_I15_I]], align 32
-// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, <8 x float>* [[__A_ADDR_I14_I]], align 32
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x float>, <8 x float>* [[__B_ADDR_I15_I]], align 32
-// CHECK-NEXT:    [[TMP11:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[TMP9]], <8 x float> [[TMP10]]) #2
-// CHECK-NEXT:    store <8 x float> [[TMP11]], <8 x float>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[EXTRACT4_I]], <4 x float>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[EXTRACT5_I]], <4 x float>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP14:%.*]] = load <4 x float>, <4 x float>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x float>, <4 x float>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[__A_ADDR_I12_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP15]], <4 x float>* [[__B_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP17:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[TMP16]], <4 x float> [[TMP17]]) #2
-// CHECK-NEXT:    store <4 x float> [[TMP18]], <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP20:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[SHUFFLE_I]], <4 x float>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load <4 x float>, <4 x float>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[__A_ADDR_I10_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP22]], <4 x float>* [[__B_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I10_I]], align 16
-// CHECK-NEXT:    [[TMP24:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[TMP23]], <4 x float> [[TMP24]]) #2
-// CHECK-NEXT:    store <4 x float> [[TMP25]], <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[SHUFFLE8_I:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> [[TMP27]], <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[SHUFFLE8_I]], <4 x float>* [[__T9_I]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = load <4 x float>, <4 x float>* [[__T9_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP28]], <4 x float>* [[__A_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP29]], <4 x float>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP32:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[TMP30]], <4 x float> [[TMP31]]) #2
-// CHECK-NEXT:    store <4 x float> [[TMP32]], <4 x float>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x float>, <4 x float>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x float> [[TMP33]], i32 0
-// CHECK-NEXT:    ret float [[VECEXT_I]]
-float test_mm512_reduce_max_ps(__m512 __W){
-  return _mm512_reduce_max_ps(__W); 
-}
-
-// CHECK-LABEL: define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I10_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I11_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT2_I]], <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* [[__A_ADDR_I_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* [[__A_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP5]] to <8 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP7]] to <8 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = icmp slt <8 x i32> [[TMP6]], [[TMP8]]
-// CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> [[TMP6]], <8 x i32> [[TMP8]]
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP10]] to <4 x i64>
-// CHECK-NEXT:    store <4 x i64> [[TMP11]], <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT4_I]], <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT5_I]], <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i64> [[TMP16]] to <4 x i32>
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i64> [[TMP18]] to <4 x i32>
-// CHECK-NEXT:    [[TMP20:%.*]] = icmp slt <4 x i32> [[TMP17]], [[TMP19]]
-// CHECK-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP20]], <4 x i32> [[TMP17]], <4 x i32> [[TMP19]]
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[TMP21]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP22]], <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <2 x i64> [[TMP23]] to <4 x i32>
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> [[TMP26]], <4 x i32> 
-// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP28]], <2 x i64>* [[__V1_ADDR_I10_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[__V2_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I10_I]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[TMP30]] to <4 x i32>
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i64> [[TMP32]] to <4 x i32>
-// CHECK-NEXT:    [[TMP34:%.*]] = icmp slt <4 x i32> [[TMP31]], [[TMP33]]
-// CHECK-NEXT:    [[TMP35:%.*]] = select <4 x i1> [[TMP34]], <4 x i32> [[TMP31]], <4 x i32> [[TMP33]]
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[TMP35]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP36]], <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP37]] to <4 x i32>
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <2 x i64> [[TMP39]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE8_I:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP40]], <4 x i32> 
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <4 x i32> [[SHUFFLE8_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP41]], <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    [[TMP42:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP42]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP44]] to <4 x i32>
-// CHECK-NEXT:    [[TMP46:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP46]] to <4 x i32>
-// CHECK-NEXT:    [[TMP48:%.*]] = icmp slt <4 x i32> [[TMP45]], [[TMP47]]
-// CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[TMP45]], <4 x i32> [[TMP47]]
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <4 x i32> [[TMP49]] to <2 x i64>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <2 x i64> [[TMP50]] to <4 x i32>
-// CHECK-NEXT:    store <4 x i32> [[TMP51]], <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[TMP52:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0
-// CHECK-NEXT:    ret i32 [[VECEXT_I]]
-int test_mm512_reduce_min_epi32(__m512i __W){
-  return _mm512_reduce_min_epi32(__W);
-}
-
-// CHECK-LABEL: define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I10_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I11_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT2_I]], <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* [[__A_ADDR_I_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* [[__A_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP5]] to <8 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP7]] to <8 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <8 x i32> [[TMP6]], [[TMP8]]
-// CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> [[TMP6]], <8 x i32> [[TMP8]]
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP10]] to <4 x i64>
-// CHECK-NEXT:    store <4 x i64> [[TMP11]], <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT4_I]], <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT5_I]], <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i64> [[TMP16]] to <4 x i32>
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i64> [[TMP18]] to <4 x i32>
-// CHECK-NEXT:    [[TMP20:%.*]] = icmp ult <4 x i32> [[TMP17]], [[TMP19]]
-// CHECK-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP20]], <4 x i32> [[TMP17]], <4 x i32> [[TMP19]]
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[TMP21]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP22]], <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <2 x i64> [[TMP23]] to <4 x i32>
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> [[TMP26]], <4 x i32> 
-// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP28]], <2 x i64>* [[__V1_ADDR_I10_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[__V2_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I10_I]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[TMP30]] to <4 x i32>
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i64> [[TMP32]] to <4 x i32>
-// CHECK-NEXT:    [[TMP34:%.*]] = icmp ult <4 x i32> [[TMP31]], [[TMP33]]
-// CHECK-NEXT:    [[TMP35:%.*]] = select <4 x i1> [[TMP34]], <4 x i32> [[TMP31]], <4 x i32> [[TMP33]]
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[TMP35]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP36]], <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP37]] to <4 x i32>
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <2 x i64> [[TMP39]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE8_I:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP40]], <4 x i32> 
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <4 x i32> [[SHUFFLE8_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP41]], <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    [[TMP42:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP42]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP44]] to <4 x i32>
-// CHECK-NEXT:    [[TMP46:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP46]] to <4 x i32>
-// CHECK-NEXT:    [[TMP48:%.*]] = icmp ult <4 x i32> [[TMP45]], [[TMP47]]
-// CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[TMP45]], <4 x i32> [[TMP47]]
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <4 x i32> [[TMP49]] to <2 x i64>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <2 x i64> [[TMP50]] to <4 x i32>
-// CHECK-NEXT:    store <4 x i32> [[TMP51]], <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[TMP52:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0
-// CHECK-NEXT:    ret i32 [[VECEXT_I]]
-unsigned int test_mm512_reduce_min_epu32(__m512i __W){
-  return _mm512_reduce_min_epu32(__W); 
-}
-
-// CHECK-LABEL: define float @test_mm512_reduce_min_ps(<16 x float> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__A_ADDR_I12_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I13_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I10_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I11_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I14_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I15_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    store <16 x float> [[__W:%.*]], <16 x float>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, <16 x float>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store <16 x float> [[TMP0]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x float> [[TMP1]] to <8 x double>
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x double> [[EXTRACT_I]] to <8 x float>
-// CHECK-NEXT:    store <8 x float> [[TMP3]], <8 x float>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP4]] to <8 x double>
-// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[EXTRACT2_I]] to <8 x float>
-// CHECK-NEXT:    store <8 x float> [[TMP6]], <8 x float>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x float>, <8 x float>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x float>, <8 x float>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[__A_ADDR_I14_I]], align 32
-// CHECK-NEXT:    store <8 x float> [[TMP8]], <8 x float>* [[__B_ADDR_I15_I]], align 32
-// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, <8 x float>* [[__A_ADDR_I14_I]], align 32
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x float>, <8 x float>* [[__B_ADDR_I15_I]], align 32
-// CHECK-NEXT:    [[TMP11:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[TMP9]], <8 x float> [[TMP10]]) #2
-// CHECK-NEXT:    store <8 x float> [[TMP11]], <8 x float>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[EXTRACT4_I]], <4 x float>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[EXTRACT5_I]], <4 x float>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP14:%.*]] = load <4 x float>, <4 x float>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x float>, <4 x float>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[__A_ADDR_I12_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP15]], <4 x float>* [[__B_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP17:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[TMP16]], <4 x float> [[TMP17]]) #2
-// CHECK-NEXT:    store <4 x float> [[TMP18]], <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP20:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[SHUFFLE_I]], <4 x float>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load <4 x float>, <4 x float>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[__A_ADDR_I10_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP22]], <4 x float>* [[__B_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I10_I]], align 16
-// CHECK-NEXT:    [[TMP24:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[TMP23]], <4 x float> [[TMP24]]) #2
-// CHECK-NEXT:    store <4 x float> [[TMP25]], <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[SHUFFLE8_I:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> [[TMP27]], <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[SHUFFLE8_I]], <4 x float>* [[__T9_I]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = load <4 x float>, <4 x float>* [[__T9_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP28]], <4 x float>* [[__A_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP29]], <4 x float>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP32:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[TMP30]], <4 x float> [[TMP31]]) #2
-// CHECK-NEXT:    store <4 x float> [[TMP32]], <4 x float>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x float>, <4 x float>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x float> [[TMP33]], i32 0
-// CHECK-NEXT:    ret float [[VECEXT_I]]
-float test_mm512_reduce_min_ps(__m512 __W){
-  return _mm512_reduce_min_ps(__W); 
-}
-
-// CHECK-LABEL: define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__S_ADDR_I_I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x i32>, align 64
-// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__V1_ADDR_I14_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I15_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store i16 [[__M:%.*]], i16* [[__M_ADDR]], align 2
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
-// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store i32 -2147483648, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <16 x i32> undef, i32 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <16 x i32> [[VECINIT_I_I]], i32 [[TMP3]], i32 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <16 x i32> [[VECINIT1_I_I]], i32 [[TMP4]], i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <16 x i32> [[VECINIT2_I_I]], i32 [[TMP5]], i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <16 x i32> [[VECINIT3_I_I]], i32 [[TMP6]], i32 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <16 x i32> [[VECINIT4_I_I]], i32 [[TMP7]], i32 5
-// CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <16 x i32> [[VECINIT5_I_I]], i32 [[TMP8]], i32 6
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <16 x i32> [[VECINIT6_I_I]], i32 [[TMP9]], i32 7
-// CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT8_I_I:%.*]] = insertelement <16 x i32> [[VECINIT7_I_I]], i32 [[TMP10]], i32 8
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT9_I_I:%.*]] = insertelement <16 x i32> [[VECINIT8_I_I]], i32 [[TMP11]], i32 9
-// CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT10_I_I:%.*]] = insertelement <16 x i32> [[VECINIT9_I_I]], i32 [[TMP12]], i32 10
-// CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT11_I_I:%.*]] = insertelement <16 x i32> [[VECINIT10_I_I]], i32 [[TMP13]], i32 11
-// CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT12_I_I:%.*]] = insertelement <16 x i32> [[VECINIT11_I_I]], i32 [[TMP14]], i32 12
-// CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT13_I_I:%.*]] = insertelement <16 x i32> [[VECINIT12_I_I]], i32 [[TMP15]], i32 13
-// CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = insertelement <16 x i32> [[VECINIT13_I_I]], i32 [[TMP16]], i32 14
-// CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT15_I_I:%.*]] = insertelement <16 x i32> [[VECINIT14_I_I]], i32 [[TMP17]], i32 15
-// CHECK-NEXT:    store <16 x i32> [[VECINIT15_I_I]], <16 x i32>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x i32>, <16 x i32>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i32> [[TMP18]] to <8 x i64>
-// CHECK-NEXT:    [[TMP20:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__W_ADDR_I_I]], align 64
-// CHECK-NEXT:    store i16 [[TMP20]], i16* [[__U_ADDR_I_I]], align 2
-// CHECK-NEXT:    store <8 x i64> [[TMP21]], <8 x i64>* [[__A2_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP22:%.*]] = load i16, i16* [[__U_ADDR_I_I]], align 2
-// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__A2_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[TMP23]] to <16 x i32>
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i64> [[TMP25]] to <16 x i32>
-// CHECK-NEXT:    [[TMP27:%.*]] = bitcast i16 [[TMP22]] to <16 x i1>
-// CHECK-NEXT:    [[TMP28:%.*]] = select <16 x i1> [[TMP27]], <16 x i32> [[TMP24]], <16 x i32> [[TMP26]]
-// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[TMP28]] to <8 x i64>
-// CHECK-NEXT:    store <8 x i64> [[TMP29]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP30]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x i64> [[TMP31]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT4_I]], <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP32]], <4 x i64>* [[__A_ADDR_I_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP33]], <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i64>, <4 x i64>* [[__A_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <4 x i64> [[TMP34]] to <8 x i32>
-// CHECK-NEXT:    [[TMP36:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i64> [[TMP36]] to <8 x i32>
-// CHECK-NEXT:    [[TMP38:%.*]] = icmp sgt <8 x i32> [[TMP35]], [[TMP37]]
-// CHECK-NEXT:    [[TMP39:%.*]] = select <8 x i1> [[TMP38]], <8 x i32> [[TMP35]], <8 x i32> [[TMP37]]
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP39]] to <4 x i64>
-// CHECK-NEXT:    store <4 x i64> [[TMP40]], <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP41:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <4 x i64> [[TMP41]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT6_I]], <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP42:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <4 x i64> [[TMP42]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT7_I]], <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V1_ADDR_I14_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* [[__V2_ADDR_I15_I]], align 16
-// CHECK-NEXT:    [[TMP45:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I14_I]], align 16
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP45]] to <4 x i32>
-// CHECK-NEXT:    [[TMP47:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I15_I]], align 16
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <2 x i64> [[TMP47]] to <4 x i32>
-// CHECK-NEXT:    [[TMP49:%.*]] = icmp sgt <4 x i32> [[TMP46]], [[TMP48]]
-// CHECK-NEXT:    [[TMP50:%.*]] = select <4 x i1> [[TMP49]], <4 x i32> [[TMP46]], <4 x i32> [[TMP48]]
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <4 x i32> [[TMP50]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP51]], <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP52:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP52]] to <4 x i32>
-// CHECK-NEXT:    [[TMP54:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP54]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> [[TMP55]], <4 x i32> 
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP56]], <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP57:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP58:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP57]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP58]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP59:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP60:%.*]] = bitcast <2 x i64> [[TMP59]] to <4 x i32>
-// CHECK-NEXT:    [[TMP61:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP62:%.*]] = bitcast <2 x i64> [[TMP61]] to <4 x i32>
-// CHECK-NEXT:    [[TMP63:%.*]] = icmp sgt <4 x i32> [[TMP60]], [[TMP62]]
-// CHECK-NEXT:    [[TMP64:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP60]], <4 x i32> [[TMP62]]
-// CHECK-NEXT:    [[TMP65:%.*]] = bitcast <4 x i32> [[TMP64]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP65]], <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP67:%.*]] = bitcast <2 x i64> [[TMP66]] to <4 x i32>
-// CHECK-NEXT:    [[TMP68:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP69:%.*]] = bitcast <2 x i64> [[TMP68]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE10_I:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> [[TMP69]], <4 x i32> 
-// CHECK-NEXT:    [[TMP70:%.*]] = bitcast <4 x i32> [[SHUFFLE10_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP70]], <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    [[TMP71:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP72:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP71]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP72]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP73:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP74:%.*]] = bitcast <2 x i64> [[TMP73]] to <4 x i32>
-// CHECK-NEXT:    [[TMP75:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP76:%.*]] = bitcast <2 x i64> [[TMP75]] to <4 x i32>
-// CHECK-NEXT:    [[TMP77:%.*]] = icmp sgt <4 x i32> [[TMP74]], [[TMP76]]
-// CHECK-NEXT:    [[TMP78:%.*]] = select <4 x i1> [[TMP77]], <4 x i32> [[TMP74]], <4 x i32> [[TMP76]]
-// CHECK-NEXT:    [[TMP79:%.*]] = bitcast <4 x i32> [[TMP78]] to <2 x i64>
-// CHECK-NEXT:    [[TMP80:%.*]] = bitcast <2 x i64> [[TMP79]] to <4 x i32>
-// CHECK-NEXT:    store <4 x i32> [[TMP80]], <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[TMP81:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP81]], i32 0
-// CHECK-NEXT:    ret i32 [[VECEXT_I]]
-int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){
-  return _mm512_mask_reduce_max_epi32(__M, __W); 
-}
-
-// CHECK-LABEL: define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__V1_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I14_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I11_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store i16 [[__M:%.*]], i16* [[__M_ADDR]], align 2
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
-// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP2:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store i16 [[TMP2]], i16* [[__U_ADDR_I_I]], align 2
-// CHECK-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[__U_ADDR_I_I]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP5]] to <16 x i32>
-// CHECK-NEXT:    store <8 x i64> zeroinitializer, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I_I]], align 64
-// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I_I]], align 64
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP7]] to <16 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-// CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP6]], <16 x i32> [[TMP8]]
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64>
-// CHECK-NEXT:    store <8 x i64> [[TMP11]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT3_I:%.*]] = shufflevector <8 x i64> [[TMP13]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT3_I]], <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* [[__A2_ADDR_I_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP15]], <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP16:%.*]] = load <4 x i64>, <4 x i64>* [[__A2_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i64> [[TMP16]] to <8 x i32>
-// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i64> [[TMP18]] to <8 x i32>
-// CHECK-NEXT:    [[TMP20:%.*]] = icmp ugt <8 x i32> [[TMP17]], [[TMP19]]
-// CHECK-NEXT:    [[TMP21:%.*]] = select <8 x i1> [[TMP20]], <8 x i32> [[TMP17]], <8 x i32> [[TMP19]]
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[TMP21]] to <4 x i64>
-// CHECK-NEXT:    store <4 x i64> [[TMP22]], <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT5_I]], <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <4 x i64> [[TMP24]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT6_I]], <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[__V1_ADDR_I13_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* [[__V2_ADDR_I14_I]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <2 x i64> [[TMP27]] to <4 x i32>
-// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I14_I]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i64> [[TMP29]] to <4 x i32>
-// CHECK-NEXT:    [[TMP31:%.*]] = icmp ugt <4 x i32> [[TMP28]], [[TMP30]]
-// CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP31]], <4 x i32> [[TMP28]], <4 x i32> [[TMP30]]
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <4 x i32> [[TMP32]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <2 x i64> [[TMP34]] to <4 x i32>
-// CHECK-NEXT:    [[TMP36:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[TMP36]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP37]], <4 x i32> 
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP38]], <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP39]], <2 x i64>* [[__V1_ADDR_I11_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP40]], <2 x i64>* [[__V2_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP41:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I11_I]], align 16
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <2 x i64> [[TMP41]] to <4 x i32>
-// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i64> [[TMP43]] to <4 x i32>
-// CHECK-NEXT:    [[TMP45:%.*]] = icmp ugt <4 x i32> [[TMP42]], [[TMP44]]
-// CHECK-NEXT:    [[TMP46:%.*]] = select <4 x i1> [[TMP45]], <4 x i32> [[TMP42]], <4 x i32> [[TMP44]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i32> [[TMP46]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP47]], <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP48:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <2 x i64> [[TMP48]] to <4 x i32>
-// CHECK-NEXT:    [[TMP50:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <2 x i64> [[TMP50]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE9_I:%.*]] = shufflevector <4 x i32> [[TMP49]], <4 x i32> [[TMP51]], <4 x i32> 
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <4 x i32> [[SHUFFLE9_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP52]], <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    [[TMP53:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP54:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP53]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP54]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP55:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i64> [[TMP55]] to <4 x i32>
-// CHECK-NEXT:    [[TMP57:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP58:%.*]] = bitcast <2 x i64> [[TMP57]] to <4 x i32>
-// CHECK-NEXT:    [[TMP59:%.*]] = icmp ugt <4 x i32> [[TMP56]], [[TMP58]]
-// CHECK-NEXT:    [[TMP60:%.*]] = select <4 x i1> [[TMP59]], <4 x i32> [[TMP56]], <4 x i32> [[TMP58]]
-// CHECK-NEXT:    [[TMP61:%.*]] = bitcast <4 x i32> [[TMP60]] to <2 x i64>
-// CHECK-NEXT:    [[TMP62:%.*]] = bitcast <2 x i64> [[TMP61]] to <4 x i32>
-// CHECK-NEXT:    store <4 x i32> [[TMP62]], <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[TMP63:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP63]], i32 0
-// CHECK-NEXT:    ret i32 [[VECEXT_I]]
-unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){
-  return _mm512_mask_reduce_max_epu32(__M, __W); 
-}
-
-// CHECK-LABEL: define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca float, align 4
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    [[__W2_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I14_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I15_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I12_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I13_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I16_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I17_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    store i16 [[__M:%.*]], i16* [[__M_ADDR]], align 2
-// CHECK-NEXT:    store <16 x float> [[__W:%.*]], <16 x float>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
-// CHECK-NEXT:    store <16 x float> [[TMP1]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store float 0xFFF0000000000000, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <16 x float> [[VECINIT_I_I]], float [[TMP3]], i32 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <16 x float> [[VECINIT1_I_I]], float [[TMP4]], i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <16 x float> [[VECINIT2_I_I]], float [[TMP5]], i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <16 x float> [[VECINIT3_I_I]], float [[TMP6]], i32 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <16 x float> [[VECINIT4_I_I]], float [[TMP7]], i32 5
-// CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <16 x float> [[VECINIT5_I_I]], float [[TMP8]], i32 6
-// CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <16 x float> [[VECINIT6_I_I]], float [[TMP9]], i32 7
-// CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT8_I_I:%.*]] = insertelement <16 x float> [[VECINIT7_I_I]], float [[TMP10]], i32 8
-// CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT9_I_I:%.*]] = insertelement <16 x float> [[VECINIT8_I_I]], float [[TMP11]], i32 9
-// CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT10_I_I:%.*]] = insertelement <16 x float> [[VECINIT9_I_I]], float [[TMP12]], i32 10
-// CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT11_I_I:%.*]] = insertelement <16 x float> [[VECINIT10_I_I]], float [[TMP13]], i32 11
-// CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT12_I_I:%.*]] = insertelement <16 x float> [[VECINIT11_I_I]], float [[TMP14]], i32 12
-// CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT13_I_I:%.*]] = insertelement <16 x float> [[VECINIT12_I_I]], float [[TMP15]], i32 13
-// CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = insertelement <16 x float> [[VECINIT13_I_I]], float [[TMP16]], i32 14
-// CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT15_I_I:%.*]] = insertelement <16 x float> [[VECINIT14_I_I]], float [[TMP17]], i32 15
-// CHECK-NEXT:    store <16 x float> [[VECINIT15_I_I]], <16 x float>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x float>, <16 x float>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP20:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store <16 x float> [[TMP18]], <16 x float>* [[__W2_ADDR_I_I]], align 64
-// CHECK-NEXT:    store i16 [[TMP19]], i16* [[__U_ADDR_I_I]], align 2
-// CHECK-NEXT:    store <16 x float> [[TMP20]], <16 x float>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP21:%.*]] = load i16, i16* [[__U_ADDR_I_I]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP23:%.*]] = load <16 x float>, <16 x float>* [[__W2_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast i16 [[TMP21]] to <16 x i1>
-// CHECK-NEXT:    [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x float> [[TMP22]], <16 x float> [[TMP23]]
-// CHECK-NEXT:    store <16 x float> [[TMP25]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP26:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x float> [[TMP26]] to <8 x double>
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP27]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x double> [[EXTRACT_I]] to <8 x float>
-// CHECK-NEXT:    store <8 x float> [[TMP28]], <8 x float>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x float> [[TMP29]] to <8 x double>
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x double> [[TMP30]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x double> [[EXTRACT4_I]] to <8 x float>
-// CHECK-NEXT:    store <8 x float> [[TMP31]], <8 x float>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x float>, <8 x float>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x float>, <8 x float>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <8 x float> [[TMP32]], <8 x float>* [[__A_ADDR_I16_I]], align 32
-// CHECK-NEXT:    store <8 x float> [[TMP33]], <8 x float>* [[__B_ADDR_I17_I]], align 32
-// CHECK-NEXT:    [[TMP34:%.*]] = load <8 x float>, <8 x float>* [[__A_ADDR_I16_I]], align 32
-// CHECK-NEXT:    [[TMP35:%.*]] = load <8 x float>, <8 x float>* [[__B_ADDR_I17_I]], align 32
-// CHECK-NEXT:    [[TMP36:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[TMP34]], <8 x float> [[TMP35]]) #2
-// CHECK-NEXT:    store <8 x float> [[TMP36]], <8 x float>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP37:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[EXTRACT6_I]], <4 x float>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP38:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <8 x float> [[TMP38]], <8 x float> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[EXTRACT7_I]], <4 x float>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x float>, <4 x float>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = load <4 x float>, <4 x float>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP39]], <4 x float>* [[__A_ADDR_I14_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP40]], <4 x float>* [[__B_ADDR_I15_I]], align 16
-// CHECK-NEXT:    [[TMP41:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I14_I]], align 16
-// CHECK-NEXT:    [[TMP42:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I15_I]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[TMP41]], <4 x float> [[TMP42]]) #2
-// CHECK-NEXT:    store <4 x float> [[TMP43]], <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP44:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP45:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP44]], <4 x float> [[TMP45]], <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[SHUFFLE_I]], <4 x float>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP46:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP47:%.*]] = load <4 x float>, <4 x float>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP46]], <4 x float>* [[__A_ADDR_I12_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP47]], <4 x float>* [[__B_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP48:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP49:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP50:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[TMP48]], <4 x float> [[TMP49]]) #2
-// CHECK-NEXT:    store <4 x float> [[TMP50]], <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP51:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP52:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[SHUFFLE10_I:%.*]] = shufflevector <4 x float> [[TMP51]], <4 x float> [[TMP52]], <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[SHUFFLE10_I]], <4 x float>* [[__T9_I]], align 16
-// CHECK-NEXT:    [[TMP53:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP54:%.*]] = load <4 x float>, <4 x float>* [[__T9_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP53]], <4 x float>* [[__A2_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP54]], <4 x float>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP55:%.*]] = load <4 x float>, <4 x float>* [[__A2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP56:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP57:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[TMP55]], <4 x float> [[TMP56]]) #2
-// CHECK-NEXT:    store <4 x float> [[TMP57]], <4 x float>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[TMP58:%.*]] = load <4 x float>, <4 x float>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x float> [[TMP58]], i32 0
-// CHECK-NEXT:    ret float [[VECEXT_I]]
-float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){
-  return _mm512_mask_reduce_max_ps(__M, __W); 
-}
-
-// CHECK-LABEL: define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__S_ADDR_I_I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x i32>, align 64
-// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__V1_ADDR_I14_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I15_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store i16 [[__M:%.*]], i16* [[__M_ADDR]], align 2
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
-// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store i32 2147483647, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <16 x i32> undef, i32 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <16 x i32> [[VECINIT_I_I]], i32 [[TMP3]], i32 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <16 x i32> [[VECINIT1_I_I]], i32 [[TMP4]], i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <16 x i32> [[VECINIT2_I_I]], i32 [[TMP5]], i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <16 x i32> [[VECINIT3_I_I]], i32 [[TMP6]], i32 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <16 x i32> [[VECINIT4_I_I]], i32 [[TMP7]], i32 5
-// CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <16 x i32> [[VECINIT5_I_I]], i32 [[TMP8]], i32 6
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <16 x i32> [[VECINIT6_I_I]], i32 [[TMP9]], i32 7
-// CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT8_I_I:%.*]] = insertelement <16 x i32> [[VECINIT7_I_I]], i32 [[TMP10]], i32 8
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT9_I_I:%.*]] = insertelement <16 x i32> [[VECINIT8_I_I]], i32 [[TMP11]], i32 9
-// CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT10_I_I:%.*]] = insertelement <16 x i32> [[VECINIT9_I_I]], i32 [[TMP12]], i32 10
-// CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT11_I_I:%.*]] = insertelement <16 x i32> [[VECINIT10_I_I]], i32 [[TMP13]], i32 11
-// CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT12_I_I:%.*]] = insertelement <16 x i32> [[VECINIT11_I_I]], i32 [[TMP14]], i32 12
-// CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT13_I_I:%.*]] = insertelement <16 x i32> [[VECINIT12_I_I]], i32 [[TMP15]], i32 13
-// CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = insertelement <16 x i32> [[VECINIT13_I_I]], i32 [[TMP16]], i32 14
-// CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT15_I_I:%.*]] = insertelement <16 x i32> [[VECINIT14_I_I]], i32 [[TMP17]], i32 15
-// CHECK-NEXT:    store <16 x i32> [[VECINIT15_I_I]], <16 x i32>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x i32>, <16 x i32>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i32> [[TMP18]] to <8 x i64>
-// CHECK-NEXT:    [[TMP20:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__W_ADDR_I_I]], align 64
-// CHECK-NEXT:    store i16 [[TMP20]], i16* [[__U_ADDR_I_I]], align 2
-// CHECK-NEXT:    store <8 x i64> [[TMP21]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP22:%.*]] = load i16, i16* [[__U_ADDR_I_I]], align 2
-// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[TMP23]] to <16 x i32>
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i64> [[TMP25]] to <16 x i32>
-// CHECK-NEXT:    [[TMP27:%.*]] = bitcast i16 [[TMP22]] to <16 x i1>
-// CHECK-NEXT:    [[TMP28:%.*]] = select <16 x i1> [[TMP27]], <16 x i32> [[TMP24]], <16 x i32> [[TMP26]]
-// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[TMP28]] to <8 x i64>
-// CHECK-NEXT:    store <8 x i64> [[TMP29]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP30]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x i64> [[TMP31]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT4_I]], <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP32]], <4 x i64>* [[__A2_ADDR_I_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP33]], <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i64>, <4 x i64>* [[__A2_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <4 x i64> [[TMP34]] to <8 x i32>
-// CHECK-NEXT:    [[TMP36:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i64> [[TMP36]] to <8 x i32>
-// CHECK-NEXT:    [[TMP38:%.*]] = icmp slt <8 x i32> [[TMP35]], [[TMP37]]
-// CHECK-NEXT:    [[TMP39:%.*]] = select <8 x i1> [[TMP38]], <8 x i32> [[TMP35]], <8 x i32> [[TMP37]]
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP39]] to <4 x i64>
-// CHECK-NEXT:    store <4 x i64> [[TMP40]], <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP41:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <4 x i64> [[TMP41]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT6_I]], <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP42:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <4 x i64> [[TMP42]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT7_I]], <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V1_ADDR_I14_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* [[__V2_ADDR_I15_I]], align 16
-// CHECK-NEXT:    [[TMP45:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I14_I]], align 16
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP45]] to <4 x i32>
-// CHECK-NEXT:    [[TMP47:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I15_I]], align 16
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <2 x i64> [[TMP47]] to <4 x i32>
-// CHECK-NEXT:    [[TMP49:%.*]] = icmp slt <4 x i32> [[TMP46]], [[TMP48]]
-// CHECK-NEXT:    [[TMP50:%.*]] = select <4 x i1> [[TMP49]], <4 x i32> [[TMP46]], <4 x i32> [[TMP48]]
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <4 x i32> [[TMP50]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP51]], <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP52:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP52]] to <4 x i32>
-// CHECK-NEXT:    [[TMP54:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP54]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> [[TMP55]], <4 x i32> 
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP56]], <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP57:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP58:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP57]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP58]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP59:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP60:%.*]] = bitcast <2 x i64> [[TMP59]] to <4 x i32>
-// CHECK-NEXT:    [[TMP61:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP62:%.*]] = bitcast <2 x i64> [[TMP61]] to <4 x i32>
-// CHECK-NEXT:    [[TMP63:%.*]] = icmp slt <4 x i32> [[TMP60]], [[TMP62]]
-// CHECK-NEXT:    [[TMP64:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP60]], <4 x i32> [[TMP62]]
-// CHECK-NEXT:    [[TMP65:%.*]] = bitcast <4 x i32> [[TMP64]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP65]], <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP67:%.*]] = bitcast <2 x i64> [[TMP66]] to <4 x i32>
-// CHECK-NEXT:    [[TMP68:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP69:%.*]] = bitcast <2 x i64> [[TMP68]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE10_I:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> [[TMP69]], <4 x i32> 
-// CHECK-NEXT:    [[TMP70:%.*]] = bitcast <4 x i32> [[SHUFFLE10_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP70]], <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    [[TMP71:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP72:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP71]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP72]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP73:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP74:%.*]] = bitcast <2 x i64> [[TMP73]] to <4 x i32>
-// CHECK-NEXT:    [[TMP75:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP76:%.*]] = bitcast <2 x i64> [[TMP75]] to <4 x i32>
-// CHECK-NEXT:    [[TMP77:%.*]] = icmp slt <4 x i32> [[TMP74]], [[TMP76]]
-// CHECK-NEXT:    [[TMP78:%.*]] = select <4 x i1> [[TMP77]], <4 x i32> [[TMP74]], <4 x i32> [[TMP76]]
-// CHECK-NEXT:    [[TMP79:%.*]] = bitcast <4 x i32> [[TMP78]] to <2 x i64>
-// CHECK-NEXT:    [[TMP80:%.*]] = bitcast <2 x i64> [[TMP79]] to <4 x i32>
-// CHECK-NEXT:    store <4 x i32> [[TMP80]], <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[TMP81:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP81]], i32 0
-// CHECK-NEXT:    ret i32 [[VECEXT_I]]
-int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){
-  return _mm512_mask_reduce_min_epi32(__M, __W); 
-}
-
-// CHECK-LABEL: define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__S_ADDR_I_I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x i32>, align 64
-// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__V1_ADDR_I14_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I15_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK-NEXT:    store i16 [[__M:%.*]], i16* [[__M_ADDR]], align 2
-// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
-// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store i32 -1, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <16 x i32> undef, i32 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <16 x i32> [[VECINIT_I_I]], i32 [[TMP3]], i32 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <16 x i32> [[VECINIT1_I_I]], i32 [[TMP4]], i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <16 x i32> [[VECINIT2_I_I]], i32 [[TMP5]], i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <16 x i32> [[VECINIT3_I_I]], i32 [[TMP6]], i32 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <16 x i32> [[VECINIT4_I_I]], i32 [[TMP7]], i32 5
-// CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <16 x i32> [[VECINIT5_I_I]], i32 [[TMP8]], i32 6
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <16 x i32> [[VECINIT6_I_I]], i32 [[TMP9]], i32 7
-// CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT8_I_I:%.*]] = insertelement <16 x i32> [[VECINIT7_I_I]], i32 [[TMP10]], i32 8
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT9_I_I:%.*]] = insertelement <16 x i32> [[VECINIT8_I_I]], i32 [[TMP11]], i32 9
-// CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT10_I_I:%.*]] = insertelement <16 x i32> [[VECINIT9_I_I]], i32 [[TMP12]], i32 10
-// CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT11_I_I:%.*]] = insertelement <16 x i32> [[VECINIT10_I_I]], i32 [[TMP13]], i32 11
-// CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT12_I_I:%.*]] = insertelement <16 x i32> [[VECINIT11_I_I]], i32 [[TMP14]], i32 12
-// CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT13_I_I:%.*]] = insertelement <16 x i32> [[VECINIT12_I_I]], i32 [[TMP15]], i32 13
-// CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = insertelement <16 x i32> [[VECINIT13_I_I]], i32 [[TMP16]], i32 14
-// CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT15_I_I:%.*]] = insertelement <16 x i32> [[VECINIT14_I_I]], i32 [[TMP17]], i32 15
-// CHECK-NEXT:    store <16 x i32> [[VECINIT15_I_I]], <16 x i32>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x i32>, <16 x i32>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i32> [[TMP18]] to <8 x i64>
-// CHECK-NEXT:    [[TMP20:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__W_ADDR_I_I]], align 64
-// CHECK-NEXT:    store i16 [[TMP20]], i16* [[__U_ADDR_I_I]], align 2
-// CHECK-NEXT:    store <8 x i64> [[TMP21]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP22:%.*]] = load i16, i16* [[__U_ADDR_I_I]], align 2
-// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[TMP23]] to <16 x i32>
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i64> [[TMP25]] to <16 x i32>
-// CHECK-NEXT:    [[TMP27:%.*]] = bitcast i16 [[TMP22]] to <16 x i1>
-// CHECK-NEXT:    [[TMP28:%.*]] = select <16 x i1> [[TMP27]], <16 x i32> [[TMP24]], <16 x i32> [[TMP26]]
-// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[TMP28]] to <8 x i64>
-// CHECK-NEXT:    store <8 x i64> [[TMP29]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP30]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x i64> [[TMP31]], <8 x i64> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x i64> [[EXTRACT4_I]], <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP32]], <4 x i64>* [[__A2_ADDR_I_I]], align 32
-// CHECK-NEXT:    store <4 x i64> [[TMP33]], <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i64>, <4 x i64>* [[__A2_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <4 x i64> [[TMP34]] to <8 x i32>
-// CHECK-NEXT:    [[TMP36:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i64> [[TMP36]] to <8 x i32>
-// CHECK-NEXT:    [[TMP38:%.*]] = icmp ult <8 x i32> [[TMP35]], [[TMP37]]
-// CHECK-NEXT:    [[TMP39:%.*]] = select <8 x i1> [[TMP38]], <8 x i32> [[TMP35]], <8 x i32> [[TMP37]]
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP39]] to <4 x i64>
-// CHECK-NEXT:    store <4 x i64> [[TMP40]], <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP41:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <4 x i64> [[TMP41]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT6_I]], <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP42:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <4 x i64> [[TMP42]], <4 x i64> undef, <2 x i32> 
-// CHECK-NEXT:    store <2 x i64> [[EXTRACT7_I]], <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V1_ADDR_I14_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* [[__V2_ADDR_I15_I]], align 16
-// CHECK-NEXT:    [[TMP45:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I14_I]], align 16
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP45]] to <4 x i32>
-// CHECK-NEXT:    [[TMP47:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I15_I]], align 16
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <2 x i64> [[TMP47]] to <4 x i32>
-// CHECK-NEXT:    [[TMP49:%.*]] = icmp ult <4 x i32> [[TMP46]], [[TMP48]]
-// CHECK-NEXT:    [[TMP50:%.*]] = select <4 x i1> [[TMP49]], <4 x i32> [[TMP46]], <4 x i32> [[TMP48]]
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <4 x i32> [[TMP50]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP51]], <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP52:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP52]] to <4 x i32>
-// CHECK-NEXT:    [[TMP54:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP54]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> [[TMP55]], <4 x i32> 
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP56]], <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP57:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP58:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP57]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP58]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP59:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP60:%.*]] = bitcast <2 x i64> [[TMP59]] to <4 x i32>
-// CHECK-NEXT:    [[TMP61:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP62:%.*]] = bitcast <2 x i64> [[TMP61]] to <4 x i32>
-// CHECK-NEXT:    [[TMP63:%.*]] = icmp ult <4 x i32> [[TMP60]], [[TMP62]]
-// CHECK-NEXT:    [[TMP64:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP60]], <4 x i32> [[TMP62]]
-// CHECK-NEXT:    [[TMP65:%.*]] = bitcast <4 x i32> [[TMP64]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP65]], <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP67:%.*]] = bitcast <2 x i64> [[TMP66]] to <4 x i32>
-// CHECK-NEXT:    [[TMP68:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP69:%.*]] = bitcast <2 x i64> [[TMP68]] to <4 x i32>
-// CHECK-NEXT:    [[SHUFFLE10_I:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> [[TMP69]], <4 x i32> 
-// CHECK-NEXT:    [[TMP70:%.*]] = bitcast <4 x i32> [[SHUFFLE10_I]] to <2 x i64>
-// CHECK-NEXT:    store <2 x i64> [[TMP70]], <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    [[TMP71:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP72:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP71]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP72]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP73:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP74:%.*]] = bitcast <2 x i64> [[TMP73]] to <4 x i32>
-// CHECK-NEXT:    [[TMP75:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP76:%.*]] = bitcast <2 x i64> [[TMP75]] to <4 x i32>
-// CHECK-NEXT:    [[TMP77:%.*]] = icmp ult <4 x i32> [[TMP74]], [[TMP76]]
-// CHECK-NEXT:    [[TMP78:%.*]] = select <4 x i1> [[TMP77]], <4 x i32> [[TMP74]], <4 x i32> [[TMP76]]
-// CHECK-NEXT:    [[TMP79:%.*]] = bitcast <4 x i32> [[TMP78]] to <2 x i64>
-// CHECK-NEXT:    [[TMP80:%.*]] = bitcast <2 x i64> [[TMP79]] to <4 x i32>
-// CHECK-NEXT:    store <4 x i32> [[TMP80]], <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[TMP81:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP81]], i32 0
-// CHECK-NEXT:    ret i32 [[VECEXT_I]]
-unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){
-  return _mm512_mask_reduce_min_epu32(__M, __W); 
-}
-
-// CHECK-LABEL: define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) #0 {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca float, align 4
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    [[__W2_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    [[__A_ADDR_I14_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I15_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I12_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I13_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__A_ADDR_I16_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__B_ADDR_I17_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <16 x float>, align 64
-// CHECK-NEXT:    store i16 [[__M:%.*]], i16* [[__M_ADDR]], align 2
-// CHECK-NEXT:    store <16 x float> [[__W:%.*]], <16 x float>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[__W_ADDR]], align 64
-// CHECK-NEXT:    store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
-// CHECK-NEXT:    store <16 x float> [[TMP1]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store float 0x7FF0000000000000, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <16 x float> [[VECINIT_I_I]], float [[TMP3]], i32 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <16 x float> [[VECINIT1_I_I]], float [[TMP4]], i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <16 x float> [[VECINIT2_I_I]], float [[TMP5]], i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <16 x float> [[VECINIT3_I_I]], float [[TMP6]], i32 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <16 x float> [[VECINIT4_I_I]], float [[TMP7]], i32 5
-// CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <16 x float> [[VECINIT5_I_I]], float [[TMP8]], i32 6
-// CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <16 x float> [[VECINIT6_I_I]], float [[TMP9]], i32 7
-// CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT8_I_I:%.*]] = insertelement <16 x float> [[VECINIT7_I_I]], float [[TMP10]], i32 8
-// CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT9_I_I:%.*]] = insertelement <16 x float> [[VECINIT8_I_I]], float [[TMP11]], i32 9
-// CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT10_I_I:%.*]] = insertelement <16 x float> [[VECINIT9_I_I]], float [[TMP12]], i32 10
-// CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT11_I_I:%.*]] = insertelement <16 x float> [[VECINIT10_I_I]], float [[TMP13]], i32 11
-// CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT12_I_I:%.*]] = insertelement <16 x float> [[VECINIT11_I_I]], float [[TMP14]], i32 12
-// CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT13_I_I:%.*]] = insertelement <16 x float> [[VECINIT12_I_I]], float [[TMP15]], i32 13
-// CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = insertelement <16 x float> [[VECINIT13_I_I]], float [[TMP16]], i32 14
-// CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK-NEXT:    [[VECINIT15_I_I:%.*]] = insertelement <16 x float> [[VECINIT14_I_I]], float [[TMP17]], i32 15
-// CHECK-NEXT:    store <16 x float> [[VECINIT15_I_I]], <16 x float>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x float>, <16 x float>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
-// CHECK-NEXT:    [[TMP19:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP20:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    store <16 x float> [[TMP18]], <16 x float>* [[__W2_ADDR_I_I]], align 64
-// CHECK-NEXT:    store i16 [[TMP19]], i16* [[__U_ADDR_I_I]], align 2
-// CHECK-NEXT:    store <16 x float> [[TMP20]], <16 x float>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP21:%.*]] = load i16, i16* [[__U_ADDR_I_I]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP23:%.*]] = load <16 x float>, <16 x float>* [[__W2_ADDR_I_I]], align 64
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast i16 [[TMP21]] to <16 x i1>
-// CHECK-NEXT:    [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x float> [[TMP22]], <16 x float> [[TMP23]]
-// CHECK-NEXT:    store <16 x float> [[TMP25]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP26:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x float> [[TMP26]] to <8 x double>
-// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP27]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x double> [[EXTRACT_I]] to <8 x float>
-// CHECK-NEXT:    store <8 x float> [[TMP28]], <8 x float>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x float> [[TMP29]] to <8 x double>
-// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x double> [[TMP30]], <8 x double> undef, <4 x i32> 
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x double> [[EXTRACT4_I]] to <8 x float>
-// CHECK-NEXT:    store <8 x float> [[TMP31]], <8 x float>* [[__T2_I]], align 32
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x float>, <8 x float>* [[__T1_I]], align 32
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x float>, <8 x float>* [[__T2_I]], align 32
-// CHECK-NEXT:    store <8 x float> [[TMP32]], <8 x float>* [[__A_ADDR_I16_I]], align 32
-// CHECK-NEXT:    store <8 x float> [[TMP33]], <8 x float>* [[__B_ADDR_I17_I]], align 32
-// CHECK-NEXT:    [[TMP34:%.*]] = load <8 x float>, <8 x float>* [[__A_ADDR_I16_I]], align 32
-// CHECK-NEXT:    [[TMP35:%.*]] = load <8 x float>, <8 x float>* [[__B_ADDR_I17_I]], align 32
-// CHECK-NEXT:    [[TMP36:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[TMP34]], <8 x float> [[TMP35]]) #2
-// CHECK-NEXT:    store <8 x float> [[TMP36]], <8 x float>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[TMP37:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[EXTRACT6_I]], <4 x float>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP38:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
-// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <8 x float> [[TMP38]], <8 x float> undef, <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[EXTRACT7_I]], <4 x float>* [[__T5_I]], align 16
-// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x float>, <4 x float>* [[__T4_I]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = load <4 x float>, <4 x float>* [[__T5_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP39]], <4 x float>* [[__A_ADDR_I14_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP40]], <4 x float>* [[__B_ADDR_I15_I]], align 16
-// CHECK-NEXT:    [[TMP41:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I14_I]], align 16
-// CHECK-NEXT:    [[TMP42:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I15_I]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[TMP41]], <4 x float> [[TMP42]]) #2
-// CHECK-NEXT:    store <4 x float> [[TMP43]], <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP44:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP45:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP44]], <4 x float> [[TMP45]], <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[SHUFFLE_I]], <4 x float>* [[__T7_I]], align 16
-// CHECK-NEXT:    [[TMP46:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
-// CHECK-NEXT:    [[TMP47:%.*]] = load <4 x float>, <4 x float>* [[__T7_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP46]], <4 x float>* [[__A_ADDR_I12_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP47]], <4 x float>* [[__B_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP48:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I12_I]], align 16
-// CHECK-NEXT:    [[TMP49:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I13_I]], align 16
-// CHECK-NEXT:    [[TMP50:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[TMP48]], <4 x float> [[TMP49]]) #2
-// CHECK-NEXT:    store <4 x float> [[TMP50]], <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP51:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP52:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[SHUFFLE10_I:%.*]] = shufflevector <4 x float> [[TMP51]], <4 x float> [[TMP52]], <4 x i32> 
-// CHECK-NEXT:    store <4 x float> [[SHUFFLE10_I]], <4 x float>* [[__T9_I]], align 16
-// CHECK-NEXT:    [[TMP53:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
-// CHECK-NEXT:    [[TMP54:%.*]] = load <4 x float>, <4 x float>* [[__T9_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP53]], <4 x float>* [[__A2_ADDR_I_I]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP54]], <4 x float>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP55:%.*]] = load <4 x float>, <4 x float>* [[__A2_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP56:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I_I]], align 16
-// CHECK-NEXT:    [[TMP57:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[TMP55]], <4 x float> [[TMP56]]) #2
-// CHECK-NEXT:    store <4 x float> [[TMP57]], <4 x float>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[TMP58:%.*]] = load <4 x float>, <4 x float>* [[__T10_I]], align 16
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x float> [[TMP58]], i32 0
-// CHECK-NEXT:    ret float [[VECEXT_I]]
-float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){
-  return _mm512_mask_reduce_min_ps(__M, __W); 
-}
-
diff --git a/clang/test/CodeGen/basic-block-sections.c b/clang/test/CodeGen/basic-block-sections.c
index 6cdea79f0fa7b..dc414d70ba5f9 100644
--- a/clang/test/CodeGen/basic-block-sections.c
+++ b/clang/test/CodeGen/basic-block-sections.c
@@ -1,12 +1,11 @@
 // REQUIRES: x86-registered-target
 
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -S -o - < %s | FileCheck %s --check-prefix=PLAIN
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -S -fbasic-block-sections=all -fbasic-block-sections=none -o - < %s | FileCheck %s --check-prefix=PLAIN
+// RUN: %clang_cc1 -triple x86_64 -S -o - < %s | FileCheck %s --check-prefix=PLAIN
+// RUN: %clang_cc1 -triple x86_64 -S -fbasic-block-sections=all -fbasic-block-sections=none -o - < %s | FileCheck %s --check-prefix=PLAIN
 
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -S -fbasic-block-sections=labels -o - < %s | FileCheck %s --check-prefix=BB_LABELS
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -S -fbasic-block-sections=all -o - < %s | FileCheck %s --check-prefix=BB_WORLD --check-prefix=BB_ALL
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -S -fbasic-block-sections=list=%S/Inputs/basic-block-sections.funcnames -o - < %s | FileCheck %s --check-prefix=BB_WORLD --check-prefix=BB_LIST
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -S -fbasic-block-sections=all -funique-basic-block-section-names -o - < %s | FileCheck %s --check-prefix=UNIQUE
+// RUN: %clang_cc1 -triple x86_64 -S -fbasic-block-sections=all -o - < %s | FileCheck %s --check-prefix=BB_WORLD --check-prefix=BB_ALL
+// RUN: %clang_cc1 -triple x86_64 -S -fbasic-block-sections=list=%S/Inputs/basic-block-sections.funcnames -o - < %s | FileCheck %s --check-prefix=BB_WORLD --check-prefix=BB_LIST
+// RUN: %clang_cc1 -triple x86_64 -S -fbasic-block-sections=all -funique-basic-block-section-names -o - < %s | FileCheck %s --check-prefix=UNIQUE
 
 int world(int a) {
   if (a > 10)
@@ -26,12 +25,6 @@ int another(int a) {
 // PLAIN-NOT: section
 // PLAIN: world:
 //
-// BB_LABELS-NOT: section
-// BB_LABELS: world:
-// BB_LABELS: a.BB.world:
-// BB_LABELS: aa.BB.world:
-// BB_LABELS: a.BB.another:
-//
 // BB_WORLD: .section .text.world,"ax",@progbits{{$}}
 // BB_WORLD: world:
 // BB_WORLD: .section .text.world,"ax",@progbits,unique
diff --git a/clang/test/CodeGen/builtin-align-array.c b/clang/test/CodeGen/builtin-align-array.c
index 97235c33b7fbe..31f7b42b56170 100644
--- a/clang/test/CodeGen/builtin-align-array.c
+++ b/clang/test/CodeGen/builtin-align-array.c
@@ -4,7 +4,7 @@
 
 extern int func(char *c);
 
-// CHECK-LABEL: define {{[^@]+}}@test_array() #0
+// CHECK-LABEL: @test_array(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[BUF:%.*]] = alloca [1024 x i8], align 16
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BUF]], i64 0, i64 44
@@ -12,10 +12,7 @@ extern int func(char *c);
 // CHECK-NEXT:    [[ALIGNED_INTPTR:%.*]] = and i64 [[INTPTR]], -16
 // CHECK-NEXT:    [[DIFF:%.*]] = sub i64 [[ALIGNED_INTPTR]], [[INTPTR]]
 // CHECK-NEXT:    [[ALIGNED_RESULT:%.*]] = getelementptr inbounds i8, i8* [[ARRAYIDX]], i64 [[DIFF]]
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[ALIGNED_RESULT]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 15
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[ALIGNED_RESULT]], i64 16) ]
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @func(i8* [[ALIGNED_RESULT]])
 // CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BUF]], i64 0, i64 22
 // CHECK-NEXT:    [[INTPTR2:%.*]] = ptrtoint i8* [[ARRAYIDX1]] to i64
@@ -23,13 +20,10 @@ extern int func(char *c);
 // CHECK-NEXT:    [[ALIGNED_INTPTR4:%.*]] = and i64 [[OVER_BOUNDARY]], -32
 // CHECK-NEXT:    [[DIFF5:%.*]] = sub i64 [[ALIGNED_INTPTR4]], [[INTPTR2]]
 // CHECK-NEXT:    [[ALIGNED_RESULT6:%.*]] = getelementptr inbounds i8, i8* [[ARRAYIDX1]], i64 [[DIFF5]]
-// CHECK-NEXT:    [[PTRINT7:%.*]] = ptrtoint i8* [[ALIGNED_RESULT6]] to i64
-// CHECK-NEXT:    [[MASKEDPTR8:%.*]] = and i64 [[PTRINT7]], 31
-// CHECK-NEXT:    [[MASKCOND9:%.*]] = icmp eq i64 [[MASKEDPTR8]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND9]])
-// CHECK-NEXT:    [[CALL10:%.*]] = call i32 @func(i8* [[ALIGNED_RESULT6]])
-// CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BUF]], i64 0, i64 16
-// CHECK-NEXT:    [[SRC_ADDR:%.*]] = ptrtoint i8* [[ARRAYIDX11]] to i64
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[ALIGNED_RESULT6]], i64 32) ]
+// CHECK-NEXT:    [[CALL7:%.*]] = call i32 @func(i8* [[ALIGNED_RESULT6]])
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BUF]], i64 0, i64 16
+// CHECK-NEXT:    [[SRC_ADDR:%.*]] = ptrtoint i8* [[ARRAYIDX8]] to i64
 // CHECK-NEXT:    [[SET_BITS:%.*]] = and i64 [[SRC_ADDR]], 63
 // CHECK-NEXT:    [[IS_ALIGNED:%.*]] = icmp eq i64 [[SET_BITS]], 0
 // CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[IS_ALIGNED]] to i32
@@ -42,7 +36,7 @@ int test_array(void) {
   return __builtin_is_aligned(&buf[16], 64);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@test_array_should_not_mask() #0
+// CHECK-LABEL: @test_array_should_not_mask(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[BUF:%.*]] = alloca [1024 x i8], align 32
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BUF]], i64 0, i64 64
@@ -50,10 +44,7 @@ int test_array(void) {
 // CHECK-NEXT:    [[ALIGNED_INTPTR:%.*]] = and i64 [[INTPTR]], -16
 // CHECK-NEXT:    [[DIFF:%.*]] = sub i64 [[ALIGNED_INTPTR]], [[INTPTR]]
 // CHECK-NEXT:    [[ALIGNED_RESULT:%.*]] = getelementptr inbounds i8, i8* [[ARRAYIDX]], i64 [[DIFF]]
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[ALIGNED_RESULT]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 15
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[ALIGNED_RESULT]], i64 16) ]
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @func(i8* [[ALIGNED_RESULT]])
 // CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BUF]], i64 0, i64 32
 // CHECK-NEXT:    [[INTPTR2:%.*]] = ptrtoint i8* [[ARRAYIDX1]] to i64
@@ -61,11 +52,8 @@ int test_array(void) {
 // CHECK-NEXT:    [[ALIGNED_INTPTR4:%.*]] = and i64 [[OVER_BOUNDARY]], -32
 // CHECK-NEXT:    [[DIFF5:%.*]] = sub i64 [[ALIGNED_INTPTR4]], [[INTPTR2]]
 // CHECK-NEXT:    [[ALIGNED_RESULT6:%.*]] = getelementptr inbounds i8, i8* [[ARRAYIDX1]], i64 [[DIFF5]]
-// CHECK-NEXT:    [[PTRINT7:%.*]] = ptrtoint i8* [[ALIGNED_RESULT6]] to i64
-// CHECK-NEXT:    [[MASKEDPTR8:%.*]] = and i64 [[PTRINT7]], 31
-// CHECK-NEXT:    [[MASKCOND9:%.*]] = icmp eq i64 [[MASKEDPTR8]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND9]])
-// CHECK-NEXT:    [[CALL10:%.*]] = call i32 @func(i8* [[ALIGNED_RESULT6]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[ALIGNED_RESULT6]], i64 32) ]
+// CHECK-NEXT:    [[CALL7:%.*]] = call i32 @func(i8* [[ALIGNED_RESULT6]])
 // CHECK-NEXT:    ret i32 1
 //
 int test_array_should_not_mask(void) {
diff --git a/clang/test/CodeGen/builtin-align.c b/clang/test/CodeGen/builtin-align.c
index 7e66e2b5c0b9b..60f7fc99c1d4d 100644
--- a/clang/test/CodeGen/builtin-align.c
+++ b/clang/test/CodeGen/builtin-align.c
@@ -122,11 +122,7 @@ _Bool is_aligned(TYPE ptr, unsigned align) {
 // CHECK-VOID_PTR-NEXT:    [[ALIGNED_INTPTR:%.*]] = and i64 [[OVER_BOUNDARY]], [[INVERTED_MASK]]
 // CHECK-VOID_PTR-NEXT:    [[DIFF:%.*]] = sub i64 [[ALIGNED_INTPTR]], [[INTPTR]]
 // CHECK-VOID_PTR-NEXT:    [[ALIGNED_RESULT:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[DIFF]]
-// CHECK-VOID_PTR-NEXT:    [[MASK1:%.*]] = sub i64 [[ALIGNMENT]], 1
-// CHECK-VOID_PTR-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[ALIGNED_RESULT]] to i64
-// CHECK-VOID_PTR-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK1]]
-// CHECK-VOID_PTR-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-VOID_PTR-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-VOID_PTR-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[ALIGNED_RESULT]], i64 [[ALIGNMENT]]) ]
 // CHECK-VOID_PTR-NEXT:    ret i8* [[ALIGNED_RESULT]]
 //
 // CHECK-FLOAT_PTR-LABEL: define {{[^@]+}}@align_up
@@ -142,11 +138,7 @@ _Bool is_aligned(TYPE ptr, unsigned align) {
 // CHECK-FLOAT_PTR-NEXT:    [[TMP0:%.*]] = bitcast float* [[PTR]] to i8*
 // CHECK-FLOAT_PTR-NEXT:    [[ALIGNED_RESULT:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 [[DIFF]]
 // CHECK-FLOAT_PTR-NEXT:    [[TMP1:%.*]] = bitcast i8* [[ALIGNED_RESULT]] to float*
-// CHECK-FLOAT_PTR-NEXT:    [[MASK1:%.*]] = sub i64 [[ALIGNMENT]], 1
-// CHECK-FLOAT_PTR-NEXT:    [[PTRINT:%.*]] = ptrtoint float* [[TMP1]] to i64
-// CHECK-FLOAT_PTR-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK1]]
-// CHECK-FLOAT_PTR-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-FLOAT_PTR-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-FLOAT_PTR-NEXT:    call void @llvm.assume(i1 true) [ "align"(float* [[TMP1]], i64 [[ALIGNMENT]]) ]
 // CHECK-FLOAT_PTR-NEXT:    ret float* [[TMP1]]
 //
 // CHECK-LONG-LABEL: define {{[^@]+}}@align_up
@@ -184,11 +176,7 @@ TYPE align_up(TYPE ptr, unsigned align) {
 // CHECK-VOID_PTR-NEXT:    [[ALIGNED_INTPTR:%.*]] = and i64 [[INTPTR]], [[INVERTED_MASK]]
 // CHECK-VOID_PTR-NEXT:    [[DIFF:%.*]] = sub i64 [[ALIGNED_INTPTR]], [[INTPTR]]
 // CHECK-VOID_PTR-NEXT:    [[ALIGNED_RESULT:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[DIFF]]
-// CHECK-VOID_PTR-NEXT:    [[MASK1:%.*]] = sub i64 [[ALIGNMENT]], 1
-// CHECK-VOID_PTR-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[ALIGNED_RESULT]] to i64
-// CHECK-VOID_PTR-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK1]]
-// CHECK-VOID_PTR-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-VOID_PTR-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-VOID_PTR-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[ALIGNED_RESULT]], i64 [[ALIGNMENT]]) ]
 // CHECK-VOID_PTR-NEXT:    ret i8* [[ALIGNED_RESULT]]
 //
 // CHECK-FLOAT_PTR-LABEL: define {{[^@]+}}@align_down
@@ -203,11 +191,7 @@ TYPE align_up(TYPE ptr, unsigned align) {
 // CHECK-FLOAT_PTR-NEXT:    [[TMP0:%.*]] = bitcast float* [[PTR]] to i8*
 // CHECK-FLOAT_PTR-NEXT:    [[ALIGNED_RESULT:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 [[DIFF]]
 // CHECK-FLOAT_PTR-NEXT:    [[TMP1:%.*]] = bitcast i8* [[ALIGNED_RESULT]] to float*
-// CHECK-FLOAT_PTR-NEXT:    [[MASK1:%.*]] = sub i64 [[ALIGNMENT]], 1
-// CHECK-FLOAT_PTR-NEXT:    [[PTRINT:%.*]] = ptrtoint float* [[TMP1]] to i64
-// CHECK-FLOAT_PTR-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK1]]
-// CHECK-FLOAT_PTR-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-FLOAT_PTR-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-FLOAT_PTR-NEXT:    call void @llvm.assume(i1 true) [ "align"(float* [[TMP1]], i64 [[ALIGNMENT]]) ]
 // CHECK-FLOAT_PTR-NEXT:    ret float* [[TMP1]]
 //
 // CHECK-LONG-LABEL: define {{[^@]+}}@align_down
diff --git a/clang/test/CodeGen/builtin-assume-aligned.c b/clang/test/CodeGen/builtin-assume-aligned.c
index 90693cc215200..b9f1ebfbdcf58 100644
--- a/clang/test/CodeGen/builtin-assume-aligned.c
+++ b/clang/test/CodeGen/builtin-assume-aligned.c
@@ -8,10 +8,7 @@
 // CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[TMP1]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[TMP1]], i64 32, i64 0) ]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
 // CHECK-NEXT:    store i32* [[TMP2]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8
@@ -31,10 +28,7 @@ int test1(int *a) {
 // CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[TMP1]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[TMP1]], i64 32, i64 0) ]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
 // CHECK-NEXT:    store i32* [[TMP2]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8
@@ -54,10 +48,7 @@ int test2(int *a) {
 // CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[TMP1]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[TMP1]], i64 32) ]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
 // CHECK-NEXT:    store i32* [[TMP2]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8
@@ -81,11 +72,7 @@ int test3(int *a) {
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[B_ADDR]], align 4
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP2]] to i64
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[TMP1]] to i64
-// CHECK-NEXT:    [[OFFSETPTR:%.*]] = sub i64 [[PTRINT]], [[CONV]]
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[OFFSETPTR]], 31
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[TMP1]], i64 32, i64 [[CONV]]) ]
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to i32*
 // CHECK-NEXT:    store i32* [[TMP3]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[A_ADDR]], align 8
@@ -115,11 +102,7 @@ int *m2() __attribute__((assume_aligned(64, 12)));
 // CHECK-LABEL: define {{[^@]+}}@test6() #0
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* (...) @m2()
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
-// CHECK-NEXT:    [[OFFSETPTR:%.*]] = sub i64 [[PTRINT]], 12
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[OFFSETPTR]], 63
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 64, i64 12) ]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
@@ -134,10 +117,7 @@ int test6() {
 // CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[TMP1]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 536870911
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[TMP1]], i64 536870912) ]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
 // CHECK-NEXT:    store i32* [[TMP2]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8
diff --git a/clang/test/CodeGen/builtin-redeclaration.c b/clang/test/CodeGen/builtin-redeclaration.c
new file mode 100644
index 0000000000000..582907184ea53
--- /dev/null
+++ b/clang/test/CodeGen/builtin-redeclaration.c
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -emit-llvm-only %s
+
+// PR45410
+// Ensure we mark local extern redeclarations with a different type as non-builtin.
+void non_builtin() {
+  extern float exp();
+  exp(); // Will crash due to wrong number of arguments if this calls the builtin.
+}
+
+// PR45410
+// We mark exp() builtin as const with -fno-math-errno (default).
+// We mustn't do that for extern redeclarations of builtins where the type differs.
+float attribute() {
+  extern float exp();
+  return exp(1);
+}
diff --git a/clang/test/CodeGen/builtins-ppc-fpconstrained.c b/clang/test/CodeGen/builtins-ppc-fpconstrained.c
index c8b08c3fb5d4a..880c0c339ef33 100644
--- a/clang/test/CodeGen/builtins-ppc-fpconstrained.c
+++ b/clang/test/CodeGen/builtins-ppc-fpconstrained.c
@@ -2,14 +2,12 @@
 // RUN: %clang_cc1 -triple powerpc64le-gnu-linux -target-feature +vsx \
 // RUN: -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-UNCONSTRAINED %s
 // RUN: %clang_cc1 -triple powerpc64le-gnu-linux -target-feature +vsx \
-// RUN: -fexperimental-strict-floating-point \
 // RUN:  -ffp-exception-behavior=strict -emit-llvm %s -o - | FileCheck \
 // RUN: --check-prefix=CHECK-CONSTRAINED -vv %s
 // RUN: %clang_cc1 -triple powerpc64le-gnu-linux -target-feature +vsx \
 // RUN: -fallow-half-arguments-and-returns -S -o - %s | \
 // RUN: FileCheck --check-prefix=CHECK-ASM --check-prefix=NOT-FIXME-CHECK  %s
 // RUN: %clang_cc1 -triple powerpc64le-gnu-linux -target-feature +vsx \
-// RUN: -fexperimental-strict-floating-point \
 // RUN: -fallow-half-arguments-and-returns -S -ffp-exception-behavior=strict \
 // RUN: -o - %s | FileCheck --check-prefix=CHECK-ASM \
 // RUN: --check-prefix=FIXME-CHECK  %s
@@ -59,14 +57,14 @@ void test_float(void) {
 
   vf = __builtin_vsx_xvrspic(vf);
   // CHECK-LABEL: try-xvrspic
-  // CHECK-UNCONSTRAINED: @llvm.nearbyint.v4f32(<4 x float> %{{.*}})
-  // CHECK-CONSTRAINED: @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  // CHECK-UNCONSTRAINED: @llvm.rint.v4f32(<4 x float> %{{.*}})
+  // CHECK-CONSTRAINED: @llvm.experimental.constrained.rint.v4f32(<4 x float> %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
   // CHECK-ASM: xvrspic
 
   vd = __builtin_vsx_xvrdpic(vd);
   // CHECK-LABEL: try-xvrdpic
-  // CHECK-UNCONSTRAINED: @llvm.nearbyint.v2f64(<2 x double> %{{.*}})
-  // CHECK-CONSTRAINED: @llvm.experimental.constrained.nearbyint.v2f64(<2 x double> %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  // CHECK-UNCONSTRAINED: @llvm.rint.v2f64(<2 x double> %{{.*}})
+  // CHECK-CONSTRAINED: @llvm.experimental.constrained.rint.v2f64(<2 x double> %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
   // CHECK-ASM: xvrdpic
 
   vf = __builtin_vsx_xvrspip(vf);
diff --git a/clang/test/CodeGen/builtins-ppc-p10vector.c b/clang/test/CodeGen/builtins-ppc-p10vector.c
index ad63d646196c3..0f72c5b0146ed 100644
--- a/clang/test/CodeGen/builtins-ppc-p10vector.c
+++ b/clang/test/CodeGen/builtins-ppc-p10vector.c
@@ -17,6 +17,7 @@ vector signed int vsia, vsib;
 vector unsigned int vuia, vuib, vuic;
 vector signed long long vslla, vsllb;
 vector unsigned long long vulla, vullb, vullc;
+vector signed __int128 vsi128a, vsi128b;
 vector unsigned __int128 vui128a, vui128b, vui128c;
 vector float vfa, vfb;
 vector double vda, vdb;
@@ -69,6 +70,18 @@ vector unsigned long long test_vec_div_ull(void) {
   return vec_div(vulla, vullb);
 }
 
+vector unsigned __int128 test_vec_div_u128(void) {
+  // CHECK: udiv <1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_div(vui128a, vui128b);
+}
+
+vector signed __int128 test_vec_div_s128(void) {
+  // CHECK: sdiv <1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_div(vsi128a, vsi128b);
+}
+
 vector signed int test_vec_dive_si(void) {
   // CHECK: @llvm.ppc.altivec.vdivesw(<4 x i32> %{{.+}}, <4 x i32> %{{.+}})
   // CHECK-NEXT: ret <4 x i32>
@@ -231,6 +244,30 @@ vector unsigned __int128 test_vec_expandm_u128(void) {
   return vec_expandm(vui128a);
 }
 
+unsigned long long test_vec_cntm_uc(void) {
+  // CHECK: @llvm.ppc.altivec.vcntmbb(<16 x i8> %{{.+}}, i32
+  // CHECK-NEXT: ret i64
+  return vec_cntm(vuca, 1);
+}
+
+unsigned long long test_vec_cntm_us(void) {
+  // CHECK: @llvm.ppc.altivec.vcntmbh(<8 x i16> %{{.+}}, i32
+  // CHECK-NEXT: ret i64
+  return vec_cntm(vusa, 0);
+}
+
+unsigned long long test_vec_cntm_ui(void) {
+  // CHECK: @llvm.ppc.altivec.vcntmbw(<4 x i32> %{{.+}}, i32
+  // CHECK-NEXT: ret i64
+  return vec_cntm(vuia, 1);
+}
+
+unsigned long long test_vec_cntm_ull(void) {
+  // CHECK: @llvm.ppc.altivec.vcntmbd(<2 x i64> %{{.+}}, i32
+  // CHECK-NEXT: ret i64
+  return vec_cntm(vulla, 0);
+}
+
 unsigned long long test_vgnb_1(void) {
   // CHECK: @llvm.ppc.altivec.vgnb(<1 x i128> %{{.+}}, i32 2)
   // CHECK-NEXT: ret i64
diff --git a/clang/test/CodeGen/builtins-ppc-vsx.c b/clang/test/CodeGen/builtins-ppc-vsx.c
index 0d07247262754..2542b30590bf8 100644
--- a/clang/test/CodeGen/builtins-ppc-vsx.c
+++ b/clang/test/CodeGen/builtins-ppc-vsx.c
@@ -863,12 +863,12 @@ void test1() {
 // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double>
 
   res_vf = vec_rint(vf);
-// CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}})
-// CHECK-LE: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}})
+// CHECK: call <4 x float> @llvm.rint.v4f32(<4 x float> %{{[0-9]+}})
+// CHECK-LE: call <4 x float> @llvm.rint.v4f32(<4 x float> %{{[0-9]+}})
 
   res_vd = vec_rint(vd);
-// CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}})
-// CHECK-LE: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}})
+// CHECK: call <2 x double> @llvm.rint.v2f64(<2 x double> %{{[0-9]+}})
+// CHECK-LE: call <2 x double> @llvm.rint.v2f64(<2 x double> %{{[0-9]+}})
 
   res_vf = vec_rsqrte(vf);
 // CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}})
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 01e9273e0fb63..67b1586cb7c78 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -328,26 +328,20 @@ u8x16 sub_saturate_u_i8x16(u8x16 x, u8x16 y) {
 
 i8x16 abs_i8x16(i8x16 v) {
   return __builtin_wasm_abs_i8x16(v);
-  // WEBASSEMBLY: %neg = sub <16 x i8> zeroinitializer, %v
-  // WEBASSEMBLY: %abscond = icmp slt <16 x i8> %v, zeroinitializer
-  // WEBASSEMBLY: %abs = select <16 x i1> %abscond, <16 x i8> %neg, <16 x i8> %v
-  // WEBASSEMBLY: ret <16 x i8> %abs
+  // WEBASSEMBLY: call <16 x i8> @llvm.abs.v16i8(<16 x i8> %v, i1 false)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 i16x8 abs_i16x8(i16x8 v) {
   return __builtin_wasm_abs_i16x8(v);
-  // WEBASSEMBLY: %neg = sub <8 x i16> zeroinitializer, %v
-  // WEBASSEMBLY: %abscond = icmp slt <8 x i16> %v, zeroinitializer
-  // WEBASSEMBLY: %abs = select <8 x i1> %abscond, <8 x i16> %neg, <8 x i16> %v
-  // WEBASSEMBLY: ret <8 x i16> %abs
+  // WEBASSEMBLY: call <8 x i16> @llvm.abs.v8i16(<8 x i16> %v, i1 false)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 i32x4 abs_i32x4(i32x4 v) {
   return __builtin_wasm_abs_i32x4(v);
-  // WEBASSEMBLY: %neg = sub <4 x i32> zeroinitializer, %v
-  // WEBASSEMBLY: %abscond = icmp slt <4 x i32> %v, zeroinitializer
-  // WEBASSEMBLY: %abs = select <4 x i1> %abscond, <4 x i32> %neg, <4 x i32> %v
-  // WEBASSEMBLY: ret <4 x i32> %abs
+  // WEBASSEMBLY: call <4 x i32> @llvm.abs.v4i32(<4 x i32> %v, i1 false)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 i8x16 min_s_i8x16(i8x16 x, i8x16 y) {
diff --git a/clang/test/CodeGen/callback_pthread_create.c b/clang/test/CodeGen/callback_pthread_create.c
index d1b01b91eac3f..80457cb3ade3b 100644
--- a/clang/test/CodeGen/callback_pthread_create.c
+++ b/clang/test/CodeGen/callback_pthread_create.c
@@ -1,3 +1,7 @@
+// FIXME: pthread_create() definition in Builtins.def doesn't match the real one, so it doesn't get recognized as a builtin and attributes aren't added.
+// RUN: false
+// XFAIL: *
+
 // RUN: %clang_cc1 %s -S -emit-llvm -o - -disable-llvm-optzns | FileCheck %s
 
 // CHECK: declare !callback ![[cid:[0-9]+]] {{.*}}i32 @pthread_create
diff --git a/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-lvalue.cpp b/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-lvalue.cpp
index 96d264190bec7..fb2b1a76116e9 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-lvalue.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-lvalue.cpp
@@ -21,9 +21,9 @@ char **load_from_ac_struct(struct ac_struct *x) {
   // CHECK-NEXT:                        %[[X_RELOADED:.*]] = load %[[STRUCT_AC_STRUCT]]*, %[[STRUCT_AC_STRUCT]]** %[[STRUCT_AC_STRUCT_ADDR]], align 8
   // CHECK:                             %[[A_ADDR:.*]] = getelementptr inbounds %[[STRUCT_AC_STRUCT]], %[[STRUCT_AC_STRUCT]]* %[[X_RELOADED]], i32 0, i32 0
   // CHECK:                             %[[A:.*]] = load i8**, i8*** %[[A_ADDR]], align 8
-  // CHECK-NEXT:                        %[[PTRINT:.*]] = ptrtoint i8** %[[A]] to i64
-  // CHECK-NEXT:                        %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], 2147483647
-  // CHECK-NEXT:                        %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-SANITIZE-NEXT:               %[[PTRINT:.*]] = ptrtoint i8** %[[A]] to i64
+  // CHECK-SANITIZE-NEXT:               %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], 2147483647
+  // CHECK-SANITIZE-NEXT:               %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8** %[[A]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -32,7 +32,7 @@ char **load_from_ac_struct(struct ac_struct *x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
+  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8** %[[A]], i64 2147483648) ]
   // CHECK-NEXT:                        ret i8** %[[A]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-paramvar.cpp b/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-paramvar.cpp
index 0e3fa750c66c3..46f7d09ae2aa5 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-paramvar.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-paramvar.cpp
@@ -24,7 +24,7 @@ char **passthrough(__attribute__((align_value(0x80000000))) char **x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-SANITIZE-NEXT:               call void @llvm.assume(i1 %[[MASKCOND]])
+  // CHECK-SANITIZE-NEXT:               call void @llvm.assume(i1 true) [ "align"(i8** %[[X_RELOADED]], i64 2147483648) ]
   // CHECK-NEXT:                        ret i8** %[[X_RELOADED]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function-variable.cpp b/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function-variable.cpp
index 591eaa0e13131..40abbc3871996 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function-variable.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function-variable.cpp
@@ -30,10 +30,10 @@ char **caller(char **x, unsigned long alignment) {
   // CHECK-NEXT:                        %[[X_RELOADED:.*]] = load i8**, i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[ALIGNMENT_RELOADED:.*]] = load i64, i64* %[[ALIGNMENT_ADDR]], align 8
   // CHECK-NEXT:                        %[[X_RETURNED:.*]] = call i8** @[[PASSTHROUGH]](i8** %[[X_RELOADED]], i64 %[[ALIGNMENT_RELOADED]])
-  // CHECK-NEXT:                        %[[MASK:.*]] = sub i64 %[[ALIGNMENT_RELOADED]], 1
-  // CHECK-NEXT:                        %[[PTRINT:.*]] = ptrtoint i8** %[[X_RETURNED]] to i64
-  // CHECK-NEXT:                        %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], %[[MASK]]
-  // CHECK-NEXT:                        %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-SANITIZE-NEXT:               %[[PTRINT:.*]] = ptrtoint i8** %[[X_RETURNED]] to i64
+  // CHECK-SANITIZE-NEXT:               %[[MASK:.*]] = sub i64 %[[ALIGNMENT_RELOADED]], 1
+  // CHECK-SANITIZE-NEXT:               %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], %[[MASK]]
+  // CHECK-SANITIZE-NEXT:               %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8** %[[X_RETURNED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -42,7 +42,7 @@ char **caller(char **x, unsigned long alignment) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
+  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8** %[[X_RETURNED]], i64 %1) ]
   // CHECK-NEXT:                        ret i8** %[[X_RETURNED]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function.cpp b/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function.cpp
index a41357933f918..87d903c69716c 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function.cpp
@@ -39,7 +39,7 @@ char **caller(char **x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-SANITIZE-NEXT:               call void @llvm.assume(i1 %[[MASKCOND]])
+  // CHECK-SANITIZE-NEXT:               call void @llvm.assume(i1 true) [ "align"(i8** %[[X_RETURNED]], i64 128) ]
   // CHECK-NEXT:                        ret i8** %[[X_RETURNED]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function-two-params.cpp b/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function-two-params.cpp
index e78667ce16e06..ecc96bcf6a53b 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function-two-params.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function-two-params.cpp
@@ -24,10 +24,10 @@ char **caller(char **x) {
   // CHECK-NEXT:                        store i8** %[[X]], i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[X_RELOADED:.*]] = load i8**, i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[X_RETURNED:.*]] = call i8** @[[PASSTHROUGH]](i8** %[[X_RELOADED]])
-  // CHECK-NEXT:                        %[[PTRINT:.*]] = ptrtoint i8** %[[X_RETURNED]] to i64
-  // CHECK-NEXT:                        %[[OFFSETPTR:.*]] = sub i64 %[[PTRINT]], 42
-  // CHECK-NEXT:                        %[[MASKEDPTR:.*]] = and i64 %[[OFFSETPTR]], 2147483647
-  // CHECK-NEXT:                        %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-SANITIZE-NEXT:               %[[PTRINT:.*]] = ptrtoint i8** %[[X_RETURNED]] to i64
+  // CHECK-SANITIZE-NEXT:               %[[OFFSETPTR:.*]] = sub i64 %[[PTRINT]], 42
+  // CHECK-SANITIZE-NEXT:               %[[MASKEDPTR:.*]] = and i64 %[[OFFSETPTR]], 2147483647
+  // CHECK-SANITIZE-NEXT:               %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8** %[[X_RETURNED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -36,7 +36,7 @@ char **caller(char **x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
+  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8** %[[X_RETURNED]], i64 2147483648, i64 42) ]
   // CHECK-NEXT:                        ret i8** %[[X_RETURNED]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function.cpp b/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function.cpp
index f750bbd77d42f..5bbc5843b89f8 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function.cpp
@@ -36,7 +36,7 @@ char **caller(char **x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-SANITIZE-NEXT:               call void @llvm.assume(i1 %[[MASKCOND]])
+  // CHECK-SANITIZE-NEXT:               call void @llvm.assume(i1 true) [ "align"(i8** %[[X_RETURNED]], i64 128) ]
   // CHECK-NEXT:                        ret i8** %[[X_RETURNED]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params-variable.cpp b/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params-variable.cpp
index 4306e322f5fb6..9c8944ba280b4 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params-variable.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params-variable.cpp
@@ -16,10 +16,10 @@ void *caller(char **x, unsigned long offset) {
   // CHECK-NEXT:                        %[[X_RELOADED:.*]] = load i8**, i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[BITCAST:.*]] = bitcast i8** %[[X_RELOADED]] to i8*
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, i64* %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[PTRINT:.*]] = ptrtoint i8* %[[BITCAST]] to i64
-  // CHECK-NEXT:                        %[[OFFSETPTR:.*]] = sub i64 %[[PTRINT]], %[[OFFSET_RELOADED]]
-  // CHECK-NEXT:                        %[[MASKEDPTR:.*]] = and i64 %[[OFFSETPTR]], 536870911
-  // CHECK-NEXT:                        %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-SANITIZE-NEXT:               %[[PTRINT:.*]] = ptrtoint i8* %[[BITCAST]] to i64
+  // CHECK-SANITIZE-NEXT:               %[[OFFSETPTR:.*]] = sub i64 %[[PTRINT]], %[[OFFSET_RELOADED]]
+  // CHECK-SANITIZE-NEXT:               %[[MASKEDPTR:.*]] = and i64 %[[OFFSETPTR]], 536870911
+  // CHECK-SANITIZE-NEXT:               %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8* %[[BITCAST]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -28,7 +28,7 @@ void *caller(char **x, unsigned long offset) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
+  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8* %[[BITCAST]], i64 536870912, i64 %[[OFFSET_RELOADED]]) ]
   // CHECK-NEXT:                        ret i8* %[[BITCAST]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params.cpp b/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params.cpp
index 27f53e92bed89..9f61e08106a01 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params.cpp
@@ -13,10 +13,10 @@ void *caller(char **x) {
   // CHECK-NEXT:                        store i8** %[[X]], i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[X_RELOADED:.*]] = load i8**, i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[BITCAST:.*]] = bitcast i8** %[[X_RELOADED]] to i8*
-  // CHECK-NEXT:                        %[[PTRINT:.*]] = ptrtoint i8* %[[BITCAST]] to i64
-  // CHECK-NEXT:                        %[[OFFSETPTR:.*]] = sub i64 %[[PTRINT]], 42
-  // CHECK-NEXT:                        %[[MASKEDPTR:.*]] = and i64 %[[OFFSETPTR]], 536870911
-  // CHECK-NEXT:                        %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-SANITIZE-NEXT:               %[[PTRINT:.*]] = ptrtoint i8* %[[BITCAST]] to i64
+  // CHECK-SANITIZE-NEXT:               %[[OFFSETPTR:.*]] = sub i64 %[[PTRINT]], 42
+  // CHECK-SANITIZE-NEXT:               %[[MASKEDPTR:.*]] = and i64 %[[OFFSETPTR]], 536870911
+  // CHECK-SANITIZE-NEXT:               %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8* %[[BITCAST]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -25,7 +25,7 @@ void *caller(char **x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
+  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8* %[[BITCAST]], i64 536870912, i64 42) ]
   // CHECK-NEXT:                        ret i8* %[[BITCAST]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-two-params.cpp b/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-two-params.cpp
index 5412270f37619..20bed646ff951 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-two-params.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-two-params.cpp
@@ -13,9 +13,9 @@ void *caller(char **x) {
   // CHECK-NEXT:                        store i8** %[[X]], i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[X_RELOADED:.*]] = load i8**, i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[BITCAST:.*]] = bitcast i8** %[[X_RELOADED]] to i8*
-  // CHECK-NEXT:                        %[[PTRINT:.*]] = ptrtoint i8* %[[BITCAST]] to i64
-  // CHECK-NEXT:                        %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], 536870911
-  // CHECK-NEXT:                        %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-SANITIZE-NEXT:               %[[PTRINT:.*]] = ptrtoint i8* %[[BITCAST]] to i64
+  // CHECK-SANITIZE-NEXT:               %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], 536870911
+  // CHECK-SANITIZE-NEXT:               %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8* %[[BITCAST]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -24,7 +24,7 @@ void *caller(char **x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
+  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8* %[[BITCAST]], i64 536870912) ]
   // CHECK-NEXT:                        ret i8* %[[BITCAST]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-openmp.cpp b/clang/test/CodeGen/catch-alignment-assumption-openmp.cpp
index 6d75ee0858dac..353f2fd7f17bd 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-openmp.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-openmp.cpp
@@ -12,9 +12,9 @@ void func(char *data) {
   // CHECK-NEXT:   %[[DATA_ADDR:.*]] = alloca i8*, align 8
   // CHECK:   store i8* %[[DATA]], i8** %[[DATA_ADDR]], align 8
   // CHECK:   %[[DATA_RELOADED:.*]] = load i8*, i8** %[[DATA_ADDR]], align 8
-  // CHECK-NEXT:   %[[PTRINT:.*]] = ptrtoint i8* %[[DATA_RELOADED]] to i64
-  // CHECK-NEXT:   %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], 1073741823
-  // CHECK-NEXT:   %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-SANITIZE-NEXT:   %[[PTRINT:.*]] = ptrtoint i8* %[[DATA_RELOADED]] to i64
+  // CHECK-SANITIZE-NEXT:   %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], 1073741823
+  // CHECK-SANITIZE-NEXT:   %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8* %[[DATA_RELOADED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -23,7 +23,7 @@ void func(char *data) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
+  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8* %[[DATA_RELOADED]], i64 1073741824) ]
 
 #line 100
 #pragma omp for simd aligned(data : 0x40000000)
diff --git a/clang/test/CodeGen/code-coverage-tsan.c b/clang/test/CodeGen/code-coverage-tsan.c
index 023a99598075f..17f6596aa83df 100644
--- a/clang/test/CodeGen/code-coverage-tsan.c
+++ b/clang/test/CodeGen/code-coverage-tsan.c
@@ -5,7 +5,6 @@
 // CHECK-LABEL: void @foo()
 /// Two counters are incremented by __tsan_atomic64_fetch_add.
 // CHECK:         call i64 @__tsan_atomic64_fetch_add
-// CHECK-NEXT:    call i64 @__tsan_atomic64_fetch_add
 // CHECK-NEXT:    call i32 @__tsan_atomic32_fetch_sub
 
 _Atomic(int) cnt;
diff --git a/clang/test/CodeGen/code-coverage.c b/clang/test/CodeGen/code-coverage.c
index 5a663135e2f03..39c4556b9ff4b 100644
--- a/clang/test/CodeGen/code-coverage.c
+++ b/clang/test/CodeGen/code-coverage.c
@@ -37,10 +37,10 @@ int test2(int b) {
 }
 
 
-// CHECK: @__llvm_internal_gcov_emit_function_args.0 = internal unnamed_addr constant [2 x %0]
-// CHECK-SAME: [%0 zeroinitializer, %0 { i32 1, i32 0, i32 0 }]
+// CHECK: @__llvm_internal_gcov_emit_function_args.0 = internal unnamed_addr constant [2 x %emit_function_args_ty]
+// CHECK-SAME: [%emit_function_args_ty { i32 0, i32 {{[-0-9]+}}, i32 {{[-0-9]+}} }, %emit_function_args_ty { i32 1, i32 {{[-0-9]+}}, i32 {{[-0-9]+}} }]
 
-// CHECK: @__llvm_internal_gcov_emit_file_info = internal unnamed_addr constant [1 x %2]
+// CHECK: @__llvm_internal_gcov_emit_file_info = internal unnamed_addr constant [1 x %file_info]
 /// 0x3330342a '3' '0' '4' '*'
 // 304-SAME: i32 858797098
 /// 0x3430372a '4' '0' '7' '*'
diff --git a/clang/test/CodeGen/non-power-of-2-alignment-assumptions.c b/clang/test/CodeGen/non-power-of-2-alignment-assumptions.c
index 9467f6228dfc4..b8ce1699f7ed0 100644
--- a/clang/test/CodeGen/non-power-of-2-alignment-assumptions.c
+++ b/clang/test/CodeGen/non-power-of-2-alignment-assumptions.c
@@ -9,12 +9,8 @@ void *__attribute__((alloc_align(1))) alloc(int align);
 // CHECK-NEXT:    store i32 [[ALIGN:%.*]], i32* [[ALIGN_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ALIGN_ADDR]], align 4
 // CHECK-NEXT:    [[CALL:%.*]] = call i8* @alloc(i32 [[TMP0]])
-// CHECK-NEXT:    [[ALIGNMENTCAST:%.*]] = zext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[ALIGNMENTCAST]], 1
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[CALL]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[CALL]], i64 [[TMP1]]) ]
 // CHECK-NEXT:    ret void
 //
 void t0(int align) {
@@ -25,10 +21,7 @@ void t0(int align) {
 // CHECK-NEXT:    [[ALIGN_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 [[ALIGN:%.*]], i32* [[ALIGN_ADDR]], align 4
 // CHECK-NEXT:    [[CALL:%.*]] = call i8* @alloc(i32 7)
-// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[CALL]] to i64
-// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 6
-// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[CALL]], i64 7) ]
 // CHECK-NEXT:    ret void
 //
 void t1(int align) {
diff --git a/clang/test/CodeGen/pass-by-value-noalias.c b/clang/test/CodeGen/pass-by-value-noalias.c
new file mode 100644
index 0000000000000..f77ce2b1e35bb
--- /dev/null
+++ b/clang/test/CodeGen/pass-by-value-noalias.c
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -fpass-by-value-is-noalias -triple arm64-apple-iphoneos -emit-llvm -disable-llvm-optzns %s -o - 2>&1 | FileCheck --check-prefix=WITH_NOALIAS %s
+// RUN: %clang_cc1 -triple arm64-apple-iphoneos -emit-llvm -disable-llvm-optzns %s -o - 2>&1 | FileCheck --check-prefix=NO_NOALIAS %s
+
+// A struct large enough so it is not passed in registers on ARM64.
+struct Foo {
+  int a;
+  int b;
+  int c;
+  int d;
+  int e;
+  int f;
+};
+
+// WITH_NOALIAS: define void @take(%struct.Foo* noalias %arg)
+// NO_NOALIAS: define void @take(%struct.Foo* %arg)
+void take(struct Foo arg) {}
diff --git a/clang/test/CodeGen/shadowcallstack-attr.c b/clang/test/CodeGen/shadowcallstack-attr.c
index 45e710d875627..da68251bf26aa 100644
--- a/clang/test/CodeGen/shadowcallstack-attr.c
+++ b/clang/test/CodeGen/shadowcallstack-attr.c
@@ -1,9 +1,23 @@
-// RUN: %clang_cc1 -triple x86_64-linux-unknown -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=UNBLACKLISTED %s
+// RUN: %clang_cc1 -triple x86_64-linux-unknown -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=UNBLOCKLISTED %s
 
-// RUN: %clang_cc1 -D ATTR -triple x86_64-linux-unknown -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=BLACKLISTED %s
+// RUN: %clang_cc1 -D ATTR -triple x86_64-linux-unknown -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=BLOCKLISTED %s
 
 // RUN: echo -e "[shadow-call-stack]\nfun:foo" > %t
-// RUN: %clang_cc1 -fsanitize-blacklist=%t -triple x86_64-linux-unknown -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=BLACKLISTED %s
+// RUN: %clang_cc1 -fsanitize-blacklist=%t -triple x86_64-linux-unknown -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=BLOCKLISTED %s
+
+// RUN: %clang_cc1 -triple riscv32-linux-gnu -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=UNBLOCKLISTED %s
+
+// RUN: %clang_cc1 -D ATTR -triple riscv32-linux-gnu -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=BLOCKLISTED %s
+
+// RUN: echo -e "[shadow-call-stack]\nfun:foo" > %t
+// RUN: %clang_cc1 -fsanitize-blacklist=%t -triple riscv32-linux-gnu -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=BLOCKLISTED %s
+
+// RUN: %clang_cc1 -triple riscv64-linux-gnu -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=UNBLOCKLISTED %s
+
+// RUN: %clang_cc1 -D ATTR -triple riscv64-linux-gnu -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=BLOCKLISTED %s
+
+// RUN: echo -e "[shadow-call-stack]\nfun:foo" > %t
+// RUN: %clang_cc1 -fsanitize-blacklist=%t -triple riscv64-linux-gnu -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=BLOCKLISTED %s
 
 #ifdef ATTR
 __attribute__((no_sanitize("shadow-call-stack")))
@@ -12,5 +26,5 @@ int foo(int *a) { return *a; }
 
 // CHECK: define i32 @foo(i32* %a)
 
-// BLACKLISTED-NOT: attributes {{.*}}shadowcallstack{{.*}}
-// UNBLACKLISTED: attributes {{.*}}shadowcallstack{{.*}}
+// BLOCKLISTED-NOT: attributes {{.*}}shadowcallstack{{.*}}
+// UNBLOCKLISTED: attributes {{.*}}shadowcallstack{{.*}}
diff --git a/clang/test/CodeGen/thinlto_embed_bitcode.ll b/clang/test/CodeGen/thinlto_embed_bitcode.ll
new file mode 100644
index 0000000000000..2d60e16e54e1e
--- /dev/null
+++ b/clang/test/CodeGen/thinlto_embed_bitcode.ll
@@ -0,0 +1,30 @@
+; REQUIRES: x86-registered-target
+
+; check the -lto-embed-bitcode=post-thinlto-merge does not perform optimizations
+; we expect 't1' - i.e start-lib1.ll's products - have both foo and bar defined,
+; but the bar call is still made from foo.
+; RUN: opt -module-summary %p/Inputs/start-lib1.ll -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/start-lib2.ll -o %t2.bc
+; RUN: llvm-lto -thinlto -o %t.o %t1.bc %t2.bc
+
+; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t.o -x ir %t1.bc -c -fthinlto-index=%t.o.thinlto.bc -mllvm -lto-embed-bitcode=post-merge-pre-opt
+; RUN: llvm-readelf -S %t.o | FileCheck %s --check-prefixes=CHECK-ELF,CHECK-CMD
+; RUN: llvm-objcopy --dump-section=.llvmbc=%t-embedded.bc %t.o /dev/null
+; RUN: llvm-dis %t-embedded.bc -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOOPT
+
+; For the optimized case, we expect the inlining of foo into bar to happen.
+; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t.o -x ir %t1.bc -c -fthinlto-index=%t.o.thinlto.bc -mllvm -lto-embed-bitcode=optimized
+; RUN: llvm-readelf -S %t.o | FileCheck %s --check-prefixes=CHECK-ELF,CHECK-NO-CMD
+; RUN: llvm-objcopy --dump-section=.llvmbc=%t-embedded.bc %t.o /dev/null
+; RUN: llvm-dis %t-embedded.bc -o - | FileCheck %s --check-prefixes=CHECK,CHECK-OPT
+
+; CHECK-ELF:      .text   PROGBITS 0000000000000000 [[#%x,OFF:]] [[#%x,SIZE:]] 00 AX 0
+; CHECK-ELF-NEXT: .llvmbc PROGBITS 0000000000000000 [[#%x,OFF:]] [[#%x,SIZE:]] 00    0
+; CHECK-ELF-CMD:  .llvmcmd
+; CHECK-ELF-NO-CMD-NOT: .llvmcmd
+
+; CHECK:          define void @foo() 
+; CHECK-OPT-NEXT:   ret void
+; CHECK-NOOPT-NEXT: call void @bar()
+; CHECK-NOOPT: define available_externally void @bar()
+; CHECK-NOOPT-NEXT: ret void
diff --git a/clang/test/CodeGenCUDA/Inputs/device-lib-code.ll b/clang/test/CodeGenCUDA/Inputs/device-lib-code.ll
new file mode 100644
index 0000000000000..43ec911fb02cc
--- /dev/null
+++ b/clang/test/CodeGenCUDA/Inputs/device-lib-code.ll
@@ -0,0 +1,5 @@
+define linkonce_odr protected float @__ocml_fma_f32(float %0, float %1, float %2) local_unnamed_addr {
+  %4 = tail call float @llvm.fma.f32(float %0, float %1, float %2)
+  ret float %4
+}
+declare float @llvm.fma.f32(float, float, float)
diff --git a/clang/test/CodeGenCUDA/dft-func-attr-skip-intrinsic.hip b/clang/test/CodeGenCUDA/dft-func-attr-skip-intrinsic.hip
new file mode 100644
index 0000000000000..9e3e436200fc3
--- /dev/null
+++ b/clang/test/CodeGenCUDA/dft-func-attr-skip-intrinsic.hip
@@ -0,0 +1,18 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -x ir -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm-bc -disable-llvm-passes -o %t.bc %S/Inputs/device-lib-code.ll
+// RUN: %clang_cc1 -x hip -fcuda-is-device -triple amdgcn-amd-amdhsa -mlink-builtin-bitcode %t.bc -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+extern "C" __device__ float __ocml_fma_f32(float x, float y, float z);
+
+__device__ float foo(float x) {
+  return __ocml_fma_f32(x, x, x);
+}
+
+// CHECK: {{^}}define{{.*}} @__ocml_fma_f32{{.*}} [[ATTR1:#[0-9]+]]
+// CHECK: {{^}}declare{{.*}} @llvm.fma.f32{{.*}} [[ATTR2:#[0-9]+]]
+// CHECK: attributes [[ATTR1]] = { convergent
+// CHECK: attributes [[ATTR2]] = {
+// CHECK-NOT: convergent
+// CHECK: }
diff --git a/clang/test/CodeGenCXX/attr-likelihood-if-branch-weights.cpp b/clang/test/CodeGenCXX/attr-likelihood-if-branch-weights.cpp
new file mode 100644
index 0000000000000..6327396a92852
--- /dev/null
+++ b/clang/test/CodeGenCXX/attr-likelihood-if-branch-weights.cpp
@@ -0,0 +1,146 @@
+// RUN: %clang_cc1 -O1 -emit-llvm %s -o - -triple=x86_64-linux-gnu | FileCheck -DLIKELY=2000 -DUNLIKELY=1 %s
+// RUN: %clang_cc1 -O1 -emit-llvm %s -triple=x86_64-linux-gnu -mllvm -likely-branch-weight=99 -mllvm -unlikely-branch-weight=42 -o - | FileCheck -DLIKELY=99 -DUNLIKELY=42 %s
+
+extern volatile bool b;
+extern volatile int i;
+extern bool A();
+extern bool B();
+
+bool f() {
+  // CHECK-LABEL: define zeroext i1 @_Z1fv
+  // CHECK: br {{.*}} !prof !7
+  if (b)
+    [[likely]] {
+      return A();
+    }
+  return B();
+}
+
+bool g() {
+  // CHECK-LABEL: define zeroext i1 @_Z1gv
+  // CHECK: br {{.*}} !prof !8
+  if (b)
+    [[unlikely]] {
+      return A();
+    }
+
+  return B();
+}
+
+bool h() {
+  // CHECK-LABEL: define zeroext i1 @_Z1hv
+  // CHECK: br {{.*}} !prof !8
+  if (b)
+    [[unlikely]] return A();
+
+  return B();
+}
+
+void NullStmt() {
+  // CHECK-LABEL: define{{.*}}NullStmt
+  // CHECK: br {{.*}} !prof !8
+  if (b)
+    [[unlikely]];
+  else {
+    // Make sure the branches aren't optimized away.
+    b = true;
+  }
+}
+
+void IfStmt() {
+  // CHECK-LABEL: define{{.*}}IfStmt
+  // CHECK: br {{.*}} !prof !8
+  if (b)
+    [[unlikely]] if (B()) {}
+
+  // CHECK-NOT: br {{.*}} !prof
+  // CHECK: br {{.*}} !prof
+  if (b) {
+    if (B())
+      [[unlikely]] { b = false; }
+  }
+}
+
+void WhileStmt() {
+  // CHECK-LABEL: define{{.*}}WhileStmt
+  // CHECK: br {{.*}} !prof !8
+  if (b)
+    [[unlikely]] while (B()) {}
+
+  // CHECK-NOT: br {{.*}} %if.end{{.*}} !prof
+  if (b)
+    while (B())
+      [[unlikely]] { b = false; }
+}
+
+void DoStmt() {
+  // CHECK-LABEL: define{{.*}}DoStmt
+  // CHECK: br {{.*}} !prof !8
+  if (b)
+    [[unlikely]] do {}
+    while (B())
+      ;
+
+  // CHECK-NOT: br {{.*}} %if.end{{.*}} !prof
+  if (b)
+    do
+      [[unlikely]] {}
+    while (B());
+}
+
+void ForStmt() {
+  // CHECK-LABEL: define{{.*}}ForStmt
+  // CHECK: br {{.*}} !prof !8
+  if (b)
+    [[unlikely]] for (; B();) {}
+
+  // CHECK-NOT: br {{.*}} %if.end{{.*}} !prof
+  if (b)
+    for (; B();)
+      [[unlikely]] {}
+}
+
+void GotoStmt() {
+  // CHECK-LABEL: define{{.*}}GotoStmt
+  // CHECK: br {{.*}} !prof !8
+  if (b)
+    [[unlikely]] goto end;
+  else {
+    // Make sure the branches aren't optimized away.
+    b = true;
+  }
+end:;
+}
+
+void ReturnStmt() {
+  // CHECK-LABEL: define{{.*}}ReturnStmt
+  // CHECK: br {{.*}} !prof !8
+  if (b)
+    [[unlikely]] return;
+  else {
+    // Make sure the branches aren't optimized away.
+    b = true;
+  }
+}
+
+void SwitchStmt() {
+  // CHECK-LABEL: define{{.*}}SwitchStmt
+  // CHECK: br {{.*}} !prof !8
+  if (b)
+    [[unlikely]] switch (i) {}
+  else {
+    // Make sure the branches aren't optimized away.
+    b = true;
+  }
+  // CHECK-NOT: br {{.*}} %if.end{{.*}} !prof
+  if (b)
+    switch (i)
+      [[unlikely]] {}
+  else {
+    // Make sure the branches aren't optimized away.
+    b = true;
+  }
+}
+
+// CHECK: !7 = !{!"branch_weights", i32 [[UNLIKELY]], i32 [[LIKELY]]}
+// CHECK: !8 = !{!"branch_weights", i32 [[LIKELY]], i32 [[UNLIKELY]]}
diff --git a/clang/test/CodeGenCXX/builtins.cpp b/clang/test/CodeGenCXX/builtins.cpp
index 242cba7bc14aa..b0378322f97e8 100644
--- a/clang/test/CodeGenCXX/builtins.cpp
+++ b/clang/test/CodeGenCXX/builtins.cpp
@@ -1,5 +1,19 @@
 // RUN: %clang_cc1 -triple=x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
 
+// Builtins inside a namespace inside an extern "C" must be considered builtins.
+extern "C" {
+namespace X {
+double __builtin_fabs(double);
+float __builtin_fabsf(float) noexcept;
+} // namespace X
+}
+
+int o = X::__builtin_fabs(-2.0);
+// CHECK: @o = global i32 2, align 4
+
+long p = X::__builtin_fabsf(-3.0f);
+// CHECK: @p = global i64 3, align 8
+
 // PR8839
 extern "C" char memmove();
 
diff --git a/clang/test/CodeGenCXX/mangle-ms-auto-templates.cpp b/clang/test/CodeGenCXX/mangle-ms-auto-templates.cpp
new file mode 100644
index 0000000000000..c17f5f5e4477f
--- /dev/null
+++ b/clang/test/CodeGenCXX/mangle-ms-auto-templates.cpp
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -std=c++17 -fms-compatibility-version=19.20 -emit-llvm %s -o - -fms-extensions -fdelayed-template-parsing -triple=x86_64-pc-windows-msvc | FileCheck --check-prefix=AFTER %s
+// RUN: %clang_cc1 -std=c++17 -fms-compatibility-version=19.14 -emit-llvm %s -o - -fms-extensions -fdelayed-template-parsing -triple=x86_64-pc-windows-msvc | FileCheck --check-prefix=BEFORE %s
+
+template 
+class AutoParmTemplate {
+public:
+  AutoParmTemplate() {}
+};
+
+template 
+class AutoParmsTemplate {
+public:
+  AutoParmsTemplate() {}
+};
+
+template 
+auto AutoFunc() {
+  return a;
+}
+
+void template_mangling() {
+  AutoFunc<1>();
+  // AFTER: call {{.*}} @"??$AutoFunc@$MH00@@YA?A?@@XZ"
+  // BEFORE: call {{.*}} @"??$AutoFunc@$00@@YA?A?@@XZ"
+  AutoParmTemplate<0> auto_int;
+  // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$MH0A@@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$0A@@@QEAA@XZ"
+  AutoParmTemplate<'a'> auto_char;
+  // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$MD0GB@@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$0GB@@@QEAA@XZ"
+  AutoParmTemplate<9223372036854775807LL> int64_max;
+  // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$M_J0HPPPPPPPPPPPPPPP@@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$0HPPPPPPPPPPPPPPP@@@QEAA@XZ"
+  AutoParmTemplate<-9223372036854775807LL - 1LL> int64_min;
+  // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$M_J0?IAAAAAAAAAAAAAAA@@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$0?IAAAAAAAAAAAAAAA@@@QEAA@XZ"
+  AutoParmTemplate<(unsigned long long)-1> uint64_neg_1;
+  // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$M_K0?0@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$0?0@@QEAA@XZ"
+
+  AutoParmsTemplate<0, false, 'a'> c1;
+  // AFTER: call {{.*}} @"??0?$AutoParmsTemplate@$MH0A@$M_N0A@$MD0GB@@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmsTemplate@$0A@$0A@$0GB@@@QEAA@XZ"
+  AutoParmsTemplate<(unsigned long)1, 9223372036854775807LL> c2;
+  // AFTER: call {{.*}} @"??0?$AutoParmsTemplate@$MK00$M_J0HPPPPPPPPPPPPPPP@@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmsTemplate@$00$0HPPPPPPPPPPPPPPP@@@QEAA@XZ"
+}
diff --git a/clang/test/CodeGenCXX/mangle-neon-vectors.cpp b/clang/test/CodeGenCXX/mangle-neon-vectors.cpp
index 6faf6226efd2e..cb5e40be6a6df 100644
--- a/clang/test/CodeGenCXX/mangle-neon-vectors.cpp
+++ b/clang/test/CodeGenCXX/mangle-neon-vectors.cpp
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 -triple armv7-apple-ios -target-feature +neon  %s -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon %s -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-AARCH64
+// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon -target-feature +bf16 %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-AARCH64-BF16
 
 typedef float float32_t;
 typedef double float64_t;
@@ -14,6 +15,10 @@ typedef short poly16_t;
 #endif
 typedef unsigned __INT64_TYPE__ uint64_t;
 
+#if defined(__ARM_FEATURE_BF16)
+typedef __bf16 bfloat16_t;
+#endif
+
 typedef __attribute__((neon_vector_type(2))) int int32x2_t;
 typedef __attribute__((neon_vector_type(4))) int int32x4_t;
 typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
@@ -28,6 +33,10 @@ typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
 typedef __attribute__((neon_polyvector_type(16))) poly8_t  poly8x16_t;
 typedef __attribute__((neon_polyvector_type(8)))  poly16_t poly16x8_t;
 
+#if defined(__ARM_FEATURE_BF16)
+typedef __attribute__((neon_vector_type(4))) __bf16 bfloat16x4_t;
+#endif
+
 // CHECK: 16__simd64_int32_t
 // CHECK-AARCH64: 11__Int32x2_t
 void f1(int32x2_t v) { }
@@ -72,3 +81,8 @@ void f10(poly16x8_t v) {}
 // CHECK-AARCH64: 13__Float64x2_t
 void f11(float64x2_t v) { }
 #endif
+
+#if defined(__ARM_FEATURE_BF16)
+// CHECK-AARCH64-BF16: 14__Bfloat16x4_t
+void f12(bfloat16x4_t v) {}
+#endif
diff --git a/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp b/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp
index f3bd7e6fd6c80..8598396f06441 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp
@@ -46,9 +46,11 @@ const std::type_info* test4_typeid() { return &typeid(b); }
 
 const std::type_info* test5_typeid() { return &typeid(v); }
 // CHECK: define dso_local %struct.type_info* @"?test5_typeid@@YAPBUtype_info@@XZ"()
-// CHECK:        [[RT:%.*]] = call i8* @__RTtypeid(i8* bitcast (%struct.V* @"?v@@3UV@@A" to i8*))
-// CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[RT]] to %struct.type_info*
-// CHECK-NEXT:   ret %struct.type_info* [[RET]]
+// CHECK:   ret %struct.type_info* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUV@@@8" to %struct.type_info*)
+
+const std::type_info *test6_typeid() { return &typeid((V &)v); }
+// CHECK: define dso_local %struct.type_info* @"?test6_typeid@@YAPBUtype_info@@XZ"()
+// CHECK:   ret %struct.type_info* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUV@@@8" to %struct.type_info*)
 
 namespace PR26329 {
 struct Polymorphic {
diff --git a/clang/test/CodeGenCXX/pass-by-value-noalias.cpp b/clang/test/CodeGenCXX/pass-by-value-noalias.cpp
new file mode 100644
index 0000000000000..fd96a36d3d6e5
--- /dev/null
+++ b/clang/test/CodeGenCXX/pass-by-value-noalias.cpp
@@ -0,0 +1,73 @@
+// RUN: %clang_cc1 -fpass-by-value-is-noalias -triple arm64-apple-iphoneos -emit-llvm -disable-llvm-optzns %s -o - 2>&1 | FileCheck --check-prefix=WITH_NOALIAS %s
+// RUN: %clang_cc1 -triple arm64-apple-iphoneos -emit-llvm -disable-llvm-optzns %s -o - 2>&1 | FileCheck --check-prefix=NO_NOALIAS %s
+
+// A trivial struct large enough so it is not passed in registers on ARM64.
+struct Foo {
+  int a;
+  int b;
+  int c;
+  int d;
+  int e;
+  int f;
+};
+
+// Make sure noalias is added to indirect arguments with trivially copyable types
+// if -fpass-by-value-is-noalias is provided.
+
+// WITH_NOALIAS: define void @_Z4take3Foo(%struct.Foo* noalias %arg)
+// NO_NOALIAS: define void @_Z4take3Foo(%struct.Foo* %arg)
+void take(Foo arg) {}
+
+int G;
+
+// NonTrivial is not trivially-copyable, because it has a non-trivial copy
+// constructor.
+struct NonTrivial {
+  int a;
+  int b;
+  int c;
+  int d;
+  int e;
+  int f;
+
+  NonTrivial(const NonTrivial &Other) {
+    a = G + 10 + Other.a;
+  }
+};
+
+// Make sure noalias is not added to indirect arguments that are not trivially
+// copyable even if -fpass-by-value-is-noalias is provided.
+
+// WITH_NOALIAS: define void @_Z4take10NonTrivial(%struct.NonTrivial* %arg)
+// NO_NOALIAS:   define void @_Z4take10NonTrivial(%struct.NonTrivial* %arg)
+void take(NonTrivial arg) {}
+
+// Escape examples. Pointers to the objects passed to take() may escape, depending on whether a temporary copy is created or not (e.g. due to NRVO).
+struct A {
+  A(A **where) : data{"hello world 1"} {
+    *where = this; //Escaped pointer 1 (proposed UB?)
+  }
+
+  A() : data{"hello world 2"} {}
+
+  char data[32];
+};
+A *p;
+
+// WITH_NOALIAS: define void @_Z4take1A(%struct.A* noalias %arg)
+// NO_NOALIAS: define void @_Z4take1A(%struct.A* %arg)
+void take(A arg) {}
+
+// WITH_NOALIAS: define void @_Z7CreateAPP1A(%struct.A* noalias sret align 1 %agg.result, %struct.A** %where)
+// NO_NOALIAS: define void @_Z7CreateAPP1A(%struct.A* noalias sret align 1 %agg.result, %struct.A** %where)
+A CreateA(A **where) {
+  A justlikethis;
+  *where = &justlikethis; //Escaped pointer 2 (should also be UB, then)
+  return justlikethis;
+}
+
+// elsewhere, perhaps compiled by a smarter compiler that doesn't make a copy here
+void test() {
+  take({&p});        // 1
+  take(CreateA(&p)); // 2
+}
diff --git a/clang/test/CodeGenCoroutines/Inputs/coroutine.h b/clang/test/CodeGenCoroutines/Inputs/coroutine.h
index 5cc78a4904aad..2dd1ce7e97351 100644
--- a/clang/test/CodeGenCoroutines/Inputs/coroutine.h
+++ b/clang/test/CodeGenCoroutines/Inputs/coroutine.h
@@ -15,7 +15,7 @@ template <> struct coroutine_handle {
     return me;
   }
   void operator()() { resume(); }
-  void *address() const { return ptr; }
+  void *address() const noexcept { return ptr; }
   void resume() const { __builtin_coro_resume(ptr); }
   void destroy() const { __builtin_coro_destroy(ptr); }
   bool done() const { return __builtin_coro_done(ptr); }
diff --git a/clang/test/CodeGenCoroutines/coro-semmetric-transfer.cpp b/clang/test/CodeGenCoroutines/coro-semmetric-transfer.cpp
new file mode 100644
index 0000000000000..09205799c3f7f
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/coro-semmetric-transfer.cpp
@@ -0,0 +1,53 @@
+// RUN: %clang -std=c++14 -fcoroutines-ts -emit-llvm -S -O1 %s -o -
+
+#include "Inputs/coroutine.h"
+
+namespace coro = std::experimental::coroutines_v1;
+
+struct detached_task {
+  struct promise_type {
+    detached_task get_return_object() noexcept {
+      return detached_task{coro::coroutine_handle::from_promise(*this)};
+    }
+
+    void return_void() noexcept {}
+
+    struct final_awaiter {
+      bool await_ready() noexcept { return false; }
+      coro::coroutine_handle<> await_suspend(coro::coroutine_handle h) noexcept {
+        h.destroy();
+        return {};
+      }
+      void await_resume() noexcept {}
+    };
+
+    void unhandled_exception() noexcept {}
+
+    final_awaiter final_suspend() noexcept { return {}; }
+
+    coro::suspend_always initial_suspend() noexcept { return {}; }
+  };
+
+  ~detached_task() {
+    if (coro_) {
+      coro_.destroy();
+      coro_ = {};
+    }
+  }
+
+  void start() && {
+    auto tmp = coro_;
+    coro_ = {};
+    tmp.resume();
+  }
+
+  coro::coroutine_handle coro_;
+};
+
+detached_task foo() {
+  co_return;
+}
+
+// check that the lifetime of the coroutine handle used to obtain the address ended right away.
+// CHECK:       %{{.*}} = call i8* @{{.*address.*}}(%"struct.std::experimental::coroutines_v1::coroutine_handle.0"* nonnull %{{.*}})
+// CHECK-NEXT:  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %{{.*}})
diff --git a/clang/test/CodeGenObjC/pass-by-value-noalias.m b/clang/test/CodeGenObjC/pass-by-value-noalias.m
new file mode 100644
index 0000000000000..08252800dba2f
--- /dev/null
+++ b/clang/test/CodeGenObjC/pass-by-value-noalias.m
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -fpass-by-value-is-noalias -triple arm64-apple-iphoneos -emit-llvm -disable-llvm-optzns -fobjc-runtime-has-weak -fobjc-arc -fobjc-dispatch-method=mixed %s -o - 2>&1 | FileCheck --check-prefix=WITH_NOALIAS %s
+// RUN: %clang_cc1 -triple arm64-apple-iphoneos -emit-llvm -disable-llvm-optzns -fobjc-runtime-has-weak -fobjc-arc -fobjc-dispatch-method=mixed %s -o - 2>&1 | FileCheck --check-prefix=NO_NOALIAS %s
+
+@interface Bar
+@property char value;
+@end
+
+// A struct large enough so it is not passed in registers on ARM64, but with a
+// weak reference, so noalias should not be added even with
+// -fpass-by-value-is-noalias.
+struct Foo {
+  int a;
+  int b;
+  int c;
+  int d;
+  int e;
+  Bar *__weak f;
+};
+
+// WITH_NOALIAS: define void @take(%struct.Foo* %arg)
+// NO_NOALIAS: define void @take(%struct.Foo* %arg)
+void take(struct Foo arg) {}
diff --git a/clang/test/Driver/amdgcn-gz-options.cl b/clang/test/Driver/amdgcn-gz-options.cl
new file mode 100644
index 0000000000000..40fe9cfcc50df
--- /dev/null
+++ b/clang/test/Driver/amdgcn-gz-options.cl
@@ -0,0 +1,16 @@
+// REQUIRES: zlib, amdgpu-registered-target
+
+// RUN: %clang -### -target amdgcn-amd-amdhsa -gz=none -x assembler %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_NONE %s
+// RUN: %clang -### -target amdgcn-amd-amdhsa -gz=none %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_NONE %s
+// CHECK-OPT_GZ_EQ_NONE: {{.* "-cc1(as)?".* "--compress-debug-sections=none"}}
+// CHECK-OPT_GZ_EQ_NONE: "--compress-debug-sections=none"
+
+// RUN: %clang -### -target amdgcn-amd-amdhsa -gz=zlib -x assembler %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_ZLIB %s
+// RUN: %clang -### -target amdgcn-amd-amdhsa -gz=zlib %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_ZLIB %s
+// CHECK-OPT_GZ_EQ_ZLIB: {{.* "-cc1(as)?".* "--compress-debug-sections=zlib"}}
+// CHECK-OPT_GZ_EQ_ZLIB: "--compress-debug-sections=zlib"
+
+// RUN: %clang -### -target amdgcn-amd-amdhsa -gz=zlib-gnu -x assembler %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_ZLIB_GNU %s
+// RUN: %clang -### -target amdgcn-amd-amdhsa -gz=zlib-gnu %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_ZLIB_GNU %s
+// CHECK-OPT_GZ_EQ_ZLIB_GNU: {{.* "-cc1(as)?".* "--compress-debug-sections=zlib-gnu"}}
+// CHECK-OPT_GZ_EQ_ZLIB_GNU: "--compress-debug-sections=zlib-gnu"
diff --git a/clang/test/Driver/arm64-markbti.S b/clang/test/Driver/arm64-markbti.S
new file mode 100644
index 0000000000000..8eeed74810d27
--- /dev/null
+++ b/clang/test/Driver/arm64-markbti.S
@@ -0,0 +1,26 @@
+// REQUIRES: aarch64-registered-target
+
+// When -mmark-bti-property is passed the generated file object gets BTI marking.
+// RUN: %clang -target arm64-linux-none -mmark-bti-property -c -o - %s | llvm-readobj -n - | FileCheck -check-prefix=CHECK  -check-prefix=CHECK_GEN %s
+// RUN: %clang -target arm64-linux-none -DNOTE_PRESENT -c %s -o - | llvm-readobj -n - | FileCheck -check-prefix=CHECK  -check-prefix=CHECK_PRESET %s
+// RUN: %clang -target arm64-linux-none -mmark-bti-property -DNOTE_PRESENT -c %s -o - | llvm-readobj -n - | FileCheck -check-prefix=CHECK  -check-prefix=CHECK_PRESET %s
+// RUN: %clang -target arm64-linux-none -mmark-bti-property -DNOTE_PRESENT -c %s -o - 2>&1 |  FileCheck -check-prefix=CHECK_WARNING %s
+//
+// CHECK_WARNING: The .note.gnu.property is not emitted because it is already present.
+// CHECK: Name: .note.gnu.property
+// CHECK: Type: NT_GNU_PROPERTY_TYPE_0
+// CHECK_GEN: aarch64 feature: BTI
+// CHECK_PRESET: aarch64 feature: BTI, PAC
+
+#ifdef NOTE_PRESENT
+  .section .note.gnu.property, "a";
+  .balign 8;
+  .long 4;
+  .long 0x10;
+  .long 0x5
+  .asciz "GNU"
+  .long 0xc0000000
+  .long 4
+  .long 3
+  .long 0
+#endif
diff --git a/clang/test/Driver/compress.c b/clang/test/Driver/compress.c
index 1a16c6385c66e..f2cc187278f41 100644
--- a/clang/test/Driver/compress.c
+++ b/clang/test/Driver/compress.c
@@ -18,19 +18,21 @@
 // RUN: %clang -### -fintegrated-as -gz -c %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ %s
 // CHECK-OPT_GZ: "--compress-debug-sections"
 
-// RUN: %clang -### -fintegrated-as -gz=none -x assembler -c %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_NONE %s
-// RUN: %clang -### -fintegrated-as -gz=none -c %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_NONE %s
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -gz=none -x assembler %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_NONE %s
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -gz=none %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_NONE %s
+// CHECK-OPT_GZ_EQ_NONE: {{.* "-cc1(as)?".* "--compress-debug-sections=none"}}
 // CHECK-OPT_GZ_EQ_NONE: "--compress-debug-sections=none"
 
-// RUN: %clang -### -fintegrated-as -gz=zlib -x assembler -c %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_ZLIB %s
-// RUN: %clang -### -fintegrated-as -gz=zlib -c %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_ZLIB %s
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -gz=zlib -x assembler %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_ZLIB %s
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -gz=zlib %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_ZLIB %s
+// CHECK-OPT_GZ_EQ_ZLIB: {{.* "-cc1(as)?".* "--compress-debug-sections=zlib"}}
 // CHECK-OPT_GZ_EQ_ZLIB: "--compress-debug-sections=zlib"
 
-// RUN: %clang -### -fintegrated-as -gz=zlib-gnu -x assembler -c %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_ZLIB_GNU %s
-// RUN: %clang -### -fintegrated-as -gz=zlib-gnu -c %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_ZLIB_GNU %s
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -gz=zlib-gnu -x assembler %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_ZLIB_GNU %s
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -gz=zlib-gnu %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_ZLIB_GNU %s
+// CHECK-OPT_GZ_EQ_ZLIB_GNU: {{.* "-cc1(as)?".* "--compress-debug-sections=zlib-gnu"}}
 // CHECK-OPT_GZ_EQ_ZLIB_GNU: "--compress-debug-sections=zlib-gnu"
 
 // RUN: %clang -### -fintegrated-as -gz=invalid -x assembler -c %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_INVALID %s
 // RUN: %clang -### -fintegrated-as -gz=invalid -c %s 2>&1 | FileCheck -check-prefix CHECK-OPT_GZ_EQ_INVALID %s
 // CHECK-OPT_GZ_EQ_INVALID: error: unsupported argument 'invalid' to option 'gz='
-
diff --git a/clang/test/Driver/darwin-infer-simulator-sdkroot.c b/clang/test/Driver/darwin-infer-simulator-sdkroot.c
index a084bf6346b62..7d4d4070b81a1 100644
--- a/clang/test/Driver/darwin-infer-simulator-sdkroot.c
+++ b/clang/test/Driver/darwin-infer-simulator-sdkroot.c
@@ -17,7 +17,7 @@
 //
 // RUN: rm -rf %t/SDKs/iPhoneSimulator8.0.sdk
 // RUN: mkdir -p %t/SDKs/iPhoneSimulator8.0.sdk
-// RUN: env SDKROOT=%t/SDKs/iPhoneSimulator8.0.sdk %clang %s -mlinker-version=400 -### 2>&1 \
+// RUN: env SDKROOT=%t/SDKs/iPhoneSimulator8.0.sdk %clang -arch x86_64 %s -mlinker-version=400 -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-SIMULATOR %s
 //
 // CHECK-SIMULATOR: clang
@@ -27,6 +27,18 @@
 // CHECK-SIMULATOR: "-ios_simulator_version_min" "8.0.0"
 //
 //
+// RUN: rm -rf %t/SDKs/iPhoneSimulator14.0.sdk
+// RUN: mkdir -p %t/SDKs/iPhoneSimulator14.0.sdk
+// RUN: env SDKROOT=%t/SDKs/iPhoneSimulator14.0.sdk %clang -arch arm64 %s -mlinker-version=400 -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-SIMULATOR-ARM64 %s
+//
+// CHECK-SIMULATOR-ARM64: clang
+// CHECK-SIMULATOR-ARM64: "-cc1"
+// CHECK-SIMULATOR-ARM64: -apple-ios14.0.0-simulator"
+// CHECK-SIMULATOR-ARM64: ld
+// CHECK-SIMULATOR-ARM64: "-ios_simulator_version_min" "14.0.0"
+//
+//
 // RUN: rm -rf %t/SDKs/WatchOS3.0.sdk
 // RUN: mkdir -p %t/SDKs/WatchOS3.0.sdk
 // RUN: env SDKROOT=%t/SDKs/WatchOS3.0.sdk %clang %s -mlinker-version=400 -### 2>&1 \
@@ -43,7 +55,7 @@
 //
 // RUN: rm -rf %t/SDKs/WatchSimulator3.0.sdk
 // RUN: mkdir -p %t/SDKs/WatchSimulator3.0.sdk
-// RUN: env SDKROOT=%t/SDKs/WatchSimulator3.0.sdk %clang %s -mlinker-version=400 -### 2>&1 \
+// RUN: env SDKROOT=%t/SDKs/WatchSimulator3.0.sdk %clang -arch x86_64 %s -mlinker-version=400 -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-WATCH-SIMULATOR %s
 //
 // CHECK-WATCH-SIMULATOR: clang
@@ -53,6 +65,18 @@
 // CHECK-WATCH-SIMULATOR: "-watchos_simulator_version_min" "3.0.0"
 //
 //
+// RUN: rm -rf %t/SDKs/WatchSimulator7.0.sdk
+// RUN: mkdir -p %t/SDKs/WatchSimulator7.0.sdk
+// RUN: env SDKROOT=%t/SDKs/WatchSimulator7.0.sdk %clang -arch arm64 %s -mlinker-version=400 -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-WATCH-SIMULATOR-ARM64 %s
+//
+// CHECK-WATCH-SIMULATOR-ARM64: clang
+// CHECK-WATCH-SIMULATOR-ARM64: "-cc1"
+// CHECK-WATCH-SIMULATOR-ARM64: -apple-watchos7.0.0-simulator"
+// CHECK-WATCH-SIMULATOR-ARM64: ld
+// CHECK-WATCH-SIMULATOR-ARM64: "-watchos_simulator_version_min" "7.0.0"
+//
+//
 // RUN: rm -rf %t/SDKs/AppleTVOS10.0.sdk
 // RUN: mkdir -p %t/SDKs/AppleTVOS10.0.sdk
 // RUN: env SDKROOT=%t/SDKs/AppleTVOS10.0.sdk %clang %s -mlinker-version=400 -### 2>&1 \
@@ -67,7 +91,7 @@
 //
 // RUN: rm -rf %t/SDKs/AppleTVSimulator10.0.sdk
 // RUN: mkdir -p %t/SDKs/AppleTVSimulator10.0.sdk
-// RUN: env SDKROOT=%t/SDKs/AppleTVSimulator10.0.sdk %clang %s -mlinker-version=400 -### 2>&1 \
+// RUN: env SDKROOT=%t/SDKs/AppleTVSimulator10.0.sdk %clang -arch x86_64 %s -mlinker-version=400 -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-TV-SIMULATOR %s
 //
 // CHECK-TV-SIMULATOR: clang
@@ -75,3 +99,16 @@
 // CHECK-TV-SIMULATOR: -apple-tvos10.0.0-simulator"
 // CHECK-TV-SIMULATOR: ld
 // CHECK-TV-SIMULATOR: "-tvos_simulator_version_min" "10.0.0"
+//
+//
+// RUN: rm -rf %t/SDKs/AppleTVSimulator14.0.sdk
+// RUN: mkdir -p %t/SDKs/AppleTVSimulator14.0.sdk
+// RUN: env SDKROOT=%t/SDKs/AppleTVSimulator14.0.sdk %clang -arch arm64 %s -mlinker-version=400 -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-TV-SIMULATOR-ARM64 %s
+//
+// CHECK-TV-SIMULATOR-ARM64: clang
+// CHECK-TV-SIMULATOR-ARM64: "-cc1"
+// CHECK-TV-SIMULATOR-ARM64: -apple-tvos14.0.0-simulator"
+// CHECK-TV-SIMULATOR-ARM64: ld
+// CHECK-TV-SIMULATOR-ARM64: "-tvos_simulator_version_min" "14.0.0"
+
diff --git a/clang/test/Driver/fbasic-block-sections.c b/clang/test/Driver/fbasic-block-sections.c
index 2ff98c94222b2..93c7fe9fc0699 100644
--- a/clang/test/Driver/fbasic-block-sections.c
+++ b/clang/test/Driver/fbasic-block-sections.c
@@ -1,9 +1,12 @@
-// RUN: %clang -### -fbasic-block-sections=none %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-NONE %s
-// RUN: %clang -### -fbasic-block-sections=all %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-ALL %s
-// RUN: %clang -### -fbasic-block-sections=list=%s %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-LIST %s
-// RUN: %clang -### -fbasic-block-sections=labels %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-LABELS %s
+// RUN: %clang -### -target x86_64 -fbasic-block-sections=none %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-NONE %s
+// RUN: %clang -### -target x86_64 -fbasic-block-sections=all %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-ALL %s
+// RUN: %clang -### -target x86_64 -fbasic-block-sections=list=%s %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-LIST %s
+// RUN: %clang -### -target x86_64 -fbasic-block-sections=labels %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-LABELS %s
+// RUN: not %clang -c -target arm-unknown-linux -fbasic-block-sections=all %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
+// RUN: not %clang -c -target x86_64-apple-darwin10 -fbasic-block-sections=all %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
 //
-// CHECK-OPT-NONE: "-fbasic-block-sections=none"
-// CHECK-OPT-ALL: "-fbasic-block-sections=all"
-// CHECK-OPT-LIST: "-fbasic-block-sections={{[^ ]*}}fbasic-block-sections.c"
+// CHECK-OPT-NONE:   "-fbasic-block-sections=none"
+// CHECK-OPT-ALL:    "-fbasic-block-sections=all"
+// CHECK-OPT-LIST:   "-fbasic-block-sections={{[^ ]*}}fbasic-block-sections.c"
 // CHECK-OPT-LABELS: "-fbasic-block-sections=labels"
+// CHECK-TRIPLE:     error: unsupported option '-fbasic-block-sections=all' for target
diff --git a/clang/test/Driver/flang/flang.f90 b/clang/test/Driver/flang/flang.f90
index a68be31343f9c..e4629d527d183 100644
--- a/clang/test/Driver/flang/flang.f90
+++ b/clang/test/Driver/flang/flang.f90
@@ -13,7 +13,7 @@
 ! * (no type specified, resulting in an object file)
 
 ! All invocations should begin with flang -fc1, consume up to here.
-! ALL-LABEL: "{{[^"]*}}flang" "-fc1"
+! ALL-LABEL: "{{[^"]*}}flang-new" "-fc1"
 
 ! Check that f90 files are not treated as "previously preprocessed"
 ! ... in --driver-mode=flang.
diff --git a/clang/test/Driver/flang/flang_ucase.F90 b/clang/test/Driver/flang/flang_ucase.F90
index dd1e20088191f..4da09e138b59d 100644
--- a/clang/test/Driver/flang/flang_ucase.F90
+++ b/clang/test/Driver/flang/flang_ucase.F90
@@ -13,7 +13,7 @@
 ! * (no type specified, resulting in an object file)
 
 ! All invocations should begin with flang -fc1, consume up to here.
-! ALL-LABEL: "{{[^"]*}}flang" "-fc1"
+! ALL-LABEL: "{{[^"]*}}flang-new" "-fc1"
 
 ! Check that f90 files are not treated as "previously preprocessed"
 ! ... in --driver-mode=flang.
diff --git a/clang/test/Driver/flang/multiple-inputs-mixed.f90 b/clang/test/Driver/flang/multiple-inputs-mixed.f90
index 98d8cab00bdfd..2395dbecf1fe9 100644
--- a/clang/test/Driver/flang/multiple-inputs-mixed.f90
+++ b/clang/test/Driver/flang/multiple-inputs-mixed.f90
@@ -1,7 +1,7 @@
 ! Check that flang can handle mixed C and fortran inputs.
 
 ! RUN: %clang --driver-mode=flang -### -fsyntax-only %S/Inputs/one.f90 %S/Inputs/other.c 2>&1 | FileCheck --check-prefixes=CHECK-SYNTAX-ONLY %s
-! CHECK-SYNTAX-ONLY-LABEL: "{{[^"]*}}flang{{[^"/]*}}" "-fc1"
+! CHECK-SYNTAX-ONLY-LABEL: "{{[^"]*}}flang-new{{[^"/]*}}" "-fc1"
 ! CHECK-SYNTAX-ONLY: "{{[^"]*}}/Inputs/one.f90"
 ! CHECK-SYNTAX-ONLY-LABEL: "{{[^"]*}}clang{{[^"/]*}}" "-cc1"
 ! CHECK-SYNTAX-ONLY: "{{[^"]*}}/Inputs/other.c"
diff --git a/clang/test/Driver/flang/multiple-inputs.f90 b/clang/test/Driver/flang/multiple-inputs.f90
index 34592a3dc3a39..f6ee60e48fef3 100644
--- a/clang/test/Driver/flang/multiple-inputs.f90
+++ b/clang/test/Driver/flang/multiple-inputs.f90
@@ -1,7 +1,7 @@
 ! Check that flang driver can handle multiple inputs at once.
 
 ! RUN: %clang --driver-mode=flang -### -fsyntax-only %S/Inputs/one.f90 %S/Inputs/two.f90 2>&1 | FileCheck --check-prefixes=CHECK-SYNTAX-ONLY %s
-! CHECK-SYNTAX-ONLY-LABEL: "{{[^"]*}}flang" "-fc1"
+! CHECK-SYNTAX-ONLY-LABEL: "{{[^"]*}}flang-new" "-fc1"
 ! CHECK-SYNTAX-ONLY: "{{[^"]*}}/Inputs/one.f90"
-! CHECK-SYNTAX-ONLY-LABEL: "{{[^"]*}}flang" "-fc1"
+! CHECK-SYNTAX-ONLY-LABEL: "{{[^"]*}}flang-new" "-fc1"
 ! CHECK-SYNTAX-ONLY: "{{[^"]*}}/Inputs/two.f90"
diff --git a/clang/test/Driver/fmemprof.cpp b/clang/test/Driver/fmemprof.cpp
index a2b740e1e6e5e..69686442d4103 100644
--- a/clang/test/Driver/fmemprof.cpp
+++ b/clang/test/Driver/fmemprof.cpp
@@ -1,6 +1,6 @@
 // RUN: %clangxx -target x86_64-linux-gnu -fmemory-profile %s -### 2>&1 | FileCheck %s
 // RUN: %clangxx -target x86_64-linux-gnu -fmemory-profile -fno-memory-profile %s -### 2>&1 | FileCheck %s --check-prefix=OFF
 // CHECK: "-cc1" {{.*}} "-fmemory-profile"
-// CHECK: ld{{.*}}libclang_rt.heapprof{{.*}}libclang_rt.heapprof_cxx
+// CHECK: ld{{.*}}libclang_rt.memprof{{.*}}libclang_rt.memprof_cxx
 // OFF-NOT: "-fmemory-profile"
-// OFF-NOT: libclang_rt.heapprof
+// OFF-NOT: libclang_rt.memprof
diff --git a/clang/test/Driver/fsplit-machine-functions.c b/clang/test/Driver/fsplit-machine-functions.c
new file mode 100644
index 0000000000000..e126e4d41edbf
--- /dev/null
+++ b/clang/test/Driver/fsplit-machine-functions.c
@@ -0,0 +1,9 @@
+// RUN: %clang -### -target x86_64 -fprofile-use=default.profdata -fsplit-machine-functions %s -c 2>&1 | FileCheck -check-prefix=CHECK-OPT %s
+// RUN: %clang -### -target x86_64 -fprofile-use=default.profdata -fsplit-machine-functions -fno-split-machine-functions %s -c 2>&1 | FileCheck -check-prefix=CHECK-NOOPT %s
+// RUN: %clang -### -target x86_64 -fsplit-machine-functions %s 2>&1 | FileCheck -check-prefix=CHECK-WARN %s
+// RUN: not %clang -c -target arm-unknown-linux -fsplit-machine-functions %s 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
+
+// CHECK-OPT:       "-fsplit-machine-functions"
+// CHECK-NOOPT-NOT: "-fsplit-machine-functions"
+// CHECK-WARN:      warning: argument '-fsplit-machine-functions' requires profile-guided optimization information
+// CHECK-TRIPLE:    error: unsupported option '-fsplit-machine-functions' for target
diff --git a/clang/test/Driver/hip-gz-options.hip b/clang/test/Driver/hip-gz-options.hip
new file mode 100644
index 0000000000000..705c1be7b94ef
--- /dev/null
+++ b/clang/test/Driver/hip-gz-options.hip
@@ -0,0 +1,14 @@
+// REQUIRES: zlib, clang-driver, amdgpu-registered-target
+
+// RUN: %clang -### -target x86_64-unknown-linux-gnu \
+// RUN:    --offload-arch=gfx906 %s -nogpulib -nogpuinc \
+// RUN:   -ggdb -gz=zlib 2>&1 | FileCheck %s
+
+// RUN: %clang -### -target x86_64-unknown-linux-gnu \
+// RUN:   -fgpu-rdc --offload-arch=gfx906 %s -nogpulib -nogpuinc \
+// RUN:   -ggdb -gz=zlib 2>&1 | FileCheck %s
+
+// CHECK-DAG: {{".*clang.*" .* "--compress-debug-sections=zlib"}}
+// CHECK-DAG: {{".*lld" .* "--compress-debug-sections=zlib"}}
+// CHECK-DAG: {{".*clang.*" .* "--compress-debug-sections=zlib"}}
+// CHECK: "--compress-debug-sections=zlib"
diff --git a/clang/test/Driver/hip-offload-arch.hip b/clang/test/Driver/hip-offload-arch.hip
new file mode 100644
index 0000000000000..4cd37b5815f73
--- /dev/null
+++ b/clang/test/Driver/hip-offload-arch.hip
@@ -0,0 +1,10 @@
+// REQUIRES: clang-driver, x86-registered-target, amdgpu-registered-target
+
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --offload-arch=gfx1030 \
+// RUN:   --offload-arch=gfx1031 \
+// RUN:   -nogpuinc -nogpulib \
+// RUN:   %s 2>&1 | FileCheck %s
+
+// CHECK: {{"[^"]*clang[^"]*".* "-target-cpu" "gfx1030"}}
+// CHECK: {{"[^"]*clang[^"]*".* "-target-cpu" "gfx1031"}}
diff --git a/clang/test/Driver/hip-sanitize-options.hip b/clang/test/Driver/hip-sanitize-options.hip
new file mode 100644
index 0000000000000..908e02136cada
--- /dev/null
+++ b/clang/test/Driver/hip-sanitize-options.hip
@@ -0,0 +1,9 @@
+// REQUIRES: clang-driver, x86-registered-target, amdgpu-registered-target
+
+// RUN: %clang -### -target x86_64-unknown-linux-gnu --offload-arch=gfx906 \
+// RUN:   -fsanitize=address \
+// RUN:   -nogpuinc -nogpulib \
+// RUN:   %s 2>&1 | FileCheck %s
+
+// CHECK-NOT: {{"[^"]*clang[^"]*".* "-fcuda-is-device".* "-fsanitize=address"}}
+// CHECK: {{"[^"]*clang[^"]*".* "-triple" "x86_64-unknown-linux-gnu".* "-fsanitize=address"}}
diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index a3070d26d16cc..9a300256d08ea 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -614,6 +614,16 @@
 // RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64 %s
 // CHECK-SHADOWCALLSTACK-LINUX-AARCH64: '-fsanitize=shadow-call-stack' only allowed with '-ffixed-x18'
 
+// RUN: %clang -fsanitize=shadow-call-stack %s -### -o %t.o 2>&1 \
+// RUN:     -target riscv32-unknown-elf -fuse-ld=ld \
+// RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-ELF-RISCV32 %s
+// CHECK-SHADOWCALLSTACK-ELF-RISCV32: '-fsanitize=shadow-call-stack' only allowed with '-ffixed-x18'
+
+// RUN: %clang -fsanitize=shadow-call-stack %s -### -o %t.o 2>&1 \
+// RUN:     -target riscv64-unknown-linux -fuse-ld=ld \
+// RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-RISCV64 %s
+// CHECK-SHADOWCALLSTACK-LINUX-RISCV64: '-fsanitize=shadow-call-stack' only allowed with '-ffixed-x18'
+
 // RUN: %clang -fsanitize=shadow-call-stack %s -### -o %t.o 2>&1 \
 // RUN:     -target aarch64-unknown-linux -fuse-ld=ld -ffixed-x18 \
 // RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18 %s
diff --git a/clang/test/Driver/split-debug.c b/clang/test/Driver/split-debug.c
index d40207d5ae3b6..b6ebbaa2036e2 100644
--- a/clang/test/Driver/split-debug.c
+++ b/clang/test/Driver/split-debug.c
@@ -10,6 +10,11 @@
 // RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf=split -c -### %s 2> %t
 // RUN: FileCheck -check-prefix=CHECK-ACTIONS < %t %s
 
+// RUN: %clang -target wasm32-unknown-unknown -gsplit-dwarf -c -### %s 2> %t
+// RUN: FileCheck -check-prefix=CHECK-ACTIONS < %t %s
+// RUN: %clang -target wasm32-unknown-unknown -gsplit-dwarf=split -c -### %s 2> %t
+// RUN: FileCheck -check-prefix=CHECK-ACTIONS < %t %s
+
 // RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf=single -c -### %s 2> %t
 // RUN: FileCheck -check-prefix=CHECK-ACTIONS-SINGLE-SPLIT < %t %s
 //
diff --git a/clang/test/Driver/unavailable_aligned_allocation.cpp b/clang/test/Driver/unavailable_aligned_allocation.cpp
index 131bc116be10c..7f5d8e2cc7d4b 100644
--- a/clang/test/Driver/unavailable_aligned_allocation.cpp
+++ b/clang/test/Driver/unavailable_aligned_allocation.cpp
@@ -22,6 +22,9 @@
 // RUN: -c -### %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=UNAVAILABLE
 //
+// RUN: %clang -target s390x-none-zos -c -### %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=UNAVAILABLE
+
 // UNAVAILABLE: "-faligned-alloc-unavailable"
 
 // RUN: %clang -target x86_64-apple-macosx10.14 -c -### %s 2>&1 \
@@ -59,5 +62,11 @@
 //
 // RUN: %clang -target x86_64-apple-macosx10.13 -fno-aligned-allocation -c -### %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=AVAILABLE
+//
+// RUN: %clang -target s390x-none-zos -faligned-allocation -c -### %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=AVAILABLE
+//
+// RUN: %clang -target s390x-none-zos -fno-aligned-allocation -c -### %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=AVAILABLE
 
 // AVAILABLE-NOT: "-faligned-alloc-unavailable"
diff --git a/clang/test/Driver/wasm-toolchain.c b/clang/test/Driver/wasm-toolchain.c
index ad8b000ad2250..3c2eb66f9e199 100644
--- a/clang/test/Driver/wasm-toolchain.c
+++ b/clang/test/Driver/wasm-toolchain.c
@@ -119,3 +119,14 @@
 // RUN:   | FileCheck -check-prefix=CHECK-REACTOR %s
 // CHECK-REACTOR: clang{{.*}}" "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
 // CHECK-REACTOR: wasm-ld{{.*}}" "crt1-reactor.o" "--entry" "_initialize" "[[temp]]" "-lc" "{{.*[/\\]}}libclang_rt.builtins-wasm32.a" "-o" "a.out"
+
+// -fPIC implies +mutable-globals
+
+// RUN: %clang %s -### -no-canonical-prefixes -target wasm32-unknown-unknown --sysroot=%s/no-sysroot-there -fPIC 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-PIC %s
+// CHECK-PIC: clang{{.*}}" "-cc1" {{.*}} "-target-feature" "+mutable-globals"
+
+// '-mno-mutable-globals' is not allowed with '-fPIC'
+// RUN: %clang %s -### -no-canonical-prefixes -target wasm32-unknown-unknown --sysroot=%s/no-sysroot-there -fPIC -mno-mutable-globals %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=PIC_NO_MUTABLE_GLOBALS %s
+// PIC_NO_MUTABLE_GLOBALS: error: invalid argument '-fPIC' not allowed with '-mno-mutable-globals'
diff --git a/clang/test/Headers/Inputs/include/cmath b/clang/test/Headers/Inputs/include/cmath
index 5e4e8b67514f0..20e34898b5535 100644
--- a/clang/test/Headers/Inputs/include/cmath
+++ b/clang/test/Headers/Inputs/include/cmath
@@ -82,8 +82,13 @@ bool isless(float, float);
 bool islessgreater(double, double);
 bool islessgreater(float, float);
 bool isnan(long double);
+#ifdef USE_ISNAN_WITH_INT_RETURN
+int isnan(double);
+int isnan(float);
+#else
 bool isnan(double);
 bool isnan(float);
+#endif
 bool isnormal(double);
 bool isnormal(float);
 bool isunordered(double, double);
diff --git a/clang/test/Headers/Inputs/include/complex b/clang/test/Headers/Inputs/include/complex
index f3aefab7954be..bd43cd952d7cd 100644
--- a/clang/test/Headers/Inputs/include/complex
+++ b/clang/test/Headers/Inputs/include/complex
@@ -3,6 +3,7 @@
 #include 
 
 #define INFINITY (__builtin_inff())
+#define NAN (__builtin_nanf (""))
 
 namespace std {
 
@@ -298,4 +299,114 @@ operator!=(const _Tp &__x, const complex<_Tp> &__y) {
   return !(__x == __y);
 }
 
+template  _Tp abs(const std::complex<_Tp> &__c);
+
+// arg
+
+template  _Tp arg(const std::complex<_Tp> &__c);
+
+// norm
+
+template  _Tp norm(const std::complex<_Tp> &__c);
+
+// conj
+
+template  std::complex<_Tp> conj(const std::complex<_Tp> &__c);
+
+// proj
+
+template  std::complex<_Tp> proj(const std::complex<_Tp> &__c);
+
+// polar
+
+template 
+complex<_Tp> polar(const _Tp &__rho, const _Tp &__theta = _Tp());
+
+// log
+
+template  std::complex<_Tp> log(const std::complex<_Tp> &__x);
+
+// log10
+
+template  std::complex<_Tp> log10(const std::complex<_Tp> &__x);
+
+// sqrt
+
+template 
+std::complex<_Tp> sqrt(const std::complex<_Tp> &__x);
+
+// exp
+
+template 
+std::complex<_Tp> exp(const std::complex<_Tp> &__x);
+
+// pow
+
+template 
+std::complex<_Tp> pow(const std::complex<_Tp> &__x,
+                      const std::complex<_Tp> &__y);
+
+// __sqr, computes pow(x, 2)
+
+template  std::complex<_Tp> __sqr(const std::complex<_Tp> &__x);
+
+// asinh
+
+template 
+std::complex<_Tp> asinh(const std::complex<_Tp> &__x);
+
+// acosh
+
+template 
+std::complex<_Tp> acosh(const std::complex<_Tp> &__x);
+
+// atanh
+
+template 
+std::complex<_Tp> atanh(const std::complex<_Tp> &__x);
+
+// sinh
+
+template 
+std::complex<_Tp> sinh(const std::complex<_Tp> &__x);
+
+// cosh
+
+template 
+std::complex<_Tp> cosh(const std::complex<_Tp> &__x);
+
+// tanh
+
+template 
+std::complex<_Tp> tanh(const std::complex<_Tp> &__x);
+
+// asin
+
+template 
+std::complex<_Tp> asin(const std::complex<_Tp> &__x);
+
+// acos
+
+template 
+std::complex<_Tp> acos(const std::complex<_Tp> &__x);
+
+// atan
+
+template 
+std::complex<_Tp> atan(const std::complex<_Tp> &__x);
+
+// sin
+
+template 
+std::complex<_Tp> sin(const std::complex<_Tp> &__x);
+
+// cos
+
+template  std::complex<_Tp> cos(const std::complex<_Tp> &__x);
+
+// tan
+
+template 
+std::complex<_Tp> tan(const std::complex<_Tp> &__x);
+
 } // namespace std
diff --git a/clang/test/Headers/Inputs/include/type_traits b/clang/test/Headers/Inputs/include/type_traits
new file mode 100644
index 0000000000000..9fd02d51eff13
--- /dev/null
+++ b/clang/test/Headers/Inputs/include/type_traits
@@ -0,0 +1,43 @@
+/// Copied from libcxx type_traits and simplified
+
+#pragma once
+
+namespace std {
+
+template 
+struct integral_constant {
+  static const _Tp value = __v;
+  typedef _Tp value_type;
+  typedef integral_constant type;
+};
+
+typedef integral_constant true_type;
+typedef integral_constant false_type;
+
+// is_same, functional
+template  struct is_same : public false_type {};
+template  struct is_same<_Tp, _Tp> : public true_type {};
+
+// is_integral, for some types.
+template  struct is_integral
+    : public integral_constant {};
+template <> struct is_integral
+    : public integral_constant {};
+template <> struct is_integral
+    : public integral_constant {};
+template <> struct is_integral
+    : public integral_constant {};
+template <> struct is_integral
+    : public integral_constant {};
+template <> struct is_integral
+    : public integral_constant {};
+template <> struct is_integral
+    : public integral_constant {};
+
+// enable_if, functional
+template  struct enable_if{};
+template  struct enable_if{
+  using type = _Tp;
+};
+
+}
diff --git a/clang/test/Headers/nvptx_device_math_complex.cpp b/clang/test/Headers/nvptx_device_math_complex.cpp
index e4b78deb05d7b..688fd5d101eab 100644
--- a/clang/test/Headers/nvptx_device_math_complex.cpp
+++ b/clang/test/Headers/nvptx_device_math_complex.cpp
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -verify -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -aux-triple powerpc64le-unknown-unknown -o - | FileCheck %s
 // expected-no-diagnostics
 
+#include 
 #include 
 
 // CHECK: define weak {{.*}} @__muldc3
@@ -33,6 +34,12 @@
 // CHECK-DAG: call float @__nv_fabsf(
 // CHECK-DAG: call float @__nv_logbf(
 
+// We actually check that there are no declarations of non-OpenMP functions.
+// That is, as long as we don't call an unkown function with a name that
+// doesn't start with '__' we are good :)
+
+// CHECK-NOT: declare.*@[^_]
+
 void test_scmplx(std::complex a) {
 #pragma omp target
   {
@@ -46,3 +53,35 @@ void test_dcmplx(std::complex a) {
     (void)(a * (a / a));
   }
 }
+
+template 
+std::complex test_template_math_calls(std::complex a) {
+  decltype(a) r = a;
+#pragma omp target
+  {
+    r = std::sin(r);
+    r = std::cos(r);
+    r = std::exp(r);
+    r = std::atan(r);
+    r = std::acos(r);
+  }
+  return r;
+}
+
+std::complex test_scall(std::complex a) {
+  decltype(a) r;
+#pragma omp target
+  {
+    r = std::sin(a);
+  }
+  return test_template_math_calls(r);
+}
+
+std::complex test_dcall(std::complex a) {
+  decltype(a) r;
+#pragma omp target
+  {
+    r = std::exp(a);
+  }
+  return test_template_math_calls(r);
+}
diff --git a/clang/test/Headers/openmp_device_math_isnan.cpp b/clang/test/Headers/openmp_device_math_isnan.cpp
new file mode 100644
index 0000000000000..35443dbdebea6
--- /dev/null
+++ b/clang/test/Headers/openmp_device_math_isnan.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=BOOL_RETURN
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math -ffp-contract=fast
+// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math -ffp-contract=fast | FileCheck %s --check-prefix=BOOL_RETURN
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -DUSE_ISNAN_WITH_INT_RETURN
+// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DUSE_ISNAN_WITH_INT_RETURN | FileCheck %s --check-prefix=INT_RETURN
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math -ffp-contract=fast -DUSE_ISNAN_WITH_INT_RETURN
+// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math -ffp-contract=fast -DUSE_ISNAN_WITH_INT_RETURN | FileCheck %s --check-prefix=INT_RETURN
+// expected-no-diagnostics
+
+#include 
+
+double math(float f, double d) {
+  double r = 0;
+  // INT_RETURN: call i32 @__nv_isnanf(float
+  // BOOL_RETURN: call i32 @__nv_isnanf(float
+  r += std::isnan(f);
+  // INT_RETURN: call i32 @__nv_isnand(double
+  // BOOL_RETURN: call i32 @__nv_isnand(double
+  r += std::isnan(d);
+  return r;
+}
+
+long double foo(float f, double d, long double ld) {
+  double r = ld;
+  r += math(f, d);
+#pragma omp target map(r)
+  { r += math(f, d); }
+  return r;
+}
diff --git a/clang/test/Lexer/aligned-allocation.cpp b/clang/test/Lexer/aligned-allocation.cpp
index eef5d980a37b8..d92bb73ba1f9a 100644
--- a/clang/test/Lexer/aligned-allocation.cpp
+++ b/clang/test/Lexer/aligned-allocation.cpp
@@ -6,10 +6,19 @@
 //
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.12.0 -fexceptions -std=c++17 -verify %s \
 // RUN:   -faligned-allocation -faligned-alloc-unavailable
+//
+// RUN: %clang_cc1 -triple s390x-none-zos -fexceptions -std=c++17 -verify %s \
+// RUN:   -DEXPECT_DEFINED
+//
+// RUN: %clang_cc1 -triple s390x-none-zos -fexceptions -std=c++17 -verify %s \
+// RUN:   -faligned-alloc-unavailable
+//
+// RUN: %clang_cc1 -triple s390x-none-zos -fexceptions -std=c++17 -verify %s \
+// RUN:   -faligned-allocation -faligned-alloc-unavailable
 
 // Test that __cpp_aligned_new is not defined when CC1 is passed
-// -faligned-alloc-unavailable by the Darwin driver, even when aligned
-// allocation is actually enabled.
+// -faligned-alloc-unavailable by the Darwin and the z/OS driver, even when
+// aligned allocation is actually enabled.
 
 // expected-no-diagnostics
 #ifdef EXPECT_DEFINED
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index cdc58c7b8aeae..9dd237a24ec2e 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -145,9 +145,12 @@
 // CHECK-NEXT: Section (SubjectMatchRule_function, SubjectMatchRule_variable_is_global, SubjectMatchRule_objc_method, SubjectMatchRule_objc_property)
 // CHECK-NEXT: SetTypestate (SubjectMatchRule_function_is_member)
 // CHECK-NEXT: SpeculativeLoadHardening (SubjectMatchRule_function, SubjectMatchRule_objc_method)
+// CHECK-NEXT: SwiftBridgedTypedef (SubjectMatchRule_type_alias)
 // CHECK-NEXT: SwiftContext (SubjectMatchRule_variable_is_parameter)
+// CHECK-NEXT: SwiftError (SubjectMatchRule_function, SubjectMatchRule_objc_method)
 // CHECK-NEXT: SwiftErrorResult (SubjectMatchRule_variable_is_parameter)
 // CHECK-NEXT: SwiftIndirectResult (SubjectMatchRule_variable_is_parameter)
+// CHECK-NEXT: SwiftObjCMembers (SubjectMatchRule_objc_interface)
 // CHECK-NEXT: TLSModel (SubjectMatchRule_variable_is_thread_local)
 // CHECK-NEXT: Target (SubjectMatchRule_function)
 // CHECK-NEXT: TestTypestate (SubjectMatchRule_function_is_member)
diff --git a/clang/test/OpenMP/allocate_codegen.cpp b/clang/test/OpenMP/allocate_codegen.cpp
index c068589041af3..068e307697a0c 100644
--- a/clang/test/OpenMP/allocate_codegen.cpp
+++ b/clang/test/OpenMP/allocate_codegen.cpp
@@ -85,6 +85,7 @@ int main () {
 // CHECK-NOT:  {{__kmpc_alloc|__kmpc_free}}
 // CHECK:      store i32 %{{.+}}, i32* [[V_ADDR]],
 // CHECK-NEXT: [[V_VAL:%.+]] = load i32, i32* [[V_ADDR]],
+// CHECK-NEXT: [[V_VOID_ADDR:%.+]] = bitcast i32* [[V_ADDR]] to i8*
 // CHECK-NEXT: call void @__kmpc_free(i32 [[GTID]], i8* [[V_VOID_ADDR]], i8* inttoptr (i64 6 to i8*))
 // CHECK-NOT:  {{__kmpc_alloc|__kmpc_free}}
 // CHECK:      ret i32 [[V_VAL]]
@@ -101,7 +102,9 @@ void bar(int a, float &z) {
 // CHECK: [[Z_ADDR:%.+]] = bitcast i8* [[Z_VOID_PTR]] to float**
 // CHECK: store float* %{{.+}}, float** [[Z_ADDR]],
 #pragma omp allocate(a,z) allocator(omp_default_mem_alloc)
+// CHECK-NEXT: [[Z_VOID_PTR:%.+]] = bitcast float** [[Z_ADDR]] to i8*
 // CHECK: call void @__kmpc_free(i32 [[GTID]], i8* [[Z_VOID_PTR]], i8* inttoptr (i64 1 to i8*))
+// CHECK-NEXT: [[A_VOID_PTR:%.+]] = bitcast i32* [[A_ADDR]] to i8*
 // CHECK: call void @__kmpc_free(i32 [[GTID]], i8* [[A_VOID_PTR]], i8* inttoptr (i64 1 to i8*))
 // CHECK: ret void
 }
diff --git a/clang/test/OpenMP/declare_variant_messages.c b/clang/test/OpenMP/declare_variant_messages.c
index 84a56c5fd4094..2c63ca206fbbc 100644
--- a/clang/test/OpenMP/declare_variant_messages.c
+++ b/clang/test/OpenMP/declare_variant_messages.c
@@ -153,3 +153,17 @@ void caller() {
 #pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}}
 
 #pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}}
+
+// FIXME: If the scores are equivalent we should detect that and allow it.
+#pragma omp begin declare variant match(implementation = {vendor(score(2) \
+                                                                 : llvm)})
+#pragma omp declare variant(foo) match(implementation = {vendor(score(2) \
+                                                                : llvm)}) // expected-error@-1 {{nested OpenMP context selector contains duplicated trait 'llvm' in selector 'vendor' and set 'implementation' with different score}}
+int conflicting_nested_score(void);
+#pragma omp end declare variant
+
+// FIXME: We should build the conjuction of different conditions, see also the score fixme above.
+#pragma omp begin declare variant match(user = {condition(1)})
+#pragma omp declare variant(foo) match(user = {condition(1)}) // expected-error {{nested user conditions in OpenMP context selector not supported (yet)}}
+int conflicting_nested_condition(void);
+#pragma omp end declare variant
diff --git a/clang/test/OpenMP/for_lastprivate_codegen.cpp b/clang/test/OpenMP/for_lastprivate_codegen.cpp
index 4fc7b2061ae21..87f109e70e6e9 100644
--- a/clang/test/OpenMP/for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/for_lastprivate_codegen.cpp
@@ -654,7 +654,8 @@ int main() {
 // CHECK-NEXT: br label %[[LAST_DONE]]
 // CHECK: [[LAST_DONE]]
 
-// CHECK:      call void @__kmpc_free(i32 [[GTID]], i8* [[F_VOID_PTR]], i8* inttoptr (i64 3 to i8*))
+// CHECK: [[F_VOID_PTR:%.+]] = bitcast float* [[F_PRIV]] to i8*
+// CHECK-NEXT:      call void @__kmpc_free(i32 [[GTID]], i8* [[F_VOID_PTR]], i8* inttoptr (i64 3 to i8*))
 // CHECK-NEXT: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
 // CHECK-NEXT: ret void
 
diff --git a/clang/test/OpenMP/for_linear_codegen.cpp b/clang/test/OpenMP/for_linear_codegen.cpp
index fd9d89c38dcb7..548ded3f8644f 100644
--- a/clang/test/OpenMP/for_linear_codegen.cpp
+++ b/clang/test/OpenMP/for_linear_codegen.cpp
@@ -414,6 +414,7 @@ int main() {
 // CHECK: [[ADD:%.+]] = add nsw i64 [[LVAR_VAL]], 3
 // CHECK: store i64 [[ADD]], i64* [[LVAR_PRIV]],
 // CHECK: call void @__kmpc_for_static_fini(%{{.+}}* @{{.+}}, i32 %{{.+}})
+// CHECK: [[LVAR_VOID_PTR:%.+]] = bitcast i64* [[LVAR_PRIV]] to i8*
 // CHECK: call void @__kmpc_free(i32 [[GTID]], i8* [[LVAR_VOID_PTR]], i8* inttoptr (i64 5 to i8*))
 // CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
 // CHECK: ret void
diff --git a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
index 5a20fa187e9c3..ff6ce7847da1a 100644
--- a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
@@ -876,6 +876,7 @@ int main() {
 // CHECK: getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 4
 
 // CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], [4 x [[S_FLOAT_TY]]]** %
+// CHECK: [[VAR3_VOID_PTR:%.+]] = bitcast [4 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]] to i8*
 // CHECK: call void @__kmpc_free(i32 [[GTID]], i8* [[VAR3_VOID_PTR]], i8* inttoptr (i64 6 to i8*))
 // CHECK: ret void
 
diff --git a/clang/test/OpenMP/ordered_messages.cpp b/clang/test/OpenMP/ordered_messages.cpp
index f6b9dbd6d27fa..8a3a86443eb8c 100644
--- a/clang/test/OpenMP/ordered_messages.cpp
+++ b/clang/test/OpenMP/ordered_messages.cpp
@@ -16,6 +16,9 @@ void xxx(int argc) {
 }
 
 int foo();
+#if __cplusplus >= 201103L
+// expected-note@-2 {{declared here}}
+#endif
 
 template 
 T foo() {
@@ -176,7 +179,7 @@ T foo() {
 
 int foo() {
 #if __cplusplus >= 201103L
-// expected-note@-2 2 {{declared here}}
+// expected-note@-2 {{declared here}}
 #endif
 int k;
   #pragma omp for ordered
diff --git a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
index 04af45badaea1..97024e0ace1ff 100644
--- a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
@@ -423,6 +423,7 @@ int main() {
 // CHECK-64: [[T_VAR_VAL:%.+]] = load i32, i32* [[BC]],
 // CHECK:    store i32 [[T_VAR_VAL]], i32* [[T_VAR_PRIV]],
 // CHECK:    store i32 0, i32* [[T_VAR_PRIV]],
+// CHECK:    [[T_VAR_VOID_PTR:%.+]] = bitcast i32* [[T_VAR_PRIV]] to i8*
 // CHECK:    call void @__kmpc_free(i32 [[GTID]], i8* [[T_VAR_VOID_PTR]], i8* inttoptr ([[iz]] 1 to i8*))
 // CHECK:    ret void
 
@@ -584,6 +585,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // ARRAY: [[SIZE:%.+]] = mul nuw i64 %{{.+}}, 8
 // ARRAY: [[BC:%.+]] = bitcast double* [[VLA2_PTR]] to i8*
 // ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 [[BC]], i8* align 128 %{{.+}}, i64 [[SIZE]], i1 false)
+// ARRAY: [[VLA2_VOID_PTR:%.+]] = bitcast double* [[VLA2_PTR]] to i8*
 // ARRAY: call void @__kmpc_free(i32 [[GTID]], i8* [[VLA2_VOID_PTR]], i8* inttoptr (i64 8 to i8*))
 // ARRAY-NEXT: ret void
 #endif
diff --git a/clang/test/OpenMP/parallel_private_codegen.cpp b/clang/test/OpenMP/parallel_private_codegen.cpp
index ceceaf95d49ab..eb575c53f913b 100644
--- a/clang/test/OpenMP/parallel_private_codegen.cpp
+++ b/clang/test/OpenMP/parallel_private_codegen.cpp
@@ -361,12 +361,13 @@ int main() {
 // CHECK: [[GTID_ADDR:%.+]] = load i32*, i32** [[GTID_ADDR_PTR]],
 // CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_ADDR]],
 // CHECK: [[A_VOID_PTR:%.+]] = call i8* @__kmpc_alloc(i32 [[GTID]], i64 4, i8* inttoptr (i64 2 to i8*))
-// CHECK: [[A_PRIV:%.+]] = bitcast i8* [[A_VOID_PTR]] to i32*
-// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REF:%.+]],
+// CHECK: [[A_PRIV_ADDR:%.+]] = bitcast i8* [[A_VOID_PTR]] to i32*
+// CHECK: store i{{[0-9]+}}* [[A_PRIV_ADDR]], i{{[0-9]+}}** [[REF:%.+]],
 // CHECK-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REF]],
 // CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
 // CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
 // CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[A_VOID_PTR:%.+]] = bitcast i32* [[A_PRIV_ADDR]] to i8*
 // CHECK-NEXT: call void @__kmpc_free(i32 [[GTID]], i8* [[A_VOID_PTR]], i8* inttoptr (i64 2 to i8*))
 // CHECK-NEXT: ret void
 
diff --git a/clang/test/OpenMP/parallel_reduction_messages.cpp b/clang/test/OpenMP/parallel_reduction_messages.cpp
index b464bf5b96437..12b34a4de07ba 100644
--- a/clang/test/OpenMP/parallel_reduction_messages.cpp
+++ b/clang/test/OpenMP/parallel_reduction_messages.cpp
@@ -92,6 +92,8 @@ class S6 { // expected-note 3 {{candidate function (the implicit copy assignment
 
 S3 h, k;
 #pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+int *gptr;
+#pragma omp threadprivate(gptr) // expected-note {{defined as threadprivate or thread local}}
 
 template        // expected-note {{declared here}}
 T tmain(T argc) {
@@ -277,6 +279,8 @@ int main(int argc, char **argv) {
   m++;
 #pragma omp parallel reduction(task, + : m) // omp45-error 2 {{expected expression}} omp45-warning {{missing ':' after reduction identifier - ignoring}}
   m++;
+#pragma omp parallel reduction(+:gptr[:argc]) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  ;
 
   return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain' requested here}} expected-note {{in instantiation of function template specialization 'tmain' requested here}}
 }
diff --git a/clang/test/OpenMP/simd_codegen.cpp b/clang/test/OpenMP/simd_codegen.cpp
index 8ba87dce82fcb..335dfd78cacea 100644
--- a/clang/test/OpenMP/simd_codegen.cpp
+++ b/clang/test/OpenMP/simd_codegen.cpp
@@ -817,25 +817,9 @@ void parallel_simd(float *a) {
 // TERM_DEBUG: !{{[0-9]+}} = !DILocation(line: [[@LINE-11]],
 
 // CHECK-LABEL: S8
-// CHECK-DAG: ptrtoint [[SS_TY]]* %{{.+}} to i64
-// CHECK-DAG: ptrtoint [[SS_TY]]* %{{.+}} to i64
-// CHECK-DAG: ptrtoint [[SS_TY]]* %{{.+}} to i64
-// CHECK-DAG: ptrtoint [[SS_TY]]* %{{.+}} to i64
-
-// CHECK-DAG: and i64 %{{.+}}, 15
-// CHECK-DAG: icmp eq i64 %{{.+}}, 0
 // CHECK-DAG: call void @llvm.assume(i1
-
-// CHECK-DAG: and i64 %{{.+}}, 7
-// CHECK-DAG: icmp eq i64 %{{.+}}, 0
 // CHECK-DAG: call void @llvm.assume(i1
-
-// CHECK-DAG: and i64 %{{.+}}, 15
-// CHECK-DAG: icmp eq i64 %{{.+}}, 0
 // CHECK-DAG: call void @llvm.assume(i1
-
-// CHECK-DAG: and i64 %{{.+}}, 3
-// CHECK-DAG: icmp eq i64 %{{.+}}, 0
 // CHECK-DAG: call void @llvm.assume(i1
 struct SS {
   SS(): a(0) {}
diff --git a/clang/test/OpenMP/simd_metadata.c b/clang/test/OpenMP/simd_metadata.c
index f0ae0200dd08e..18133e3b6c2e7 100644
--- a/clang/test/OpenMP/simd_metadata.c
+++ b/clang/test/OpenMP/simd_metadata.c
@@ -21,30 +21,21 @@ void h1(float *c, float *a, double b[], int size)
 // CHECK-LABEL: define void @h1
   int t = 0;
 #pragma omp simd safelen(16) linear(t) aligned(c:32) aligned(a,b)
-// CHECK:         [[C_PTRINT:%.+]] = ptrtoint
-// CHECK-NEXT:    [[C_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[C_PTRINT]], 31
-// CHECK-NEXT:    [[C_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[C_MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[C_MASKCOND]])
-// CHECK:         [[A_PTRINT:%.+]] = ptrtoint
-
-// X86-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
-// X86-AVX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 31
-// X86-AVX512-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 63
-// PPC-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
-// PPC-QPX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
-
-// CHECK-NEXT:    [[A_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[A_MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[A_MASKCOND]])
-// CHECK:         [[B_PTRINT:%.+]] = ptrtoint
-
-// X86-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
-// X86-AVX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
-// X86-AVX512-NEXT: [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 63
-// PPC-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
-// PPC-QPX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
-
-// CHECK-NEXT:    [[B_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[B_MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[B_MASKCOND]])
+  // CHECK:         call void @llvm.assume(i1 true) [ "align"(float* [[PTR4:%.*]], {{i64|i32}} 32) ]
+  // CHECK-NEXT:    load
+
+  // X86-NEXT:       call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
+  // X86-AVX-NEXT:   call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 32) ]
+  // X86-AVX512-NEXT:call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 64) ]
+  // PPC-NEXT:       call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
+  // PPC-QPX-NEXT:   call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
+  // CHECK-NEXT:     load
+
+  // X86-NEXT:       call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 16) ]
+  // X86-AVX-NEXT:   call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 32) ]
+  // X86-AVX512-NEXT:call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 64) ]
+  // PPC-NEXT:       call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 16) ]
+  // PPC-QPX-NEXT:   call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 32) ]
   for (int i = 0; i < size; ++i) {
     c[i] = a[i] * a[i] + b[i] * b[t];
     ++t;
@@ -52,30 +43,21 @@ void h1(float *c, float *a, double b[], int size)
 // do not emit llvm.access.group metadata due to usage of safelen clause.
 // CHECK-NOT: store float {{.+}}, float* {{.+}}, align {{.+}}, !llvm.access.group {{![0-9]+}}
 #pragma omp simd safelen(16) linear(t) aligned(c:32) aligned(a,b) simdlen(8)
-// CHECK:         [[C_PTRINT:%.+]] = ptrtoint
-// CHECK-NEXT:    [[C_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[C_PTRINT]], 31
-// CHECK-NEXT:    [[C_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[C_MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[C_MASKCOND]])
-// CHECK:         [[A_PTRINT:%.+]] = ptrtoint
-
-// X86-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
-// X86-AVX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 31
-// X86-AVX512-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 63
-// PPC-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
-// PPC-QPX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
-
-// CHECK-NEXT:    [[A_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[A_MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[A_MASKCOND]])
-// CHECK:         [[B_PTRINT:%.+]] = ptrtoint
-
-// X86-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
-// X86-AVX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
-// X86-AVX512-NEXT: [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 63
-// PPC-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
-// PPC-QPX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
-
-// CHECK-NEXT:    [[B_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[B_MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[B_MASKCOND]])
+  // CHECK:         call void @llvm.assume(i1 true) [ "align"(float* [[PTR4:%.*]], {{i64|i32}} 32) ]
+  // CHECK-NEXT:    load
+
+  // X86-NEXT:       call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
+  // X86-AVX-NEXT:   call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 32) ]
+  // X86-AVX512-NEXT:call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 64) ]
+  // PPC-NEXT:       call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
+  // PPC-QPX-NEXT:   call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
+  // CHECK-NEXT:     load
+
+  // X86-NEXT:       call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 16) ]
+  // X86-AVX-NEXT:   call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 32) ]
+  // X86-AVX512-NEXT:call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 64) ]
+  // PPC-NEXT:       call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 16) ]
+  // PPC-QPX-NEXT:   call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 32) ]
   for (int i = 0; i < size; ++i) {
     c[i] = a[i] * a[i] + b[i] * b[t];
     ++t;
@@ -83,30 +65,21 @@ void h1(float *c, float *a, double b[], int size)
 // do not emit llvm.access.group metadata due to usage of safelen clause.
 // CHECK-NOT: store float {{.+}}, float* {{.+}}, align {{.+}}, !llvm.access.group {{![0-9]+}}
 #pragma omp simd linear(t) aligned(c:32) aligned(a,b) simdlen(8)
-// CHECK:         [[C_PTRINT:%.+]] = ptrtoint
-// CHECK-NEXT:    [[C_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[C_PTRINT]], 31
-// CHECK-NEXT:    [[C_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[C_MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[C_MASKCOND]])
-// CHECK:         [[A_PTRINT:%.+]] = ptrtoint
-
-// X86-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
-// X86-AVX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 31
-// X86-AVX512-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 63
-// PPC-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
-// PPC-QPX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
-
-// CHECK-NEXT:    [[A_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[A_MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[A_MASKCOND]])
-// CHECK:         [[B_PTRINT:%.+]] = ptrtoint
-
-// X86-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
-// X86-AVX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
-// X86-AVX512-NEXT: [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 63
-// PPC-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
-// PPC-QPX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
-
-// CHECK-NEXT:    [[B_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[B_MASKEDPTR]], 0
-// CHECK-NEXT:    call void @llvm.assume(i1 [[B_MASKCOND]])
+  // CHECK:         call void @llvm.assume(i1 true) [ "align"(float* [[PTR4:%.*]], {{i64|i32}} 32) ]
+  // CHECK-NEXT:    load
+
+  // X86-NEXT:       call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
+  // X86-AVX-NEXT:   call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 32) ]
+  // X86-AVX512-NEXT:call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 64) ]
+  // PPC-NEXT:       call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
+  // PPC-QPX-NEXT:   call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
+  // CHECK-NEXT:     load
+
+  // X86-NEXT:       call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 16) ]
+  // X86-AVX-NEXT:   call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 32) ]
+  // X86-AVX512-NEXT:call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 64) ]
+  // PPC-NEXT:       call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 16) ]
+  // PPC-QPX-NEXT:   call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 32) ]
   for (int i = 0; i < size; ++i) {
     c[i] = a[i] * a[i] + b[i] * b[t];
     ++t;
diff --git a/clang/test/OpenMP/target_depend_codegen.cpp b/clang/test/OpenMP/target_depend_codegen.cpp
index 9b1f6c9582ae4..178940243a7e8 100644
--- a/clang/test/OpenMP/target_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_depend_codegen.cpp
@@ -43,8 +43,8 @@
 
 // TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 }
 
-// CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i64] [i64 0, i64 4]
-// CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 544, i64 800]
+// CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [3 x i64] [i64 0, i64 4, i64 {{16|12}}]
+// CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [3 x i64] [i64 544, i64 800, i64 3]
 // CHECK-DAG: @{{.*}} = weak constant i8 0
 
 // TCHECK: @{{.+}} = weak constant [[ENTTY]]
@@ -61,6 +61,9 @@ struct TT{
   ty Y;
 };
 
+#pragma omp declare mapper(id                     \
+                           : TT  \
+                               s) map(s.X, s.Y)
 int global;
 extern int global;
 
@@ -102,29 +105,75 @@ int foo(int n) {
   // CHECK:       [[BOOL:%.+]] = icmp ne i32 %{{.+}}, 0
   // CHECK:       br i1 [[BOOL]], label %[[THEN:.+]], label %[[ELSE:.+]]
   // CHECK:       [[THEN]]:
-  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%.+]], i32 0, i32 0
-  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[MADDR0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[M:%.+]], i[[SZ]] 0, i[[SZ]] 0
   // CHECK-DAG:   [[CBPADDR0:%.+]] = bitcast i8** [[BPADDR0]] to i[[SZ]]**
   // CHECK-DAG:   [[CPADDR0:%.+]] = bitcast i8** [[PADDR0]] to i[[SZ]]**
   // CHECK-DAG:   store i[[SZ]]* [[BP0:%[^,]+]], i[[SZ]]** [[CBPADDR0]]
   // CHECK-DAG:   store i[[SZ]]* [[BP0]], i[[SZ]]** [[CPADDR0]]
+  // CHECK-DAG:   store i8* null, i8** [[MADDR0]],
 
-  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1
-  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 1
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 1
+  // CHECK-DAG:   [[MADDR1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[M]], i[[SZ]] 0, i[[SZ]] 1
   // CHECK-DAG:   [[CBPADDR1:%.+]] = bitcast i8** [[BPADDR1]] to i[[SZ]]*
   // CHECK-DAG:   [[CPADDR1:%.+]] = bitcast i8** [[PADDR1]] to i[[SZ]]*
   // CHECK-DAG:   store i[[SZ]] [[BP1:%[^,]+]], i[[SZ]]* [[CBPADDR1]]
   // CHECK-DAG:   store i[[SZ]] [[BP1]], i[[SZ]]* [[CPADDR1]]
-  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0
-  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0
+  // CHECK-DAG:   store i8* null, i8** [[MADDR1]],
+
+  // CHECK-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 2
+  // CHECK-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 2
+  // CHECK-DAG:   [[MADDR2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[M]], i[[SZ]] 0, i[[SZ]] 2
+  // CHECK-DAG:   [[CBPADDR2:%.+]] = bitcast i8** [[BPADDR2]] to [[STRUCT_TT:%.+]]**
+  // CHECK-DAG:   [[CPADDR2:%.+]] = bitcast i8** [[PADDR2]] to [[STRUCT_TT]]**
+  // CHECK-DAG:   store [[STRUCT_TT]]* [[D_ADDR:%.+]], [[STRUCT_TT]]** [[CBPADDR2]]
+  // CHECK-DAG:   store [[STRUCT_TT]]* [[D_ADDR]], [[STRUCT_TT]]** [[CPADDR2]]
+  // CHECK-DAG:   store i8* bitcast (void (i8*, i8*, i8*, i64, i64)* [[MAPPER_ID:@.+]] to i8*), i8** [[MADDR2]],
+
+  // CHECK-DAG:   [[BP_START:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 0
+  // CHECK-DAG:   [[P_START:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 0
+  // CHECK-DAG:   [[M_START:%.+]] = bitcast [3 x i8*]* [[M]] to i8**
   // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
   // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
   // CHECK:       store i32 [[DEV]], i32* [[GEP]],
   // CHECK:       [[DEV1:%.+]] = load i32, i32* [[DEVICE_CAP]],
   // CHECK:       [[DEV2:%.+]] = sext i32 [[DEV1]] to i64
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @{{.*}}, i32 [[GTID]], i32 1, i[[SZ]] {{120|68}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @{{.*}}, i32 [[GTID]], i32 1, i[[SZ]] {{152|88}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
   // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
+  // CHECK:       [[BASE:%.+]] = getelementptr inbounds [[TASK_TY1_]], [[TASK_TY1_]]* [[BC_TASK]], i32 0, i32 1
+  // CHECK-64:    [[BP_BASE:%.+]] = getelementptr inbounds [[PRIVS_TY:%.+]], [[PRIVS_TY:%.+]]* [[BASE]], i32 0, i32 1
+  // CHECK-64:    [[BP_CAST:%.+]] = bitcast [3 x i8*]* [[BP_BASE]] to i8*
+  // CHECK-64:    [[BP_SRC:%.+]] = bitcast i8** [[BP_START]] to i8*
+  // CHECK-64:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[BP_CAST]], i8* align 8 [[BP_SRC]], i64 24, i1 false)
+  // CHECK-64:    [[P_BASE:%.+]] = getelementptr inbounds [[PRIVS_TY]], [[PRIVS_TY]]* [[BASE]], i32 0, i32 2
+  // CHECK-64:    [[P_CAST:%.+]] = bitcast [3 x i8*]* [[P_BASE]] to i8*
+  // CHECK-64:    [[P_SRC:%.+]] = bitcast i8** [[P_START]] to i8*
+  // CHECK-64:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[P_CAST]], i8* align 8 [[P_SRC]], i64 24, i1 false)
+  // CHECK-64:    [[SZ_BASE:%.+]] = getelementptr inbounds [[PRIVS_TY]], [[PRIVS_TY]]* [[BASE]], i32 0, i32 3
+  // CHECK-64:    [[SZ_CAST:%.+]] = bitcast [3 x i64]* [[SZ_BASE]] to i8*
+  // CHECK-64:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SZ_CAST]], i8* align 8 bitcast ([3 x i64]* [[SIZET]] to i8*), i64 24, i1 false)
+  // CHECK-64:    [[M_BASE:%.+]] = getelementptr inbounds [[PRIVS_TY]], [[PRIVS_TY]]* [[BASE]], i32 0, i32 4
+  // CHECK-64:    [[M_CAST:%.+]] = bitcast [3 x i8*]* [[M_BASE]] to i8*
+  // CHECK-64:    [[M_SRC:%.+]] = bitcast i8** [[M_START]] to i8*
+  // CHECK-64:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[M_CAST]], i8* align 8 [[M_SRC]], i64 24, i1 false)
+  // CHECK-32:    [[SZ_BASE:%.+]] = getelementptr inbounds [[PRIVS_TY:%.+]], [[PRIVS_TY:%.+]]* [[BASE]], i32 0, i32 0
+  // CHECK-32:    [[SZ_CAST:%.+]] = bitcast [3 x i64]* [[SZ_BASE]] to i8*
+  // CHECK-32:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[SZ_CAST]], i8* align 4 bitcast ([3 x i64]* [[SIZET]] to i8*), i32 24, i1 false)
+  // CHECK-32:    [[BP_BASE:%.+]] = getelementptr inbounds [[PRIVS_TY]], [[PRIVS_TY]]* [[BASE]], i32 0, i32 3
+  // CHECK-32:    [[BP_CAST:%.+]] = bitcast [3 x i8*]* [[BP_BASE]] to i8*
+  // CHECK-32:    [[BP_SRC:%.+]] = bitcast i8** [[BP_START]] to i8*
+  // CHECK-32:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[BP_CAST]], i8* align 4 [[BP_SRC]], i32 12, i1 false)
+  // CHECK-32:    [[P_BASE:%.+]] = getelementptr inbounds [[PRIVS_TY]], [[PRIVS_TY]]* [[BASE]], i32 0, i32 4
+  // CHECK-32:    [[P_CAST:%.+]] = bitcast [3 x i8*]* [[P_BASE]] to i8*
+  // CHECK-32:    [[P_SRC:%.+]] = bitcast i8** [[P_START]] to i8*
+  // CHECK-32:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[P_CAST]], i8* align 4 [[P_SRC]], i32 12, i1 false)
+  // CHECK-32:    [[M_BASE:%.+]] = getelementptr inbounds [[PRIVS_TY]], [[PRIVS_TY]]* [[BASE]], i32 0, i32 5
+  // CHECK-32:    [[M_CAST:%.+]] = bitcast [3 x i8*]* [[M_BASE]] to i8*
+  // CHECK-32:    [[M_SRC:%.+]] = bitcast i8** [[M_START]] to i8*
+  // CHECK-32:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[M_CAST]], i8* align 4 [[M_SRC]], i32 12, i1 false)
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP_START:%.+]], i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP_START]], i[[SZ]] 2
   // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
@@ -148,8 +197,9 @@ int foo(int n) {
   // CHECK:       br label %[[EXIT:.+]]
   // CHECK:       [[EXIT]]:
 
-#pragma omp target device(global + a) nowait depend(inout \
-                                                    : global, a, bn) if (a)
+#pragma omp target device(global + a) nowait depend(inout                                          \
+                                                    : global, a, bn) if (a) map(mapper(id), tofrom \
+                                                                                : d)
   {
     static int local1;
     *plocal = global;
@@ -193,13 +243,22 @@ int foo(int n) {
 
 // CHECK:       define internal void [[HVT1:@.+]](i[[SZ]]* %{{.+}}, i[[SZ]] %{{.+}})
 
-// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1_]](i32{{.*}}, [[TASK_TY1_]]* noalias %1)
-// CHECK:       call void (i8*, ...) %
-// CHECK:       [[SZT:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       define internal void [[MAPPER_ID]](i8* %{{.+}}, i8* %{{.+}}, i8* %{{.+}}, i64 %{{.+}}, i64 %{{.+}})
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1_]](i32{{.*}}, [[TASK_TY1_]]* noalias %{{.+}})
+// CHECK:       call void (i8*, ...) %{{.+}}(i8* %{{.+}}, i[[SZ]]*** %{{.+}}, i32** %{{.+}}, [3 x i8*]** [[BPTR_ADDR:%.+]], [3 x i8*]** [[PTR_ADDR:%.+]], [3 x i64]** [[SZ_ADDR:%.+]], [3 x i8*]** [[M_ADDR:%.+]])
+// CHECK:       [[BPTR_REF:%.+]] = load [3 x i8*]*, [3 x i8*]** [[BPTR_ADDR]],
+// CHECK:       [[PTR_REF:%.+]] = load [3 x i8*]*, [3 x i8*]** [[PTR_ADDR]],
+// CHECK:       [[SZ_REF:%.+]] = load [3 x i64]*, [3 x i64]** [[SZ_ADDR]],
+// CHECK:       [[M_REF:%.+]] = load [3 x i8*]*, [3 x i8*]** [[M_ADDR]],
+// CHECK:       [[BPR:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BPTR_REF]], i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[PR:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTR_REF]], i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[SZT:%.+]] = getelementptr inbounds [3 x i64], [3 x i64]* [[SZ_REF]], i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[M:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[M_REF]], i[[SZ]] 0, i[[SZ]] 0
 // CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
 // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
 // CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
-// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_nowait_mapper(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SZT]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT]], i32 0, i32 0), i8** [[M:%[^,]+]])
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_nowait_mapper(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 3, i8** [[BPR]], i8** [[PR]], i64* [[SZT]], i64* getelementptr inbounds ([3 x i64], [3 x i64]* [[MAPT]], i32 0, i32 0), i8** [[M]])
 
 // CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
 // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
diff --git a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
index 7c2eef577f9f3..a7c585751161e 100644
--- a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
+++ b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
@@ -285,4 +285,41 @@ void bar(double *arg){
   ++arg;
 }
 #endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -DCK3 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -DCK3 -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+#ifdef CK3
+
+// CK3-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[SZ:64|32]]] [i{{64|32}} {{8|4}}]
+// OMP_MAP_TARGET_PARAM = 0x20 | OMP_MAP_TO = 0x1 = 0x21
+// CK3-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i64] [i64 [[#0x21]]]
+void bar() {
+  __attribute__((aligned(64))) double *ptr;
+  // CK3-DAG: call i32 @__tgt_target_mapper(i64 {{.+}}, i8* {{.+}}, i32 1, i8** [[BPGEP:%[0-9]+]], i8** [[PGEP:%[0-9]+]], {{.+}}[[SIZES]]{{.+}}, {{.+}}[[TYPES]]{{.+}}, i8** null)
+  // CK3-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
+  // CK3-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
+  // CK3-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
+  // CK3-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
+  // CK3-DAG: [[CBP1:%.+]] = bitcast i8** [[BP1]] to double***
+  // CK3-DAG: [[CP1:%.+]] = bitcast i8** [[P1]] to double***
+  // CK3-DAG: store double** [[PTR:%.+]], double*** [[CBP1]]
+  // CK3-DAG: store double** [[PTR]], double*** [[CP1]]
+
+  // CK3: call void [[KERNEL:@.+]](double** [[PTR]])
+#pragma omp target is_device_ptr(ptr)
+  *ptr = 0;
+}
+#endif
 #endif
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
index d2031d6d214b1..7dff11951d9f8 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -101,10 +101,7 @@ int target_teams_fun(int *g){
 
   // CK1: define internal void @[[OUTL1]]({{.+}})
   // CK1: [[ARRDECAY:%.+]] = getelementptr inbounds [1000 x i32], [1000 x i32]* %{{.+}}, i{{32|64}} 0, i{{32|64}} 0
-  // CK1: [[ARR_CAST:%.+]] = ptrtoint i32* [[ARRDECAY]] to i{{32|64}}
-  // CK1: [[MASKED_PTR:%.+]] = and i{{32|64}} [[ARR_CAST]], 7
-  // CK1: [[COND:%.+]] = icmp eq i{{32|64}} [[MASKED_PTR]], 0
-  // CK1: call void @llvm.assume(i1 [[COND]])
+  // CK1: call void @llvm.assume(i1 true) [ "align"(i32* [[ARRDECAY]], {{i64|i32}} 8) ]
   // CK1: call void @__kmpc_for_static_init_4(
   // CK1: call void {{.+}} @__kmpc_fork_call(
   // CK1: call void @__kmpc_for_static_fini(
diff --git a/clang/test/OpenMP/task_codegen.cpp b/clang/test/OpenMP/task_codegen.cpp
index 3c92ca75b1016..f54499ca38f06 100644
--- a/clang/test/OpenMP/task_codegen.cpp
+++ b/clang/test/OpenMP/task_codegen.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix UNTIEDRT
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - -DUNTIEDRT | FileCheck %s --check-prefix CHECK --check-prefix UNTIEDRT
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s -DUNTIEDRT
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix UNTIEDRT
 //
 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
@@ -14,6 +14,19 @@
 #ifndef HEADER
 #define HEADER
 
+enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__
+};
+
 // CHECK-DAG: [[IDENT_T:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STRUCT_SHAREDS:%.+]] = type { i8*, [2 x [[STRUCT_S:%.+]]]* }
 // CHECK-DAG: [[STRUCT_SHAREDS1:%.+]] = type { [2 x [[STRUCT_S:%.+]]]* }
@@ -258,21 +271,26 @@ int main() {
     a = 4;
     c = 5;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 0, i64 48, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY6:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 0, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY6:@.+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]])
-#pragma omp task untied
+#pragma omp task untied firstprivate(c) allocate(omp_pteam_mem_alloc:c)
   {
-    S s1;
+    S s1, s2;
+#ifdef UNTIEDRT
+#pragma omp allocate(s2) allocator(omp_pteam_mem_alloc)
+#endif
+    s2.a = 0;
 #pragma omp task
-    a = 4;
+    a = c = 4;
 #pragma omp taskyield
     s1 = S();
+    s2.a = 10;
 #pragma omp taskwait
   }
   return a;
 }
 // CHECK: define internal i32 [[TASK_ENTRY1]](i32 %0, [[KMP_TASK_T]]{{.*}}* noalias %1)
-// CHECK: store i32 15, i32* [[A_PTR:@.+]]
+// CHECK: store i32 15, i32* [[A_PTR:@.+]],
 // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A_PTR]]
 // CHECK: [[A_VAL_I8:%.+]] = trunc i32 [[A_VAL]] to i8
 // CHECK: store i8 [[A_VAL_I8]], i8* %{{.+}}
@@ -294,10 +312,13 @@ int main() {
 // CHECK: define internal i32
 // CHECK: store i32 4, i32* [[A_PTR]]
 
-// CHECK: define internal i32 [[TASK_ENTRY6]](i32 %0, [[KMP_TASK_T]]{{.*}}* noalias %1)
+// CHECK: define internal i32 [[TASK_ENTRY6]](i32 %0, [[KMP_TASK_T]]{{.*}}* noalias %{{.+}})
 // UNTIEDRT: [[S1_ADDR_PTR:%.+]] = alloca %struct.S*,
-// UNTIEDRT: call void (i8*, ...) %{{.+}}(i8* %{{.+}}, %struct.S** [[S1_ADDR_PTR]])
-// UNTIEDRT: [[S1_ADDR:%.+]] = load %struct.S*, %struct.S** [[S1_ADDR_PTR]],
+// UNTIEDRT: [[S2_ADDR_PTR_REF:%.+]] = alloca %struct.S**,
+// UNTIEDRT: call void (i8*, ...) %{{.+}}(i8* %{{.+}}, %struct.S** [[S1_ADDR_PTR]], %struct.S*** [[S2_ADDR_PTR_REF]])
+// UNTIEDRT-DAG: [[S1_ADDR:%.+]] = load %struct.S*, %struct.S** [[S1_ADDR_PTR]],
+// UNTIEDRT-DAG: [[S2_ADDR_PTR:%.+]] = load %struct.S**, %struct.S*** [[S2_ADDR_PTR_REF]],
+// UNTIEDRT-DAG: [[S2_ADDR:%.+]] = load %struct.S*, %struct.S** [[S2_ADDR_PTR]],
 // CHECK: switch i32 %{{.+}}, label %[[DONE:.+]] [
 
 // CHECK: [[DONE]]:
@@ -309,16 +330,25 @@ int main() {
 // UNTIEDRT: br label %[[EXIT:[^,]+]]
 
 // UNTIEDRT: call void [[CONSTR:@.+]](%struct.S* [[S1_ADDR]])
+// UNTIEDRT: [[S2_VOID_PTR:%.+]] = call i8* @__kmpc_alloc(i32 %{{.+}}, i64 4, i8* inttoptr (i64 7 to i8*))
+// UNTIEDRT: [[S2_PTR:%.+]] = bitcast i8* [[S2_VOID_PTR]] to %struct.S*
+// UNTIEDRT: store %struct.S* [[S2_PTR]], %struct.S** [[S2_ADDR_PTR]],
+// UNTIEDRT: load i32*, i32** %
+// UNTIEDRT: store i32 2, i32* %
+// UNTIEDRT: call i32 @__kmpc_omp_task(%
+// UNTIEDRT: br label %[[EXIT]]
+
+// UNTIEDRT: call void [[CONSTR]](%struct.S* [[S2_ADDR]])
 // CHECK: call i8* @__kmpc_omp_task_alloc(
 // CHECK: call i32 @__kmpc_omp_task(%
 // CHECK: load i32*, i32** %
-// CHECK: store i32 2, i32* %
+// CHECK: store i32 {{2|3}}, i32* %
 // CHECK: call i32 @__kmpc_omp_task(%
 // UNTIEDRT: br label %[[EXIT]]
 
 // CHECK: call i32 @__kmpc_omp_taskyield(%
 // CHECK: load i32*, i32** %
-// CHECK: store i32 3, i32* %
+// CHECK: store i32 {{3|4}}, i32* %
 // CHECK: call i32 @__kmpc_omp_task(%
 // UNTIEDRT: br label %[[EXIT]]
 
@@ -331,10 +361,13 @@ int main() {
 
 // CHECK: call i32 @__kmpc_omp_taskwait(%
 // CHECK: load i32*, i32** %
-// CHECK: store i32 4, i32* %
+// CHECK: store i32 {{4|5}}, i32* %
 // CHECK: call i32 @__kmpc_omp_task(%
 // UNTIEDRT: br label %[[EXIT]]
 
+// UNTIEDRT: call void [[DESTR]](%struct.S* [[S2_ADDR]])
+// UNTIEDRT: [[S2_VOID_PTR:%.+]] = bitcast %struct.S* [[S2_ADDR]] to i8*
+// UNTIEDRT: call void @__kmpc_free(i32 %{{.+}}, i8* [[S2_VOID_PTR]], i8* inttoptr (i64 7 to i8*))
 // UNTIEDRT: call void [[DESTR]](%struct.S* [[S1_ADDR]])
 // CHECK: br label %[[CLEANUP]]
 
diff --git a/clang/test/Preprocessor/has_attribute.cpp b/clang/test/Preprocessor/has_attribute.cpp
index e7303c7c5b4dd..a66624ac4147a 100644
--- a/clang/test/Preprocessor/has_attribute.cpp
+++ b/clang/test/Preprocessor/has_attribute.cpp
@@ -62,13 +62,13 @@ CXX11(unlikely)
 // FIXME(201806L) CHECK: ensures: 0
 // FIXME(201806L) CHECK: expects: 0
 // CHECK: fallthrough: 201603L
-// FIXME(201803L) CHECK: likely: 0
+// FIXME(201803L) CHECK: likely: 2L
 // CHECK: maybe_unused: 201603L
 // ITANIUM: no_unique_address: 201803L
 // WINDOWS: no_unique_address: 0
 // CHECK: nodiscard: 201907L
 // CHECK: noreturn: 200809L
-// FIXME(201803L) CHECK: unlikely: 0
+// FIXME(201803L) CHECK: unlikely: 2L
 
 // Test for Microsoft __declspec attributes
 
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index 5326596fee93c..287a7c58cddab 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -2525,6 +2525,7 @@
 // CHECK_AMDFAM10_M32: #define __SSE4A__ 1
 // CHECK_AMDFAM10_M32: #define __SSE_MATH__ 1
 // CHECK_AMDFAM10_M32: #define __SSE__ 1
+// CHECK_AMDFAM10_M32-NOT: #define __SSSE3__ 1
 // CHECK_AMDFAM10_M32: #define __amdfam10 1
 // CHECK_AMDFAM10_M32: #define __amdfam10__ 1
 // CHECK_AMDFAM10_M32: #define __i386 1
@@ -2547,6 +2548,7 @@
 // CHECK_AMDFAM10_M64: #define __SSE4A__ 1
 // CHECK_AMDFAM10_M64: #define __SSE_MATH__ 1
 // CHECK_AMDFAM10_M64: #define __SSE__ 1
+// CHECK_AMDFAM10_M64-NOT: #define __SSSE3__ 1
 // CHECK_AMDFAM10_M64: #define __amd64 1
 // CHECK_AMDFAM10_M64: #define __amd64__ 1
 // CHECK_AMDFAM10_M64: #define __amdfam10 1
@@ -3233,9 +3235,26 @@
 // RUN:     -target sparc-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SPARC-V9
 // CHECK_SPARC-V9-NOT: #define __sparcv8 1
+// CHECK_SPARC-V9-NOT: #define __sparcv8__ 1
 // CHECK_SPARC-V9: #define __sparc_v9__ 1
 // CHECK_SPARC-V9: #define __sparcv9 1
-// CHECK_SPARC-V9-NOT: #define __sparcv8 1
+// CHECK_SPARC-V9: #define __sparcv9__ 1
+
+// RUN: %clang -E -dM %s -o - 2>&1 \
+// RUN:     -target sparc-sun-solaris \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SPARC_SOLARIS_GCC_ATOMICS
+// CHECK_SPARC_SOLARIS_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK_SPARC_SOLARIS_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK_SPARC_SOLARIS_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK_SPARC_SOLARIS_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
+
+// RUN: %clang -mcpu=v8 -E -dM %s -o - 2>&1 \
+// RUN:     -target sparc-sun-solaris \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SPARC_SOLARIS_GCC_ATOMICS-V8
+// CHECK_SPARC_SOLARIS_GCC_ATOMICS-V8-NOT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK_SPARC_SOLARIS_GCC_ATOMICS-V8-NOT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK_SPARC_SOLARIS_GCC_ATOMICS-V8-NOT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK_SPARC_SOLARIS_GCC_ATOMICS-V8-NOT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 
 // RUN: %clang -E -dM %s -o - 2>&1 \
 // RUN:     -target sparcel-unknown-linux \
diff --git a/clang/test/Sema/attr-arm-sve-vector-bits.c b/clang/test/Sema/attr-arm-sve-vector-bits.c
index 1bcbfa360c976..7cc2d4f4e0b5e 100644
--- a/clang/test/Sema/attr-arm-sve-vector-bits.c
+++ b/clang/test/Sema/attr-arm-sve-vector-bits.c
@@ -1,11 +1,16 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fsyntax-only -verify -msve-vector-bits=128 -fallow-half-arguments-and-returns %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fsyntax-only -verify -msve-vector-bits=256 -fallow-half-arguments-and-returns %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fsyntax-only -verify -msve-vector-bits=512 -fallow-half-arguments-and-returns %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fsyntax-only -verify -msve-vector-bits=1024 -fallow-half-arguments-and-returns %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fsyntax-only -verify -msve-vector-bits=2048 -fallow-half-arguments-and-returns %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -ffreestanding -fsyntax-only -verify -msve-vector-bits=128 -fallow-half-arguments-and-returns %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -ffreestanding -fsyntax-only -verify -msve-vector-bits=256 -fallow-half-arguments-and-returns %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -ffreestanding -fsyntax-only -verify -msve-vector-bits=512 -fallow-half-arguments-and-returns %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -ffreestanding -fsyntax-only -verify -msve-vector-bits=1024 -fallow-half-arguments-and-returns %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -ffreestanding -fsyntax-only -verify -msve-vector-bits=2048 -fallow-half-arguments-and-returns %s
+
+#include 
 
 #define N __ARM_FEATURE_SVE_BITS
 
+typedef __fp16 float16_t;
+typedef float float32_t;
+typedef double float64_t;
 typedef __SVInt8_t svint8_t;
 typedef __SVInt16_t svint16_t;
 typedef __SVInt32_t svint32_t;
@@ -19,6 +24,7 @@ typedef __SVFloat32_t svfloat32_t;
 typedef __SVFloat64_t svfloat64_t;
 
 #if defined(__ARM_FEATURE_SVE_BF16)
+typedef __bf16 bfloat16_t;
 typedef __SVBFloat16_t svbfloat16_t;
 #endif
 
@@ -43,6 +49,23 @@ typedef svbfloat16_t fixed_bfloat16_t __attribute__((arm_sve_vector_bits(N)));
 
 typedef svbool_t fixed_bool_t __attribute__((arm_sve_vector_bits(N)));
 
+// GNU vector types
+typedef int8_t gnu_int8_t __attribute__((vector_size(N / 8)));
+typedef int16_t gnu_int16_t __attribute__((vector_size(N / 8)));
+typedef int32_t gnu_int32_t __attribute__((vector_size(N / 8)));
+typedef int64_t gnu_int64_t __attribute__((vector_size(N / 8)));
+
+typedef uint8_t gnu_uint8_t __attribute__((vector_size(N / 8)));
+typedef uint16_t gnu_uint16_t __attribute__((vector_size(N / 8)));
+typedef uint32_t gnu_uint32_t __attribute__((vector_size(N / 8)));
+typedef uint64_t gnu_uint64_t __attribute__((vector_size(N / 8)));
+
+typedef float16_t gnu_float16_t __attribute__((vector_size(N / 8)));
+typedef float32_t gnu_float32_t __attribute__((vector_size(N / 8)));
+typedef float64_t gnu_float64_t __attribute__((vector_size(N / 8)));
+
+typedef bfloat16_t gnu_bfloat16_t __attribute__((vector_size(N / 8)));
+
 // Attribute must have a single argument
 typedef svint8_t no_argument __attribute__((arm_sve_vector_bits));         // expected-error {{'arm_sve_vector_bits' attribute takes one argument}}
 typedef svint8_t two_arguments __attribute__((arm_sve_vector_bits(2, 4))); // expected-error {{'arm_sve_vector_bits' attribute takes one argument}}
@@ -176,38 +199,51 @@ union union_bool { fixed_bool_t x, y[5]; };
 // --------------------------------------------------------------------------//
 // Implicit casts
 
-#define TEST_CAST(TYPE)                                          \
-  sv##TYPE##_t to_sv##TYPE##_t(fixed_##TYPE##_t x) { return x; } \
-  fixed_##TYPE##_t from_sv##TYPE##_t(sv##TYPE##_t x) { return x; }
-
-TEST_CAST(int8)
-TEST_CAST(int16)
-TEST_CAST(int32)
-TEST_CAST(int64)
-TEST_CAST(uint8)
-TEST_CAST(uint16)
-TEST_CAST(uint32)
-TEST_CAST(uint64)
-TEST_CAST(float16)
-TEST_CAST(float32)
-TEST_CAST(float64)
-TEST_CAST(bfloat16)
-TEST_CAST(bool)
+#define TEST_CAST_COMMON(TYPE)                                              \
+  sv##TYPE##_t to_sv##TYPE##_t_from_fixed(fixed_##TYPE##_t x) { return x; } \
+  fixed_##TYPE##_t from_sv##TYPE##_t_to_fixed(sv##TYPE##_t x) { return x; }
+
+#define TEST_CAST_GNU(PREFIX, TYPE)                                                          \
+  gnu_##TYPE##_t to_gnu_##TYPE##_t_from_##PREFIX##TYPE##_t(PREFIX##TYPE##_t x) { return x; } \
+  PREFIX##TYPE##_t from_gnu_##TYPE##_t_to_##PREFIX##TYPE##_t(gnu_##TYPE##_t x) { return x; }
+
+#define TEST_CAST_VECTOR(TYPE) \
+  TEST_CAST_COMMON(TYPE)       \
+  TEST_CAST_GNU(sv, TYPE)      \
+  TEST_CAST_GNU(fixed_, TYPE)
+
+TEST_CAST_VECTOR(int8)
+TEST_CAST_VECTOR(int16)
+TEST_CAST_VECTOR(int32)
+TEST_CAST_VECTOR(int64)
+TEST_CAST_VECTOR(uint8)
+TEST_CAST_VECTOR(uint16)
+TEST_CAST_VECTOR(uint32)
+TEST_CAST_VECTOR(uint64)
+TEST_CAST_VECTOR(float16)
+TEST_CAST_VECTOR(float32)
+TEST_CAST_VECTOR(float64)
+TEST_CAST_VECTOR(bfloat16)
+TEST_CAST_COMMON(bool)
 
 // Test the implicit conversion only applies to valid types
 fixed_int8_t to_fixed_int8_t__from_svuint8_t(svuint8_t x) { return x; } // expected-error-re {{returning 'svuint8_t' (aka '__SVUint8_t') from a function with incompatible result type 'fixed_int8_t' (vector of {{[0-9]+}} 'signed char' values)}}
 fixed_bool_t to_fixed_bool_t__from_svint32_t(svint32_t x) { return x; } // expected-error-re {{returning 'svint32_t' (aka '__SVInt32_t') from a function with incompatible result type 'fixed_bool_t' (vector of {{[0-9]+}} 'unsigned char' values)}}
 
+svint64_t to_svint64_t__from_gnu_int32_t(gnu_int32_t x) { return x; } // expected-error-re {{returning 'gnu_int32_t' (vector of {{[0-9]+}} 'int32_t' values) from a function with incompatible result type 'svint64_t' (aka '__SVInt64_t')}}
+gnu_int32_t from_svint64_t__to_gnu_int32_t(svint64_t x) { return x; } // expected-error-re {{returning 'svint64_t' (aka '__SVInt64_t') from a function with incompatible result type 'gnu_int32_t' (vector of {{[0-9]+}} 'int32_t' values)}}
+
+// Test implicit conversion between SVE and GNU vector is invalid when
+// __ARM_FEATURE_SVE_BITS != N
+#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 512
+typedef int32_t int4 __attribute__((vector_size(16)));
+svint32_t badcast(int4 x) { return x; } // expected-error {{returning 'int4' (vector of 4 'int32_t' values) from a function with incompatible result type 'svint32_t' (aka '__SVInt32_t')}}
+#endif
+
 // Test conversion between predicate and uint8 is invalid, both have the same
 // memory representation.
 fixed_bool_t to_fixed_bool_t__from_svuint8_t(svuint8_t x) { return x; } // expected-error-re {{returning 'svuint8_t' (aka '__SVUint8_t') from a function with incompatible result type 'fixed_bool_t' (vector of {{[0-9]+}} 'unsigned char' values)}}
 
-// Test the implicit conversion only applies to fixed-length types
-typedef signed int vSInt32 __attribute__((__vector_size__(16)));
-svint32_t to_svint32_t_from_gnut(vSInt32 x) { return x; } // expected-error-re {{returning 'vSInt32' (vector of {{[0-9]+}} 'int' values) from a function with incompatible result type 'svint32_t' (aka '__SVInt32_t')}}
-
-vSInt32 to_gnut_from_svint32_t(svint32_t x) { return x; } // expected-error-re {{returning 'svint32_t' (aka '__SVInt32_t') from a function with incompatible result type 'vSInt32' (vector of {{[0-9]+}} 'int' values)}}
-
 // --------------------------------------------------------------------------//
 // Test the scalable and fixed-length types can be used interchangeably
 
diff --git a/clang/test/Sema/attr-likelihood.c b/clang/test/Sema/attr-likelihood.c
new file mode 100644
index 0000000000000..66aabd6b64052
--- /dev/null
+++ b/clang/test/Sema/attr-likelihood.c
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 %s -fsyntax-only -fdouble-square-bracket-attributes -verify
+
+void g() {
+  if (1)
+    [[clang::likely]] {}
+}
+void m() {
+  [[clang::likely]] int x = 42; // expected-error {{'likely' attribute cannot be applied to a declaration}}
+
+  if (x)
+    [[clang::unlikely]] {}
+  if (x) {
+    [[clang::unlikely]];
+  }
+  switch (x) {
+  case 1:
+    [[clang::likely]] {}
+    break;
+    [[clang::likely]] case 2 : case 3 : {}
+    break;
+  }
+
+  do {
+    [[clang::unlikely]];
+  } while (x);
+  do
+    [[clang::unlikely]] {}
+  while (x);
+  do { // expected-note {{to match this 'do'}}
+  }
+  [[clang::unlikely]] while (x); // expected-error {{expected 'while' in do/while loop}}
+  for (;;)
+    [[clang::unlikely]] {}
+  for (;;) {
+    [[clang::unlikely]];
+  }
+  while (x)
+    [[clang::unlikely]] {}
+  while (x) {
+    [[clang::unlikely]];
+  }
+
+  if (x)
+    goto lbl;
+
+  // FIXME: allow the attribute on the label
+  [[clang::unlikely]] lbl : // expected-error {{'unlikely' attribute cannot be applied to a declaration}}
+  [[clang::likely]] x = x + 1;
+
+  [[clang::likely]]++ x;
+}
diff --git a/clang/test/Sema/implicit-builtin-decl.c b/clang/test/Sema/implicit-builtin-decl.c
index 3a3dfa935ac16..b25e86bc03a33 100644
--- a/clang/test/Sema/implicit-builtin-decl.c
+++ b/clang/test/Sema/implicit-builtin-decl.c
@@ -1,5 +1,4 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: not %clang_cc1 -fsyntax-only -ast-dump %s | FileCheck %s
 
 void f() {
   int *ptr = malloc(sizeof(int) * 10); // expected-warning{{implicitly declaring library function 'malloc' with type}} \
@@ -63,9 +62,5 @@ extern float fmaxf(float, float);
 struct __jmp_buf_tag {};
 void sigsetjmp(struct __jmp_buf_tag[1], int); // expected-warning{{declaration of built-in function 'sigsetjmp' requires the declaration of the 'jmp_buf' type, commonly provided in the header .}}
 
-// CHECK:     FunctionDecl {{.*}}  col:6 sigsetjmp '
-// CHECK-NOT: FunctionDecl
-// CHECK:     ReturnsTwiceAttr {{.*}} <{{.*}}> Implicit
-
 // PR40692
 void pthread_create(); // no warning expected
diff --git a/clang/test/Sema/warn-fortify-source.c b/clang/test/Sema/warn-fortify-source.c
index 0f93a687f007d..5ad2979bc29c6 100644
--- a/clang/test/Sema/warn-fortify-source.c
+++ b/clang/test/Sema/warn-fortify-source.c
@@ -1,8 +1,6 @@
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.14.0 %s -verify
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.14.0 %s -verify -DUSE_PASS_OBJECT_SIZE
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.14.0 %s -verify -DUSE_BUILTINS
 // RUN: %clang_cc1 -xc++ -triple x86_64-apple-macosx10.14.0 %s -verify
-// RUN: %clang_cc1 -xc++ -triple x86_64-apple-macosx10.14.0 %s -verify -DUSE_PASS_OBJECT_SIZE
 // RUN: %clang_cc1 -xc++ -triple x86_64-apple-macosx10.14.0 %s -verify -DUSE_BUILTINS
 
 typedef unsigned long size_t;
@@ -13,13 +11,7 @@ extern "C" {
 
 extern int sprintf(char *str, const char *format, ...);
 
-#if defined(USE_PASS_OBJECT_SIZE)
-void *memcpy(void *dst, const void *src, size_t c);
-static void *memcpy(void *dst __attribute__((pass_object_size(1))), const void *src, size_t c) __attribute__((overloadable)) __asm__("merp");
-static void *memcpy(void *const dst __attribute__((pass_object_size(1))), const void *src, size_t c) __attribute__((overloadable)) {
-  return 0;
-}
-#elif defined(USE_BUILTINS)
+#if defined(USE_BUILTINS)
 #define memcpy(x,y,z) __builtin_memcpy(x,y,z)
 #else
 void *memcpy(void *dst, const void *src, size_t c);
@@ -45,14 +37,7 @@ void call_memcpy_type() {
   };
   struct pair p;
   char buf[20];
-  memcpy(&p.first, buf, 20);
-#ifdef USE_PASS_OBJECT_SIZE
-  // Use the more strict checking mode on the pass_object_size attribute:
-  // expected-warning@-3 {{memcpy' will always overflow; destination buffer has size 4, but size argument is 20}}
-#else
-  // Or just fallback to type 0:
-  // expected-warning@-6 {{memcpy' will always overflow; destination buffer has size 8, but size argument is 20}}
-#endif
+  memcpy(&p.first, buf, 20); // expected-warning {{memcpy' will always overflow; destination buffer has size 8, but size argument is 20}}
 }
 
 void call_strncat() {
diff --git a/clang/test/SemaCXX/attr-arm-sve-vector-bits.cpp b/clang/test/SemaCXX/attr-arm-sve-vector-bits.cpp
index ea7c4778db0ea..5e796b7c8995f 100644
--- a/clang/test/SemaCXX/attr-arm-sve-vector-bits.cpp
+++ b/clang/test/SemaCXX/attr-arm-sve-vector-bits.cpp
@@ -1,14 +1,26 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fsyntax-only -verify -std=c++11 -msve-vector-bits=512 -fallow-half-arguments-and-returns %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -ffreestanding -fsyntax-only -verify -std=c++11 -msve-vector-bits=512 -fallow-half-arguments-and-returns %s
 // expected-no-diagnostics
 
+#include 
+
 #define N __ARM_FEATURE_SVE_BITS
 
 typedef __SVInt8_t svint8_t;
 typedef svint8_t fixed_int8_t __attribute__((arm_sve_vector_bits(N)));
+typedef int8_t gnu_int8_t __attribute__((vector_size(N / 8)));
 
 template struct S { T var; };
 
 S s;
 
+// Test implicit casts between VLA and VLS vectors
 svint8_t to_svint8_t(fixed_int8_t x) { return x; }
 fixed_int8_t from_svint8_t(svint8_t x) { return x; }
+
+// Test implicit casts between GNU and VLA vectors
+svint8_t to_svint8_t__from_gnu_int8_t(gnu_int8_t x) { return x; }
+gnu_int8_t from_svint8_t__to_gnu_int8_t(svint8_t x) { return x; }
+
+// Test implicit casts between GNU and VLS vectors
+fixed_int8_t to_fixed_int8_t__from_gnu_int8_t(gnu_int8_t x) { return x; }
+gnu_int8_t from_fixed_int8_t__to_gnu_int8_t(fixed_int8_t x) { return x; }
diff --git a/clang/test/SemaCXX/attr-likelihood.cpp b/clang/test/SemaCXX/attr-likelihood.cpp
new file mode 100644
index 0000000000000..c8be00bfcc32c
--- /dev/null
+++ b/clang/test/SemaCXX/attr-likelihood.cpp
@@ -0,0 +1,132 @@
+// RUN: %clang_cc1 %s -fsyntax-only -verify
+// RUN: %clang_cc1 %s -DPEDANTIC -pedantic -fsyntax-only -verify
+
+#if PEDANTIC
+void g() {
+  if (true)
+    [[likely]] {} // expected-warning {{use of the 'likely' attribute is a C++20 extension}}
+  else
+    [[unlikely]] {} // expected-warning {{use of the 'unlikely' attribute is a C++20 extension}}
+}
+#else
+void a() {
+  if (true)
+    [[likely]]; // expected-warning {{conflicting attributes 'likely' are ignored}}
+  else
+    [[likely]]; // expected-note {{conflicting attribute is here}}
+}
+
+void b() {
+  if (true)
+    [[unlikely]]; // expected-warning {{conflicting attributes 'unlikely' are ignored}}
+  else
+    [[unlikely]]; // expected-note {{conflicting attribute is here}}
+}
+
+void c() {
+  if (true)
+    [[likely]];
+}
+
+void d() {
+  if (true)
+    [[unlikely]];
+}
+
+void g() {
+  if (true)
+    [[likely]] {}
+  else
+    [[unlikely]] {}
+}
+
+void h() {
+  if (true)
+    [[likely]] {}
+  else {
+  }
+}
+
+void i() {
+  if (true)
+    [[unlikely]] {}
+  else {
+  }
+}
+
+void j() {
+  if (true) {
+  } else
+    [[likely]] {}
+}
+
+void k() {
+  if (true) {
+  } else
+    [[likely]] {}
+}
+
+void l() {
+  if (true)
+    [[likely]] {}
+  else
+    [[unlikely]] if (false) [[likely]] {}
+}
+
+void m() {
+  [[likely]] int x = 42; // expected-error {{'likely' attribute cannot be applied to a declaration}}
+
+  if (x)
+    [[unlikely]] {}
+  if (x) {
+    [[unlikely]];
+  }
+  switch (x) {
+  case 1:
+    [[likely]] {}
+    break;
+    [[likely]] case 2 : case 3 : {}
+    break;
+  }
+
+  do {
+    [[unlikely]];
+  } while (x);
+  do
+    [[unlikely]] {}
+  while (x);
+  do { // expected-note {{to match this 'do'}}
+  }
+  [[unlikely]] while (x); // expected-error {{expected 'while' in do/while loop}}
+  for (;;)
+    [[unlikely]] {}
+  for (;;) {
+    [[unlikely]];
+  }
+  while (x)
+    [[unlikely]] {}
+  while (x) {
+    [[unlikely]];
+  }
+
+  switch (x)
+    [[unlikely]] {}
+
+  if (x)
+    goto lbl;
+
+  // FIXME: allow the attribute on the label
+  [[unlikely]] lbl : // expected-error {{'unlikely' attribute cannot be applied to a declaration}}
+                     [[likely]] x = x + 1;
+
+  [[likely]]++ x;
+}
+
+void n() [[likely]] // expected-error {{'likely' attribute cannot be applied to types}}
+{
+  try
+    [[likely]] {} // expected-error {{expected '{'}}
+  catch (...) [[likely]] { // expected-error {{expected expression}}
+  }
+}
+#endif
diff --git a/clang/test/SemaCXX/cxx11-compat.cpp b/clang/test/SemaCXX/cxx11-compat.cpp
index 07cd6b1fcf93b..f17c900201f76 100644
--- a/clang/test/SemaCXX/cxx11-compat.cpp
+++ b/clang/test/SemaCXX/cxx11-compat.cpp
@@ -31,7 +31,7 @@ struct S {
 s = { n }, // expected-warning {{non-constant-expression cannot be narrowed from type 'int' to 'char' in initializer list in C++11}} expected-note {{explicit cast}}
 t = { 1234 }; // expected-warning {{constant expression evaluates to 1234 which cannot be narrowed to type 'char' in C++11}} expected-warning {{changes value}} expected-note {{explicit cast}}
 
-#define PRIuS "uS"
+#define PRIuS "zu"
 int printf(const char *, ...);
 typedef __typeof(sizeof(int)) size_t;
 void h(size_t foo, size_t bar) {
diff --git a/clang/test/SemaCXX/cxx11-inheriting-ctors.cpp b/clang/test/SemaCXX/cxx11-inheriting-ctors.cpp
index 7d6f4f09f09c4..5be428401fa01 100644
--- a/clang/test/SemaCXX/cxx11-inheriting-ctors.cpp
+++ b/clang/test/SemaCXX/cxx11-inheriting-ctors.cpp
@@ -133,3 +133,12 @@ namespace implicit_member_srcloc {
     S0 s0;
   }
 }
+
+namespace PR47555 {
+  struct A { constexpr A(int) {} };
+  struct B : A { using A::A; };
+  template void f() {
+    constexpr B b = 0;
+  };
+  template void f();
+}
diff --git a/clang/test/SemaCXX/fold_expr_expansion_limit.cpp b/clang/test/SemaCXX/fold_expr_expansion_limit.cpp
new file mode 100644
index 0000000000000..600278da78287
--- /dev/null
+++ b/clang/test/SemaCXX/fold_expr_expansion_limit.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fsyntax-only -fbracket-depth 2 -verify -std=c++17 %s
+
+template  struct seq {
+  constexpr bool zero() { return (true && ... && (V == 0)); }; // expected-error {{instantiating fold expression with 3 arguments exceeded expression nesting limit of 2}} \
+                                                                  expected-note {{use -fbracket-depth}}
+};
+constexpr unsigned N = 3;
+auto x = __make_integer_seq{};
+static_assert(!x.zero(), ""); // expected-note {{in instantiation of member function}}
diff --git a/clang/test/SemaCXX/ms-no-rtti-data.cpp b/clang/test/SemaCXX/ms-no-rtti-data.cpp
new file mode 100644
index 0000000000000..aef167d8a3736
--- /dev/null
+++ b/clang/test/SemaCXX/ms-no-rtti-data.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 %s -triple x86_64-windows-msvc -fdiagnostics-format msvc -fno-rtti-data -fsyntax-only -verify
+
+namespace std {
+struct type_info {};
+} // namespace std
+class B {
+public:
+  virtual ~B() = default;
+};
+
+class D1 : public B {
+public:
+  ~D1() = default;
+};
+
+void f() {
+  B *b = new D1();
+  auto d = dynamic_cast(b);    // expected-warning{{dynamic_cast will not work since RTTI data is disabled by /GR-}}
+  void *v = dynamic_cast(b); // expected-warning{{dynamic_cast will not work since RTTI data is disabled by /GR-}}
+
+  (void)typeid(int);
+  (void)typeid(b);
+  (void)typeid(*b); // expected-warning{{typeid will not work since RTTI data is disabled by /GR-}}
+  B b2 = *b;
+  (void)typeid(b2);
+  (void)typeid(*&b2); // expected-warning{{typeid will not work since RTTI data is disabled by /GR-}}
+  (void)typeid((B &)b2);
+
+  B &br = b2;
+  (void)typeid(br); // expected-warning{{typeid will not work since RTTI data is disabled by /GR-}}
+  (void)typeid(&br);
+}
\ No newline at end of file
diff --git a/clang/test/SemaCXX/no-rtti-data.cpp b/clang/test/SemaCXX/no-rtti-data.cpp
new file mode 100644
index 0000000000000..af0dc7c11bb81
--- /dev/null
+++ b/clang/test/SemaCXX/no-rtti-data.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux -fno-rtti-data -fsyntax-only -verify
+
+namespace std {
+struct type_info {};
+} // namespace std
+class B {
+public:
+  virtual ~B() = default;
+};
+
+class D1 : public B {
+public:
+  ~D1() = default;
+};
+
+void f() {
+  B *b = new D1();
+  auto d = dynamic_cast(b); // expected-warning{{dynamic_cast will not work since RTTI data is disabled by -fno-rtti-data}}
+  void *v = dynamic_cast(b);
+
+  (void)typeid(int);
+  (void)typeid(b);
+  (void)typeid(*b); // expected-warning{{typeid will not work since RTTI data is disabled by -fno-rtti-data}}
+  B b2 = *b;
+  (void)typeid(b2);
+  (void)typeid(*&b2); // expected-warning{{typeid will not work since RTTI data is disabled by -fno-rtti-data}}
+  (void)typeid((B &)b2);
+
+  B &br = b2;
+  (void)typeid(br); // expected-warning{{typeid will not work since RTTI data is disabled by -fno-rtti-data}}
+  (void)typeid(&br);
+}
\ No newline at end of file
diff --git a/clang/test/SemaCXX/no-rtti.cpp b/clang/test/SemaCXX/no-rtti.cpp
index e0b57153c24c9..8082da219d5ad 100644
--- a/clang/test/SemaCXX/no-rtti.cpp
+++ b/clang/test/SemaCXX/no-rtti.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -fno-rtti %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsyntax-only -verify -fno-rtti %s
 
 namespace std {
   class type_info;
diff --git a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp
index 2f0f8fe7a4b50..d4ac966be2dfc 100644
--- a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp
+++ b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp
@@ -1,12 +1,15 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.13.0 -fexceptions -faligned-alloc-unavailable -std=c++1z -verify %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.13.0 -fexceptions -faligned-alloc-unavailable -std=c++1z -verify -DMACOS %s
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.13.0 -fexceptions -std=c++1z -verify -DNO_ERRORS %s
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.13.0 -fexceptions -faligned-allocation -faligned-alloc-unavailable -std=c++14 -verify %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.13.0 -fexceptions -faligned-allocation -faligned-alloc-unavailable -std=c++14 -verify -DMACOS %s
 // RUN: %clang_cc1 -triple arm64-apple-ios10.0.0 -fexceptions -faligned-alloc-unavailable -std=c++1z -verify -DIOS %s
 // RUN: %clang_cc1 -triple arm64-apple-ios10.0.0 -fexceptions -std=c++1z -verify -DNO_ERRORS %s
 // RUN: %clang_cc1 -triple arm64-apple-tvos10.0.0 -fexceptions -faligned-alloc-unavailable -std=c++1z -verify -DTVOS %s
 // RUN: %clang_cc1 -triple arm64-apple-tvos10.0.0 -fexceptions -std=c++1z -verify -DNO_ERRORS %s
 // RUN: %clang_cc1 -triple armv7k-apple-watchos3.0.0 -fexceptions -faligned-alloc-unavailable -std=c++1z -verify -DWATCHOS %s
 // RUN: %clang_cc1 -triple armv7k-apple-watchos3.0.0 -fexceptions -std=c++1z -verify -DNO_ERRORS %s
+// RUN: %clang_cc1 -triple s390x-none-zos -fexceptions -faligned-alloc-unavailable -std=c++1z -verify -DZOS %s
+// RUN: %clang_cc1 -triple s390x-none-zos -fexceptions -std=c++1z -verify -DNO_ERRORS %s
+// RUN: %clang_cc1 -triple s390x-none-zos -fexceptions -faligned-allocation -faligned-alloc-unavailable -std=c++14 -verify -DZOS %s
 
 namespace std {
   typedef decltype(sizeof(0)) size_t;
@@ -62,40 +65,40 @@ void testOveraligned() {
 #ifdef NO_ERRORS
 // expected-no-diagnostics
 #else
-// expected-error@-16 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on}}
+// expected-error-re@-16 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-17 {{if you supply your own aligned allocation functions}}
-// expected-error@-18 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on}}
+// expected-error-re@-18 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-19 {{if you supply your own aligned allocation functions}}
 
-// expected-error@-20 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on}}
+// expected-error-re@-20 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-21 {{if you supply your own aligned allocation functions}}
-// expected-error@-22 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on}}
+// expected-error-re@-22 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-23 {{if you supply your own aligned allocation functions}}
 
-// expected-error@-24 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on}}
+// expected-error-re@-24 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-25 {{if you supply your own aligned allocation functions}}
 
-// expected-error@-26 {{aligned allocation function of type 'void *(std::size_t, std::align_val_t, const std::nothrow_t &) noexcept' is only available on}}
+// expected-error-re@-26 {{aligned allocation function of type 'void *(std::size_t, std::align_val_t, const std::nothrow_t &) noexcept' is {{only|not}} available on}}
 // expected-note@-27 {{if you supply your own aligned allocation functions}}
-// expected-error@-28 {{aligned deallocation function of type 'void (void *, std::align_val_t, const std::nothrow_t &) noexcept' is only available on}}
+// expected-error-re@-28 {{aligned deallocation function of type 'void (void *, std::align_val_t, const std::nothrow_t &) noexcept' is {{only|not}} available on}}
 // expected-note@-29 {{if you supply your own aligned allocation functions}}
 
-// expected-error@-29 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on}}
+// expected-error-re@-29 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-30 {{if you supply your own aligned allocation functions}}
-// expected-error@-31 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on}}
+// expected-error-re@-31 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-32 {{if you supply your own aligned allocation functions}}
 
-// expected-error@-33 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on}}
+// expected-error-re@-33 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-34 {{if you supply your own aligned allocation functions}}
-// expected-error@-35 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on}}
+// expected-error-re@-35 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-36 {{if you supply your own aligned allocation functions}}
 
-// expected-error@-37 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on}}
+// expected-error-re@-37 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-38 {{if you supply your own aligned allocation functions}}
 
-// expected-error@-39 {{aligned allocation function of type 'void *(std::size_t, std::align_val_t, const std::nothrow_t &) noexcept' is only available on}}
+// expected-error-re@-39 {{aligned allocation function of type 'void *(std::size_t, std::align_val_t, const std::nothrow_t &) noexcept' is {{only|not}} available on}}
 // expected-note@-40 {{if you supply your own aligned allocation functions}}
-// expected-error@-41 {{aligned deallocation function of type 'void (void *, std::align_val_t, const std::nothrow_t &) noexcept' is only available on}}
+// expected-error-re@-41 {{aligned deallocation function of type 'void (void *, std::align_val_t, const std::nothrow_t &) noexcept' is {{only|not}} available on}}
 // expected-note@-42 {{if you supply your own aligned allocation functions}}
 
 #endif
@@ -116,12 +119,15 @@ void testOveralignedCheckOS() {
 #elif defined(WATCHOS)
 // expected-error@-13 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on watchOS 4 or newer}}}
 // expected-error@-14 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on watchOS 4 or newer}}}
-#else
+#elif defined(MACOS)
 // expected-error@-16 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on macOS 10.14 or newer}}}
 // expected-error@-17 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on macOS 10.14 or newer}}}
+#elif defined(ZOS)
+// expected-error@-19 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is not available on z/OS}}}
+// expected-error@-20 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is not available on z/OS}}}
 #endif
 
-// expected-note@-20 2 {{if you supply your own aligned allocation functions}}
+// expected-note@-23 2 {{if you supply your own aligned allocation functions}}
 #endif
 
 // Test that diagnostics are produced when an unavailable aligned deallocation
@@ -145,9 +151,12 @@ OveralignedS2::~OveralignedS2() {}
 #elif defined(WATCHOS)
 // expected-error@-12 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on watchOS 4 or newer}}}
 // expected-note@-13 {{if you supply your own aligned allocation functions}}
-#else
+#elif defined(MACOS)
 // expected-error@-15 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on macOS 10.14 or newer}}}
 // expected-note@-16 {{if you supply your own aligned allocation functions}}
+#elif defined(ZOS)
+// expected-error@-18 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is not available on z/OS}}}
+// expected-note@-19 {{if you supply your own aligned allocation functions}}
 #endif
 #endif
 
@@ -172,22 +181,22 @@ void testExplicitOperatorNewDeleteOveraligned() {
 #ifdef NO_ERRORS
 // expected-no-diagnostics
 #else
-// expected-error@-11 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on}}
+// expected-error-re@-11 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-12 {{if you supply your own aligned allocation functions}}
 
-// expected-error@-13 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on}}
+// expected-error-re@-13 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-14 {{if you supply your own aligned allocation functions}}
 
-// expected-error@-15 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on}}
+// expected-error-re@-15 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-16 {{if you supply your own aligned allocation functions}}
 
-// expected-error@-17 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on}}
+// expected-error-re@-17 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-18 {{if you supply your own aligned allocation functions}}
 
-// expected-error@-19 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on}}
+// expected-error-re@-19 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-20 {{if you supply your own aligned allocation functions}}
 
-// expected-error@-21 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on}}
+// expected-error-re@-21 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-22 {{if you supply your own aligned allocation functions}}
 #endif
 
diff --git a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
index d1520b1decbd3..91bd15def577d 100644
--- a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
+++ b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
@@ -5036,8 +5036,7 @@ void spawn_fake_flight_control_thread(void) {
 }
 
 extern const char *deque_log_msg(void) __attribute__((requires_capability(Logger)));
-void logger_entry(void) __attribute__((requires_capability(Logger)))
-                        __attribute__((requires_capability(!FlightControl))) {
+void logger_entry(void) __attribute__((requires_capability(Logger))) {
   const char *msg;
 
   while ((msg = deque_log_msg())) {
@@ -5045,13 +5044,13 @@ void logger_entry(void) __attribute__((requires_capability(Logger)))
   }
 }
 
-void spawn_fake_logger_thread(void) __attribute__((requires_capability(!FlightControl))) {
+void spawn_fake_logger_thread(void) {
   acquire(Logger);
   logger_entry();
   release(Logger);
 }
 
-int main(void) __attribute__((requires_capability(!FlightControl))) {
+int main(void) {
   spawn_fake_flight_control_thread();
   spawn_fake_logger_thread();
 
diff --git a/clang/test/SemaCXX/warn-thread-safety-negative.cpp b/clang/test/SemaCXX/warn-thread-safety-negative.cpp
index 68e30f4a3225b..456fe16e6574e 100644
--- a/clang/test/SemaCXX/warn-thread-safety-negative.cpp
+++ b/clang/test/SemaCXX/warn-thread-safety-negative.cpp
@@ -81,35 +81,6 @@ class Foo {
 
 }  // end namespace SimpleTest
 
-Mutex globalMutex;
-
-namespace ScopeTest {
-
-void f() EXCLUSIVE_LOCKS_REQUIRED(!globalMutex);
-void fq() EXCLUSIVE_LOCKS_REQUIRED(!::globalMutex);
-
-namespace ns {
-  Mutex globalMutex;
-  void f() EXCLUSIVE_LOCKS_REQUIRED(!globalMutex);
-  void fq() EXCLUSIVE_LOCKS_REQUIRED(!ns::globalMutex);
-}
-
-void testGlobals() EXCLUSIVE_LOCKS_REQUIRED(!ns::globalMutex) {
-  f();     // expected-warning {{calling function 'f' requires negative capability '!globalMutex'}}
-  fq();    // expected-warning {{calling function 'fq' requires negative capability '!globalMutex'}}
-  ns::f();
-  ns::fq();
-}
-
-void testNamespaceGlobals() EXCLUSIVE_LOCKS_REQUIRED(!globalMutex) {
-  f();
-  fq();
-  ns::f();  // expected-warning {{calling function 'f' requires negative capability '!globalMutex'}}
-  ns::fq(); // expected-warning {{calling function 'fq' requires negative capability '!globalMutex'}}
-}
-
-}  // end namespace ScopeTest
-
 namespace DoubleAttribute {
 
 struct Foo {
diff --git a/clang/test/SemaCXX/warn-unused-local-typedef.cpp b/clang/test/SemaCXX/warn-unused-local-typedef.cpp
index 7e893ba506a5f..554ea37eeb282 100644
--- a/clang/test/SemaCXX/warn-unused-local-typedef.cpp
+++ b/clang/test/SemaCXX/warn-unused-local-typedef.cpp
@@ -67,10 +67,10 @@ int printf(char const *, ...);
 
 void test() {
   typedef signed long int superint; // no diag
-  printf("%f", (superint) 42);
+  printf("%ld", (superint)42);
 
   typedef signed long int superint2; // no diag
-  printf("%f", static_cast(42));
+  printf("%ld", static_cast(42));
 
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-local-typedef"
diff --git a/clang/test/SemaObjC/attr-swift-error.m b/clang/test/SemaObjC/attr-swift-error.m
new file mode 100644
index 0000000000000..0132a8b200f5f
--- /dev/null
+++ b/clang/test/SemaObjC/attr-swift-error.m
@@ -0,0 +1,93 @@
+// RUN: %clang_cc1 -verify -fsyntax-only -fobjc-arc -fblocks %s
+
+@class NSError;
+
+#if __SIZEOF_POINTER__ == 4
+typedef unsigned char BOOL;
+#else
+typedef _Bool BOOL;
+#endif
+
+typedef struct __attribute__((__objc_bridge__(NSError))) __CFError *CFErrorRef;
+
+extern int f0(void) __attribute__((__swift_error__));
+// expected-error@-1 {{'__swift_error__' attribute takes one argument}}
+extern int f1(void) __attribute__((__swift_error__(invalid)));
+// expected-warning@-1 {{'__swift_error__' attribute argument not supported: 'invalid'}}
+extern int f2(void) __attribute__((__swift_error__(none,zero_result)));
+// expected-error@-1 {{use of undeclared identifier 'zero_result'}}
+
+@interface Erroneous
+- (BOOL)m0:(NSError **)error __attribute__((__swift_error__(none)));
+- (BOOL)m1:(NSError **)error __attribute__((__swift_error__(nonnull_error)));
+- (BOOL)m2:(NSError **)error __attribute__((__swift_error__(null_result)));
+// expected-error@-1 {{'__swift_error__' attribute with 'null_result' convention can only be applied to a method returning a pointer}}
+- (BOOL)m3:(NSError **)error __attribute__((__swift_error__(nonzero_result)));
+- (BOOL)m4:(NSError **)error __attribute__((__swift_error__(zero_result)));
+
+- (Undeclared)n0:(NSError **)error __attribute__((__swift_error__(none)));
+// expected-error@-1 {{expected a type}}
+- (Undeclared)n1:(NSError **)error __attribute__((__swift_error__(nonnull_error)));
+// expected-error@-1 {{expected a type}}
+- (Undeclared)n2:(NSError **)error __attribute__((__swift_error__(null_result)));
+// expected-error@-1 {{expected a type}}
+- (Undeclared)n3:(NSError **)error __attribute__((__swift_error__(nonzero_result)));
+// expected-error@-1 {{expected a type}}
+// FIXME: the follow-on warning should really be suppressed, but apparently
+// having an ill-formed return type doesn't mark anything as invalid.
+// expected-error@-4 {{can only be applied}}
+- (Undeclared)n4:(NSError **)error __attribute__((__swift_error__(zero_result)));
+// expected-error@-1 {{expected a type}}
+// FIXME: the follow-on warning should really be suppressed, but apparently
+// having an ill-formed return type doesn't mark anything as invalid.
+// expected-error@-4 {{can only be applied}}
+
+- (instancetype)o0 __attribute__((__swift_error__(none)));
+- (instancetype)o1 __attribute__((__swift_error__(nonnull_error)));
+// expected-error@-1 {{'__swift_error__' attribute can only be applied to a method with an error parameter}}
+- (instancetype)o2 __attribute__((__swift_error__(null_result)));
+// expected-error@-1 {{'__swift_error__' attribute can only be applied to a method with an error parameter}}
+- (instancetype)o3 __attribute__((__swift_error__(nonzero_result)));
+// expected-error@-1 {{'__swift_error__' attribute can only be applied to a method with an error parameter}}
+- (instancetype)o4 __attribute__((__swift_error__(zero_result)));
+// expected-error@-1 {{'__swift_error__' attribute can only be applied to a method with an error parameter}}
+@end
+
+extern BOOL m0(CFErrorRef *) __attribute__((__swift_error__(none)));
+extern BOOL m1(CFErrorRef *) __attribute__((__swift_error__(nonnull_error)));
+extern BOOL m2(CFErrorRef *) __attribute__((__swift_error__(null_result)));
+// expected-error@-1 {{'__swift_error__' attribute with 'null_result' convention can only be applied to a function returning a pointer}}
+extern BOOL m3(CFErrorRef *) __attribute__((__swift_error__(nonzero_result)));
+extern BOOL m4(CFErrorRef *) __attribute__((__swift_error__(zero_result)));
+
+extern Undeclared n0(CFErrorRef *) __attribute__((__swift_error__(none)));
+// expected-error@-1 {{unknown type name 'Undeclared'}}
+extern Undeclared n1(CFErrorRef *) __attribute__((__swift_error__(nonnull_error)));
+// expected-error@-1 {{unknown type name 'Undeclared'}}
+extern Undeclared n2(CFErrorRef *) __attribute__((__swift_error__(null_result)));
+// expected-error@-1 {{unknown type name 'Undeclared'}}
+extern Undeclared n3(CFErrorRef *) __attribute__((__swift_error__(nonzero_result)));
+// expected-error@-1 {{unknown type name 'Undeclared'}}
+extern Undeclared n4(CFErrorRef *) __attribute__((__swift_error__(zero_result)));
+// expected-error@-1 {{unknown type name 'Undeclared'}}
+
+extern void *o0(CFErrorRef *) __attribute__((__swift_error__(none)));
+extern void *o1(CFErrorRef *) __attribute__((__swift_error__(nonnull_error)));
+extern void *o2(CFErrorRef *) __attribute__((__swift_error__(null_result)));
+extern void *o3(CFErrorRef *) __attribute__((__swift_error__(nonzero_result)));
+// expected-error@-1 {{'__swift_error__' attribute with 'nonzero_result' convention can only be applied to a function returning an integral type}}
+extern void *o4(CFErrorRef *) __attribute__((__swift_error__(zero_result)));
+// expected-error@-1 {{'__swift_error__' attribute with 'zero_result' convention can only be applied to a function returning an integral type}}
+
+extern void *p0(void) __attribute__((__swift_error__(none)));
+extern void *p1(void) __attribute__((__swift_error__(nonnull_error)));
+// expected-error@-1 {{'__swift_error__' attribute can only be applied to a function with an error parameter}}
+extern void *p2(void) __attribute__((__swift_error__(null_result)));
+// expected-error@-1 {{'__swift_error__' attribute can only be applied to a function with an error parameter}}
+extern void *p3(void) __attribute__((__swift_error__(nonzero_result)));
+// expected-error@-1 {{'__swift_error__' attribute can only be applied to a function with an error parameter}}
+extern void *p4(void) __attribute__((__swift_error__(zero_result)));
+// expected-error@-1 {{'__swift_error__' attribute can only be applied to a function with an error parameter}}
+
+extern BOOL b __attribute__((__swift_error__(none)));
+// expected-error@-1 {{attribute only applies to functions and Objective-C methods}}
diff --git a/clang/test/SemaObjC/attr-swift_bridge.m b/clang/test/SemaObjC/attr-swift_bridge.m
new file mode 100644
index 0000000000000..1c8259a6a2e7f
--- /dev/null
+++ b/clang/test/SemaObjC/attr-swift_bridge.m
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -verify -fsyntax-only %s
+
+// expected-error@+1 {{'__swift_bridge__' attribute takes one argument}}
+__attribute__((__swift_bridge__))
+@interface I
+@end
+
+// expected-error@+1 {{'__swift_bridge__' attribute requires a string}}
+__attribute__((__swift_bridge__(1)))
+@interface J
+@end
+
+// expected-error@+1 {{'__swift_bridge__' attribute takes one argument}}
+__attribute__((__swift_bridge__("K", 1)))
+@interface K
+@end
+
+@interface L
+// expected-error@+1 {{'__swift_bridge__' attribute only applies to tag types, typedefs, Objective-C interfaces, and Objective-C protocols}}
+- (void)method __attribute__((__swift_bridge__("method")));
+@end
+
+__attribute__((__swift_bridge__("Array")))
+@interface NSArray
+@end
+
+__attribute__((__swift_bridge__("ProtocolP")))
+@protocol P
+@end
+
+typedef NSArray *NSArrayAlias __attribute__((__swift_bridge__("ArrayAlias")));
+
+struct __attribute__((__swift_bridge__("StructT"))) T {};
diff --git a/clang/test/SemaObjC/attr-swift_bridged_typedef.m b/clang/test/SemaObjC/attr-swift_bridged_typedef.m
new file mode 100644
index 0000000000000..2836b886a903d
--- /dev/null
+++ b/clang/test/SemaObjC/attr-swift_bridged_typedef.m
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -verify -fsyntax-only %s
+
+@interface NSString
+@end
+
+typedef NSString *NSStringAlias __attribute__((__swift_bridged_typedef__));
+
+typedef int IntAlias __attribute__((__swift_bridged_typedef__));
+
+struct __attribute__((swift_bridged_typedef)) S {};
+// expected-error@-1 {{'swift_bridged_typedef' attribute only applies to typedefs}}
+
+typedef unsigned char UnsignedChar __attribute__((__swift_bridged_typedef__("UnsignedChar")));
+// expected-error@-1 {{'__swift_bridged_typedef__' attribute takes no arguments}}
diff --git a/clang/test/SemaObjC/attr-swift_objc_members.m b/clang/test/SemaObjC/attr-swift_objc_members.m
new file mode 100644
index 0000000000000..81328b6245947
--- /dev/null
+++ b/clang/test/SemaObjC/attr-swift_objc_members.m
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -verify -fsyntax-only %s
+
+#if !__has_attribute(swift_objc_members)
+#error cannot verify presence of swift_objc_members attribute
+#endif
+
+__attribute__((__swift_objc_members__))
+__attribute__((__objc_root_class__))
+@interface I
+@end
+
+__attribute__((swift_objc_members))
+@protocol P
+@end
+// expected-error@-3 {{'swift_objc_members' attribute only applies to Objective-C interfaces}}
+
+__attribute__((swift_objc_members))
+extern void f(void);
+// expected-error@-2 {{'swift_objc_members' attribute only applies to Objective-C interfaces}}
+
+// expected-error@+1 {{'__swift_objc_members__' attribute takes no arguments}}
+__attribute__((__swift_objc_members__("J")))
+@interface J
+@end
diff --git a/clang/tools/CMakeLists.txt b/clang/tools/CMakeLists.txt
index e46c3669a2c2b..85a85812a8d41 100644
--- a/clang/tools/CMakeLists.txt
+++ b/clang/tools/CMakeLists.txt
@@ -15,7 +15,7 @@ add_clang_subdirectory(c-index-test)
 
 add_clang_subdirectory(clang-rename)
 add_clang_subdirectory(clang-refactor)
-if(UNIX)
+if(UNIX OR MINGW)
   add_clang_subdirectory(clang-shlib)
 endif()
 
diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt
index c3b9ab6ffb9b0..15f7ff94dfead 100644
--- a/clang/tools/libclang/CMakeLists.txt
+++ b/clang/tools/libclang/CMakeLists.txt
@@ -97,11 +97,11 @@ if(NOT LLVM_ENABLE_PIC OR LIBCLANG_BUILD_STATIC)
   set(ENABLE_STATIC STATIC)
 endif()
 
-if (WIN32 AND ENABLE_SHARED AND ENABLE_STATIC)
+if (MSVC AND ENABLE_SHARED AND ENABLE_STATIC)
   unset(ENABLE_STATIC)
 endif()
 
-if(WIN32)
+if(MSVC)
   set(output_name "libclang")
 else()
   set(output_name "clang")
diff --git a/clang/unittests/AST/StructuralEquivalenceTest.cpp b/clang/unittests/AST/StructuralEquivalenceTest.cpp
index 2b5ce0fed51d6..d71c65fa3b61a 100644
--- a/clang/unittests/AST/StructuralEquivalenceTest.cpp
+++ b/clang/unittests/AST/StructuralEquivalenceTest.cpp
@@ -19,14 +19,10 @@ struct StructuralEquivalenceTest : ::testing::Test {
   std::unique_ptr AST0, AST1;
   std::string Code0, Code1; // Buffers for SourceManager
 
-  // Get a pair of node pointers into the synthesized AST from the given code
-  // snippets. To determine the returned node, a separate matcher is specified
-  // for both snippets. The first matching node is returned.
-  template 
-  std::tuple
-  makeDecls(const std::string &SrcCode0, const std::string &SrcCode1,
-            TestLanguage Lang, const MatcherType &Matcher0,
-            const MatcherType &Matcher1) {
+  // Parses the source code in the specified language and sets the ASTs of
+  // the current test instance to the parse result.
+  void makeASTUnits(const std::string &SrcCode0, const std::string &SrcCode1,
+                    TestLanguage Lang) {
     this->Code0 = SrcCode0;
     this->Code1 = SrcCode1;
     std::vector Args = getCommandLineArgsForTesting(Lang);
@@ -35,6 +31,17 @@ struct StructuralEquivalenceTest : ::testing::Test {
 
     AST0 = tooling::buildASTFromCodeWithArgs(Code0, Args, InputFileName);
     AST1 = tooling::buildASTFromCodeWithArgs(Code1, Args, InputFileName);
+  }
+
+  // Get a pair of node pointers into the synthesized AST from the given code
+  // snippets. To determine the returned node, a separate matcher is specified
+  // for both snippets. The first matching node is returned.
+  template 
+  std::tuple
+  makeDecls(const std::string &SrcCode0, const std::string &SrcCode1,
+            TestLanguage Lang, const MatcherType &Matcher0,
+            const MatcherType &Matcher1) {
+    makeASTUnits(SrcCode0, SrcCode1, Lang);
 
     NodeType *D0 = FirstDeclMatcher().match(
         AST0->getASTContext().getTranslationUnitDecl(), Matcher0);
@@ -47,14 +54,7 @@ struct StructuralEquivalenceTest : ::testing::Test {
   std::tuple
   makeTuDecls(const std::string &SrcCode0, const std::string &SrcCode1,
               TestLanguage Lang) {
-    this->Code0 = SrcCode0;
-    this->Code1 = SrcCode1;
-    std::vector Args = getCommandLineArgsForTesting(Lang);
-
-    const char *const InputFileName = "input.cc";
-
-    AST0 = tooling::buildASTFromCodeWithArgs(Code0, Args, InputFileName);
-    AST1 = tooling::buildASTFromCodeWithArgs(Code1, Args, InputFileName);
+    makeASTUnits(SrcCode0, SrcCode1, Lang);
 
     return std::make_tuple(AST0->getASTContext().getTranslationUnitDecl(),
                            AST1->getASTContext().getTranslationUnitDecl());
@@ -80,6 +80,56 @@ struct StructuralEquivalenceTest : ::testing::Test {
     return makeDecls(SrcCode0, SrcCode1, Lang, Matcher);
   }
 
+  // Wraps a Stmt and the ASTContext that contains it.
+  struct StmtWithASTContext {
+    Stmt *S;
+    ASTContext *Context;
+    explicit StmtWithASTContext(Stmt &S, ASTContext &Context)
+        : S(&S), Context(&Context) {}
+    explicit StmtWithASTContext(FunctionDecl *FD)
+        : S(FD->getBody()), Context(&FD->getASTContext()) {}
+  };
+
+  // Get a pair of node pointers into the synthesized AST from the given code
+  // snippets. To determine the returned node, a separate matcher is specified
+  // for both snippets. The first matching node is returned.
+  template 
+  std::tuple
+  makeStmts(const std::string &SrcCode0, const std::string &SrcCode1,
+            TestLanguage Lang, const MatcherType &Matcher0,
+            const MatcherType &Matcher1) {
+    makeASTUnits(SrcCode0, SrcCode1, Lang);
+
+    Stmt *S0 = FirstDeclMatcher().match(
+        AST0->getASTContext().getTranslationUnitDecl(), Matcher0);
+    Stmt *S1 = FirstDeclMatcher().match(
+        AST1->getASTContext().getTranslationUnitDecl(), Matcher1);
+
+    return std::make_tuple(StmtWithASTContext(*S0, AST0->getASTContext()),
+                           StmtWithASTContext(*S1, AST1->getASTContext()));
+  }
+
+  // Get a pair of node pointers into the synthesized AST from the given code
+  // snippets. The same matcher is used for both snippets.
+  template 
+  std::tuple
+  makeStmts(const std::string &SrcCode0, const std::string &SrcCode1,
+            TestLanguage Lang, const MatcherType &AMatcher) {
+    return makeStmts(SrcCode0, SrcCode1, Lang, AMatcher, AMatcher);
+  }
+
+  // Convenience function for makeStmts that wraps the code inside a function
+  // body.
+  template 
+  std::tuple
+  makeWrappedStmts(const std::string &SrcCode0, const std::string &SrcCode1,
+                   TestLanguage Lang, const MatcherType &AMatcher) {
+    auto Wrap = [](const std::string &Src) {
+      return "void wrapped() {" + Src + ";}";
+    };
+    return makeStmts(Wrap(SrcCode0), Wrap(SrcCode1), Lang, AMatcher);
+  }
+
   bool testStructuralMatch(Decl *D0, Decl *D1) {
     llvm::DenseSet> NonEquivalentDecls01;
     llvm::DenseSet> NonEquivalentDecls10;
@@ -95,6 +145,26 @@ struct StructuralEquivalenceTest : ::testing::Test {
     return Eq01;
   }
 
+  bool testStructuralMatch(StmtWithASTContext S0, StmtWithASTContext S1) {
+    llvm::DenseSet> NonEquivalentDecls01;
+    llvm::DenseSet> NonEquivalentDecls10;
+    StructuralEquivalenceContext Ctx01(
+        *S0.Context, *S1.Context, NonEquivalentDecls01,
+        StructuralEquivalenceKind::Default, false, false);
+    StructuralEquivalenceContext Ctx10(
+        *S1.Context, *S0.Context, NonEquivalentDecls10,
+        StructuralEquivalenceKind::Default, false, false);
+    bool Eq01 = Ctx01.IsEquivalent(S0.S, S1.S);
+    bool Eq10 = Ctx10.IsEquivalent(S1.S, S0.S);
+    EXPECT_EQ(Eq01, Eq10);
+    return Eq01;
+  }
+
+  bool
+  testStructuralMatch(std::tuple t) {
+    return testStructuralMatch(get<0>(t), get<1>(t));
+  }
+
   bool testStructuralMatch(std::tuple t) {
     return testStructuralMatch(get<0>(t), get<1>(t));
   }
@@ -1375,5 +1445,225 @@ TEST_F(StructuralEquivalenceCacheTest, Cycle) {
       findDeclPair(TU, functionDecl(hasName("x")))));
 }
 
+struct StructuralEquivalenceStmtTest : StructuralEquivalenceTest {};
+
+/// Fallback matcher to be used only when there is no specific matcher for a
+/// Expr subclass. Remove this once all Expr subclasses have their own matcher.
+static auto &fallbackExprMatcher = expr;
+
+TEST_F(StructuralEquivalenceStmtTest, AddrLabelExpr) {
+  auto t = makeWrappedStmts("lbl: &&lbl;", "lbl: &&lbl;", Lang_CXX03,
+                            addrLabelExpr());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, AddrLabelExprDifferentLabel) {
+  auto t = makeWrappedStmts("lbl1: lbl2: &&lbl1;", "lbl1: lbl2: &&lbl2;",
+                            Lang_CXX03, addrLabelExpr());
+  // FIXME: Should be false. LabelDecl are incorrectly matched.
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+static const std::string MemoryOrderSrc = R"(
+enum memory_order {
+  memory_order_relaxed,
+  memory_order_consume,
+  memory_order_acquire,
+  memory_order_release,
+  memory_order_acq_rel,
+  memory_order_seq_cst
+};
+)";
+
+TEST_F(StructuralEquivalenceStmtTest, AtomicExpr) {
+  std::string Prefix = "char a, b; " + MemoryOrderSrc;
+  auto t = makeStmts(
+      Prefix +
+          "void wrapped() { __atomic_load(&a, &b, memory_order_seq_cst); }",
+      Prefix +
+          "void wrapped() { __atomic_load(&a, &b, memory_order_seq_cst); }",
+      Lang_CXX03, atomicExpr());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, AtomicExprDifferentOp) {
+  std::string Prefix = "char a, b; " + MemoryOrderSrc;
+  auto t = makeStmts(
+      Prefix +
+          "void wrapped() { __atomic_load(&a, &b, memory_order_seq_cst); }",
+      Prefix +
+          "void wrapped() { __atomic_store(&a, &b, memory_order_seq_cst); }",
+      Lang_CXX03, atomicExpr());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, BinaryOperator) {
+  auto t = makeWrappedStmts("1 + 1", "1 + 1", Lang_CXX03, binaryOperator());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, BinaryOperatorDifferentOps) {
+  auto t = makeWrappedStmts("1 + 1", "1 - 1", Lang_CXX03, binaryOperator());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, CallExpr) {
+  std::string Src = "int call(); int wrapped() { call(); }";
+  auto t = makeStmts(Src, Src, Lang_CXX03, callExpr());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, CallExprDifferentCallee) {
+  std::string FunctionSrc = "int func1(); int func2();\n";
+  auto t = makeStmts(FunctionSrc + "void wrapper() { func1(); }",
+                     FunctionSrc + "void wrapper() { func2(); }", Lang_CXX03,
+                     callExpr());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, CharacterLiteral) {
+  auto t = makeWrappedStmts("'a'", "'a'", Lang_CXX03, characterLiteral());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, CharacterLiteralDifferentValues) {
+  auto t = makeWrappedStmts("'a'", "'b'", Lang_CXX03, characterLiteral());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, ExpressionTraitExpr) {
+  auto t = makeWrappedStmts("__is_lvalue_expr(1)", "__is_lvalue_expr(1)",
+                            Lang_CXX03, fallbackExprMatcher());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, ExpressionTraitExprDifferentKind) {
+  auto t = makeWrappedStmts("__is_lvalue_expr(1)", "__is_rvalue_expr(1)",
+                            Lang_CXX03, fallbackExprMatcher());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, FloatingLiteral) {
+  auto t = makeWrappedStmts("1.0", "1.0", Lang_CXX03, fallbackExprMatcher());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, FloatingLiteralDifferentSpelling) {
+  auto t = makeWrappedStmts("0x10.1p0", "16.0625", Lang_CXX17,
+                            fallbackExprMatcher());
+  // Same value but with different spelling is equivalent.
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, FloatingLiteralDifferentType) {
+  auto t = makeWrappedStmts("1.0", "1.0f", Lang_CXX03, fallbackExprMatcher());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, FloatingLiteralDifferentValue) {
+  auto t = makeWrappedStmts("1.01", "1.0", Lang_CXX03, fallbackExprMatcher());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, IntegerLiteral) {
+  auto t = makeWrappedStmts("1", "1", Lang_CXX03, integerLiteral());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, IntegerLiteralDifferentSpelling) {
+  auto t = makeWrappedStmts("1", "0x1", Lang_CXX03, integerLiteral());
+  // Same value but with different spelling is equivalent.
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, IntegerLiteralDifferentValue) {
+  auto t = makeWrappedStmts("1", "2", Lang_CXX03, integerLiteral());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, IntegerLiteralDifferentTypes) {
+  auto t = makeWrappedStmts("1", "1L", Lang_CXX03, integerLiteral());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, ObjCStringLiteral) {
+  auto t =
+      makeWrappedStmts("@\"a\"", "@\"a\"", Lang_OBJCXX, fallbackExprMatcher());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, ObjCStringLiteralDifferentContent) {
+  auto t =
+      makeWrappedStmts("@\"a\"", "@\"b\"", Lang_OBJCXX, fallbackExprMatcher());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, StringLiteral) {
+  auto t = makeWrappedStmts("\"a\"", "\"a\"", Lang_CXX03, stringLiteral());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, StringLiteralDifferentContent) {
+  auto t = makeWrappedStmts("\"a\"", "\"b\"", Lang_CXX03, stringLiteral());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, StringLiteralDifferentLength) {
+  auto t = makeWrappedStmts("\"a\"", "\"aa\"", Lang_CXX03, stringLiteral());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, TypeTraitExpr) {
+  auto t = makeWrappedStmts("__is_pod(int)", "__is_pod(int)", Lang_CXX03,
+                            fallbackExprMatcher());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, TypeTraitExprDifferentType) {
+  auto t = makeWrappedStmts("__is_pod(int)", "__is_pod(long)", Lang_CXX03,
+                            fallbackExprMatcher());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, TypeTraitExprDifferentTrait) {
+  auto t = makeWrappedStmts(
+      "__is_pod(int)", "__is_trivially_constructible(int)", Lang_CXX03, expr());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, TypeTraitExprDifferentTraits) {
+  auto t = makeWrappedStmts("__is_constructible(int)",
+                            "__is_constructible(int, int)", Lang_CXX03, expr());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, UnaryExprOrTypeTraitExpr) {
+  auto t = makeWrappedStmts("sizeof(int)", "sizeof(int)", Lang_CXX03,
+                            unaryExprOrTypeTraitExpr());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, UnaryExprOrTypeTraitExprDifferentKind) {
+  auto t = makeWrappedStmts("sizeof(int)", "alignof(long)", Lang_CXX11,
+                            unaryExprOrTypeTraitExpr());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, UnaryExprOrTypeTraitExprDifferentType) {
+  auto t = makeWrappedStmts("sizeof(int)", "sizeof(long)", Lang_CXX03,
+                            unaryExprOrTypeTraitExpr());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, UnaryOperator) {
+  auto t = makeWrappedStmts("+1", "+1", Lang_CXX03, unaryOperator());
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceStmtTest, UnaryOperatorDifferentOps) {
+  auto t = makeWrappedStmts("+1", "-1", Lang_CXX03, unaryOperator());
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
 } // end namespace ast_matchers
 } // end namespace clang
diff --git a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
index c7db52b37a506..39222fbe42491 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
@@ -741,6 +741,164 @@ TEST(ForEachArgumentWithParam, HandlesBoundNodesForNonMatches) {
     std::make_unique>("v", 4)));
 }
 
+TEST(ForEachArgumentWithParamType, ReportsNoFalsePositives) {
+  StatementMatcher ArgumentY =
+      declRefExpr(to(varDecl(hasName("y")))).bind("arg");
+  TypeMatcher IntType = qualType(isInteger()).bind("type");
+  StatementMatcher CallExpr =
+      callExpr(forEachArgumentWithParamType(ArgumentY, IntType));
+
+  // IntParam does not match.
+  EXPECT_TRUE(notMatches("void f(int* i) { int* y; f(y); }", CallExpr));
+  // ArgumentY does not match.
+  EXPECT_TRUE(notMatches("void f(int i) { int x; f(x); }", CallExpr));
+}
+
+TEST(ForEachArgumentWithParamType, MatchesCXXMemberCallExpr) {
+  StatementMatcher ArgumentY =
+      declRefExpr(to(varDecl(hasName("y")))).bind("arg");
+  TypeMatcher IntType = qualType(isInteger()).bind("type");
+  StatementMatcher CallExpr =
+      callExpr(forEachArgumentWithParamType(ArgumentY, IntType));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "struct S {"
+      "  const S& operator[](int i) { return *this; }"
+      "};"
+      "void f(S S1) {"
+      "  int y = 1;"
+      "  S1[y];"
+      "}",
+      CallExpr, std::make_unique>("type", 1)));
+
+  StatementMatcher CallExpr2 =
+      callExpr(forEachArgumentWithParamType(ArgumentY, IntType));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "struct S {"
+      "  static void g(int i);"
+      "};"
+      "void f() {"
+      "  int y = 1;"
+      "  S::g(y);"
+      "}",
+      CallExpr2, std::make_unique>("type", 1)));
+}
+
+TEST(ForEachArgumentWithParamType, MatchesCallExpr) {
+  StatementMatcher ArgumentY =
+      declRefExpr(to(varDecl(hasName("y")))).bind("arg");
+  TypeMatcher IntType = qualType(isInteger()).bind("type");
+  StatementMatcher CallExpr =
+      callExpr(forEachArgumentWithParamType(ArgumentY, IntType));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "void f(int i) { int y; f(y); }", CallExpr,
+      std::make_unique>("type")));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "void f(int i) { int y; f(y); }", CallExpr,
+      std::make_unique>("arg")));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "void f(int i, int j) { int y; f(y, y); }", CallExpr,
+      std::make_unique>("type", 2)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "void f(int i, int j) { int y; f(y, y); }", CallExpr,
+      std::make_unique>("arg", 2)));
+}
+
+TEST(ForEachArgumentWithParamType, MatchesConstructExpr) {
+  StatementMatcher ArgumentY =
+      declRefExpr(to(varDecl(hasName("y")))).bind("arg");
+  TypeMatcher IntType = qualType(isInteger()).bind("type");
+  StatementMatcher ConstructExpr =
+      cxxConstructExpr(forEachArgumentWithParamType(ArgumentY, IntType));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "struct C {"
+      "  C(int i) {}"
+      "};"
+      "int y = 0;"
+      "C Obj(y);",
+      ConstructExpr, std::make_unique>("type")));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "struct C {"
+      "  C(int i) {}"
+      "};"
+      "int y = 0;"
+      "C Obj(y);",
+      ConstructExpr, std::make_unique>("arg")));
+}
+
+TEST(ForEachArgumentWithParamType, HandlesKandRFunctions) {
+  StatementMatcher ArgumentY =
+      declRefExpr(to(varDecl(hasName("y")))).bind("arg");
+  TypeMatcher IntType = qualType(isInteger()).bind("type");
+  StatementMatcher CallExpr =
+      callExpr(forEachArgumentWithParamType(ArgumentY, IntType));
+
+  EXPECT_TRUE(matchesC("void f();\n"
+                       "void call_it(void) { int x, y; f(x, y); }\n"
+                       "void f(a, b) int a, b; {}\n"
+                       "void call_it2(void) { int x, y; f(x, y); }",
+                       CallExpr));
+}
+
+TEST(ForEachArgumentWithParamType, HandlesBoundNodesForNonMatches) {
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "void g(int i, int j) {"
+      "  int a;"
+      "  int b;"
+      "  int c;"
+      "  g(a, 0);"
+      "  g(a, b);"
+      "  g(0, b);"
+      "}",
+      functionDecl(
+          forEachDescendant(varDecl().bind("v")),
+          forEachDescendant(callExpr(forEachArgumentWithParamType(
+              declRefExpr(to(decl(equalsBoundNode("v")))), qualType())))),
+      std::make_unique>("v", 4)));
+}
+
+TEST(ForEachArgumentWithParamType, MatchesFunctionPtrCalls) {
+  StatementMatcher ArgumentY =
+      declRefExpr(to(varDecl(hasName("y")))).bind("arg");
+  TypeMatcher IntType = qualType(builtinType()).bind("type");
+  StatementMatcher CallExpr =
+      callExpr(forEachArgumentWithParamType(ArgumentY, IntType));
+
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "void f(int i) {"
+      "void (*f_ptr)(int) = f; int y; f_ptr(y); }",
+      CallExpr, std::make_unique>("type")));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "void f(int i) {"
+      "void (*f_ptr)(int) = f; int y; f_ptr(y); }",
+      CallExpr, std::make_unique>("arg")));
+}
+
+TEST(ForEachArgumentWithParamType, MatchesMemberFunctionPtrCalls) {
+  StatementMatcher ArgumentY =
+      declRefExpr(to(varDecl(hasName("y")))).bind("arg");
+  TypeMatcher IntType = qualType(builtinType()).bind("type");
+  StatementMatcher CallExpr =
+      callExpr(forEachArgumentWithParamType(ArgumentY, IntType));
+
+  StringRef S = "struct A {\n"
+                "  int f(int i) { return i + 1; }\n"
+                "  int (A::*x)(int);\n"
+                "};\n"
+                "void f() {\n"
+                "  int y = 42;\n"
+                "  A a;\n"
+                "  a.x = &A::f;\n"
+                "  (a.*(a.x))(y);\n"
+                "}";
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      S, CallExpr, std::make_unique>("type")));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      S, CallExpr, std::make_unique>("arg")));
+}
+
 TEST(QualType, hasCanonicalType) {
   EXPECT_TRUE(notMatches("typedef int &int_ref;"
                            "int a;"
@@ -1454,10 +1612,49 @@ TEST(HasBody, FindsBodyOfForWhileDoLoops) {
                       doStmt(hasBody(compoundStmt()))));
   EXPECT_TRUE(matches("void f() { int p[2]; for (auto x : p) {} }",
                       cxxForRangeStmt(hasBody(compoundStmt()))));
+}
+
+TEST(HasBody, FindsBodyOfFunctions) {
   EXPECT_TRUE(matches("void f() {}", functionDecl(hasBody(compoundStmt()))));
   EXPECT_TRUE(notMatches("void f();", functionDecl(hasBody(compoundStmt()))));
-  EXPECT_TRUE(matches("void f(); void f() {}",
-                      functionDecl(hasBody(compoundStmt()))));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "void f(); void f() {}",
+      functionDecl(hasBody(compoundStmt())).bind("func"),
+      std::make_unique>("func", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "class C { void f(); }; void C::f() {}",
+      cxxMethodDecl(hasBody(compoundStmt())).bind("met"),
+      std::make_unique>("met", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "class C { C(); }; C::C() {}",
+      cxxConstructorDecl(hasBody(compoundStmt())).bind("ctr"),
+      std::make_unique>("ctr", 1)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "class C { ~C(); }; C::~C() {}",
+      cxxDestructorDecl(hasBody(compoundStmt())).bind("dtr"),
+      std::make_unique>("dtr", 1)));
+}
+
+TEST(HasAnyBody, FindsAnyBodyOfFunctions) {
+  EXPECT_TRUE(matches("void f() {}", functionDecl(hasAnyBody(compoundStmt()))));
+  EXPECT_TRUE(notMatches("void f();",
+                         functionDecl(hasAnyBody(compoundStmt()))));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "void f(); void f() {}",
+      functionDecl(hasAnyBody(compoundStmt())).bind("func"),
+      std::make_unique>("func", 2)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "class C { void f(); }; void C::f() {}",
+      cxxMethodDecl(hasAnyBody(compoundStmt())).bind("met"),
+      std::make_unique>("met", 2)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "class C { C(); }; C::C() {}",
+      cxxConstructorDecl(hasAnyBody(compoundStmt())).bind("ctr"),
+      std::make_unique>("ctr", 2)));
+  EXPECT_TRUE(matchAndVerifyResultTrue(
+      "class C { ~C(); }; C::~C() {}",
+      cxxDestructorDecl(hasAnyBody(compoundStmt())).bind("dtr"),
+      std::make_unique>("dtr", 2)));
 }
 
 TEST(HasAnySubstatement, MatchesForTopLevelCompoundStatement) {
diff --git a/clang/unittests/Driver/SanitizerArgsTest.cpp b/clang/unittests/Driver/SanitizerArgsTest.cpp
index dac1caddc055e..84bd568523459 100644
--- a/clang/unittests/Driver/SanitizerArgsTest.cpp
+++ b/clang/unittests/Driver/SanitizerArgsTest.cpp
@@ -57,7 +57,7 @@ class SanitizerArgsTest : public ::testing::Test {
         new DiagnosticIDs, Opts,
         new TextDiagnosticPrinter(llvm::errs(), Opts.get()));
     DriverInstance.emplace(ClangBinary, "x86_64-unknown-linux-gnu", Diags,
-                           prepareFS(ExtraFiles));
+                           "clang LLVM compiler", prepareFS(ExtraFiles));
 
     std::vector Args = {ClangBinary};
     for (const auto &A : ExtraArgs)
diff --git a/clang/unittests/Driver/ToolChainTest.cpp b/clang/unittests/Driver/ToolChainTest.cpp
index f84e508b6cbdb..67bf545b14e4b 100644
--- a/clang/unittests/Driver/ToolChainTest.cpp
+++ b/clang/unittests/Driver/ToolChainTest.cpp
@@ -35,7 +35,7 @@ TEST(ToolChainTest, VFSGCCInstallation) {
   IntrusiveRefCntPtr InMemoryFileSystem(
       new llvm::vfs::InMemoryFileSystem);
   Driver TheDriver("/bin/clang", "arm-linux-gnueabihf", Diags,
-                   InMemoryFileSystem);
+                   "clang LLVM compiler", InMemoryFileSystem);
 
   const char *EmptyFiles[] = {
       "foo.cpp",
@@ -89,7 +89,7 @@ TEST(ToolChainTest, VFSGCCInstallationRelativeDir) {
   IntrusiveRefCntPtr InMemoryFileSystem(
       new llvm::vfs::InMemoryFileSystem);
   Driver TheDriver("/home/test/bin/clang", "arm-linux-gnueabi", Diags,
-                   InMemoryFileSystem);
+                   "clang LLVM compiler", InMemoryFileSystem);
 
   const char *EmptyFiles[] = {
       "foo.cpp", "/home/test/lib/gcc/arm-linux-gnueabi/4.6.1/crtbegin.o",
@@ -130,13 +130,13 @@ TEST(ToolChainTest, DefaultDriverMode) {
       new llvm::vfs::InMemoryFileSystem);
 
   Driver CCDriver("/home/test/bin/clang", "arm-linux-gnueabi", Diags,
-                  InMemoryFileSystem);
+                  "clang LLVM compiler", InMemoryFileSystem);
   CCDriver.setCheckInputsExist(false);
   Driver CXXDriver("/home/test/bin/clang++", "arm-linux-gnueabi", Diags,
-                   InMemoryFileSystem);
+                   "clang LLVM compiler", InMemoryFileSystem);
   CXXDriver.setCheckInputsExist(false);
   Driver CLDriver("/home/test/bin/clang-cl", "arm-linux-gnueabi", Diags,
-                  InMemoryFileSystem);
+                  "clang LLVM compiler", InMemoryFileSystem);
   CLDriver.setCheckInputsExist(false);
 
   std::unique_ptr CC(CCDriver.BuildCompilation(
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index b198efa4af9ec..eae7b24fae7cd 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -2743,6 +2743,43 @@ TEST_F(FormatTest, FormatTryAsAVariable) {
   verifyFormat("int catch, size;");
   verifyFormat("catch = foo();");
   verifyFormat("if (catch < size) {\n  return true;\n}");
+
+  FormatStyle Style = getLLVMStyle();
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Style.BraceWrapping.AfterFunction = true;
+  Style.BraceWrapping.BeforeCatch = true;
+  verifyFormat("try {\n"
+               "  int bar = 1;\n"
+               "}\n"
+               "catch (...) {\n"
+               "  int bar = 1;\n"
+               "}",
+               Style);
+  verifyFormat("#if NO_EX\n"
+               "try\n"
+               "#endif\n"
+               "{\n"
+               "}\n"
+               "#if NO_EX\n"
+               "catch (...) {\n"
+               "}",
+               Style);
+  verifyFormat("try /* abc */ {\n"
+               "  int bar = 1;\n"
+               "}\n"
+               "catch (...) {\n"
+               "  int bar = 1;\n"
+               "}",
+               Style);
+  verifyFormat("try\n"
+               "// abc\n"
+               "{\n"
+               "  int bar = 1;\n"
+               "}\n"
+               "catch (...) {\n"
+               "  int bar = 1;\n"
+               "}",
+               Style);
 }
 
 TEST_F(FormatTest, FormatSEHTryCatch) {
@@ -7565,6 +7602,21 @@ TEST_F(FormatTest, UnderstandsTemplateParameters) {
   verifyFormat("static_assert(is_convertible::value, \"AAA\");");
   verifyFormat("Constructor(A... a) : a_(X{std::forward(a)}...) {}");
   verifyFormat("< < < < < < < < < < < < < < < < < < < < < < < < < < < < < <");
+  verifyFormat("some_templated_type");
+}
+
+TEST_F(FormatTest, UnderstandsShiftOperators) {
+  verifyFormat("if (i < x >> 1)");
+  verifyFormat("while (i < x >> 1)");
+  verifyFormat("for (unsigned i = 0; i < i; ++i, v = v >> 1)");
+  verifyFormat("for (unsigned i = 0; i < x >> 1; ++i, v = v >> 1)");
+  verifyFormat(
+      "for (std::vector::iterator i = 0; i < x >> 1; ++i, v = v >> 1)");
+  verifyFormat("Foo.call>()");
+  verifyFormat("if (Foo.call>() == 0)");
+  verifyFormat("for (std::vector>::iterator i = 0; i < x >> 1; "
+               "++i, v = v >> 1)");
+  verifyFormat("if (w>, 1>::t)");
 }
 
 TEST_F(FormatTest, BitshiftOperatorWidth) {
diff --git a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
index aab20008a4974..95ebeb2c59403 100644
--- a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
+++ b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
@@ -17,7 +17,72 @@ using namespace clang::syntax;
 
 namespace {
 
-TEST_P(SyntaxTreeTest, Simple) {
+class BuildSyntaxTreeTest : public SyntaxTreeTest {
+protected:
+  ::testing::AssertionResult treeDumpEqual(StringRef Code, StringRef Tree) {
+    SCOPED_TRACE(llvm::join(GetParam().getCommandLineArgs(), " "));
+
+    auto *Root = buildTree(Code, GetParam());
+    if (Diags->getClient()->getNumErrors() != 0) {
+      return ::testing::AssertionFailure()
+             << "Source file has syntax errors, they were printed to the test "
+                "log";
+    }
+    auto Actual = StringRef(Root->dump(Arena->getSourceManager())).trim().str();
+    // EXPECT_EQ shows the diff between the two strings if they are different.
+    EXPECT_EQ(Tree.trim().str(), Actual);
+    if (Actual != Tree.trim().str()) {
+      return ::testing::AssertionFailure();
+    }
+    return ::testing::AssertionSuccess();
+  }
+
+  ::testing::AssertionResult
+  treeDumpEqualOnAnnotations(StringRef CodeWithAnnotations,
+                             ArrayRef TreeDumps) {
+    SCOPED_TRACE(llvm::join(GetParam().getCommandLineArgs(), " "));
+
+    auto AnnotatedCode = llvm::Annotations(CodeWithAnnotations);
+    auto *Root = buildTree(AnnotatedCode.code(), GetParam());
+
+    if (Diags->getClient()->getNumErrors() != 0) {
+      return ::testing::AssertionFailure()
+             << "Source file has syntax errors, they were printed to the test "
+                "log";
+    }
+
+    auto AnnotatedRanges = AnnotatedCode.ranges();
+    if (AnnotatedRanges.size() != TreeDumps.size()) {
+      return ::testing::AssertionFailure()
+             << "The number of annotated ranges in the source code is "
+                "different "
+                "to the number of their corresponding tree dumps.";
+    }
+    bool Failed = false;
+    for (unsigned i = 0; i < AnnotatedRanges.size(); i++) {
+      auto *AnnotatedNode = nodeByRange(AnnotatedRanges[i], Root);
+      assert(AnnotatedNode);
+      auto AnnotatedNodeDump =
+          StringRef(AnnotatedNode->dump(Arena->getSourceManager()))
+              .trim()
+              .str();
+      // EXPECT_EQ shows the diff between the two strings if they are different.
+      EXPECT_EQ(TreeDumps[i].trim().str(), AnnotatedNodeDump)
+          << "Dumps diverged for the code:\n"
+          << AnnotatedCode.code().slice(AnnotatedRanges[i].Begin,
+                                        AnnotatedRanges[i].End);
+      if (AnnotatedNodeDump != TreeDumps[i].trim().str())
+        Failed = true;
+    }
+    return Failed ? ::testing::AssertionFailure()
+                  : ::testing::AssertionSuccess();
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(SyntaxTreeTests, BuildSyntaxTreeTest,
+                        testing::ValuesIn(allTestClangConfigs()), );
+
+TEST_P(BuildSyntaxTreeTest, Simple) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 int main() {}
@@ -48,7 +113,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, SimpleVariable) {
+TEST_P(BuildSyntaxTreeTest, SimpleVariable) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 int a;
@@ -72,7 +137,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, SimpleFunction) {
+TEST_P(BuildSyntaxTreeTest, SimpleFunction) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 void foo(int a, int b) {}
@@ -102,7 +167,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, If) {
+TEST_P(BuildSyntaxTreeTest, If) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
@@ -144,7 +209,7 @@ IfStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, For) {
+TEST_P(BuildSyntaxTreeTest, For) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
@@ -164,7 +229,7 @@ ForStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, RangeBasedFor) {
+TEST_P(BuildSyntaxTreeTest, RangeBasedFor) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -194,7 +259,7 @@ RangeBasedForStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, DeclarationStatement) {
+TEST_P(BuildSyntaxTreeTest, DeclarationStatement) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
@@ -214,7 +279,7 @@ DeclarationStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, Switch) {
+TEST_P(BuildSyntaxTreeTest, Switch) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
@@ -247,7 +312,7 @@ SwitchStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, While) {
+TEST_P(BuildSyntaxTreeTest, While) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
@@ -273,7 +338,7 @@ WhileStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UnhandledStatement) {
+TEST_P(BuildSyntaxTreeTest, UnhandledStatement) {
   // Unhandled statements should end up as 'unknown statement'.
   // This example uses a 'label statement', which does not yet have a syntax
   // counterpart.
@@ -295,7 +360,7 @@ UnknownStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, Expressions) {
+TEST_P(BuildSyntaxTreeTest, Expressions) {
   // expressions should be wrapped in 'ExpressionStatement' when they appear
   // in a statement position.
   EXPECT_TRUE(treeDumpEqual(
@@ -351,7 +416,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, UnqualifiedId_Identifier) {
+TEST_P(BuildSyntaxTreeTest, UnqualifiedId_Identifier) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test(int a) {
@@ -365,7 +430,7 @@ IdExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UnqualifiedId_OperatorFunctionId) {
+TEST_P(BuildSyntaxTreeTest, UnqualifiedId_OperatorFunctionId) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -397,7 +462,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UnqualifiedId_ConversionFunctionId) {
+TEST_P(BuildSyntaxTreeTest, UnqualifiedId_ConversionFunctionId) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -426,7 +491,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UnqualifiedId_LiteralOperatorId) {
+TEST_P(BuildSyntaxTreeTest, UnqualifiedId_LiteralOperatorId) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -452,7 +517,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UnqualifiedId_Destructor) {
+TEST_P(BuildSyntaxTreeTest, UnqualifiedId_Destructor) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -479,7 +544,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UnqualifiedId_DecltypeDestructor) {
+TEST_P(BuildSyntaxTreeTest, UnqualifiedId_DecltypeDestructor) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -513,7 +578,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UnqualifiedId_TemplateId) {
+TEST_P(BuildSyntaxTreeTest, UnqualifiedId_TemplateId) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -538,7 +603,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, QualifiedId_NamespaceSpecifier) {
+TEST_P(BuildSyntaxTreeTest, QualifiedId_NamespaceSpecifier) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -548,9 +613,6 @@ namespace n {
   struct S { };
 }
 void test() {
-  // FIXME: Remove the `UnknownExpression` wrapping `s1` and `s2`. This
-  // `UnknownExpression` comes from a leaf `CXXConstructExpr` in the
-  // ClangAST. We need to ignore leaf implicit nodes.
   [[::n::S s1]];
   [[n::S s2]];
 }
@@ -564,8 +626,7 @@ SimpleDeclaration
 | `-'::' ListDelimiter
 |-'S'
 `-SimpleDeclarator Declarator
-  `-UnknownExpression
-    `-'s1'
+  `-'s1'
 )txt",
        R"txt(
 SimpleDeclaration
@@ -575,12 +636,11 @@ SimpleDeclaration
 | `-'::' ListDelimiter
 |-'S'
 `-SimpleDeclarator Declarator
-  `-UnknownExpression
-    `-'s2'
+  `-'s2'
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, QualifiedId_TemplateSpecifier) {
+TEST_P(BuildSyntaxTreeTest, QualifiedId_TemplateSpecifier) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -608,8 +668,7 @@ SimpleDeclaration
 | `-'::' ListDelimiter
 |-'S'
 `-SimpleDeclarator Declarator
-  `-UnknownExpression
-    `-'s1'
+  `-'s1'
 )txt",
        R"txt(
 SimpleDeclaration
@@ -623,12 +682,11 @@ SimpleDeclaration
 | `-'::' ListDelimiter
 |-'S'
 `-SimpleDeclarator Declarator
-  `-UnknownExpression
-    `-'s2'
+  `-'s2'
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, QualifiedId_DecltypeSpecifier) {
+TEST_P(BuildSyntaxTreeTest, QualifiedId_DecltypeSpecifier) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -660,7 +718,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, QualifiedId_OptionalTemplateKw) {
+TEST_P(BuildSyntaxTreeTest, QualifiedId_OptionalTemplateKw) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -708,7 +766,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, QualifiedId_Complex) {
+TEST_P(BuildSyntaxTreeTest, QualifiedId_Complex) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -751,7 +809,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, QualifiedId_DependentType) {
+TEST_P(BuildSyntaxTreeTest, QualifiedId_DependentType) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -822,7 +880,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, This_Simple) {
+TEST_P(BuildSyntaxTreeTest, This_Simple) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -840,7 +898,7 @@ ThisExpression ReturnValue
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, This_ExplicitMemberAccess) {
+TEST_P(BuildSyntaxTreeTest, This_ExplicitMemberAccess) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -864,7 +922,7 @@ MemberExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, This_ImplicitMemberAccess) {
+TEST_P(BuildSyntaxTreeTest, This_ImplicitMemberAccess) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -884,7 +942,7 @@ IdExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, ParenExpr) {
+TEST_P(BuildSyntaxTreeTest, ParenExpr) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
@@ -926,7 +984,7 @@ ParenExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UserDefinedLiteral_Char) {
+TEST_P(BuildSyntaxTreeTest, UserDefinedLiteral_Char) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -943,7 +1001,7 @@ CharUserDefinedLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UserDefinedLiteral_String) {
+TEST_P(BuildSyntaxTreeTest, UserDefinedLiteral_String) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -963,7 +1021,7 @@ StringUserDefinedLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UserDefinedLiteral_Integer) {
+TEST_P(BuildSyntaxTreeTest, UserDefinedLiteral_Integer) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -994,7 +1052,7 @@ IntegerUserDefinedLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UserDefinedLiteral_Float) {
+TEST_P(BuildSyntaxTreeTest, UserDefinedLiteral_Float) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -1025,7 +1083,7 @@ FloatUserDefinedLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, IntegerLiteral_LongLong) {
+TEST_P(BuildSyntaxTreeTest, IntegerLiteral_LongLong) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -1046,7 +1104,7 @@ IntegerLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, IntegerLiteral_Binary) {
+TEST_P(BuildSyntaxTreeTest, IntegerLiteral_Binary) {
   if (!GetParam().isCXX14OrLater()) {
     return;
   }
@@ -1062,7 +1120,7 @@ IntegerLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, IntegerLiteral_WithDigitSeparators) {
+TEST_P(BuildSyntaxTreeTest, IntegerLiteral_WithDigitSeparators) {
   if (!GetParam().isCXX14OrLater()) {
     return;
   }
@@ -1078,7 +1136,7 @@ IntegerLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CharacterLiteral) {
+TEST_P(BuildSyntaxTreeTest, CharacterLiteral) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
@@ -1116,7 +1174,7 @@ CharacterLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CharacterLiteral_Utf) {
+TEST_P(BuildSyntaxTreeTest, CharacterLiteral_Utf) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -1147,7 +1205,7 @@ CharacterLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CharacterLiteral_Utf8) {
+TEST_P(BuildSyntaxTreeTest, CharacterLiteral_Utf8) {
   if (!GetParam().isCXX17OrLater()) {
     return;
   }
@@ -1168,7 +1226,7 @@ CharacterLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, FloatingLiteral) {
+TEST_P(BuildSyntaxTreeTest, FloatingLiteral) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
@@ -1196,7 +1254,7 @@ FloatingLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, FloatingLiteral_Hexadecimal) {
+TEST_P(BuildSyntaxTreeTest, FloatingLiteral_Hexadecimal) {
   if (!GetParam().isCXX17OrLater()) {
     return;
   }
@@ -1227,7 +1285,7 @@ FloatingLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, StringLiteral) {
+TEST_P(BuildSyntaxTreeTest, StringLiteral) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
@@ -1245,7 +1303,7 @@ StringLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, StringLiteral_Utf) {
+TEST_P(BuildSyntaxTreeTest, StringLiteral_Utf) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -1271,7 +1329,7 @@ StringLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, StringLiteral_Raw) {
+TEST_P(BuildSyntaxTreeTest, StringLiteral_Raw) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -1304,7 +1362,7 @@ TEST_P(SyntaxTreeTest, StringLiteral_Raw) {
       "    `-'}' CloseParen\n"));
 }
 
-TEST_P(SyntaxTreeTest, BoolLiteral) {
+TEST_P(BuildSyntaxTreeTest, BoolLiteral) {
   if (GetParam().isC()) {
     return;
   }
@@ -1325,7 +1383,7 @@ BoolLiteralExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CxxNullPtrLiteral) {
+TEST_P(BuildSyntaxTreeTest, CxxNullPtrLiteral) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -1341,7 +1399,7 @@ CxxNullPtrExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, PostfixUnaryOperator) {
+TEST_P(BuildSyntaxTreeTest, PostfixUnaryOperator) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test(int a) {
@@ -1365,7 +1423,7 @@ PostfixUnaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, PrefixUnaryOperator) {
+TEST_P(BuildSyntaxTreeTest, PrefixUnaryOperator) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test(int a, int *ap) {
@@ -1451,7 +1509,7 @@ PrefixUnaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, PrefixUnaryOperatorCxx) {
+TEST_P(BuildSyntaxTreeTest, PrefixUnaryOperatorCxx) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -1478,7 +1536,7 @@ PrefixUnaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, BinaryOperator) {
+TEST_P(BuildSyntaxTreeTest, BinaryOperator) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test(int a) {
@@ -1552,7 +1610,7 @@ BinaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, BinaryOperatorCxx) {
+TEST_P(BuildSyntaxTreeTest, BinaryOperatorCxx) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -1600,7 +1658,7 @@ BinaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, BinaryOperator_NestedWithParenthesis) {
+TEST_P(BuildSyntaxTreeTest, BinaryOperator_NestedWithParenthesis) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
@@ -1631,7 +1689,7 @@ BinaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, BinaryOperator_Associativity) {
+TEST_P(BuildSyntaxTreeTest, BinaryOperator_Associativity) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test(int a, int b) {
@@ -1669,7 +1727,7 @@ BinaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, BinaryOperator_Precedence) {
+TEST_P(BuildSyntaxTreeTest, BinaryOperator_Precedence) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
@@ -1711,7 +1769,7 @@ BinaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, OverloadedOperator_Assignment) {
+TEST_P(BuildSyntaxTreeTest, OverloadedOperator_Assignment) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -1736,7 +1794,7 @@ BinaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, OverloadedOperator_Plus) {
+TEST_P(BuildSyntaxTreeTest, OverloadedOperator_Plus) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -1745,19 +1803,15 @@ TEST_P(SyntaxTreeTest, OverloadedOperator_Plus) {
 struct X {
   friend X operator+(X, const X&);
 };
-// FIXME: Remove additional `UnknownExpression` wrapping `x`. For that, ignore
-// implicit copy constructor called on `x`. This should've been ignored already,
-// as we `IgnoreImplicit` when traversing an `Stmt`.
 void test(X x, X y) {
   [[x + y]];
 }
 )cpp",
       {R"txt(
 BinaryOperatorExpression Expression
-|-UnknownExpression LeftHandSide
-| `-IdExpression
-|   `-UnqualifiedId UnqualifiedId
-|     `-'x'
+|-IdExpression LeftHandSide
+| `-UnqualifiedId UnqualifiedId
+|   `-'x'
 |-'+' OperatorToken
 `-IdExpression RightHandSide
   `-UnqualifiedId UnqualifiedId
@@ -1765,7 +1819,7 @@ BinaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, OverloadedOperator_Less) {
+TEST_P(BuildSyntaxTreeTest, OverloadedOperator_Less) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -1790,7 +1844,7 @@ BinaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, OverloadedOperator_LeftShift) {
+TEST_P(BuildSyntaxTreeTest, OverloadedOperator_LeftShift) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -1815,7 +1869,7 @@ BinaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, OverloadedOperator_Comma) {
+TEST_P(BuildSyntaxTreeTest, OverloadedOperator_Comma) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -1840,7 +1894,7 @@ BinaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, OverloadedOperator_PointerToMember) {
+TEST_P(BuildSyntaxTreeTest, OverloadedOperator_PointerToMember) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -1865,7 +1919,7 @@ BinaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, OverloadedOperator_Negation) {
+TEST_P(BuildSyntaxTreeTest, OverloadedOperator_Negation) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -1887,7 +1941,7 @@ PrefixUnaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, OverloadedOperator_AddressOf) {
+TEST_P(BuildSyntaxTreeTest, OverloadedOperator_AddressOf) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -1909,7 +1963,7 @@ PrefixUnaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, OverloadedOperator_PrefixIncrement) {
+TEST_P(BuildSyntaxTreeTest, OverloadedOperator_PrefixIncrement) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -1931,7 +1985,7 @@ PrefixUnaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, OverloadedOperator_PostfixIncrement) {
+TEST_P(BuildSyntaxTreeTest, OverloadedOperator_PostfixIncrement) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -1953,7 +2007,7 @@ PostfixUnaryOperatorExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, MemberExpression_SimpleWithDot) {
+TEST_P(BuildSyntaxTreeTest, MemberExpression_SimpleWithDot) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct S {
@@ -1975,7 +2029,7 @@ MemberExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, MemberExpression_StaticDataMember) {
+TEST_P(BuildSyntaxTreeTest, MemberExpression_StaticDataMember) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2000,7 +2054,7 @@ MemberExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, MemberExpression_SimpleWithArrow) {
+TEST_P(BuildSyntaxTreeTest, MemberExpression_SimpleWithArrow) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct S {
@@ -2022,7 +2076,7 @@ MemberExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, MemberExpression_Chaining) {
+TEST_P(BuildSyntaxTreeTest, MemberExpression_Chaining) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct S {
@@ -2049,7 +2103,7 @@ MemberExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, MemberExpression_OperatorFunction) {
+TEST_P(BuildSyntaxTreeTest, MemberExpression_OperatorFunction) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2078,7 +2132,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, MemberExpression_VariableTemplate) {
+TEST_P(BuildSyntaxTreeTest, MemberExpression_VariableTemplate) {
   if (!GetParam().isCXX14OrLater()) {
     return;
   }
@@ -2114,7 +2168,7 @@ CompoundStatement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, MemberExpression_FunctionTemplate) {
+TEST_P(BuildSyntaxTreeTest, MemberExpression_FunctionTemplate) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2146,7 +2200,8 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, MemberExpression_FunctionTemplateWithTemplateKeyword) {
+TEST_P(BuildSyntaxTreeTest,
+       MemberExpression_FunctionTemplateWithTemplateKeyword) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2179,7 +2234,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, MemberExpression_WithQualifier) {
+TEST_P(BuildSyntaxTreeTest, MemberExpression_WithQualifier) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2232,7 +2287,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, MemberExpression_Complex) {
+TEST_P(BuildSyntaxTreeTest, MemberExpression_Complex) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2290,7 +2345,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Callee_Member) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Callee_Member) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2318,7 +2373,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Callee_OperatorParens) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Callee_OperatorParens) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2341,7 +2396,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Callee_OperatorParensChaining) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Callee_OperatorParensChaining) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2367,7 +2422,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Callee_MemberWithThis) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Callee_MemberWithThis) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2423,7 +2478,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Callee_FunctionPointer) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Callee_FunctionPointer) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2458,7 +2513,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Callee_MemberFunctionPointer) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Callee_MemberFunctionPointer) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2491,7 +2546,7 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Arguments_Zero) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Arguments_Zero) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2514,7 +2569,7 @@ ExpressionStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Arguments_One) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Arguments_One) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2540,7 +2595,7 @@ ExpressionStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Arguments_Multiple) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Arguments_Multiple) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2572,7 +2627,7 @@ ExpressionStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Arguments_Assignment) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Arguments_Assignment) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2603,7 +2658,7 @@ ExpressionStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Arguments_BracedInitList_Empty) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Arguments_BracedInitList_Empty) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -2631,7 +2686,7 @@ ExpressionStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Arguments_BracedInitList_Simple) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Arguments_BracedInitList_Simple) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -2671,7 +2726,8 @@ ExpressionStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Arguments_BracedInitList_Designated) {
+TEST_P(BuildSyntaxTreeTest,
+       CallExpression_Arguments_BracedInitList_Designated) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -2718,7 +2774,7 @@ ExpressionStatement Statement
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, CallExpression_Arguments_ParameterPack) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_Arguments_ParameterPack) {
   if (!GetParam().isCXX11OrLater() || GetParam().hasDelayedTemplateParsing()) {
     return;
   }
@@ -2744,7 +2800,55 @@ CallExpression Expression
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, MultipleDeclaratorsGrouping) {
+TEST_P(BuildSyntaxTreeTest, CallExpression_DefaultArguments) {
+  if (!GetParam().isCXX11OrLater()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+void f(int i = 1, char c = '2');
+void test() {
+  [[f()]];
+  [[f(1)]];
+  [[f(1, '2')]];
+}
+)cpp",
+      {R"txt(
+CallExpression Expression
+|-IdExpression Callee
+| `-UnqualifiedId UnqualifiedId
+|   `-'f'
+|-'(' OpenParen
+`-')' CloseParen
+      )txt",
+       R"txt(
+CallExpression Expression
+|-IdExpression Callee
+| `-UnqualifiedId UnqualifiedId
+|   `-'f'
+|-'(' OpenParen
+|-CallArguments Arguments
+| `-IntegerLiteralExpression ListElement
+|   `-'1' LiteralToken
+`-')' CloseParen
+      )txt",
+       R"txt(
+CallExpression Expression
+|-IdExpression Callee
+| `-UnqualifiedId UnqualifiedId
+|   `-'f'
+|-'(' OpenParen
+|-CallArguments Arguments
+| |-IntegerLiteralExpression ListElement
+| | `-'1' LiteralToken
+| |-',' ListDelimiter
+| `-CharacterLiteralExpression ListElement
+|   `-''2'' LiteralToken
+`-')' CloseParen
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest, MultipleDeclaratorsGrouping) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 int *a, b;
@@ -2773,7 +2877,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, MultipleDeclaratorsGroupingTypedef) {
+TEST_P(BuildSyntaxTreeTest, MultipleDeclaratorsGroupingTypedef) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 typedef int *a, b;
@@ -2793,7 +2897,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, MultipleDeclaratorsInsideStatement) {
+TEST_P(BuildSyntaxTreeTest, MultipleDeclaratorsInsideStatement) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 void foo() {
@@ -2837,7 +2941,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, SizeTTypedef) {
+TEST_P(BuildSyntaxTreeTest, SizeTTypedef) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -2864,7 +2968,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, Namespace_Nested) {
+TEST_P(BuildSyntaxTreeTest, Namespace_Nested) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2887,7 +2991,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, Namespace_NestedDefinition) {
+TEST_P(BuildSyntaxTreeTest, Namespace_NestedDefinition) {
   if (!GetParam().isCXX17OrLater()) {
     return;
   }
@@ -2907,7 +3011,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, Namespace_Unnamed) {
+TEST_P(BuildSyntaxTreeTest, Namespace_Unnamed) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2924,7 +3028,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, Namespace_Alias) {
+TEST_P(BuildSyntaxTreeTest, Namespace_Alias) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2943,7 +3047,7 @@ NamespaceAliasDefinition
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UsingDirective) {
+TEST_P(BuildSyntaxTreeTest, UsingDirective) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2963,7 +3067,7 @@ UsingNamespaceDirective
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UsingDeclaration_Namespace) {
+TEST_P(BuildSyntaxTreeTest, UsingDeclaration_Namespace) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -2984,7 +3088,7 @@ UsingDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UsingDeclaration_ClassMember) {
+TEST_P(BuildSyntaxTreeTest, UsingDeclaration_ClassMember) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3018,7 +3122,7 @@ UsingDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, UsingTypeAlias) {
+TEST_P(BuildSyntaxTreeTest, UsingTypeAlias) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -3037,7 +3141,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, FreeStandingClass_ForwardDeclaration) {
+TEST_P(BuildSyntaxTreeTest, FreeStandingClass_ForwardDeclaration) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 [[struct X;]]
@@ -3060,7 +3164,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, FreeStandingClasses_Definition) {
+TEST_P(BuildSyntaxTreeTest, FreeStandingClasses_Definition) {
   EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 [[struct X {};]]
@@ -3098,7 +3202,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, StaticMemberFunction) {
+TEST_P(BuildSyntaxTreeTest, StaticMemberFunction) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -3123,7 +3227,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, OutOfLineMemberFunctionDefinition) {
+TEST_P(BuildSyntaxTreeTest, OutOfLineMemberFunctionDefinition) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -3152,7 +3256,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, ConversionMemberFunction) {
+TEST_P(BuildSyntaxTreeTest, ConversionMemberFunction) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3174,7 +3278,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, LiteralOperatorDeclaration) {
+TEST_P(BuildSyntaxTreeTest, LiteralOperatorDeclaration) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -3200,7 +3304,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, NumericLiteralOperatorTemplateDeclaration) {
+TEST_P(BuildSyntaxTreeTest, NumericLiteralOperatorTemplateDeclaration) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -3231,7 +3335,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, OverloadedOperatorDeclaration) {
+TEST_P(BuildSyntaxTreeTest, OverloadedOperatorDeclaration) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3261,7 +3365,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, OverloadedOperatorFriendDeclaration) {
+TEST_P(BuildSyntaxTreeTest, OverloadedOperatorFriendDeclaration) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3295,7 +3399,7 @@ UnknownDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, ClassTemplateDeclaration) {
+TEST_P(BuildSyntaxTreeTest, ClassTemplateDeclaration) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3322,7 +3426,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, FunctionTemplateDeclaration) {
+TEST_P(BuildSyntaxTreeTest, FunctionTemplateDeclaration) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3351,7 +3455,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, VariableTemplateDeclaration) {
+TEST_P(BuildSyntaxTreeTest, VariableTemplateDeclaration) {
   if (!GetParam().isCXX14OrLater()) {
     return;
   }
@@ -3379,7 +3483,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, StaticMemberFunctionTemplate) {
+TEST_P(BuildSyntaxTreeTest, StaticMemberFunctionTemplate) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3410,7 +3514,7 @@ TemplateDeclaration Declaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, NestedTemplates) {
+TEST_P(BuildSyntaxTreeTest, NestedTemplates) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3455,7 +3559,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, NestedTemplatesInNamespace) {
+TEST_P(BuildSyntaxTreeTest, NestedTemplatesInNamespace) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3508,7 +3612,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ClassTemplate_MemberClassDefinition) {
+TEST_P(BuildSyntaxTreeTest, ClassTemplate_MemberClassDefinition) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3541,7 +3645,7 @@ TemplateDeclaration Declaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, ExplicitClassTemplateInstantation_Definition) {
+TEST_P(BuildSyntaxTreeTest, ExplicitClassTemplateInstantation_Definition) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3563,7 +3667,7 @@ ExplicitTemplateInstantiation
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, ExplicitClassTemplateInstantation_Declaration) {
+TEST_P(BuildSyntaxTreeTest, ExplicitClassTemplateInstantation_Declaration) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3586,7 +3690,7 @@ ExplicitTemplateInstantiation
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, ClassTemplateSpecialization_Partial) {
+TEST_P(BuildSyntaxTreeTest, ClassTemplateSpecialization_Partial) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3616,7 +3720,7 @@ TemplateDeclaration Declaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, ClassTemplateSpecialization_Full) {
+TEST_P(BuildSyntaxTreeTest, ClassTemplateSpecialization_Full) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3642,7 +3746,7 @@ TemplateDeclaration Declaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, EmptyDeclaration) {
+TEST_P(BuildSyntaxTreeTest, EmptyDeclaration) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 ;
@@ -3654,7 +3758,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, StaticAssert) {
+TEST_P(BuildSyntaxTreeTest, StaticAssert) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -3677,7 +3781,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, StaticAssert_WithoutMessage) {
+TEST_P(BuildSyntaxTreeTest, StaticAssert_WithoutMessage) {
   if (!GetParam().isCXX17OrLater()) {
     return;
   }
@@ -3697,7 +3801,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ExternC) {
+TEST_P(BuildSyntaxTreeTest, ExternC) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3734,7 +3838,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, NonModifiableNodes) {
+TEST_P(BuildSyntaxTreeTest, NonModifiableNodes) {
   // Some nodes are non-modifiable, they are marked with 'I:'.
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
@@ -3775,7 +3879,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ModifiableNodes) {
+TEST_P(BuildSyntaxTreeTest, ModifiableNodes) {
   // All nodes can be mutated.
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
@@ -3821,29 +3925,140 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, InitDeclarator_Brace) {
+TEST_P(BuildSyntaxTreeTest, InitDeclarator_Equal) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+struct S { S(int);};
+void test() {
+  [[S s = 1]];
+}
+)cpp",
+      {R"txt(
+SimpleDeclaration
+|-'S'
+`-SimpleDeclarator Declarator
+  |-'s'
+  |-'='
+  `-IntegerLiteralExpression
+    `-'1' LiteralToken
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest, InitDeclarator_Brace) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
-int a {};
+struct S { 
+  S();
+  S(int);
+  S(int, float);
+};
+void test(){
+  // FIXME: 's...' is a declarator and '{...}' is initializer
+  [[S s0{}]];
+  [[S s1{1}]];
+  [[S s2{1, 2.}]];
+}
 )cpp",
-      R"txt(
-TranslationUnit Detached
-`-SimpleDeclaration
-  |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'a'
-  | `-UnknownExpression
-  |   `-UnknownExpression
-  |     |-'{'
-  |     `-'}'
-  `-';'
-)txt"));
+      {R"txt(
+SimpleDeclaration
+|-'S'
+`-SimpleDeclarator Declarator
+  `-UnknownExpression
+    |-'s0'
+    |-'{'
+    `-'}'
+  )txt",
+       R"txt(
+SimpleDeclaration
+|-'S'
+`-SimpleDeclarator Declarator
+  `-UnknownExpression
+    |-'s1'
+    |-'{'
+    |-IntegerLiteralExpression
+    | `-'1' LiteralToken
+    `-'}'
+  )txt",
+       R"txt(
+SimpleDeclaration
+|-'S'
+`-SimpleDeclarator Declarator
+  `-UnknownExpression
+    |-'s2'
+    |-'{'
+    |-IntegerLiteralExpression
+    | `-'1' LiteralToken
+    |-','
+    |-FloatingLiteralExpression
+    | `-'2.' LiteralToken
+    `-'}'
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest, InitDeclarator_EqualBrace) {
+  if (!GetParam().isCXX11OrLater()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+struct S { 
+  S();
+  S(int);
+  S(int, float);
+};
+void test() {
+  // FIXME: '= {...}' is initializer
+  [[S s0 = {}]];
+  [[S s1 = {1}]];
+  [[S s2 = {1, 2.}]];
+}
+)cpp",
+      {R"txt(
+SimpleDeclaration
+|-'S'
+`-SimpleDeclarator Declarator
+  |-'s0'
+  |-'='
+  `-UnknownExpression
+    |-'{'
+    `-'}'
+  )txt",
+       R"txt(
+SimpleDeclaration
+|-'S'
+`-SimpleDeclarator Declarator
+  |-'s1'
+  |-'='
+  `-UnknownExpression
+    |-'{'
+    |-IntegerLiteralExpression
+    | `-'1' LiteralToken
+    `-'}'
+  )txt",
+       R"txt(
+SimpleDeclaration
+|-'S'
+`-SimpleDeclarator Declarator
+  |-'s2'
+  |-'='
+  `-UnknownExpression
+    |-'{'
+    |-IntegerLiteralExpression
+    | `-'1' LiteralToken
+    |-','
+    |-FloatingLiteralExpression
+    | `-'2.' LiteralToken
+    `-'}'
+)txt"}));
 }
 
-TEST_P(SyntaxTreeTest, InitDeclarator_Paren) {
+TEST_P(BuildSyntaxTreeTest, InitDeclarator_Paren) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -3851,24 +4066,285 @@ TEST_P(SyntaxTreeTest, InitDeclarator_Paren) {
       R"cpp(
 struct S {
   S(int);
+  S(int, float);
 };
-[[S s(1);]]
+// FIXME: 's...' is a declarator and '(...)' is initializer
+[[S s1(1);]]
+[[S s2(1, 2.);]]
 )cpp",
       {R"txt(
 SimpleDeclaration
 |-'S'
 |-SimpleDeclarator Declarator
 | `-UnknownExpression
-|   |-'s'
+|   |-'s1'
 |   |-'('
 |   |-IntegerLiteralExpression
 |   | `-'1' LiteralToken
 |   `-')'
+`-';'
+  )txt",
+       R"txt(
+SimpleDeclaration
+|-'S'
+|-SimpleDeclarator Declarator
+| `-UnknownExpression
+|   |-'s2'
+|   |-'('
+|   |-IntegerLiteralExpression
+|   | `-'1' LiteralToken
+|   |-','
+|   |-FloatingLiteralExpression
+|   | `-'2.' LiteralToken
+|   `-')'
 `-';'
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, ArrayDeclarator_Simple) {
+TEST_P(BuildSyntaxTreeTest, InitDeclarator_Paren_DefaultArguments) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+struct S {
+  S(int i = 1, float = 2.);
+};
+[[S s0;]]
+// FIXME: 's...' is a declarator and '(...)' is initializer
+[[S s1(1);]]
+[[S s2(1, 2.);]]
+)cpp",
+      {R"txt(
+SimpleDeclaration
+|-'S'
+|-SimpleDeclarator Declarator
+| `-'s0'
+`-';'
+  )txt",
+       R"txt(
+SimpleDeclaration
+|-'S'
+|-SimpleDeclarator Declarator
+| `-UnknownExpression
+|   |-'s1'
+|   |-'('
+|   |-IntegerLiteralExpression
+|   | `-'1' LiteralToken
+|   `-')'
+`-';'
+  )txt",
+       R"txt(
+SimpleDeclaration
+|-'S'
+|-SimpleDeclarator Declarator
+| `-UnknownExpression
+|   |-'s2'
+|   |-'('
+|   |-IntegerLiteralExpression
+|   | `-'1' LiteralToken
+|   |-','
+|   |-FloatingLiteralExpression
+|   | `-'2.' LiteralToken
+|   `-')'
+`-';'
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest, ImplicitConversion_Argument) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+struct X {
+  X(int);
+};
+void TakeX(const X&);
+void test() {
+  [[TakeX(1)]];
+}
+)cpp",
+      {R"txt(
+CallExpression Expression
+|-IdExpression Callee
+| `-UnqualifiedId UnqualifiedId
+|   `-'TakeX'
+|-'(' OpenParen
+|-CallArguments Arguments
+| `-IntegerLiteralExpression ListElement
+|   `-'1' LiteralToken
+`-')' CloseParen
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest, ImplicitConversion_Return) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+struct X {
+  X(int);
+};
+X CreateX(){
+  [[return 1;]]
+}
+)cpp",
+      {R"txt(
+ReturnStatement Statement
+|-'return' IntroducerKeyword
+|-IntegerLiteralExpression ReturnValue
+| `-'1' LiteralToken
+`-';'
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest, ConstructorCall_ZeroArguments) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+struct X {
+  X();
+};
+X test() {
+  [[return X();]]
+}
+)cpp",
+      {R"txt(
+ReturnStatement Statement
+|-'return' IntroducerKeyword
+|-UnknownExpression ReturnValue
+| |-'X'
+| |-'('
+| `-')'
+`-';'
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest, ConstructorCall_OneArgument) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+struct X {
+  X(int);
+};
+X test() {
+  [[return X(1);]]
+}
+)cpp",
+      {R"txt(
+ReturnStatement Statement
+|-'return' IntroducerKeyword
+|-UnknownExpression ReturnValue
+| |-'X'
+| |-'('
+| |-IntegerLiteralExpression
+| | `-'1' LiteralToken
+| `-')'
+`-';'
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest, ConstructorCall_MultipleArguments) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+struct X {
+  X(int, char);
+};
+X test() {
+  [[return X(1, '2');]]
+}
+)cpp",
+      {R"txt(
+ReturnStatement Statement
+|-'return' IntroducerKeyword
+|-UnknownExpression ReturnValue
+| |-'X'
+| |-'('
+| |-IntegerLiteralExpression
+| | `-'1' LiteralToken
+| |-','
+| |-CharacterLiteralExpression
+| | `-''2'' LiteralToken
+| `-')'
+`-';'
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest, ConstructorCall_DefaultArguments) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+struct X {
+  X(int i = 1, char c = '2');
+};
+X test() {
+  auto x0 = [[X()]];
+  auto x1 = [[X(1)]];
+  auto x2 = [[X(1, '2')]];
+}
+)cpp",
+      {R"txt(
+UnknownExpression
+|-'X'
+|-'('
+`-')'
+)txt",
+       R"txt(
+UnknownExpression
+|-'X'
+|-'('
+|-IntegerLiteralExpression
+| `-'1' LiteralToken
+`-')'
+)txt",
+       R"txt(
+UnknownExpression
+|-'X'
+|-'('
+|-IntegerLiteralExpression
+| `-'1' LiteralToken
+|-','
+|-CharacterLiteralExpression
+| `-''2'' LiteralToken
+`-')'
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest, TypeConversion_FunctionalNotation) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+float test() {
+  [[return float(1);]]
+}
+)cpp",
+      {R"txt(
+ReturnStatement Statement
+|-'return' IntroducerKeyword
+|-UnknownExpression ReturnValue
+| |-'float'
+| |-'('
+| |-IntegerLiteralExpression
+| | `-'1' LiteralToken
+| `-')'
+`-';'
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest, ArrayDeclarator_Simple) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 int a[10];
@@ -3888,7 +4364,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ArrayDeclarator_Multidimensional) {
+TEST_P(BuildSyntaxTreeTest, ArrayDeclarator_Multidimensional) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 int b[1][2][3];
@@ -3918,7 +4394,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ArrayDeclarator_UnknownBound) {
+TEST_P(BuildSyntaxTreeTest, ArrayDeclarator_UnknownBound) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 int c[] = {1,2,3};
@@ -3949,7 +4425,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ArrayDeclarator_Static) {
+TEST_P(BuildSyntaxTreeTest, ArrayDeclarator_Static) {
   if (!GetParam().isC99OrLater()) {
     return;
   }
@@ -3981,7 +4457,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Empty) {
+TEST_P(BuildSyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Empty) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 int func();
@@ -3999,7 +4475,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Named) {
+TEST_P(BuildSyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Named) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 int func1(int a);
@@ -4056,7 +4532,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Unnamed) {
+TEST_P(BuildSyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Unnamed) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 int func1(int);
@@ -4106,7 +4582,63 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest,
+TEST_P(BuildSyntaxTreeTest,
+       ParametersAndQualifiers_InFreeFunctions_Default_One) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+int func1([[int a = 1]]);
+)cpp",
+      {R"txt(
+ParameterDeclarationList Parameters
+`-SimpleDeclaration ListElement
+  |-'int'
+  `-SimpleDeclarator Declarator
+    |-'a'
+    |-'='
+    `-IntegerLiteralExpression
+      `-'1' LiteralToken
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest,
+       ParametersAndQualifiers_InFreeFunctions_Default_Multiple) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+int func2([[int *ap, int a = 1, char c = '2']]);
+)cpp",
+      {R"txt(
+ParameterDeclarationList Parameters
+|-SimpleDeclaration ListElement
+| |-'int'
+| `-SimpleDeclarator Declarator
+|   |-'*'
+|   `-'ap'
+|-',' ListDelimiter
+|-SimpleDeclaration ListElement
+| |-'int'
+| `-SimpleDeclarator Declarator
+|   |-'a'
+|   |-'='
+|   `-IntegerLiteralExpression
+|     `-'1' LiteralToken
+|-',' ListDelimiter
+`-SimpleDeclaration ListElement
+  |-'char'
+  `-SimpleDeclarator Declarator
+    |-'c'
+    |-'='
+    `-CharacterLiteralExpression
+      `-''2'' LiteralToken
+)txt"}));
+}
+
+TEST_P(BuildSyntaxTreeTest,
        ParametersAndQualifiers_InVariadicFunctionTemplate_ParameterPack) {
   if (!GetParam().isCXX11OrLater() || GetParam().hasDelayedTemplateParsing()) {
     return;
@@ -4135,7 +4667,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest,
+TEST_P(BuildSyntaxTreeTest,
        ParametersAndQualifiers_InVariadicFunctionTemplate_NamedParameterPack) {
   if (!GetParam().isCXX11OrLater() || GetParam().hasDelayedTemplateParsing()) {
     return;
@@ -4168,7 +4700,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest,
+TEST_P(BuildSyntaxTreeTest,
        ParametersAndQualifiers_InFreeFunctions_VariadicArguments) {
   if (!GetParam().isCXX11OrLater()) {
     return;
@@ -4197,7 +4729,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest,
+TEST_P(BuildSyntaxTreeTest,
        ParametersAndQualifiers_InFreeFunctions_Cxx_CvQualifiers) {
   if (!GetParam().isCXX()) {
     return;
@@ -4238,7 +4770,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Cxx_Ref) {
+TEST_P(BuildSyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Cxx_Ref) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -4265,7 +4797,8 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InFreeFunctions_Cxx11_RefRef) {
+TEST_P(BuildSyntaxTreeTest,
+       ParametersAndQualifiers_InFreeFunctions_Cxx11_RefRef) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -4292,7 +4825,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InMemberFunctions_Simple) {
+TEST_P(BuildSyntaxTreeTest, ParametersAndQualifiers_InMemberFunctions_Simple) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -4321,7 +4854,8 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InMemberFunctions_CvQualifiers) {
+TEST_P(BuildSyntaxTreeTest,
+       ParametersAndQualifiers_InMemberFunctions_CvQualifiers) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -4369,7 +4903,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InMemberFunctions_Ref) {
+TEST_P(BuildSyntaxTreeTest, ParametersAndQualifiers_InMemberFunctions_Ref) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -4392,7 +4926,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, ParametersAndQualifiers_InMemberFunctions_RefRef) {
+TEST_P(BuildSyntaxTreeTest, ParametersAndQualifiers_InMemberFunctions_RefRef) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -4415,7 +4949,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, TrailingReturn) {
+TEST_P(BuildSyntaxTreeTest, TrailingReturn) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -4439,7 +4973,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, DynamicExceptionSpecification) {
+TEST_P(BuildSyntaxTreeTest, DynamicExceptionSpecification) {
   if (!GetParam().supportsCXXDynamicExceptionSpecification()) {
     return;
   }
@@ -4511,7 +5045,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, NoexceptExceptionSpecification) {
+TEST_P(BuildSyntaxTreeTest, NoexceptExceptionSpecification) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -4547,7 +5081,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, DeclaratorsInParentheses) {
+TEST_P(BuildSyntaxTreeTest, DeclaratorsInParentheses) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 int (a);
@@ -4607,7 +5141,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, Declaration_ConstVolatileQualifiers_SimpleConst) {
+TEST_P(BuildSyntaxTreeTest, Declaration_ConstVolatileQualifiers_SimpleConst) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 const int west = -1;
@@ -4638,7 +5172,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, Declaration_ConstVolatileQualifiers_MultipleConst) {
+TEST_P(BuildSyntaxTreeTest, Declaration_ConstVolatileQualifiers_MultipleConst) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 const int const universal = 0;
@@ -4658,7 +5192,8 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, Declaration_ConstVolatileQualifiers_ConstAndVolatile) {
+TEST_P(BuildSyntaxTreeTest,
+       Declaration_ConstVolatileQualifiers_ConstAndVolatile) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 const int const *const *volatile b;
@@ -4679,7 +5214,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, RangesOfDeclaratorsWithTrailingReturnTypes) {
+TEST_P(BuildSyntaxTreeTest, RangesOfDeclaratorsWithTrailingReturnTypes) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
@@ -4719,7 +5254,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, MemberPointers) {
+TEST_P(BuildSyntaxTreeTest, MemberPointers) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -4754,7 +5289,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, MemberFunctionPointer) {
+TEST_P(BuildSyntaxTreeTest, MemberFunctionPointer) {
   if (!GetParam().isCXX()) {
     return;
   }
@@ -4840,7 +5375,7 @@ SimpleDeclaration
 )txt"}));
 }
 
-TEST_P(SyntaxTreeTest, ComplexDeclarator) {
+TEST_P(BuildSyntaxTreeTest, ComplexDeclarator) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 void x(char a, short (*b)(int));
@@ -4878,7 +5413,7 @@ TranslationUnit Detached
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, ComplexDeclarator2) {
+TEST_P(BuildSyntaxTreeTest, ComplexDeclarator2) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
 void x(char a, short (*b)(int), long (**c)(long long));
diff --git a/clang/unittests/Tooling/Syntax/CMakeLists.txt b/clang/unittests/Tooling/Syntax/CMakeLists.txt
index 46ff4c9c3e27a..34a480503def6 100644
--- a/clang/unittests/Tooling/Syntax/CMakeLists.txt
+++ b/clang/unittests/Tooling/Syntax/CMakeLists.txt
@@ -6,6 +6,7 @@ add_clang_unittest(SyntaxTests
   TreeTestBase.cpp
   BuildTreeTest.cpp
   MutationsTest.cpp
+  SynthesisTest.cpp
   TokensTest.cpp
 )
 
diff --git a/clang/unittests/Tooling/Syntax/MutationsTest.cpp b/clang/unittests/Tooling/Syntax/MutationsTest.cpp
index 6ef71e3a80900..f63d3dffa4597 100644
--- a/clang/unittests/Tooling/Syntax/MutationsTest.cpp
+++ b/clang/unittests/Tooling/Syntax/MutationsTest.cpp
@@ -19,15 +19,12 @@ using namespace clang::syntax;
 
 namespace {
 
-TEST_P(SyntaxTreeTest, Mutations) {
-  if (!GetParam().isCXX11OrLater()) {
-    return;
-  }
-
-  using Transformation = std::function;
-  auto CheckTransformation = [this](std::string Input, std::string Expected,
-                                    Transformation Transform) -> void {
+class MutationTest : public SyntaxTreeTest {
+protected:
+  using Transformation = std::function;
+  void CheckTransformation(Transformation Transform, std::string Input,
+                           std::string Expected) {
     llvm::Annotations Source(Input);
     auto *Root = buildTree(Source.code(), GetParam());
 
@@ -46,40 +43,32 @@ TEST_P(SyntaxTreeTest, Mutations) {
 
   // Removes the selected statement. Input should have exactly one selected
   // range and it should correspond to a single statement.
-  auto RemoveStatement = [this](const llvm::Annotations &Input,
-                                syntax::TranslationUnit *TU) {
-    auto *S = cast(nodeByRange(Input.range(), TU));
+  Transformation RemoveStatement = [this](const llvm::Annotations &Input,
+                                          TranslationUnit *Root) {
+    auto *S = cast(nodeByRange(Input.range(), Root));
     ASSERT_TRUE(S->canModify()) << "cannot remove a statement";
     syntax::removeStatement(*Arena, S);
     EXPECT_TRUE(S->isDetached());
     EXPECT_FALSE(S->isOriginal())
         << "node removed from tree cannot be marked as original";
   };
+};
 
-  std::vector>
-      Cases = {
-          {"void test() { [[100+100;]] test(); }", "void test() {  test(); }"},
-          {"void test() { if (true) [[{}]] else {} }",
-           "void test() { if (true) ; else {} }"},
-          {"void test() { [[;]] }", "void test() {  }"}};
-  for (const auto &C : Cases)
-    CheckTransformation(C.first, C.second, RemoveStatement);
-}
+INSTANTIATE_TEST_CASE_P(SyntaxTreeTests, MutationTest,
+                        ::testing::ValuesIn(allTestClangConfigs()), );
 
-TEST_P(SyntaxTreeTest, SynthesizedNodes) {
-  buildTree("", GetParam());
+TEST_P(MutationTest, RemoveStatement_InCompound) {
+  CheckTransformation(RemoveStatement, "void test() { [[100+100;]] test(); }",
+                      "void test() {  test(); }");
+}
 
-  auto *C = syntax::createPunctuation(*Arena, tok::comma);
-  ASSERT_NE(C, nullptr);
-  EXPECT_EQ(C->token()->kind(), tok::comma);
-  EXPECT_TRUE(C->canModify());
-  EXPECT_FALSE(C->isOriginal());
-  EXPECT_TRUE(C->isDetached());
+TEST_P(MutationTest, RemoveStatement_InCompound_Empty) {
+  CheckTransformation(RemoveStatement, "void test() { [[;]] }",
+                      "void test() {  }");
+}
 
-  auto *S = syntax::createEmptyStatement(*Arena);
-  ASSERT_NE(S, nullptr);
-  EXPECT_TRUE(S->canModify());
-  EXPECT_FALSE(S->isOriginal());
-  EXPECT_TRUE(S->isDetached());
+TEST_P(MutationTest, RemoveStatement_LeaveEmpty) {
+  CheckTransformation(RemoveStatement, "void test() { if (1) [[{}]] else {} }",
+                      "void test() { if (1) ; else {} }");
 }
 } // namespace
diff --git a/clang/unittests/Tooling/Syntax/SynthesisTest.cpp b/clang/unittests/Tooling/Syntax/SynthesisTest.cpp
new file mode 100644
index 0000000000000..a882714ccf33f
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/SynthesisTest.cpp
@@ -0,0 +1,149 @@
+//===- SynthesisTest.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file tests synthesis API for syntax trees.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TreeTestBase.h"
+#include "clang/Tooling/Syntax/BuildTree.h"
+#include "clang/Tooling/Syntax/Nodes.h"
+#include "gtest/gtest.h"
+
+using namespace clang;
+using namespace clang::syntax;
+
+namespace {
+
+class SynthesisTest : public SyntaxTreeTest {
+protected:
+  ::testing::AssertionResult treeDumpEqual(syntax::Node *Root, StringRef Dump) {
+    if (!Root)
+      return ::testing::AssertionFailure()
+             << "Root was not built successfully.";
+
+    auto Actual = StringRef(Root->dump(Arena->getSourceManager())).trim().str();
+    auto Expected = Dump.trim().str();
+    // EXPECT_EQ shows the diff between the two strings if they are different.
+    EXPECT_EQ(Expected, Actual);
+    if (Actual != Expected) {
+      return ::testing::AssertionFailure();
+    }
+    return ::testing::AssertionSuccess();
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(SynthesisTests, SynthesisTest,
+                        ::testing::ValuesIn(allTestClangConfigs()), );
+
+TEST_P(SynthesisTest, Leaf_Punctuation) {
+  buildTree("", GetParam());
+
+  auto *Leaf = createLeaf(*Arena, tok::comma);
+
+  EXPECT_TRUE(treeDumpEqual(Leaf, R"txt(
+',' Detached synthesized
+  )txt"));
+}
+
+TEST_P(SynthesisTest, Leaf_Keyword) {
+  buildTree("", GetParam());
+
+  auto *Leaf = createLeaf(*Arena, tok::kw_if);
+
+  EXPECT_TRUE(treeDumpEqual(Leaf, R"txt(
+'if' Detached synthesized
+  )txt"));
+}
+
+TEST_P(SynthesisTest, Leaf_Identifier) {
+  buildTree("", GetParam());
+
+  auto *Leaf = createLeaf(*Arena, tok::identifier, "a");
+
+  EXPECT_TRUE(treeDumpEqual(Leaf, R"txt(
+'a' Detached synthesized
+  )txt"));
+}
+
+TEST_P(SynthesisTest, Leaf_Number) {
+  buildTree("", GetParam());
+
+  auto *Leaf = createLeaf(*Arena, tok::numeric_constant, "1");
+
+  EXPECT_TRUE(treeDumpEqual(Leaf, R"txt(
+'1' Detached synthesized
+  )txt"));
+}
+
+TEST_P(SynthesisTest, Tree_Empty) {
+  buildTree("", GetParam());
+
+  auto *Tree = createTree(*Arena, {}, NodeKind::UnknownExpression);
+
+  EXPECT_TRUE(treeDumpEqual(Tree, R"txt(
+UnknownExpression Detached synthesized
+  )txt"));
+}
+
+TEST_P(SynthesisTest, Tree_Flat) {
+  buildTree("", GetParam());
+
+  auto *LeafLParen = createLeaf(*Arena, tok::l_paren);
+  auto *LeafRParen = createLeaf(*Arena, tok::r_paren);
+  auto *TreeParen = createTree(*Arena,
+                               {{LeafLParen, NodeRole::LeftHandSide},
+                                {LeafRParen, NodeRole::RightHandSide}},
+                               NodeKind::ParenExpression);
+
+  EXPECT_TRUE(treeDumpEqual(TreeParen, R"txt(
+ParenExpression Detached synthesized
+|-'(' LeftHandSide synthesized
+`-')' RightHandSide synthesized
+  )txt"));
+}
+
+TEST_P(SynthesisTest, Tree_OfTree) {
+  buildTree("", GetParam());
+
+  auto *Leaf1 = createLeaf(*Arena, tok::numeric_constant, "1");
+  auto *Int1 = createTree(*Arena, {{Leaf1, NodeRole::LiteralToken}},
+                          NodeKind::IntegerLiteralExpression);
+
+  auto *LeafPlus = createLeaf(*Arena, tok::plus);
+
+  auto *Leaf2 = createLeaf(*Arena, tok::numeric_constant, "2");
+  auto *Int2 = createTree(*Arena, {{Leaf2, NodeRole::LiteralToken}},
+                          NodeKind::IntegerLiteralExpression);
+
+  auto *TreeBinaryOperator = createTree(*Arena,
+                                        {{Int1, NodeRole::LeftHandSide},
+                                         {LeafPlus, NodeRole::OperatorToken},
+                                         {Int2, NodeRole::RightHandSide}},
+                                        NodeKind::BinaryOperatorExpression);
+
+  EXPECT_TRUE(treeDumpEqual(TreeBinaryOperator, R"txt(
+BinaryOperatorExpression Detached synthesized
+|-IntegerLiteralExpression LeftHandSide synthesized
+| `-'1' LiteralToken synthesized
+|-'+' OperatorToken synthesized
+`-IntegerLiteralExpression RightHandSide synthesized
+  `-'2' LiteralToken synthesized
+  )txt"));
+}
+
+TEST_P(SynthesisTest, Statement_EmptyStatement) {
+  buildTree("", GetParam());
+
+  auto *S = createEmptyStatement(*Arena);
+  EXPECT_TRUE(treeDumpEqual(S, R"txt(
+EmptyStatement Detached synthesized
+`-';' synthesized
+  )txt"));
+}
+} // namespace
diff --git a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
index ebee0115cb727..2305b78006b1e 100644
--- a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -38,13 +38,14 @@ namespace {
 ArrayRef tokens(syntax::Node *N) {
   assert(N->isOriginal() && "tokens of modified nodes are not well-defined");
   if (auto *L = dyn_cast(N))
-    return llvm::makeArrayRef(L->token(), 1);
+    return llvm::makeArrayRef(L->getToken(), 1);
   auto *T = cast(N);
-  return llvm::makeArrayRef(T->firstLeaf()->token(),
-                            T->lastLeaf()->token() + 1);
+  return llvm::makeArrayRef(T->findFirstLeaf()->getToken(),
+                            T->findLastLeaf()->getToken() + 1);
 }
+} // namespace
 
-std::vector allTestClangConfigs() {
+std::vector clang::syntax::allTestClangConfigs() {
   std::vector all_configs;
   for (TestLanguage lang : {Lang_C89, Lang_C99, Lang_CXX03, Lang_CXX11,
                             Lang_CXX14, Lang_CXX17, Lang_CXX20}) {
@@ -61,10 +62,6 @@ std::vector allTestClangConfigs() {
   return all_configs;
 }
 
-INSTANTIATE_TEST_CASE_P(SyntaxTreeTests, SyntaxTreeTest,
-                        testing::ValuesIn(allTestClangConfigs()), );
-} // namespace
-
 syntax::TranslationUnit *
 SyntaxTreeTest::buildTree(StringRef Code, const TestClangConfig &ClangConfig) {
   // FIXME: this code is almost the identical to the one in TokensTest. Share
@@ -161,62 +158,6 @@ SyntaxTreeTest::buildTree(StringRef Code, const TestClangConfig &ClangConfig) {
   return Root;
 }
 
-::testing::AssertionResult SyntaxTreeTest::treeDumpEqual(StringRef Code,
-                                                         StringRef Tree) {
-  SCOPED_TRACE(llvm::join(GetParam().getCommandLineArgs(), " "));
-
-  auto *Root = buildTree(Code, GetParam());
-  if (Diags->getClient()->getNumErrors() != 0) {
-    return ::testing::AssertionFailure()
-           << "Source file has syntax errors, they were printed to the test "
-              "log";
-  }
-  auto Actual = StringRef(Root->dump(Arena->sourceManager())).trim().str();
-  // EXPECT_EQ shows the diff between the two strings if they are different.
-  EXPECT_EQ(Tree.trim().str(), Actual);
-  if (Actual != Tree.trim().str()) {
-    return ::testing::AssertionFailure();
-  }
-  return ::testing::AssertionSuccess();
-}
-
-::testing::AssertionResult
-SyntaxTreeTest::treeDumpEqualOnAnnotations(StringRef CodeWithAnnotations,
-                                           ArrayRef TreeDumps) {
-  SCOPED_TRACE(llvm::join(GetParam().getCommandLineArgs(), " "));
-
-  auto AnnotatedCode = llvm::Annotations(CodeWithAnnotations);
-  auto *Root = buildTree(AnnotatedCode.code(), GetParam());
-
-  if (Diags->getClient()->getNumErrors() != 0) {
-    return ::testing::AssertionFailure()
-           << "Source file has syntax errors, they were printed to the test "
-              "log";
-  }
-
-  auto AnnotatedRanges = AnnotatedCode.ranges();
-  if (AnnotatedRanges.size() != TreeDumps.size()) {
-    return ::testing::AssertionFailure()
-           << "The number of annotated ranges in the source code is different "
-              "to the number of their corresponding tree dumps.";
-  }
-  bool Failed = false;
-  for (unsigned i = 0; i < AnnotatedRanges.size(); i++) {
-    auto *AnnotatedNode = nodeByRange(AnnotatedRanges[i], Root);
-    assert(AnnotatedNode);
-    auto AnnotatedNodeDump =
-        StringRef(AnnotatedNode->dump(Arena->sourceManager())).trim().str();
-    // EXPECT_EQ shows the diff between the two strings if they are different.
-    EXPECT_EQ(TreeDumps[i].trim().str(), AnnotatedNodeDump)
-        << "Dumps diverged for the code:\n"
-        << AnnotatedCode.code().slice(AnnotatedRanges[i].Begin,
-                                      AnnotatedRanges[i].End);
-    if (AnnotatedNodeDump != TreeDumps[i].trim().str())
-      Failed = true;
-  }
-  return Failed ? ::testing::AssertionFailure() : ::testing::AssertionSuccess();
-}
-
 syntax::Node *SyntaxTreeTest::nodeByRange(llvm::Annotations::Range R,
                                           syntax::Node *Root) {
   ArrayRef Toks = tokens(Root);
@@ -229,7 +170,7 @@ syntax::Node *SyntaxTreeTest::nodeByRange(llvm::Annotations::Range R,
   auto *T = dyn_cast(Root);
   if (!T)
     return nullptr;
-  for (auto *C = T->firstChild(); C != nullptr; C = C->nextSibling()) {
+  for (auto *C = T->getFirstChild(); C != nullptr; C = C->getNextSibling()) {
     if (auto *Result = nodeByRange(R, C))
       return Result;
   }
diff --git a/clang/unittests/Tooling/Syntax/TreeTestBase.h b/clang/unittests/Tooling/Syntax/TreeTestBase.h
index c282bbf45fd39..8b0ca979dec3d 100644
--- a/clang/unittests/Tooling/Syntax/TreeTestBase.h
+++ b/clang/unittests/Tooling/Syntax/TreeTestBase.h
@@ -32,11 +32,6 @@ class SyntaxTreeTest : public ::testing::Test,
   TranslationUnit *buildTree(StringRef Code,
                              const TestClangConfig &ClangConfig);
 
-  ::testing::AssertionResult treeDumpEqual(StringRef Code, StringRef Tree);
-
-  ::testing::AssertionResult
-  treeDumpEqualOnAnnotations(StringRef CodeWithAnnotations,
-                             ArrayRef TreeDumps);
   /// Finds the deepest node in the tree that covers exactly \p R.
   /// FIXME: implement this efficiently and move to public syntax tree API.
   syntax::Node *nodeByRange(llvm::Annotations::Range R, syntax::Node *Root);
@@ -56,6 +51,8 @@ class SyntaxTreeTest : public ::testing::Test,
   std::unique_ptr TB;
   std::unique_ptr Arena;
 };
+
+std::vector allTestClangConfigs();
 } // namespace syntax
 } // namespace clang
 #endif // LLVM_CLANG_UNITTESTS_TOOLING_SYNTAX_TREETESTBASE_H
diff --git a/clang/unittests/Tooling/ToolingTest.cpp b/clang/unittests/Tooling/ToolingTest.cpp
index cc6f453284d71..691a847d5a715 100644
--- a/clang/unittests/Tooling/ToolingTest.cpp
+++ b/clang/unittests/Tooling/ToolingTest.cpp
@@ -563,6 +563,40 @@ TEST(ClangToolTest, StripDependencyFileAdjusterShowIncludes) {
   EXPECT_TRUE(HasFlag("-c"));
 }
 
+// Check getClangStripDependencyFileAdjuster doesn't strip args when using the
+// MSVC cl.exe driver
+TEST(ClangToolTest, StripDependencyFileAdjusterMsvc) {
+  FixedCompilationDatabase Compilations(
+      "/", {"--driver-mode=cl", "-MD", "-MDd", "-MT", "-O1", "-MTd", "-MP"});
+
+  ClangTool Tool(Compilations, std::vector(1, "/a.cc"));
+  Tool.mapVirtualFile("/a.cc", "void a() {}");
+
+  std::unique_ptr Action(
+      newFrontendActionFactory());
+
+  CommandLineArguments FinalArgs;
+  ArgumentsAdjuster CheckFlagsAdjuster =
+      [&FinalArgs](const CommandLineArguments &Args, StringRef /*unused*/) {
+        FinalArgs = Args;
+        return Args;
+      };
+  Tool.clearArgumentsAdjusters();
+  Tool.appendArgumentsAdjuster(getClangStripDependencyFileAdjuster());
+  Tool.appendArgumentsAdjuster(CheckFlagsAdjuster);
+  Tool.run(Action.get());
+
+  auto HasFlag = [&FinalArgs](const std::string &Flag) {
+    return llvm::find(FinalArgs, Flag) != FinalArgs.end();
+  };
+  EXPECT_TRUE(HasFlag("-MD"));
+  EXPECT_TRUE(HasFlag("-MDd"));
+  EXPECT_TRUE(HasFlag("-MT"));
+  EXPECT_TRUE(HasFlag("-O1"));
+  EXPECT_TRUE(HasFlag("-MTd"));
+  EXPECT_TRUE(HasFlag("-MP"));
+}
+
 // Check getClangStripPluginsAdjuster strips plugin related args.
 TEST(ClangToolTest, StripPluginsAdjuster) {
   FixedCompilationDatabase Compilations(
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index e0c2cefcaa3fe..3c546eb409dee 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -987,7 +987,7 @@ 

C++20 implementation status

[[likely]] and [[unlikely]] attributes
P0479R5 - No + Clang 12 (partial) typename optional in more contexts diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt index 0a0294f937dba..9967e293749bd 100644 --- a/compiler-rt/CMakeLists.txt +++ b/compiler-rt/CMakeLists.txt @@ -81,34 +81,19 @@ if (COMPILER_RT_STANDALONE_BUILD) set_target_properties(intrinsics_gen PROPERTIES FOLDER "Compiler-RT Misc") endif() - if(CMAKE_VERSION VERSION_LESS 3.12) - # Find Python interpreter. - include(FindPythonInterp) - if(NOT PYTHONINTERP_FOUND) - message(FATAL_ERROR " - Unable to find Python interpreter required testing. Please install Python - or specify the PYTHON_EXECUTABLE CMake variable.") + find_package(Python3 COMPONENTS Interpreter) + if(NOT Python3_Interpreter_FOUND) + message(WARNING "Python3 not found, using python2 as a fallback") + find_package(Python2 COMPONENTS Interpreter REQUIRED) + if(Python2_VERSION VERSION_LESS 2.7) + message(SEND_ERROR "Python 2.7 or newer is required") endif() + # Treat python2 as python3 add_executable(Python3::Interpreter IMPORTED) set_target_properties(Python3::Interpreter PROPERTIES - IMPORTED_LOCATION ${PYTHON_EXECUTABLE}) - set(Python3_EXECUTABLE ${PYTHON_EXECUTABLE}) - else() - find_package(Python3 COMPONENTS Interpreter) - if(NOT Python3_Interpreter_FOUND) - message(WARNING "Python3 not found, using python2 as a fallback") - find_package(Python2 COMPONENTS Interpreter REQUIRED) - if(Python2_VERSION VERSION_LESS 2.7) - message(SEND_ERROR "Python 2.7 or newer is required") - endif() - - # Treat python2 as python3 - add_executable(Python3::Interpreter IMPORTED) - set_target_properties(Python3::Interpreter PROPERTIES - IMPORTED_LOCATION ${Python2_EXECUTABLE}) - set(Python3_EXECUTABLE ${Python2_EXECUTABLE}) - endif() + IMPORTED_LOCATION ${Python2_EXECUTABLE}) + set(Python3_EXECUTABLE ${Python2_EXECUTABLE}) endif() # Ensure that fat libraries are built correctly on Darwin diff --git a/compiler-rt/include/sanitizer/netbsd_syscall_hooks.h b/compiler-rt/include/sanitizer/netbsd_syscall_hooks.h index 370da0ea72ed8..f661152ccbac7 100644 --- a/compiler-rt/include/sanitizer/netbsd_syscall_hooks.h +++ b/compiler-rt/include/sanitizer/netbsd_syscall_hooks.h @@ -20,8 +20,8 @@ // DO NOT EDIT! THIS FILE HAS BEEN GENERATED! // // Generated with: generate_netbsd_syscalls.awk -// Generated date: 2019-12-24 -// Generated from: syscalls.master,v 1.296 2019/09/22 22:59:39 christos Exp +// Generated date: 2020-09-10 +// Generated from: syscalls.master,v 1.306 2020/08/14 00:53:16 riastradh Exp // //===----------------------------------------------------------------------===// #ifndef SANITIZER_NETBSD_SYSCALL_HOOKS_H @@ -474,7 +474,12 @@ __sanitizer_syscall_pre_impl_dup2((long long)(from), (long long)(to)) #define __sanitizer_syscall_post_dup2(res, from, to) \ __sanitizer_syscall_post_impl_dup2(res, (long long)(from), (long long)(to)) -/* syscall 91 has been skipped */ +#define __sanitizer_syscall_pre_getrandom(buf, buflen, flags) \ + __sanitizer_syscall_pre_impl_getrandom( \ + (long long)(buf), (long long)(buflen), (long long)(flags)) +#define __sanitizer_syscall_post_getrandom(res, buf, buflen, flags) \ + __sanitizer_syscall_post_impl_getrandom( \ + res, (long long)(buf), (long long)(buflen), (long long)(flags)) #define __sanitizer_syscall_pre_fcntl(fd, cmd, arg) \ __sanitizer_syscall_pre_impl_fcntl((long long)(fd), (long long)(cmd), \ (long long)(arg)) @@ -849,9 +854,31 @@ #define __sanitizer_syscall_post_sysarch(res, op, parms) \ __sanitizer_syscall_post_impl_sysarch(res, (long long)(op), \ (long long)(parms)) -/* syscall 166 has been skipped */ -/* syscall 167 has been skipped */ -/* syscall 168 has been skipped */ +#define __sanitizer_syscall_pre___futex(uaddr, op, val, timeout, uaddr2, val2, \ + val3) \ + __sanitizer_syscall_pre_impl___futex((long long)(uaddr), (long long)(op), \ + (long long)(val), (long long)(timeout), \ + (long long)(uaddr2), (long long)(val2), \ + (long long)(val3)) +#define __sanitizer_syscall_post___futex(res, uaddr, op, val, timeout, uaddr2, \ + val2, val3) \ + __sanitizer_syscall_post_impl___futex( \ + res, (long long)(uaddr), (long long)(op), (long long)(val), \ + (long long)(timeout), (long long)(uaddr2), (long long)(val2), \ + (long long)(val3)) +#define __sanitizer_syscall_pre___futex_set_robust_list(head, len) \ + __sanitizer_syscall_pre_impl___futex_set_robust_list((long long)(head), \ + (long long)(len)) +#define __sanitizer_syscall_post___futex_set_robust_list(res, head, len) \ + __sanitizer_syscall_post_impl___futex_set_robust_list( \ + res, (long long)(head), (long long)(len)) +#define __sanitizer_syscall_pre___futex_get_robust_list(lwpid, headp, lenp) \ + __sanitizer_syscall_pre_impl___futex_get_robust_list( \ + (long long)(lwpid), (long long)(headp), (long long)(lenp)) +#define __sanitizer_syscall_post___futex_get_robust_list(res, lwpid, headp, \ + lenp) \ + __sanitizer_syscall_post_impl___futex_get_robust_list( \ + res, (long long)(lwpid), (long long)(headp), (long long)(lenp)) #if !defined(_LP64) #define __sanitizer_syscall_pre_compat_10_osemsys(which, a2, a3, a4, a5) \ __sanitizer_syscall_pre_impl_compat_10_osemsys( \ @@ -2731,6 +2758,83 @@ __sanitizer_syscall_post_impl___fhstatvfs190( \ res, (long long)(fhp), (long long)(fh_size), (long long)(buf), \ (long long)(flags)) +#define __sanitizer_syscall_pre___acl_get_link(path, type, aclp) \ + __sanitizer_syscall_pre_impl___acl_get_link( \ + (long long)(path), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_post___acl_get_link(res, path, type, aclp) \ + __sanitizer_syscall_post_impl___acl_get_link( \ + res, (long long)(path), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_pre___acl_set_link(path, type, aclp) \ + __sanitizer_syscall_pre_impl___acl_set_link( \ + (long long)(path), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_post___acl_set_link(res, path, type, aclp) \ + __sanitizer_syscall_post_impl___acl_set_link( \ + res, (long long)(path), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_pre___acl_delete_link(path, type) \ + __sanitizer_syscall_pre_impl___acl_delete_link((long long)(path), \ + (long long)(type)) +#define __sanitizer_syscall_post___acl_delete_link(res, path, type) \ + __sanitizer_syscall_post_impl___acl_delete_link(res, (long long)(path), \ + (long long)(type)) +#define __sanitizer_syscall_pre___acl_aclcheck_link(path, type, aclp) \ + __sanitizer_syscall_pre_impl___acl_aclcheck_link( \ + (long long)(path), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_post___acl_aclcheck_link(res, path, type, aclp) \ + __sanitizer_syscall_post_impl___acl_aclcheck_link( \ + res, (long long)(path), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_pre___acl_get_file(path, type, aclp) \ + __sanitizer_syscall_pre_impl___acl_get_file( \ + (long long)(path), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_post___acl_get_file(res, path, type, aclp) \ + __sanitizer_syscall_post_impl___acl_get_file( \ + res, (long long)(path), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_pre___acl_set_file(path, type, aclp) \ + __sanitizer_syscall_pre_impl___acl_set_file( \ + (long long)(path), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_post___acl_set_file(res, path, type, aclp) \ + __sanitizer_syscall_post_impl___acl_set_file( \ + res, (long long)(path), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_pre___acl_get_fd(filedes, type, aclp) \ + __sanitizer_syscall_pre_impl___acl_get_fd( \ + (long long)(filedes), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_post___acl_get_fd(res, filedes, type, aclp) \ + __sanitizer_syscall_post_impl___acl_get_fd( \ + res, (long long)(filedes), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_pre___acl_set_fd(filedes, type, aclp) \ + __sanitizer_syscall_pre_impl___acl_set_fd( \ + (long long)(filedes), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_post___acl_set_fd(res, filedes, type, aclp) \ + __sanitizer_syscall_post_impl___acl_set_fd( \ + res, (long long)(filedes), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_pre___acl_delete_file(path, type) \ + __sanitizer_syscall_pre_impl___acl_delete_file((long long)(path), \ + (long long)(type)) +#define __sanitizer_syscall_post___acl_delete_file(res, path, type) \ + __sanitizer_syscall_post_impl___acl_delete_file(res, (long long)(path), \ + (long long)(type)) +#define __sanitizer_syscall_pre___acl_delete_fd(filedes, type) \ + __sanitizer_syscall_pre_impl___acl_delete_fd((long long)(filedes), \ + (long long)(type)) +#define __sanitizer_syscall_post___acl_delete_fd(res, filedes, type) \ + __sanitizer_syscall_post_impl___acl_delete_fd(res, (long long)(filedes), \ + (long long)(type)) +#define __sanitizer_syscall_pre___acl_aclcheck_file(path, type, aclp) \ + __sanitizer_syscall_pre_impl___acl_aclcheck_file( \ + (long long)(path), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_post___acl_aclcheck_file(res, path, type, aclp) \ + __sanitizer_syscall_post_impl___acl_aclcheck_file( \ + res, (long long)(path), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_pre___acl_aclcheck_fd(filedes, type, aclp) \ + __sanitizer_syscall_pre_impl___acl_aclcheck_fd( \ + (long long)(filedes), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_post___acl_aclcheck_fd(res, filedes, type, aclp) \ + __sanitizer_syscall_post_impl___acl_aclcheck_fd( \ + res, (long long)(filedes), (long long)(type), (long long)(aclp)) +#define __sanitizer_syscall_pre_lpathconf(path, name) \ + __sanitizer_syscall_pre_impl_lpathconf((long long)(path), (long long)(name)) +#define __sanitizer_syscall_post_lpathconf(res, path, name) \ + __sanitizer_syscall_post_impl_lpathconf(res, (long long)(path), \ + (long long)(name)) /* Compat with older releases */ #define __sanitizer_syscall_pre_getvfsstat \ @@ -3088,7 +3192,10 @@ void __sanitizer_syscall_post_impl_compat_43_ogetdtablesize(long long res); void __sanitizer_syscall_pre_impl_dup2(long long from, long long to); void __sanitizer_syscall_post_impl_dup2(long long res, long long from, long long to); -/* syscall 91 has been skipped */ +void __sanitizer_syscall_pre_impl_getrandom(long long buf, long long buflen, + long long flags); +void __sanitizer_syscall_post_impl_getrandom(long long res, long long buf, + long long buflen, long long flags); void __sanitizer_syscall_pre_impl_fcntl(long long fd, long long cmd, long long arg); void __sanitizer_syscall_post_impl_fcntl(long long res, long long fd, @@ -3380,9 +3487,26 @@ void __sanitizer_syscall_post_impl_compat_09_ouname(long long res, void __sanitizer_syscall_pre_impl_sysarch(long long op, long long parms); void __sanitizer_syscall_post_impl_sysarch(long long res, long long op, long long parms); -/* syscall 166 has been skipped */ -/* syscall 167 has been skipped */ -/* syscall 168 has been skipped */ +void __sanitizer_syscall_pre_impl___futex(long long uaddr, long long op, + long long val, long long timeout, + long long uaddr2, long long val2, + long long val3); +void __sanitizer_syscall_post_impl___futex(long long res, long long uaddr, + long long op, long long val, + long long timeout, long long uaddr2, + long long val2, long long val3); +void __sanitizer_syscall_pre_impl___futex_set_robust_list(long long head, + long long len); +void __sanitizer_syscall_post_impl___futex_set_robust_list(long long res, + long long head, + long long len); +void __sanitizer_syscall_pre_impl___futex_get_robust_list(long long lwpid, + long long headp, + long long lenp); +void __sanitizer_syscall_post_impl___futex_get_robust_list(long long res, + long long lwpid, + long long headp, + long long lenp); #if !defined(_LP64) void __sanitizer_syscall_pre_impl_compat_10_osemsys(long long which, long long a2, long long a3, @@ -4802,6 +4926,75 @@ void __sanitizer_syscall_post_impl___fhstatvfs190(long long res, long long fhp, long long fh_size, long long buf, long long flags); +void __sanitizer_syscall_pre_impl___acl_get_link(long long path, long long type, + long long aclp); +void __sanitizer_syscall_post_impl___acl_get_link(long long res, long long path, + long long type, + long long aclp); +void __sanitizer_syscall_pre_impl___acl_set_link(long long path, long long type, + long long aclp); +void __sanitizer_syscall_post_impl___acl_set_link(long long res, long long path, + long long type, + long long aclp); +void __sanitizer_syscall_pre_impl___acl_delete_link(long long path, + long long type); +void __sanitizer_syscall_post_impl___acl_delete_link(long long res, + long long path, + long long type); +void __sanitizer_syscall_pre_impl___acl_aclcheck_link(long long path, + long long type, + long long aclp); +void __sanitizer_syscall_post_impl___acl_aclcheck_link(long long res, + long long path, + long long type, + long long aclp); +void __sanitizer_syscall_pre_impl___acl_get_file(long long path, long long type, + long long aclp); +void __sanitizer_syscall_post_impl___acl_get_file(long long res, long long path, + long long type, + long long aclp); +void __sanitizer_syscall_pre_impl___acl_set_file(long long path, long long type, + long long aclp); +void __sanitizer_syscall_post_impl___acl_set_file(long long res, long long path, + long long type, + long long aclp); +void __sanitizer_syscall_pre_impl___acl_get_fd(long long filedes, + long long type, long long aclp); +void __sanitizer_syscall_post_impl___acl_get_fd(long long res, + long long filedes, + long long type, long long aclp); +void __sanitizer_syscall_pre_impl___acl_set_fd(long long filedes, + long long type, long long aclp); +void __sanitizer_syscall_post_impl___acl_set_fd(long long res, + long long filedes, + long long type, long long aclp); +void __sanitizer_syscall_pre_impl___acl_delete_file(long long path, + long long type); +void __sanitizer_syscall_post_impl___acl_delete_file(long long res, + long long path, + long long type); +void __sanitizer_syscall_pre_impl___acl_delete_fd(long long filedes, + long long type); +void __sanitizer_syscall_post_impl___acl_delete_fd(long long res, + long long filedes, + long long type); +void __sanitizer_syscall_pre_impl___acl_aclcheck_file(long long path, + long long type, + long long aclp); +void __sanitizer_syscall_post_impl___acl_aclcheck_file(long long res, + long long path, + long long type, + long long aclp); +void __sanitizer_syscall_pre_impl___acl_aclcheck_fd(long long filedes, + long long type, + long long aclp); +void __sanitizer_syscall_post_impl___acl_aclcheck_fd(long long res, + long long filedes, + long long type, + long long aclp); +void __sanitizer_syscall_pre_impl_lpathconf(long long path, long long name); +void __sanitizer_syscall_post_impl_lpathconf(long long res, long long path, + long long name); #ifdef __cplusplus } // extern "C" diff --git a/compiler-rt/lib/asan/asan_allocator.cpp b/compiler-rt/lib/asan/asan_allocator.cpp index 7334b7200fc4c..58b496a3ca4b1 100644 --- a/compiler-rt/lib/asan/asan_allocator.cpp +++ b/compiler-rt/lib/asan/asan_allocator.cpp @@ -51,6 +51,22 @@ static u32 RZSize2Log(u32 rz_size) { static AsanAllocator &get_allocator(); +static void AtomicContextStore(volatile atomic_uint64_t *atomic_context, + u32 tid, u32 stack) { + u64 context = tid; + context <<= 32; + context += stack; + atomic_store(atomic_context, context, memory_order_relaxed); +} + +static void AtomicContextLoad(const volatile atomic_uint64_t *atomic_context, + u32 &tid, u32 &stack) { + u64 context = atomic_load(atomic_context, memory_order_relaxed); + stack = context; + context >>= 32; + tid = context; +} + // The memory chunk allocated from the underlying allocator looks like this: // L L L L L L H H U U U U U U R R // L -- left redzone words (0 or more bytes) @@ -68,29 +84,59 @@ static AsanAllocator &get_allocator(); // ---------------------| // M -- magic value kAllocBegMagic // B -- address of ChunkHeader pointing to the first 'H' -static const uptr kAllocBegMagic = 0xCC6E96B9; -struct ChunkHeader { +class ChunkHeader { + public: atomic_uint8_t chunk_state; - u8 from_memalign : 1; u8 alloc_type : 2; - u8 rz_log : 3; u8 lsan_tag : 2; - // This field is used for small sizes. For large sizes it is equal to - // SizeClassMap::kMaxSize and the actual size is stored in the - // SecondaryAllocator's metadata. - u32 user_requested_size : 29; + // align < 8 -> 0 // else -> log2(min(align, 512)) - 2 - u32 user_requested_alignment_log : 3; - u32 alloc_tid; - atomic_uint32_t alloc_context_id; + u8 user_requested_alignment_log : 3; + + private: + u16 user_requested_size_hi; + u32 user_requested_size_lo; + atomic_uint64_t alloc_context_id; + + public: + uptr UsedSize() const { + uptr R = user_requested_size_lo; + if (sizeof(uptr) > sizeof(user_requested_size_lo)) + R += (uptr)user_requested_size_hi << (8 * sizeof(user_requested_size_lo)); + return R; + } + + void SetUsedSize(uptr size) { + user_requested_size_lo = size; + if (sizeof(uptr) > sizeof(user_requested_size_lo)) { + size >>= (8 * sizeof(user_requested_size_lo)); + user_requested_size_hi = size; + CHECK_EQ(user_requested_size_hi, size); + } + } + + void SetAllocContext(u32 tid, u32 stack) { + AtomicContextStore(&alloc_context_id, tid, stack); + } + + void GetAllocContext(u32 &tid, u32 &stack) const { + AtomicContextLoad(&alloc_context_id, tid, stack); + } }; -struct ChunkBase : ChunkHeader { - // Header2, intersects with user memory. - u32 free_context_id; - u32 free_tid; +class ChunkBase : public ChunkHeader { + atomic_uint64_t free_context_id; + + public: + void SetFreeContext(u32 tid, u32 stack) { + AtomicContextStore(&free_context_id, tid, stack); + } + + void GetFreeContext(u32 &tid, u32 &stack) const { + AtomicContextLoad(&free_context_id, tid, stack); + } }; static const uptr kChunkHeaderSize = sizeof(ChunkHeader); @@ -109,25 +155,39 @@ enum { CHUNK_QUARANTINE = 3, }; -struct AsanChunk: ChunkBase { +class AsanChunk : public ChunkBase { + public: uptr Beg() { return reinterpret_cast(this) + kChunkHeaderSize; } - uptr UsedSize(bool locked_version = false) { - if (user_requested_size != SizeClassMap::kMaxSize) - return user_requested_size; - return *reinterpret_cast( - get_allocator().GetMetaData(AllocBeg(locked_version))); + bool AddrIsInside(uptr addr) { + return (addr >= Beg()) && (addr < Beg() + UsedSize()); } - void *AllocBeg(bool locked_version = false) { - if (from_memalign) { - if (locked_version) - return get_allocator().GetBlockBeginFastLocked( - reinterpret_cast(this)); - return get_allocator().GetBlockBegin(reinterpret_cast(this)); - } - return reinterpret_cast(Beg() - RZLog2Size(rz_log)); +}; + +class LargeChunkHeader { + static constexpr uptr kAllocBegMagic = + FIRST_32_SECOND_64(0xCC6E96B9, 0xCC6E96B9CC6E96B9ULL); + atomic_uintptr_t magic; + AsanChunk *chunk_header; + + public: + AsanChunk *Get() const { + return atomic_load(&magic, memory_order_acquire) == kAllocBegMagic + ? chunk_header + : nullptr; } - bool AddrIsInside(uptr addr, bool locked_version = false) { - return (addr >= Beg()) && (addr < Beg() + UsedSize(locked_version)); + + void Set(AsanChunk *p) { + if (p) { + chunk_header = p; + atomic_store(&magic, kAllocBegMagic, memory_order_release); + return; + } + + uptr old = kAllocBegMagic; + if (!atomic_compare_exchange_strong(&magic, &old, 0, + memory_order_release)) { + CHECK_EQ(old, kAllocBegMagic); + } } }; @@ -138,26 +198,22 @@ struct QuarantineCallback { } void Recycle(AsanChunk *m) { + void *p = get_allocator().GetBlockBegin(m); + if (p != m) { + // Clear the magic value, as allocator internals may overwrite the + // contents of deallocated chunk, confusing GetAsanChunk lookup. + reinterpret_cast(p)->Set(nullptr); + } + u8 old_chunk_state = CHUNK_QUARANTINE; if (!atomic_compare_exchange_strong(&m->chunk_state, &old_chunk_state, CHUNK_INVALID, memory_order_acquire)) { CHECK_EQ(old_chunk_state, CHUNK_QUARANTINE); } - CHECK_NE(m->alloc_tid, kInvalidTid); - CHECK_NE(m->free_tid, kInvalidTid); PoisonShadow(m->Beg(), RoundUpTo(m->UsedSize(), SHADOW_GRANULARITY), kAsanHeapLeftRedzoneMagic); - void *p = reinterpret_cast(m->AllocBeg()); - if (p != m) { - uptr *alloc_magic = reinterpret_cast(p); - CHECK_EQ(alloc_magic[0], kAllocBegMagic); - // Clear the magic value, as allocator internals may overwrite the - // contents of deallocated chunk, confusing GetAsanChunk lookup. - alloc_magic[0] = 0; - CHECK_EQ(alloc_magic[1], reinterpret_cast(m)); - } // Statistics. AsanStats &thread_stats = GetCurrentThreadStats(); @@ -302,11 +358,11 @@ struct Allocator { // This could be a user-facing chunk (with redzones), or some internal // housekeeping chunk, like TransferBatch. Start by assuming the former. AsanChunk *ac = GetAsanChunk((void *)chunk); - uptr allocated_size = allocator.GetActuallyAllocatedSize((void *)ac); - if (atomic_load(&ac->chunk_state, memory_order_acquire) == - CHUNK_ALLOCATED) { + uptr allocated_size = allocator.GetActuallyAllocatedSize((void *)chunk); + if (ac && atomic_load(&ac->chunk_state, memory_order_acquire) == + CHUNK_ALLOCATED) { uptr beg = ac->Beg(); - uptr end = ac->Beg() + ac->UsedSize(true); + uptr end = ac->Beg() + ac->UsedSize(); uptr chunk_end = chunk + allocated_size; if (chunk < beg && beg < end && end <= chunk_end) { // Looks like a valid AsanChunk in use, poison redzones only. @@ -354,17 +410,18 @@ struct Allocator { // -------------------- Helper methods. ------------------------- uptr ComputeRZLog(uptr user_requested_size) { - u32 rz_log = - user_requested_size <= 64 - 16 ? 0 : - user_requested_size <= 128 - 32 ? 1 : - user_requested_size <= 512 - 64 ? 2 : - user_requested_size <= 4096 - 128 ? 3 : - user_requested_size <= (1 << 14) - 256 ? 4 : - user_requested_size <= (1 << 15) - 512 ? 5 : - user_requested_size <= (1 << 16) - 1024 ? 6 : 7; - u32 min_rz = atomic_load(&min_redzone, memory_order_acquire); - u32 max_rz = atomic_load(&max_redzone, memory_order_acquire); - return Min(Max(rz_log, RZSize2Log(min_rz)), RZSize2Log(max_rz)); + u32 rz_log = user_requested_size <= 64 - 16 ? 0 + : user_requested_size <= 128 - 32 ? 1 + : user_requested_size <= 512 - 64 ? 2 + : user_requested_size <= 4096 - 128 ? 3 + : user_requested_size <= (1 << 14) - 256 ? 4 + : user_requested_size <= (1 << 15) - 512 ? 5 + : user_requested_size <= (1 << 16) - 1024 ? 6 + : 7; + u32 hdr_log = RZSize2Log(RoundUpToPowerOfTwo(sizeof(ChunkHeader))); + u32 min_log = RZSize2Log(atomic_load(&min_redzone, memory_order_acquire)); + u32 max_log = RZSize2Log(atomic_load(&max_redzone, memory_order_acquire)); + return Min(Max(rz_log, Max(min_log, hdr_log)), Max(max_log, hdr_log)); } static uptr ComputeUserRequestedAlignmentLog(uptr user_requested_alignment) { @@ -384,6 +441,10 @@ struct Allocator { // We have an address between two chunks, and we want to report just one. AsanChunk *ChooseChunk(uptr addr, AsanChunk *left_chunk, AsanChunk *right_chunk) { + if (!left_chunk) + return right_chunk; + if (!right_chunk) + return left_chunk; // Prefer an allocated chunk over freed chunk and freed chunk // over available chunk. u8 left_state = atomic_load(&left_chunk->chunk_state, memory_order_relaxed); @@ -414,8 +475,8 @@ struct Allocator { if (atomic_load(&m->chunk_state, memory_order_acquire) != CHUNK_ALLOCATED) return false; if (m->Beg() != addr) return false; - atomic_store(&m->alloc_context_id, StackDepotPut(*stack), - memory_order_relaxed); + AsanThread *t = GetCurrentThread(); + m->SetAllocContext(t ? t->tid() : 0, StackDepotPut(*stack)); return true; } @@ -452,13 +513,10 @@ struct Allocator { uptr needed_size = rounded_size + rz_size; if (alignment > min_alignment) needed_size += alignment; - bool using_primary_allocator = true; // If we are allocating from the secondary allocator, there will be no // automatic right redzone, so add the right redzone manually. - if (!PrimaryAllocator::CanAllocate(needed_size, alignment)) { + if (!PrimaryAllocator::CanAllocate(needed_size, alignment)) needed_size += rz_size; - using_primary_allocator = false; - } CHECK(IsAligned(needed_size, min_alignment)); if (size > kMaxAllowedMallocSize || needed_size > kMaxAllowedMallocSize || size > max_user_defined_malloc_size) { @@ -500,8 +558,7 @@ struct Allocator { uptr alloc_beg = reinterpret_cast(allocated); uptr alloc_end = alloc_beg + needed_size; - uptr beg_plus_redzone = alloc_beg + rz_size; - uptr user_beg = beg_plus_redzone; + uptr user_beg = alloc_beg + rz_size; if (!IsAligned(user_beg, alignment)) user_beg = RoundUpTo(user_beg, alignment); uptr user_end = user_beg + size; @@ -509,31 +566,11 @@ struct Allocator { uptr chunk_beg = user_beg - kChunkHeaderSize; AsanChunk *m = reinterpret_cast(chunk_beg); m->alloc_type = alloc_type; - m->rz_log = rz_log; - u32 alloc_tid = t ? t->tid() : 0; - m->alloc_tid = alloc_tid; - CHECK_EQ(alloc_tid, m->alloc_tid); // Does alloc_tid fit into the bitfield? - m->from_memalign = user_beg != beg_plus_redzone; - if (alloc_beg != chunk_beg) { - CHECK_LE(alloc_beg + 2 * sizeof(uptr), chunk_beg); - reinterpret_cast(alloc_beg)[0] = kAllocBegMagic; - reinterpret_cast(alloc_beg)[1] = chunk_beg; - } - if (using_primary_allocator) { - CHECK(size); - m->user_requested_size = size; - CHECK(allocator.FromPrimary(allocated)); - } else { - CHECK(!allocator.FromPrimary(allocated)); - m->user_requested_size = SizeClassMap::kMaxSize; - uptr *meta = reinterpret_cast(allocator.GetMetaData(allocated)); - meta[0] = size; - meta[1] = chunk_beg; - } + CHECK(size); + m->SetUsedSize(size); m->user_requested_alignment_log = user_requested_alignment_log; - atomic_store(&m->alloc_context_id, StackDepotPut(*stack), - memory_order_relaxed); + m->SetAllocContext(t ? t->tid() : 0, StackDepotPut(*stack)); uptr size_rounded_down_to_granularity = RoundDownTo(size, SHADOW_GRANULARITY); @@ -567,6 +604,10 @@ struct Allocator { #endif // Must be the last mutation of metadata in this function. atomic_store(&m->chunk_state, CHUNK_ALLOCATED, memory_order_release); + if (alloc_beg != chunk_beg) { + CHECK_LE(alloc_beg + sizeof(LargeChunkHeader), chunk_beg); + reinterpret_cast(alloc_beg)->Set(m); + } ASAN_MALLOC_HOOK(res, size); return res; } @@ -586,8 +627,7 @@ struct Allocator { } CHECK_EQ(CHUNK_ALLOCATED, old_chunk_state); // It was a user data. - m->free_tid = kInvalidTid; - m->free_context_id = 0; + m->SetFreeContext(kInvalidTid, 0); return true; } @@ -597,8 +637,7 @@ struct Allocator { CHECK_EQ(atomic_load(&m->chunk_state, memory_order_relaxed), CHUNK_QUARANTINE); AsanThread *t = GetCurrentThread(); - m->free_tid = t ? t->tid() : 0; - m->free_context_id = StackDepotPut(*stack); + m->SetFreeContext(t ? t->tid() : 0, StackDepotPut(*stack)); Flags &fl = *flags(); if (fl.max_free_fill_size > 0) { @@ -730,41 +769,24 @@ struct Allocator { // -------------------------- Chunk lookup ---------------------- // Assumes alloc_beg == allocator.GetBlockBegin(alloc_beg). + // Returns nullptr if AsanChunk is not yet initialized just after + // get_allocator().Allocate(), or is being destroyed just before + // get_allocator().Deallocate(). AsanChunk *GetAsanChunk(void *alloc_beg) { if (!alloc_beg) return nullptr; - if (!allocator.FromPrimary(alloc_beg)) { - uptr *meta = reinterpret_cast(allocator.GetMetaData(alloc_beg)); - AsanChunk *m = reinterpret_cast(meta[1]); - return m; - } - uptr *alloc_magic = reinterpret_cast(alloc_beg); - if (alloc_magic[0] == kAllocBegMagic) - return reinterpret_cast(alloc_magic[1]); - // FIXME: This is either valid small chunk with tiny redzone or invalid - // chunk which is beeing allocated/deallocated. The latter case should - // return nullptr like secondary allocator does. - return reinterpret_cast(alloc_beg); - } - - AsanChunk *GetAsanChunkDebug(void *alloc_beg) { - if (!alloc_beg) - return nullptr; - if (!allocator.FromPrimary(alloc_beg)) { - uptr *meta = reinterpret_cast(allocator.GetMetaData(alloc_beg)); - AsanChunk *m = reinterpret_cast(meta[1]); - Printf("GetAsanChunkDebug1 alloc_beg %p meta %p m %p\n", alloc_beg, meta, - m); - return m; + AsanChunk *p = reinterpret_cast(alloc_beg)->Get(); + if (!p) { + if (!allocator.FromPrimary(alloc_beg)) + return nullptr; + p = reinterpret_cast(alloc_beg); } - uptr *alloc_magic = reinterpret_cast(alloc_beg); - Printf( - "GetAsanChunkDebug2 alloc_beg %p alloc_magic %p alloc_magic[0] %p " - "alloc_magic[1] %p\n", - alloc_beg, alloc_magic, alloc_magic[0], alloc_magic[1]); - if (alloc_magic[0] == kAllocBegMagic) - return reinterpret_cast(alloc_magic[1]); - return reinterpret_cast(alloc_beg); + u8 state = atomic_load(&p->chunk_state, memory_order_relaxed); + // It does not guaranty that Chunk is initialized, but it's + // definitely not for any other value. + if (state == CHUNK_ALLOCATED || state == CHUNK_QUARANTINE) + return p; + return nullptr; } AsanChunk *GetAsanChunkByAddr(uptr p) { @@ -779,14 +801,6 @@ struct Allocator { return GetAsanChunk(alloc_beg); } - AsanChunk *GetAsanChunkByAddrFastLockedDebug(uptr p) { - void *alloc_beg = - allocator.GetBlockBeginFastLockedDebug(reinterpret_cast(p)); - Printf("GetAsanChunkByAddrFastLockedDebug p %p alloc_beg %p\n", p, - alloc_beg); - return GetAsanChunkDebug(alloc_beg); - } - uptr AllocationSize(uptr p) { AsanChunk *m = GetAsanChunkByAddr(p); if (!m) return 0; @@ -798,9 +812,8 @@ struct Allocator { AsanChunkView FindHeapChunkByAddress(uptr addr) { AsanChunk *m1 = GetAsanChunkByAddr(addr); - if (!m1) return AsanChunkView(m1); sptr offset = 0; - if (AsanChunkView(m1).AddrIsAtLeft(addr, 1, &offset)) { + if (!m1 || AsanChunkView(m1).AddrIsAtLeft(addr, 1, &offset)) { // The address is in the chunk's left redzone, so maybe it is actually // a right buffer overflow from the other chunk to the left. // Search a bit to the left to see if there is another chunk. @@ -874,10 +887,23 @@ uptr AsanChunkView::UsedSize() const { return chunk_->UsedSize(); } u32 AsanChunkView::UserRequestedAlignment() const { return Allocator::ComputeUserAlignment(chunk_->user_requested_alignment_log); } -uptr AsanChunkView::AllocTid() const { return chunk_->alloc_tid; } + +uptr AsanChunkView::AllocTid() const { + u32 tid = 0; + u32 stack = 0; + chunk_->GetAllocContext(tid, stack); + return tid; +} + uptr AsanChunkView::FreeTid() const { - return IsQuarantined() ? chunk_->free_tid : kInvalidTid; + if (!IsQuarantined()) + return kInvalidTid; + u32 tid = 0; + u32 stack = 0; + chunk_->GetFreeContext(tid, stack); + return tid; } + AllocType AsanChunkView::GetAllocType() const { return (AllocType)chunk_->alloc_type; } @@ -890,10 +916,19 @@ static StackTrace GetStackTraceFromId(u32 id) { } u32 AsanChunkView::GetAllocStackId() const { - return atomic_load(&chunk_->alloc_context_id, memory_order_relaxed); + u32 tid = 0; + u32 stack = 0; + chunk_->GetAllocContext(tid, stack); + return stack; } + u32 AsanChunkView::GetFreeStackId() const { - return IsQuarantined() ? chunk_->free_context_id : 0; + if (!IsQuarantined()) + return 0; + u32 tid = 0; + u32 stack = 0; + chunk_->GetFreeContext(tid, stack); + return stack; } StackTrace AsanChunkView::GetAllocStack() const { @@ -1075,53 +1110,33 @@ void GetAllocatorGlobalRange(uptr *begin, uptr *end) { *end = *begin + sizeof(__asan::get_allocator()); } -uptr PointsIntoChunk(void* p) { +uptr PointsIntoChunk(void *p) { uptr addr = reinterpret_cast(p); __asan::AsanChunk *m = __asan::instance.GetAsanChunkByAddrFastLocked(addr); if (!m || atomic_load(&m->chunk_state, memory_order_acquire) != __asan::CHUNK_ALLOCATED) return 0; uptr chunk = m->Beg(); - if (m->AddrIsInside(addr, /*locked_version=*/true)) + if (m->AddrIsInside(addr)) return chunk; - if (IsSpecialCaseOfOperatorNew0(chunk, m->UsedSize(/*locked_version*/ true), - addr)) + if (IsSpecialCaseOfOperatorNew0(chunk, m->UsedSize(), addr)) return chunk; return 0; } -// Debug code. Delete once issue #1193 is chased down. -extern "C" SANITIZER_WEAK_ATTRIBUTE const char *__lsan_current_stage; - -void GetUserBeginDebug(uptr chunk) { - Printf("GetUserBeginDebug1 chunk %p\n", chunk); - __asan::AsanChunk *m = - __asan::instance.GetAsanChunkByAddrFastLockedDebug(chunk); - Printf("GetUserBeginDebug2 m %p\n", m); -} - uptr GetUserBegin(uptr chunk) { __asan::AsanChunk *m = __asan::instance.GetAsanChunkByAddrFastLocked(chunk); - if (!m) { - Printf( - "ASAN is about to crash with a CHECK failure.\n" - "The ASAN developers are trying to chase down this bug,\n" - "so if you've encountered this bug please let us know.\n" - "See also: https://github.com/google/sanitizers/issues/1193\n" - "Internal ref b/149237057\n" - "chunk: %p caller %p __lsan_current_stage %s\n", - chunk, GET_CALLER_PC(), __lsan_current_stage); - GetUserBeginDebug(chunk); - } - CHECK(m); - return m->Beg(); + return m ? m->Beg() : 0; } LsanMetadata::LsanMetadata(uptr chunk) { - metadata_ = reinterpret_cast(chunk - __asan::kChunkHeaderSize); + metadata_ = chunk ? reinterpret_cast(chunk - __asan::kChunkHeaderSize) + : nullptr; } bool LsanMetadata::allocated() const { + if (!metadata_) + return false; __asan::AsanChunk *m = reinterpret_cast<__asan::AsanChunk *>(metadata_); return atomic_load(&m->chunk_state, memory_order_relaxed) == __asan::CHUNK_ALLOCATED; @@ -1139,12 +1154,15 @@ void LsanMetadata::set_tag(ChunkTag value) { uptr LsanMetadata::requested_size() const { __asan::AsanChunk *m = reinterpret_cast<__asan::AsanChunk *>(metadata_); - return m->UsedSize(/*locked_version=*/true); + return m->UsedSize(); } u32 LsanMetadata::stack_trace_id() const { __asan::AsanChunk *m = reinterpret_cast<__asan::AsanChunk *>(metadata_); - return atomic_load(&m->alloc_context_id, memory_order_relaxed); + u32 tid = 0; + u32 stack = 0; + m->GetAllocContext(tid, stack); + return stack; } void ForEachChunk(ForEachChunkCallback callback, void *arg) { @@ -1154,16 +1172,16 @@ void ForEachChunk(ForEachChunkCallback callback, void *arg) { IgnoreObjectResult IgnoreObjectLocked(const void *p) { uptr addr = reinterpret_cast(p); __asan::AsanChunk *m = __asan::instance.GetAsanChunkByAddr(addr); - if (!m) return kIgnoreObjectInvalid; - if ((atomic_load(&m->chunk_state, memory_order_acquire) == - __asan::CHUNK_ALLOCATED) && - m->AddrIsInside(addr)) { - if (m->lsan_tag == kIgnored) - return kIgnoreObjectAlreadyIgnored; - m->lsan_tag = __lsan::kIgnored; - return kIgnoreObjectSuccess; + if (!m || + (atomic_load(&m->chunk_state, memory_order_acquire) != + __asan::CHUNK_ALLOCATED) || + !m->AddrIsInside(addr)) { + return kIgnoreObjectInvalid; } - return kIgnoreObjectInvalid; + if (m->lsan_tag == kIgnored) + return kIgnoreObjectAlreadyIgnored; + m->lsan_tag = __lsan::kIgnored; + return kIgnoreObjectSuccess; } } // namespace __lsan diff --git a/compiler-rt/lib/asan/asan_allocator.h b/compiler-rt/lib/asan/asan_allocator.h index d60b97500a3c3..612799f90964a 100644 --- a/compiler-rt/lib/asan/asan_allocator.h +++ b/compiler-rt/lib/asan/asan_allocator.h @@ -28,7 +28,7 @@ enum AllocType { FROM_NEW_BR = 3 // Memory block came from operator new [ ] }; -struct AsanChunk; +class AsanChunk; struct AllocatorOptions { u32 quarantine_size_mb; diff --git a/compiler-rt/lib/asan/asan_flags.cpp b/compiler-rt/lib/asan/asan_flags.cpp index c5c70eaed737f..cb6a89fe32ce7 100644 --- a/compiler-rt/lib/asan/asan_flags.cpp +++ b/compiler-rt/lib/asan/asan_flags.cpp @@ -26,10 +26,6 @@ namespace __asan { Flags asan_flags_dont_use_directly; // use via flags(). -static const char *MaybeCallAsanDefaultOptions() { - return (&__asan_default_options) ? __asan_default_options() : ""; -} - static const char *MaybeUseAsanDefaultOptionsCompileDefinition() { #ifdef ASAN_DEFAULT_OPTIONS return SANITIZER_STRINGIFY(ASAN_DEFAULT_OPTIONS); @@ -108,14 +104,14 @@ void InitializeFlags() { asan_parser.ParseString(asan_compile_def); // Override from user-specified string. - const char *asan_default_options = MaybeCallAsanDefaultOptions(); + const char *asan_default_options = __asan_default_options(); asan_parser.ParseString(asan_default_options); #if CAN_SANITIZE_UB - const char *ubsan_default_options = __ubsan::MaybeCallUbsanDefaultOptions(); + const char *ubsan_default_options = __ubsan_default_options(); ubsan_parser.ParseString(ubsan_default_options); #endif #if CAN_SANITIZE_LEAKS - const char *lsan_default_options = __lsan::MaybeCallLsanDefaultOptions(); + const char *lsan_default_options = __lsan_default_options(); lsan_parser.ParseString(lsan_default_options); #endif diff --git a/compiler-rt/lib/asan/asan_interface_internal.h b/compiler-rt/lib/asan/asan_interface_internal.h index f14cbbcb76a35..3e6e660288746 100644 --- a/compiler-rt/lib/asan/asan_interface_internal.h +++ b/compiler-rt/lib/asan/asan_interface_internal.h @@ -173,8 +173,8 @@ extern "C" { SANITIZER_INTERFACE_ATTRIBUTE void __asan_print_accumulated_stats(); - SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE - const char* __asan_default_options(); + SANITIZER_INTERFACE_ATTRIBUTE + const char *__asan_default_options(); SANITIZER_INTERFACE_ATTRIBUTE extern uptr __asan_shadow_memory_dynamic_address; diff --git a/compiler-rt/lib/asan/asan_malloc_linux.cpp b/compiler-rt/lib/asan/asan_malloc_linux.cpp index cb6c0ced0494b..9c3f0a5338ee5 100644 --- a/compiler-rt/lib/asan/asan_malloc_linux.cpp +++ b/compiler-rt/lib/asan/asan_malloc_linux.cpp @@ -34,7 +34,7 @@ static uptr last_dlsym_alloc_size_in_words; static const uptr kDlsymAllocPoolSize = SANITIZER_RTEMS ? 4096 : 1024; static uptr alloc_memory_for_dlsym[kDlsymAllocPoolSize]; -static INLINE bool IsInDlsymAllocPool(const void *ptr) { +static inline bool IsInDlsymAllocPool(const void *ptr) { uptr off = (uptr)ptr - (uptr)alloc_memory_for_dlsym; return off < allocated_for_dlsym * sizeof(alloc_memory_for_dlsym[0]); } @@ -95,12 +95,12 @@ bool IsFromLocalPool(const void *ptr) { } #endif -static INLINE bool MaybeInDlsym() { +static inline bool MaybeInDlsym() { // Fuchsia doesn't use dlsym-based interceptors. return !SANITIZER_FUCHSIA && asan_init_is_running; } -static INLINE bool UseLocalPool() { +static inline bool UseLocalPool() { return EarlyMalloc() || MaybeInDlsym(); } @@ -304,4 +304,4 @@ void ReplaceSystemMalloc() { #endif // SANITIZER_ANDROID #endif // SANITIZER_FREEBSD || SANITIZER_FUCHSIA || SANITIZER_LINUX || - // SANITIZER_NETBSD || SANITIZER_SOLARIS \ No newline at end of file + // SANITIZER_NETBSD || SANITIZER_SOLARIS diff --git a/compiler-rt/lib/asan/asan_malloc_local.h b/compiler-rt/lib/asan/asan_malloc_local.h index 3f784b90c739c..e2c9be0379f2f 100644 --- a/compiler-rt/lib/asan/asan_malloc_local.h +++ b/compiler-rt/lib/asan/asan_malloc_local.h @@ -17,7 +17,7 @@ #include "sanitizer_common/sanitizer_platform.h" #include "asan_internal.h" -static INLINE bool EarlyMalloc() { +static inline bool EarlyMalloc() { return SANITIZER_RTEMS && (!__asan::asan_inited || __asan::asan_init_is_running); } diff --git a/compiler-rt/lib/asan/asan_report.cpp b/compiler-rt/lib/asan/asan_report.cpp index 99e8678aa7857..4b4db1db6dc9c 100644 --- a/compiler-rt/lib/asan/asan_report.cpp +++ b/compiler-rt/lib/asan/asan_report.cpp @@ -411,7 +411,7 @@ static bool IsInvalidPointerPair(uptr a1, uptr a2) { return false; } -static INLINE void CheckForInvalidPointerPair(void *p1, void *p2) { +static inline void CheckForInvalidPointerPair(void *p1, void *p2) { switch (flags()->detect_invalid_pointer_pairs) { case 0: return; diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 8dbe15364ab8e..3c50df1797640 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -71,6 +71,7 @@ set(GENERIC_SOURCES divdi3.c divmoddi4.c divmodsi4.c + divmodti4.c divsc3.c divsf3.c divsi3.c diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt index f9e1bc805092e..d66d725e7ab59 100644 --- a/compiler-rt/lib/builtins/README.txt +++ b/compiler-rt/lib/builtins/README.txt @@ -87,6 +87,8 @@ du_int __udivmoddi4(du_int a, du_int b, du_int* rem); // a / b, *rem = a % b u tu_int __udivmodti4(tu_int a, tu_int b, tu_int* rem); // a / b, *rem = a % b unsigned su_int __udivmodsi4(su_int a, su_int b, su_int* rem); // a / b, *rem = a % b unsigned si_int __divmodsi4(si_int a, si_int b, si_int* rem); // a / b, *rem = a % b signed +di_int __divmoddi4(di_int a, di_int b, di_int* rem); // a / b, *rem = a % b signed +ti_int __divmodti4(ti_int a, ti_int b, ti_int* rem); // a / b, *rem = a % b signed diff --git a/compiler-rt/lib/builtins/divmoddi4.c b/compiler-rt/lib/builtins/divmoddi4.c index 7f333510c0034..e7cbbb1aaa304 100644 --- a/compiler-rt/lib/builtins/divmoddi4.c +++ b/compiler-rt/lib/builtins/divmoddi4.c @@ -15,7 +15,14 @@ // Returns: a / b, *rem = a % b COMPILER_RT_ABI di_int __divmoddi4(di_int a, di_int b, di_int *rem) { - di_int d = __divdi3(a, b); - *rem = a - (d * b); - return d; + const int bits_in_dword_m1 = (int)(sizeof(di_int) * CHAR_BIT) - 1; + di_int s_a = a >> bits_in_dword_m1; // s_a = a < 0 ? -1 : 0 + di_int s_b = b >> bits_in_dword_m1; // s_b = b < 0 ? -1 : 0 + a = (a ^ s_a) - s_a; // negate if s_a == -1 + b = (b ^ s_b) - s_b; // negate if s_b == -1 + s_b ^= s_a; // sign of quotient + du_int r; + di_int q = (__udivmoddi4(a, b, &r) ^ s_b) - s_b; // negate if s_b == -1 + *rem = (r ^ s_a) - s_a; // negate if s_a == -1 + return q; } diff --git a/compiler-rt/lib/builtins/divmodsi4.c b/compiler-rt/lib/builtins/divmodsi4.c index 402eed22fe7a0..a85e2993b4e9b 100644 --- a/compiler-rt/lib/builtins/divmodsi4.c +++ b/compiler-rt/lib/builtins/divmodsi4.c @@ -16,7 +16,14 @@ // Returns: a / b, *rem = a % b COMPILER_RT_ABI si_int __divmodsi4(si_int a, si_int b, si_int *rem) { - si_int d = __divsi3(a, b); - *rem = a - (d * b); - return d; + const int bits_in_word_m1 = (int)(sizeof(si_int) * CHAR_BIT) - 1; + si_int s_a = a >> bits_in_word_m1; // s_a = a < 0 ? -1 : 0 + si_int s_b = b >> bits_in_word_m1; // s_b = b < 0 ? -1 : 0 + a = (a ^ s_a) - s_a; // negate if s_a == -1 + b = (b ^ s_b) - s_b; // negate if s_b == -1 + s_b ^= s_a; // sign of quotient + su_int r; + si_int q = (__udivmodsi4(a, b, &r) ^ s_b) - s_b; // negate if s_b == -1 + *rem = (r ^ s_a) - s_a; // negate if s_a == -1 + return q; } diff --git a/compiler-rt/lib/builtins/divmodti4.c b/compiler-rt/lib/builtins/divmodti4.c new file mode 100644 index 0000000000000..b243ba4ef8537 --- /dev/null +++ b/compiler-rt/lib/builtins/divmodti4.c @@ -0,0 +1,32 @@ +//===-- divmodti4.c - Implement __divmodti4 -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements __divmodti4 for the compiler_rt library. +// +//===----------------------------------------------------------------------===// + +#include "int_lib.h" + +#ifdef CRT_HAS_128BIT + +// Returns: a / b, *rem = a % b + +COMPILER_RT_ABI ti_int __divmodti4(ti_int a, ti_int b, ti_int *rem) { + const int bits_in_tword_m1 = (int)(sizeof(ti_int) * CHAR_BIT) - 1; + ti_int s_a = a >> bits_in_tword_m1; // s_a = a < 0 ? -1 : 0 + ti_int s_b = b >> bits_in_tword_m1; // s_b = b < 0 ? -1 : 0 + a = (a ^ s_a) - s_a; // negate if s_a == -1 + b = (b ^ s_b) - s_b; // negate if s_b == -1 + s_b ^= s_a; // sign of quotient + tu_int r; + ti_int q = (__udivmodti4(a, b, &r) ^ s_b) - s_b; // negate if s_b == -1 + *rem = (r ^ s_a) - s_a; // negate if s_a == -1 + return q; +} + +#endif // CRT_HAS_128BIT diff --git a/compiler-rt/lib/builtins/os_version_check.c b/compiler-rt/lib/builtins/os_version_check.c index 3794b979434cc..fbc68f58caf76 100644 --- a/compiler-rt/lib/builtins/os_version_check.c +++ b/compiler-rt/lib/builtins/os_version_check.c @@ -216,6 +216,44 @@ int32_t __isOSVersionAtLeast(int32_t Major, int32_t Minor, int32_t Subminor) { return Subminor <= GlobalSubminor; } +#elif __ANDROID__ + +#include +#include +#include +#include + +static int SdkVersion; +static int IsPreRelease; + +static void readSystemProperties(void) { + char buf[PROP_VALUE_MAX]; + + if (__system_property_get("ro.build.version.sdk", buf) == 0) { + // When the system property doesn't exist, defaults to future API level. + SdkVersion = __ANDROID_API_FUTURE__; + } else { + SdkVersion = atoi(buf); + } + + if (__system_property_get("ro.build.version.codename", buf) == 0) { + IsPreRelease = 1; + } else { + IsPreRelease = strcmp(buf, "REL") != 0; + } + return; +} + +int32_t __isOSVersionAtLeast(int32_t Major, int32_t Minor, int32_t Subminor) { + (int32_t) Minor; + (int32_t) Subminor; + static pthread_once_t once = PTHREAD_ONCE_INIT; + pthread_once(&once, readSystemProperties); + + return SdkVersion >= Major || + (IsPreRelease && Major == __ANDROID_API_FUTURE__); +} + #else // Silence an empty translation unit warning. diff --git a/compiler-rt/lib/builtins/paritydi2.c b/compiler-rt/lib/builtins/paritydi2.c index 58e85f89e0437..350dceb8cef59 100644 --- a/compiler-rt/lib/builtins/paritydi2.c +++ b/compiler-rt/lib/builtins/paritydi2.c @@ -17,5 +17,9 @@ COMPILER_RT_ABI int __paritydi2(di_int a) { dwords x; x.all = a; - return __paritysi2(x.s.high ^ x.s.low); + su_int x2 = x.s.high ^ x.s.low; + x2 ^= x2 >> 16; + x2 ^= x2 >> 8; + x2 ^= x2 >> 4; + return (0x6996 >> (x2 & 0xF)) & 1; } diff --git a/compiler-rt/lib/builtins/parityti2.c b/compiler-rt/lib/builtins/parityti2.c index 79e920d8a02df..011c8dd455620 100644 --- a/compiler-rt/lib/builtins/parityti2.c +++ b/compiler-rt/lib/builtins/parityti2.c @@ -18,8 +18,14 @@ COMPILER_RT_ABI int __parityti2(ti_int a) { twords x; + dwords x2; x.all = a; - return __paritydi2(x.s.high ^ x.s.low); + x2.all = x.s.high ^ x.s.low; + su_int x3 = x2.s.high ^ x2.s.low; + x3 ^= x3 >> 16; + x3 ^= x3 >> 8; + x3 ^= x3 >> 4; + return (0x6996 >> (x3 & 0xF)) & 1; } #endif // CRT_HAS_128BIT diff --git a/compiler-rt/lib/cfi/cfi.cpp b/compiler-rt/lib/cfi/cfi.cpp index fd48f71643b6f..b75c72b215c27 100644 --- a/compiler-rt/lib/cfi/cfi.cpp +++ b/compiler-rt/lib/cfi/cfi.cpp @@ -379,7 +379,7 @@ void InitializeFlags() { __ubsan::RegisterUbsanFlags(&ubsan_parser, uf); RegisterCommonFlags(&ubsan_parser); - const char *ubsan_default_options = __ubsan::MaybeCallUbsanDefaultOptions(); + const char *ubsan_default_options = __ubsan_default_options(); ubsan_parser.ParseString(ubsan_default_options); ubsan_parser.ParseStringFromEnv("UBSAN_OPTIONS"); #endif diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cpp b/compiler-rt/lib/dfsan/dfsan_custom.cpp index eb26bea188ae8..77b93f81f3495 100644 --- a/compiler-rt/lib/dfsan/dfsan_custom.cpp +++ b/compiler-rt/lib/dfsan/dfsan_custom.cpp @@ -95,18 +95,9 @@ SANITIZER_INTERFACE_ATTRIBUTE char *__dfsw_strchr(const char *s, int c, } } -DECLARE_WEAK_INTERCEPTOR_HOOK(dfsan_weak_hook_memcmp, uptr caller_pc, - const void *s1, const void *s2, size_t n, - dfsan_label s1_label, dfsan_label s2_label, - dfsan_label n_label) - -SANITIZER_INTERFACE_ATTRIBUTE int __dfsw_memcmp(const void *s1, const void *s2, - size_t n, dfsan_label s1_label, - dfsan_label s2_label, - dfsan_label n_label, - dfsan_label *ret_label) { - CALL_WEAK_INTERCEPTOR_HOOK(dfsan_weak_hook_memcmp, GET_CALLER_PC(), s1, s2, n, - s1_label, s2_label, n_label); +static int dfsan_memcmp_bcmp(const void *s1, const void *s2, size_t n, + dfsan_label s1_label, dfsan_label s2_label, + dfsan_label n_label, dfsan_label *ret_label) { const char *cs1 = (const char *) s1, *cs2 = (const char *) s2; for (size_t i = 0; i != n; ++i) { if (cs1[i] != cs2[i]) { @@ -129,6 +120,29 @@ SANITIZER_INTERFACE_ATTRIBUTE int __dfsw_memcmp(const void *s1, const void *s2, return 0; } +DECLARE_WEAK_INTERCEPTOR_HOOK(dfsan_weak_hook_memcmp, uptr caller_pc, + const void *s1, const void *s2, size_t n, + dfsan_label s1_label, dfsan_label s2_label, + dfsan_label n_label) + +SANITIZER_INTERFACE_ATTRIBUTE int __dfsw_memcmp(const void *s1, const void *s2, + size_t n, dfsan_label s1_label, + dfsan_label s2_label, + dfsan_label n_label, + dfsan_label *ret_label) { + CALL_WEAK_INTERCEPTOR_HOOK(dfsan_weak_hook_memcmp, GET_CALLER_PC(), s1, s2, n, + s1_label, s2_label, n_label); + return dfsan_memcmp_bcmp(s1, s2, n, s1_label, s2_label, n_label, ret_label); +} + +SANITIZER_INTERFACE_ATTRIBUTE int __dfsw_bcmp(const void *s1, const void *s2, + size_t n, dfsan_label s1_label, + dfsan_label s2_label, + dfsan_label n_label, + dfsan_label *ret_label) { + return dfsan_memcmp_bcmp(s1, s2, n, s1_label, s2_label, n_label, ret_label); +} + DECLARE_WEAK_INTERCEPTOR_HOOK(dfsan_weak_hook_strcmp, uptr caller_pc, const char *s1, const char *s2, dfsan_label s1_label, dfsan_label s2_label) diff --git a/compiler-rt/lib/dfsan/done_abilist.txt b/compiler-rt/lib/dfsan/done_abilist.txt index 52f3ff5ef2395..85255f7c9026a 100644 --- a/compiler-rt/lib/dfsan/done_abilist.txt +++ b/compiler-rt/lib/dfsan/done_abilist.txt @@ -183,6 +183,7 @@ fun:strtoull=custom # Functions that produce an output that is computed from the input, but is not # necessarily data dependent. +fun:bcmp=custom fun:memchr=custom fun:memcmp=custom fun:strcasecmp=custom diff --git a/compiler-rt/lib/fuzzer/FuzzerDriver.cpp b/compiler-rt/lib/fuzzer/FuzzerDriver.cpp index caafd1dbb0a7b..83ef642ceeb6e 100644 --- a/compiler-rt/lib/fuzzer/FuzzerDriver.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerDriver.cpp @@ -755,6 +755,8 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) { Options.FeaturesDir = Flags.features_dir; ValidateDirectoryExists(Options.FeaturesDir, Flags.create_missing_dirs); } + if (Flags.mutation_graph_file) + Options.MutationGraphFile = Flags.mutation_graph_file; if (Flags.collect_data_flow) Options.CollectDataFlow = Flags.collect_data_flow; if (Flags.stop_file) @@ -765,16 +767,12 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) { Options.EntropicNumberOfRarestFeatures = (size_t)Flags.entropic_number_of_rarest_features; Options.EntropicScalePerExecTime = Flags.entropic_scale_per_exec_time; - if (Options.Entropic) { - if (!Options.FocusFunction.empty()) { - Printf("ERROR: The parameters `--entropic` and `--focus_function` cannot " - "be used together.\n"); - exit(1); - } + if (!Options.FocusFunction.empty()) + Options.Entropic = false; // FocusFunction overrides entropic scheduling. + if (Options.Entropic) Printf("INFO: Running with entropic power schedule (0x%X, %d).\n", Options.EntropicFeatureFrequencyThreshold, Options.EntropicNumberOfRarestFeatures); - } struct EntropicOptions Entropic; Entropic.Enabled = Options.Entropic; Entropic.FeatureFrequencyThreshold = diff --git a/compiler-rt/lib/fuzzer/FuzzerFlags.def b/compiler-rt/lib/fuzzer/FuzzerFlags.def index fdb8362cef9d4..4d4841b17ae42 100644 --- a/compiler-rt/lib/fuzzer/FuzzerFlags.def +++ b/compiler-rt/lib/fuzzer/FuzzerFlags.def @@ -88,6 +88,11 @@ FUZZER_FLAG_STRING(features_dir, "internal flag. Used to dump feature sets on di "Every time a new input is added to the corpus, a corresponding file in the features_dir" " is created containing the unique features of that input." " Features are stored in binary format.") +FUZZER_FLAG_STRING(mutation_graph_file, "Saves a graph (in DOT format) to" + " mutation_graph_file. The graph contains a vertex for each input that has" + " unique coverage; directed edges are provided between parents and children" + " where the child has unique coverage, and are recorded with the type of" + " mutation that caused the child.") FUZZER_FLAG_INT(use_counters, 1, "Use coverage counters") FUZZER_FLAG_INT(use_memmem, 1, "Use hints from intercepting memmem, strstr, etc") @@ -166,8 +171,9 @@ FUZZER_FLAG_INT(ignore_remaining_args, 0, "If 1, ignore all arguments passed " FUZZER_FLAG_STRING(focus_function, "Experimental. " "Fuzzing will focus on inputs that trigger calls to this function. " "If -focus_function=auto and -data_flow_trace is used, libFuzzer " - "will choose the focus functions automatically.") -FUZZER_FLAG_INT(entropic, 0, "Experimental. Enables entropic power schedule.") + "will choose the focus functions automatically. Disables -entropic when " + "specified.") +FUZZER_FLAG_INT(entropic, 1, "Enables entropic power schedule.") FUZZER_FLAG_INT(entropic_feature_frequency_threshold, 0xFF, "Experimental. If " "entropic is enabled, all features which are observed less often than " "the specified value are considered as rare.") diff --git a/compiler-rt/lib/fuzzer/FuzzerIO.cpp b/compiler-rt/lib/fuzzer/FuzzerIO.cpp index c3330c3425d09..54a7219fc0e0f 100644 --- a/compiler-rt/lib/fuzzer/FuzzerIO.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerIO.cpp @@ -77,6 +77,19 @@ void WriteToFile(const uint8_t *Data, size_t Size, const std::string &Path) { fclose(Out); } +void AppendToFile(const std::string &Data, const std::string &Path) { + AppendToFile(reinterpret_cast(Data.data()), Data.size(), + Path); +} + +void AppendToFile(const uint8_t *Data, size_t Size, const std::string &Path) { + FILE *Out = fopen(Path.c_str(), "a"); + if (!Out) + return; + fwrite(Data, sizeof(Data[0]), Size, Out); + fclose(Out); +} + void ReadDirToVectorOfUnits(const char *Path, Vector *V, long *Epoch, size_t MaxSize, bool ExitOnError) { long E = Epoch ? *Epoch : 0; diff --git a/compiler-rt/lib/fuzzer/FuzzerIO.h b/compiler-rt/lib/fuzzer/FuzzerIO.h index 6e3a0b470c5f6..abd25110d07d4 100644 --- a/compiler-rt/lib/fuzzer/FuzzerIO.h +++ b/compiler-rt/lib/fuzzer/FuzzerIO.h @@ -29,6 +29,9 @@ void WriteToFile(const uint8_t *Data, size_t Size, const std::string &Path); void WriteToFile(const std::string &Data, const std::string &Path); void WriteToFile(const Unit &U, const std::string &Path); +void AppendToFile(const uint8_t *Data, size_t Size, const std::string &Path); +void AppendToFile(const std::string &Data, const std::string &Path); + void ReadDirToVectorOfUnits(const char *Path, Vector *V, long *Epoch, size_t MaxSize, bool ExitOnError); diff --git a/compiler-rt/lib/fuzzer/FuzzerLoop.cpp b/compiler-rt/lib/fuzzer/FuzzerLoop.cpp index f9986dd8eea51..f1895ec2621a4 100644 --- a/compiler-rt/lib/fuzzer/FuzzerLoop.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerLoop.cpp @@ -463,6 +463,37 @@ static void RenameFeatureSetFile(const std::string &FeaturesDir, DirPlusFile(FeaturesDir, NewFile)); } +static void WriteEdgeToMutationGraphFile(const std::string &MutationGraphFile, + const InputInfo *II, + const InputInfo *BaseII, + const std::string &MS) { + if (MutationGraphFile.empty()) + return; + + std::string Sha1 = Sha1ToString(II->Sha1); + + std::string OutputString; + + // Add a new vertex. + OutputString.append("\""); + OutputString.append(Sha1); + OutputString.append("\"\n"); + + // Add a new edge if there is base input. + if (BaseII) { + std::string BaseSha1 = Sha1ToString(BaseII->Sha1); + OutputString.append("\""); + OutputString.append(BaseSha1); + OutputString.append("\" -> \""); + OutputString.append(Sha1); + OutputString.append("\" [label=\""); + OutputString.append(MS); + OutputString.append("\"];\n"); + } + + AppendToFile(OutputString, MutationGraphFile); +} + bool Fuzzer::RunOne(const uint8_t *Data, size_t Size, bool MayDeleteFile, InputInfo *II, bool ForceAddToCorpus, bool *FoundUniqFeatures) { @@ -497,6 +528,8 @@ bool Fuzzer::RunOne(const uint8_t *Data, size_t Size, bool MayDeleteFile, TimeOfUnit, UniqFeatureSetTmp, DFT, II); WriteFeatureSetToFile(Options.FeaturesDir, Sha1ToString(NewII->Sha1), NewII->UniqFeatureSet); + WriteEdgeToMutationGraphFile(Options.MutationGraphFile, NewII, II, + MD.MutationSequence()); return true; } if (II && FoundUniqFeaturesOfII && @@ -603,7 +636,7 @@ void Fuzzer::PrintStatusForNewUnit(const Unit &U, const char *Text) { PrintStats(Text, ""); if (Options.Verbosity) { Printf(" L: %zd/%zd ", U.size(), Corpus.MaxInputSize()); - MD.PrintMutationSequence(); + MD.PrintMutationSequence(Options.Verbosity >= 2); Printf("\n"); } } diff --git a/compiler-rt/lib/fuzzer/FuzzerMutate.cpp b/compiler-rt/lib/fuzzer/FuzzerMutate.cpp index df9ada45bb039..cf34a9fe8e2e1 100644 --- a/compiler-rt/lib/fuzzer/FuzzerMutate.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerMutate.cpp @@ -18,6 +18,7 @@ namespace fuzzer { const size_t Dictionary::kMaxDictSize; +static const size_t kMaxMutationsToPrint = 10; static void PrintASCII(const Word &W, const char *PrintAfter) { PrintASCII(W.data(), W.size(), PrintAfter); @@ -481,19 +482,34 @@ void MutationDispatcher::PrintRecommendedDictionary() { Printf("###### End of recommended dictionary. ######\n"); } -void MutationDispatcher::PrintMutationSequence() { +void MutationDispatcher::PrintMutationSequence(bool Verbose) { Printf("MS: %zd ", CurrentMutatorSequence.size()); - for (auto M : CurrentMutatorSequence) - Printf("%s-", M.Name); + size_t EntriesToPrint = + Verbose ? CurrentMutatorSequence.size() + : std::min(kMaxMutationsToPrint, CurrentMutatorSequence.size()); + for (size_t i = 0; i < EntriesToPrint; i++) + Printf("%s-", CurrentMutatorSequence[i].Name); if (!CurrentDictionaryEntrySequence.empty()) { Printf(" DE: "); - for (auto DE : CurrentDictionaryEntrySequence) { + EntriesToPrint = Verbose ? CurrentDictionaryEntrySequence.size() + : std::min(kMaxMutationsToPrint, + CurrentDictionaryEntrySequence.size()); + for (size_t i = 0; i < EntriesToPrint; i++) { Printf("\""); - PrintASCII(DE->GetW(), "\"-"); + PrintASCII(CurrentDictionaryEntrySequence[i]->GetW(), "\"-"); } } } +std::string MutationDispatcher::MutationSequence() { + std::string MS; + for (auto M : CurrentMutatorSequence) { + MS += M.Name; + MS += "-"; + } + return MS; +} + size_t MutationDispatcher::Mutate(uint8_t *Data, size_t Size, size_t MaxSize) { return MutateImpl(Data, Size, MaxSize, Mutators); } diff --git a/compiler-rt/lib/fuzzer/FuzzerMutate.h b/compiler-rt/lib/fuzzer/FuzzerMutate.h index 6cbce80276248..37fd6100dac33 100644 --- a/compiler-rt/lib/fuzzer/FuzzerMutate.h +++ b/compiler-rt/lib/fuzzer/FuzzerMutate.h @@ -24,8 +24,11 @@ class MutationDispatcher { ~MutationDispatcher() {} /// Indicate that we are about to start a new sequence of mutations. void StartMutationSequence(); - /// Print the current sequence of mutations. - void PrintMutationSequence(); + /// Print the current sequence of mutations. Only prints the full sequence + /// when Verbose is true. + void PrintMutationSequence(bool Verbose = true); + /// Return the current sequence of mutations. + std::string MutationSequence(); /// Indicate that the current sequence of mutations was successful. void RecordSuccessfulMutationSequence(); /// Mutates data by invoking user-provided mutator. diff --git a/compiler-rt/lib/fuzzer/FuzzerOptions.h b/compiler-rt/lib/fuzzer/FuzzerOptions.h index b17a7474d38f0..20b810b2867fb 100644 --- a/compiler-rt/lib/fuzzer/FuzzerOptions.h +++ b/compiler-rt/lib/fuzzer/FuzzerOptions.h @@ -46,7 +46,7 @@ struct FuzzingOptions { size_t MaxNumberOfRuns = -1L; int ReportSlowUnits = 10; bool OnlyASCII = false; - bool Entropic = false; + bool Entropic = true; size_t EntropicFeatureFrequencyThreshold = 0xFF; size_t EntropicNumberOfRarestFeatures = 100; bool EntropicScalePerExecTime = false; @@ -59,6 +59,7 @@ struct FuzzingOptions { std::string DataFlowTrace; std::string CollectDataFlow; std::string FeaturesDir; + std::string MutationGraphFile; std::string StopFile; bool SaveArtifacts = true; bool PrintNEW = true; // Print a status line when new units are found; diff --git a/compiler-rt/lib/hwasan/hwasan.cpp b/compiler-rt/lib/hwasan/hwasan.cpp index 11b4d3891bc2c..c5322110cb662 100644 --- a/compiler-rt/lib/hwasan/hwasan.cpp +++ b/compiler-rt/lib/hwasan/hwasan.cpp @@ -112,7 +112,7 @@ static void InitializeFlags() { if (__hwasan_default_options) parser.ParseString(__hwasan_default_options()); #if HWASAN_CONTAINS_UBSAN - const char *ubsan_default_options = __ubsan::MaybeCallUbsanDefaultOptions(); + const char *ubsan_default_options = __ubsan_default_options(); ubsan_parser.ParseString(ubsan_default_options); #endif diff --git a/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h b/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h index eaf124aab7ddc..7d134e8c4b7fa 100644 --- a/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h +++ b/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h @@ -28,7 +28,7 @@ static u32 malloc_hash(StackTrace *stack, uptr orig_size) { return H.get(); } -static INLINE bool malloc_bisect(StackTrace *stack, uptr orig_size) { +static inline bool malloc_bisect(StackTrace *stack, uptr orig_size) { uptr left = flags()->malloc_bisect_left; uptr right = flags()->malloc_bisect_right; if (LIKELY(left == 0 && right == 0)) diff --git a/compiler-rt/lib/lsan/lsan.cpp b/compiler-rt/lib/lsan/lsan.cpp index 80a6e2fa70169..c8cc045783d45 100644 --- a/compiler-rt/lib/lsan/lsan.cpp +++ b/compiler-rt/lib/lsan/lsan.cpp @@ -73,7 +73,7 @@ static void InitializeFlags() { RegisterCommonFlags(&parser); // Override from user-specified string. - const char *lsan_default_options = MaybeCallLsanDefaultOptions(); + const char *lsan_default_options = __lsan_default_options(); parser.ParseString(lsan_default_options); parser.ParseStringFromEnv("LSAN_OPTIONS"); diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp index 67f85f2f31de4..107d63ac9117c 100644 --- a/compiler-rt/lib/lsan/lsan_common.cpp +++ b/compiler-rt/lib/lsan/lsan_common.cpp @@ -25,8 +25,6 @@ #include "sanitizer_common/sanitizer_thread_registry.h" #include "sanitizer_common/sanitizer_tls_get_addr.h" -extern "C" const char *__lsan_current_stage = "unknown"; - #if CAN_SANITIZE_LEAKS namespace __lsan { @@ -110,10 +108,6 @@ void InitializeRootRegions() { root_regions = new (placeholder) InternalMmapVector(); } -const char *MaybeCallLsanDefaultOptions() { - return (&__lsan_default_options) ? __lsan_default_options() : ""; -} - void InitCommonLsan() { InitializeRootRegions(); if (common_flags()->detect_leaks) { @@ -224,10 +218,7 @@ static void ProcessThreads(SuspendedThreadsList const &, Frontier *) {} // Scans thread data (stacks and TLS) for heap pointers. static void ProcessThreads(SuspendedThreadsList const &suspended_threads, Frontier *frontier) { - InternalMmapVector registers(suspended_threads.RegisterCount()); - uptr registers_begin = reinterpret_cast(registers.data()); - uptr registers_end = - reinterpret_cast(registers.data() + registers.size()); + InternalMmapVector registers; for (uptr i = 0; i < suspended_threads.ThreadCount(); i++) { tid_t os_id = static_cast(suspended_threads.GetThreadID(i)); LOG_THREADS("Processing thread %d.\n", os_id); @@ -244,7 +235,7 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads, } uptr sp; PtraceRegistersStatus have_registers = - suspended_threads.GetRegistersAndSP(i, registers.data(), &sp); + suspended_threads.GetRegistersAndSP(i, ®isters, &sp); if (have_registers != REGISTERS_AVAILABLE) { Report("Unable to get registers from thread %d.\n", os_id); // If unable to get SP, consider the entire stack to be reachable unless @@ -253,9 +244,13 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads, sp = stack_begin; } - if (flags()->use_registers && have_registers) + if (flags()->use_registers && have_registers) { + uptr registers_begin = reinterpret_cast(registers.data()); + uptr registers_end = + reinterpret_cast(registers.data() + registers.size()); ScanRangeForPointers(registers_begin, registers_end, frontier, "REGISTERS", kReachable); + } if (flags()->use_stacks) { LOG_THREADS("Stack at %p-%p (SP = %p).\n", stack_begin, stack_end, sp); @@ -366,7 +361,6 @@ static void FloodFillTag(Frontier *frontier, ChunkTag tag) { // ForEachChunk callback. If the chunk is marked as leaked, marks all chunks // which are reachable from it as indirectly leaked. static void MarkIndirectlyLeakedCb(uptr chunk, void *arg) { - __lsan_current_stage = "MarkIndirectlyLeakedCb"; chunk = GetUserBegin(chunk); LsanMetadata m(chunk); if (m.allocated() && m.tag() != kReachable) { @@ -379,7 +373,6 @@ static void MarkIndirectlyLeakedCb(uptr chunk, void *arg) { // frontier. static void CollectIgnoredCb(uptr chunk, void *arg) { CHECK(arg); - __lsan_current_stage = "CollectIgnoredCb"; chunk = GetUserBegin(chunk); LsanMetadata m(chunk); if (m.allocated() && m.tag() == kIgnored) { @@ -409,7 +402,6 @@ struct InvalidPCParam { static void MarkInvalidPCCb(uptr chunk, void *arg) { CHECK(arg); InvalidPCParam *param = reinterpret_cast(arg); - __lsan_current_stage = "MarkInvalidPCCb"; chunk = GetUserBegin(chunk); LsanMetadata m(chunk); if (m.allocated() && m.tag() != kReachable && m.tag() != kIgnored) { @@ -485,7 +477,6 @@ static void ClassifyAllChunks(SuspendedThreadsList const &suspended_threads, // ForEachChunk callback. Resets the tags to pre-leak-check state. static void ResetTagsCb(uptr chunk, void *arg) { (void)arg; - __lsan_current_stage = "ResetTagsCb"; chunk = GetUserBegin(chunk); LsanMetadata m(chunk); if (m.allocated() && m.tag() != kIgnored) @@ -502,7 +493,6 @@ static void PrintStackTraceById(u32 stack_trace_id) { static void CollectLeaksCb(uptr chunk, void *arg) { CHECK(arg); LeakReport *leak_report = reinterpret_cast(arg); - __lsan_current_stage = "CollectLeaksCb"; chunk = GetUserBegin(chunk); LsanMetadata m(chunk); if (!m.allocated()) return; @@ -900,12 +890,11 @@ int __lsan_do_recoverable_leak_check() { return 0; } -#if !SANITIZER_SUPPORTS_WEAK_HOOKS -SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE -const char * __lsan_default_options() { +SANITIZER_INTERFACE_WEAK_DEF(const char *, __lsan_default_options, void) { return ""; } +#if !SANITIZER_SUPPORTS_WEAK_HOOKS SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE int __lsan_is_turned_off() { return 0; diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp index 3028f79f041c3..d651a376789bd 100644 --- a/compiler-rt/lib/msan/msan.cpp +++ b/compiler-rt/lib/msan/msan.cpp @@ -172,10 +172,9 @@ static void InitializeFlags() { #endif // Override from user-specified string. - if (__msan_default_options) - parser.ParseString(__msan_default_options()); + parser.ParseString(__msan_default_options()); #if MSAN_CONTAINS_UBSAN - const char *ubsan_default_options = __ubsan::MaybeCallUbsanDefaultOptions(); + const char *ubsan_default_options = __ubsan_default_options(); ubsan_parser.ParseString(ubsan_default_options); #endif @@ -726,12 +725,9 @@ void __msan_finish_switch_fiber(const void **bottom_old, uptr *size_old) { } } -#if !SANITIZER_SUPPORTS_WEAK_HOOKS -extern "C" { -SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE -const char* __msan_default_options() { return ""; } -} // extern "C" -#endif +SANITIZER_INTERFACE_WEAK_DEF(const char *, __msan_default_options, void) { + return ""; +} extern "C" { SANITIZER_INTERFACE_ATTRIBUTE diff --git a/compiler-rt/lib/msan/msan_interface_internal.h b/compiler-rt/lib/msan/msan_interface_internal.h index 17922a888b9c9..1edacbc7504f5 100644 --- a/compiler-rt/lib/msan/msan_interface_internal.h +++ b/compiler-rt/lib/msan/msan_interface_internal.h @@ -129,8 +129,8 @@ void __msan_set_keep_going(int keep_going); SANITIZER_INTERFACE_ATTRIBUTE int __msan_set_poison_in_malloc(int do_poison); -SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE -/* OPTIONAL */ const char* __msan_default_options(); +SANITIZER_INTERFACE_ATTRIBUTE +const char *__msan_default_options(); // For testing. SANITIZER_INTERFACE_ATTRIBUTE diff --git a/compiler-rt/lib/msan/tests/msan_test.cpp b/compiler-rt/lib/msan/tests/msan_test.cpp index 4c98bb4861f20..6306b3dbfb82d 100644 --- a/compiler-rt/lib/msan/tests/msan_test.cpp +++ b/compiler-rt/lib/msan/tests/msan_test.cpp @@ -139,7 +139,7 @@ typedef signed short S2; typedef signed int S4; typedef signed long long S8; #define NOINLINE __attribute__((noinline)) -#define INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__((always_inline)) static bool TrackingOrigins() { S8 x; @@ -4312,7 +4312,7 @@ TEST(MemorySanitizerOrigins, InitializedStoreDoesNotChangeOrigin) { } // namespace template -INLINE +ALWAYS_INLINE void BinaryOpOriginTest(BinaryOp op) { U4 ox = rand(); //NOLINT U4 oy = rand(); //NOLINT @@ -4345,12 +4345,12 @@ void BinaryOpOriginTest(BinaryOp op) { EXPECT_ORIGIN(ox, __msan_get_origin(z)); } -template INLINE T XOR(const T &a, const T&b) { return a ^ b; } -template INLINE T ADD(const T &a, const T&b) { return a + b; } -template INLINE T SUB(const T &a, const T&b) { return a - b; } -template INLINE T MUL(const T &a, const T&b) { return a * b; } -template INLINE T AND(const T &a, const T&b) { return a & b; } -template INLINE T OR (const T &a, const T&b) { return a | b; } +template ALWAYS_INLINE T XOR(const T &a, const T&b) { return a ^ b; } +template ALWAYS_INLINE T ADD(const T &a, const T&b) { return a + b; } +template ALWAYS_INLINE T SUB(const T &a, const T&b) { return a - b; } +template ALWAYS_INLINE T MUL(const T &a, const T&b) { return a * b; } +template ALWAYS_INLINE T AND(const T &a, const T&b) { return a & b; } +template ALWAYS_INLINE T OR (const T &a, const T&b) { return a | b; } TEST(MemorySanitizerOrigins, BinaryOp) { if (!TrackingOrigins()) return; @@ -4704,7 +4704,7 @@ static void TestBZHI() { __builtin_ia32_bzhi_di(0xABCDABCDABCDABCD, Poisoned(1, 0xFFFFFFFF00000000ULL))); } -inline U4 bextr_imm(U4 start, U4 len) { +ALWAYS_INLINE U4 bextr_imm(U4 start, U4 len) { start &= 0xFF; len &= 0xFF; return (len << 8) | start; diff --git a/compiler-rt/lib/profile/GCDAProfiling.c b/compiler-rt/lib/profile/GCDAProfiling.c index d57fdbae5371d..4055681872415 100644 --- a/compiler-rt/lib/profile/GCDAProfiling.c +++ b/compiler-rt/lib/profile/GCDAProfiling.c @@ -127,11 +127,6 @@ struct fn_list { */ struct fn_list writeout_fn_list; -/* - * A list of flush functions that our __gcov_flush() function should call, shared between all dynamic objects. - */ -struct fn_list flush_fn_list; - /* * A list of reset functions, shared between all dynamic objects. */ @@ -406,32 +401,6 @@ void llvm_gcda_start_file(const char *orig_filename, uint32_t version, #endif } -/* Given an array of pointers to counters (counters), increment the n-th one, - * where we're also given a pointer to n (predecessor). - */ -COMPILER_RT_VISIBILITY -void llvm_gcda_increment_indirect_counter(uint32_t *predecessor, - uint64_t **counters) { - uint64_t *counter; - uint32_t pred; - - pred = *predecessor; - if (pred == 0xffffffff) - return; - counter = counters[pred]; - - /* Don't crash if the pred# is out of sync. This can happen due to threads, - or because of a TODO in GCOVProfiling.cpp buildEdgeLookupTable(). */ - if (counter) - ++*counter; -#ifdef DEBUG_GCDAPROFILING - else - fprintf(stderr, - "llvmgcda: increment_indirect_counter counters=%08llx, pred=%u\n", - *counter, *predecessor); -#endif -} - COMPILER_RT_VISIBILITY void llvm_gcda_emit_function(uint32_t ident, uint32_t func_checksum, uint32_t cfg_checksum) { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator.h index 23d589888d3b6..5ec47416fe0c9 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator.h @@ -52,14 +52,14 @@ struct NoOpMapUnmapCallback { // Callback type for iterating over chunks. typedef void (*ForEachChunkCallback)(uptr chunk, void *arg); -INLINE u32 Rand(u32 *state) { // ANSI C linear congruential PRNG. +inline u32 Rand(u32 *state) { // ANSI C linear congruential PRNG. return (*state = *state * 1103515245 + 12345) >> 16; } -INLINE u32 RandN(u32 *state, u32 n) { return Rand(state) % n; } // [0, n) +inline u32 RandN(u32 *state, u32 n) { return Rand(state) % n; } // [0, n) template -INLINE void RandomShuffle(T *a, u32 n, u32 *rand_state) { +inline void RandomShuffle(T *a, u32 n, u32 *rand_state) { if (n <= 1) return; u32 state = *rand_state; for (u32 i = n - 1; i > 0; i--) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_checks.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_checks.h index fc426f0e74f48..1cc3992c4c9fa 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_checks.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_checks.h @@ -27,7 +27,7 @@ namespace __sanitizer { void SetErrnoToENOMEM(); // A common errno setting logic shared by almost all sanitizer allocator APIs. -INLINE void *SetErrnoOnNull(void *ptr) { +inline void *SetErrnoOnNull(void *ptr) { if (UNLIKELY(!ptr)) SetErrnoToENOMEM(); return ptr; @@ -41,7 +41,7 @@ INLINE void *SetErrnoOnNull(void *ptr) { // two and that the size is a multiple of alignment for POSIX implementation, // and a bit relaxed requirement for non-POSIX ones, that the size is a multiple // of alignment. -INLINE bool CheckAlignedAllocAlignmentAndSize(uptr alignment, uptr size) { +inline bool CheckAlignedAllocAlignmentAndSize(uptr alignment, uptr size) { #if SANITIZER_POSIX return alignment != 0 && IsPowerOfTwo(alignment) && (size & (alignment - 1)) == 0; @@ -52,13 +52,13 @@ INLINE bool CheckAlignedAllocAlignmentAndSize(uptr alignment, uptr size) { // Checks posix_memalign() parameters, verifies that alignment is a power of two // and a multiple of sizeof(void *). -INLINE bool CheckPosixMemalignAlignment(uptr alignment) { +inline bool CheckPosixMemalignAlignment(uptr alignment) { return alignment != 0 && IsPowerOfTwo(alignment) && (alignment % sizeof(void *)) == 0; } // Returns true if calloc(size, n) call overflows on size*n calculation. -INLINE bool CheckForCallocOverflow(uptr size, uptr n) { +inline bool CheckForCallocOverflow(uptr size, uptr n) { if (!size) return false; uptr max = (uptr)-1L; @@ -67,7 +67,7 @@ INLINE bool CheckForCallocOverflow(uptr size, uptr n) { // Returns true if the size passed to pvalloc overflows when rounded to the next // multiple of page_size. -INLINE bool CheckForPvallocOverflow(uptr size, uptr page_size) { +inline bool CheckForPvallocOverflow(uptr size, uptr page_size) { return RoundUpTo(size, page_size) < size; } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_combined.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_combined.h index 0cf483da1e5c8..33f89d6d49928 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_combined.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_combined.h @@ -142,12 +142,6 @@ class CombinedAllocator { return secondary_.GetBlockBeginFastLocked(p); } - void *GetBlockBeginFastLockedDebug(void *p) { - if (primary_.PointerIsMine(p)) - return primary_.GetBlockBeginDebug(p); - return secondary_.GetBlockBeginFastLocked(p); - } - uptr GetActuallyAllocatedSize(void *p) { if (primary_.PointerIsMine(p)) return primary_.GetActuallyAllocatedSize(p); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h index 2c25a687c5f08..b90dabbf77692 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h @@ -211,7 +211,6 @@ class SizeClassAllocator32 { uptr res = beg + (n * (u32)size); return reinterpret_cast(res); } - void *GetBlockBeginDebug(const void *p) { return GetBlockBegin(p); } uptr GetActuallyAllocatedSize(void *p) { CHECK(PointerIsMine(p)); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h index a6126fc6265eb..0a18b0c58ef79 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h @@ -186,43 +186,19 @@ class SizeClassAllocator64 { void *GetBlockBegin(const void *p) { uptr class_id = GetSizeClass(p); + if (class_id >= kNumClasses) return nullptr; uptr size = ClassIdToSize(class_id); if (!size) return nullptr; uptr chunk_idx = GetChunkIdx((uptr)p, size); uptr reg_beg = GetRegionBegin(p); uptr beg = chunk_idx * size; uptr next_beg = beg + size; - if (class_id >= kNumClasses) return nullptr; const RegionInfo *region = AddressSpaceView::Load(GetRegionInfo(class_id)); if (region->mapped_user >= next_beg) return reinterpret_cast(reg_beg + beg); return nullptr; } - void *GetBlockBeginDebug(const void *p) { - uptr class_id = GetSizeClass(p); - uptr size = ClassIdToSize(class_id); - Printf("GetBlockBeginDebug1 p %p class_id %p size %p\n", p, class_id, size); - if (!size) - return nullptr; - uptr chunk_idx = GetChunkIdx((uptr)p, size); - uptr reg_beg = GetRegionBegin(p); - uptr beg = chunk_idx * size; - uptr next_beg = beg + size; - Printf( - "GetBlockBeginDebug2 chunk_idx %p reg_beg %p beg %p next_beg %p " - "kNumClasses %p\n", - chunk_idx, reg_beg, beg, next_beg, kNumClasses); - if (class_id >= kNumClasses) - return nullptr; - const RegionInfo *region = AddressSpaceView::Load(GetRegionInfo(class_id)); - Printf("GetBlockBeginDebug3 region %p region->mapped_user %p\n", region, - region->mapped_user); - if (region->mapped_user >= next_beg) - return reinterpret_cast(reg_beg + beg); - return nullptr; - } - uptr GetActuallyAllocatedSize(void *p) { CHECK(PointerIsMine(p)); return ClassIdToSize(GetSizeClass(p)); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_secondary.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_secondary.h index 1d128f55de05a..61fb98742373a 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_secondary.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_secondary.h @@ -18,8 +18,8 @@ // (currently, 32 bits and internal allocator). class LargeMmapAllocatorPtrArrayStatic { public: - INLINE void *Init() { return &p_[0]; } - INLINE void EnsureSpace(uptr n) { CHECK_LT(n, kMaxNumChunks); } + inline void *Init() { return &p_[0]; } + inline void EnsureSpace(uptr n) { CHECK_LT(n, kMaxNumChunks); } private: static const int kMaxNumChunks = 1 << 15; uptr p_[kMaxNumChunks]; @@ -31,14 +31,14 @@ class LargeMmapAllocatorPtrArrayStatic { // same functionality in Fuchsia case, which does not support MAP_NORESERVE. class LargeMmapAllocatorPtrArrayDynamic { public: - INLINE void *Init() { + inline void *Init() { uptr p = address_range_.Init(kMaxNumChunks * sizeof(uptr), SecondaryAllocatorName); CHECK(p); return reinterpret_cast(p); } - INLINE void EnsureSpace(uptr n) { + inline void EnsureSpace(uptr n) { CHECK_LT(n, kMaxNumChunks); DCHECK(n <= n_reserved_); if (UNLIKELY(n == n_reserved_)) { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h index a798a0cf25d9c..46f06957228c9 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h @@ -72,12 +72,12 @@ namespace __sanitizer { // Clutter-reducing helpers. template -INLINE typename T::Type atomic_load_relaxed(const volatile T *a) { +inline typename T::Type atomic_load_relaxed(const volatile T *a) { return atomic_load(a, memory_order_relaxed); } template -INLINE void atomic_store_relaxed(volatile T *a, typename T::Type v) { +inline void atomic_store_relaxed(volatile T *a, typename T::Type v) { atomic_store(a, v, memory_order_relaxed); } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang.h index c40461ebc3bf6..fc13ca52dda74 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang.h @@ -34,16 +34,16 @@ namespace __sanitizer { // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html // for mappings of the memory model to different processors. -INLINE void atomic_signal_fence(memory_order) { +inline void atomic_signal_fence(memory_order) { __asm__ __volatile__("" ::: "memory"); } -INLINE void atomic_thread_fence(memory_order) { +inline void atomic_thread_fence(memory_order) { __sync_synchronize(); } template -INLINE typename T::Type atomic_fetch_add(volatile T *a, +inline typename T::Type atomic_fetch_add(volatile T *a, typename T::Type v, memory_order mo) { (void)mo; DCHECK(!((uptr)a % sizeof(*a))); @@ -51,7 +51,7 @@ INLINE typename T::Type atomic_fetch_add(volatile T *a, } template -INLINE typename T::Type atomic_fetch_sub(volatile T *a, +inline typename T::Type atomic_fetch_sub(volatile T *a, typename T::Type v, memory_order mo) { (void)mo; DCHECK(!((uptr)a % sizeof(*a))); @@ -59,7 +59,7 @@ INLINE typename T::Type atomic_fetch_sub(volatile T *a, } template -INLINE typename T::Type atomic_exchange(volatile T *a, +inline typename T::Type atomic_exchange(volatile T *a, typename T::Type v, memory_order mo) { DCHECK(!((uptr)a % sizeof(*a))); if (mo & (memory_order_release | memory_order_acq_rel | memory_order_seq_cst)) @@ -71,7 +71,7 @@ INLINE typename T::Type atomic_exchange(volatile T *a, } template -INLINE bool atomic_compare_exchange_strong(volatile T *a, typename T::Type *cmp, +inline bool atomic_compare_exchange_strong(volatile T *a, typename T::Type *cmp, typename T::Type xchg, memory_order mo) { typedef typename T::Type Type; @@ -84,7 +84,7 @@ INLINE bool atomic_compare_exchange_strong(volatile T *a, typename T::Type *cmp, } template -INLINE bool atomic_compare_exchange_weak(volatile T *a, +inline bool atomic_compare_exchange_weak(volatile T *a, typename T::Type *cmp, typename T::Type xchg, memory_order mo) { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_mips.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_mips.h index d369aeb9935c6..59155e9883ebe 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_mips.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_mips.h @@ -37,7 +37,7 @@ static struct { } __attribute__((aligned(32))) lock = {0, {0}}; template <> -INLINE atomic_uint64_t::Type atomic_fetch_add(volatile atomic_uint64_t *ptr, +inline atomic_uint64_t::Type atomic_fetch_add(volatile atomic_uint64_t *ptr, atomic_uint64_t::Type val, memory_order mo) { DCHECK(mo & @@ -55,14 +55,14 @@ INLINE atomic_uint64_t::Type atomic_fetch_add(volatile atomic_uint64_t *ptr, } template <> -INLINE atomic_uint64_t::Type atomic_fetch_sub(volatile atomic_uint64_t *ptr, +inline atomic_uint64_t::Type atomic_fetch_sub(volatile atomic_uint64_t *ptr, atomic_uint64_t::Type val, memory_order mo) { return atomic_fetch_add(ptr, -val, mo); } template <> -INLINE bool atomic_compare_exchange_strong(volatile atomic_uint64_t *ptr, +inline bool atomic_compare_exchange_strong(volatile atomic_uint64_t *ptr, atomic_uint64_t::Type *cmp, atomic_uint64_t::Type xchg, memory_order mo) { @@ -87,7 +87,7 @@ INLINE bool atomic_compare_exchange_strong(volatile atomic_uint64_t *ptr, } template <> -INLINE atomic_uint64_t::Type atomic_load(const volatile atomic_uint64_t *ptr, +inline atomic_uint64_t::Type atomic_load(const volatile atomic_uint64_t *ptr, memory_order mo) { DCHECK(mo & (memory_order_relaxed | memory_order_releasae | memory_order_seq_cst)); @@ -100,7 +100,7 @@ INLINE atomic_uint64_t::Type atomic_load(const volatile atomic_uint64_t *ptr, } template <> -INLINE void atomic_store(volatile atomic_uint64_t *ptr, atomic_uint64_t::Type v, +inline void atomic_store(volatile atomic_uint64_t *ptr, atomic_uint64_t::Type v, memory_order mo) { DCHECK(mo & (memory_order_relaxed | memory_order_releasae | memory_order_seq_cst)); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_other.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_other.h index b8685a8542676..7580ac2dc5889 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_other.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_other.h @@ -17,12 +17,12 @@ namespace __sanitizer { -INLINE void proc_yield(int cnt) { +inline void proc_yield(int cnt) { __asm__ __volatile__("" ::: "memory"); } template -INLINE typename T::Type atomic_load( +inline typename T::Type atomic_load( const volatile T *a, memory_order mo) { DCHECK(mo & (memory_order_relaxed | memory_order_consume | memory_order_acquire | memory_order_seq_cst)); @@ -60,7 +60,7 @@ INLINE typename T::Type atomic_load( } template -INLINE void atomic_store(volatile T *a, typename T::Type v, memory_order mo) { +inline void atomic_store(volatile T *a, typename T::Type v, memory_order mo) { DCHECK(mo & (memory_order_relaxed | memory_order_release | memory_order_seq_cst)); DCHECK(!((uptr)a % sizeof(*a))); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_x86.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_x86.h index f2ce553baa7a1..51597b4927412 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_x86.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_x86.h @@ -16,7 +16,7 @@ namespace __sanitizer { -INLINE void proc_yield(int cnt) { +inline void proc_yield(int cnt) { __asm__ __volatile__("" ::: "memory"); for (int i = 0; i < cnt; i++) __asm__ __volatile__("pause"); @@ -24,7 +24,7 @@ INLINE void proc_yield(int cnt) { } template -INLINE typename T::Type atomic_load( +inline typename T::Type atomic_load( const volatile T *a, memory_order mo) { DCHECK(mo & (memory_order_relaxed | memory_order_consume | memory_order_acquire | memory_order_seq_cst)); @@ -70,7 +70,7 @@ INLINE typename T::Type atomic_load( } template -INLINE void atomic_store(volatile T *a, typename T::Type v, memory_order mo) { +inline void atomic_store(volatile T *a, typename T::Type v, memory_order mo) { DCHECK(mo & (memory_order_relaxed | memory_order_release | memory_order_seq_cst)); DCHECK(!((uptr)a % sizeof(*a))); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_msvc.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_msvc.h index 6a7c5465dcbbc..31317adcdfc99 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_msvc.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_msvc.h @@ -54,21 +54,21 @@ extern "C" long long _InterlockedExchangeAdd64(long long volatile *Addend, namespace __sanitizer { -INLINE void atomic_signal_fence(memory_order) { +inline void atomic_signal_fence(memory_order) { _ReadWriteBarrier(); } -INLINE void atomic_thread_fence(memory_order) { +inline void atomic_thread_fence(memory_order) { _mm_mfence(); } -INLINE void proc_yield(int cnt) { +inline void proc_yield(int cnt) { for (int i = 0; i < cnt; i++) _mm_pause(); } template -INLINE typename T::Type atomic_load( +inline typename T::Type atomic_load( const volatile T *a, memory_order mo) { DCHECK(mo & (memory_order_relaxed | memory_order_consume | memory_order_acquire | memory_order_seq_cst)); @@ -86,7 +86,7 @@ INLINE typename T::Type atomic_load( } template -INLINE void atomic_store(volatile T *a, typename T::Type v, memory_order mo) { +inline void atomic_store(volatile T *a, typename T::Type v, memory_order mo) { DCHECK(mo & (memory_order_relaxed | memory_order_release | memory_order_seq_cst)); DCHECK(!((uptr)a % sizeof(*a))); @@ -102,7 +102,7 @@ INLINE void atomic_store(volatile T *a, typename T::Type v, memory_order mo) { atomic_thread_fence(memory_order_seq_cst); } -INLINE u32 atomic_fetch_add(volatile atomic_uint32_t *a, +inline u32 atomic_fetch_add(volatile atomic_uint32_t *a, u32 v, memory_order mo) { (void)mo; DCHECK(!((uptr)a % sizeof(*a))); @@ -110,7 +110,7 @@ INLINE u32 atomic_fetch_add(volatile atomic_uint32_t *a, (long)v); } -INLINE uptr atomic_fetch_add(volatile atomic_uintptr_t *a, +inline uptr atomic_fetch_add(volatile atomic_uintptr_t *a, uptr v, memory_order mo) { (void)mo; DCHECK(!((uptr)a % sizeof(*a))); @@ -123,7 +123,7 @@ INLINE uptr atomic_fetch_add(volatile atomic_uintptr_t *a, #endif } -INLINE u32 atomic_fetch_sub(volatile atomic_uint32_t *a, +inline u32 atomic_fetch_sub(volatile atomic_uint32_t *a, u32 v, memory_order mo) { (void)mo; DCHECK(!((uptr)a % sizeof(*a))); @@ -131,7 +131,7 @@ INLINE u32 atomic_fetch_sub(volatile atomic_uint32_t *a, -(long)v); } -INLINE uptr atomic_fetch_sub(volatile atomic_uintptr_t *a, +inline uptr atomic_fetch_sub(volatile atomic_uintptr_t *a, uptr v, memory_order mo) { (void)mo; DCHECK(!((uptr)a % sizeof(*a))); @@ -144,28 +144,28 @@ INLINE uptr atomic_fetch_sub(volatile atomic_uintptr_t *a, #endif } -INLINE u8 atomic_exchange(volatile atomic_uint8_t *a, +inline u8 atomic_exchange(volatile atomic_uint8_t *a, u8 v, memory_order mo) { (void)mo; DCHECK(!((uptr)a % sizeof(*a))); return (u8)_InterlockedExchange8((volatile char*)&a->val_dont_use, v); } -INLINE u16 atomic_exchange(volatile atomic_uint16_t *a, +inline u16 atomic_exchange(volatile atomic_uint16_t *a, u16 v, memory_order mo) { (void)mo; DCHECK(!((uptr)a % sizeof(*a))); return (u16)_InterlockedExchange16((volatile short*)&a->val_dont_use, v); } -INLINE u32 atomic_exchange(volatile atomic_uint32_t *a, +inline u32 atomic_exchange(volatile atomic_uint32_t *a, u32 v, memory_order mo) { (void)mo; DCHECK(!((uptr)a % sizeof(*a))); return (u32)_InterlockedExchange((volatile long*)&a->val_dont_use, v); } -INLINE bool atomic_compare_exchange_strong(volatile atomic_uint8_t *a, +inline bool atomic_compare_exchange_strong(volatile atomic_uint8_t *a, u8 *cmp, u8 xchgv, memory_order mo) { @@ -191,7 +191,7 @@ INLINE bool atomic_compare_exchange_strong(volatile atomic_uint8_t *a, return false; } -INLINE bool atomic_compare_exchange_strong(volatile atomic_uintptr_t *a, +inline bool atomic_compare_exchange_strong(volatile atomic_uintptr_t *a, uptr *cmp, uptr xchg, memory_order mo) { @@ -204,7 +204,7 @@ INLINE bool atomic_compare_exchange_strong(volatile atomic_uintptr_t *a, return false; } -INLINE bool atomic_compare_exchange_strong(volatile atomic_uint16_t *a, +inline bool atomic_compare_exchange_strong(volatile atomic_uint16_t *a, u16 *cmp, u16 xchg, memory_order mo) { @@ -217,7 +217,7 @@ INLINE bool atomic_compare_exchange_strong(volatile atomic_uint16_t *a, return false; } -INLINE bool atomic_compare_exchange_strong(volatile atomic_uint32_t *a, +inline bool atomic_compare_exchange_strong(volatile atomic_uint32_t *a, u32 *cmp, u32 xchg, memory_order mo) { @@ -230,7 +230,7 @@ INLINE bool atomic_compare_exchange_strong(volatile atomic_uint32_t *a, return false; } -INLINE bool atomic_compare_exchange_strong(volatile atomic_uint64_t *a, +inline bool atomic_compare_exchange_strong(volatile atomic_uint64_t *a, u64 *cmp, u64 xchg, memory_order mo) { @@ -244,7 +244,7 @@ INLINE bool atomic_compare_exchange_strong(volatile atomic_uint64_t *a, } template -INLINE bool atomic_compare_exchange_weak(volatile T *a, +inline bool atomic_compare_exchange_weak(volatile T *a, typename T::Type *cmp, typename T::Type xchg, memory_order mo) { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h index 86e19d96e0369..c8575a984c0c3 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h @@ -53,25 +53,25 @@ const u64 kExternalPCBit = 1ULL << 60; extern const char *SanitizerToolName; // Can be changed by the tool. extern atomic_uint32_t current_verbosity; -INLINE void SetVerbosity(int verbosity) { +inline void SetVerbosity(int verbosity) { atomic_store(¤t_verbosity, verbosity, memory_order_relaxed); } -INLINE int Verbosity() { +inline int Verbosity() { return atomic_load(¤t_verbosity, memory_order_relaxed); } #if SANITIZER_ANDROID -INLINE uptr GetPageSize() { +inline uptr GetPageSize() { // Android post-M sysconf(_SC_PAGESIZE) crashes if called from .preinit_array. return 4096; } -INLINE uptr GetPageSizeCached() { +inline uptr GetPageSizeCached() { return 4096; } #else uptr GetPageSize(); extern uptr PageSizeCached; -INLINE uptr GetPageSizeCached() { +inline uptr GetPageSizeCached() { if (!PageSizeCached) PageSizeCached = GetPageSize(); return PageSizeCached; @@ -91,7 +91,7 @@ void GetThreadStackAndTls(bool main, uptr *stk_addr, uptr *stk_size, // Memory management void *MmapOrDie(uptr size, const char *mem_type, bool raw_report = false); -INLINE void *MmapOrDieQuietly(uptr size, const char *mem_type) { +inline void *MmapOrDieQuietly(uptr size, const char *mem_type) { return MmapOrDie(size, mem_type, /*raw_report*/ true); } void UnmapOrDie(void *addr, uptr size); @@ -374,7 +374,7 @@ unsigned char _BitScanReverse64(unsigned long *index, unsigned __int64 mask); } #endif -INLINE uptr MostSignificantSetBitIndex(uptr x) { +inline uptr MostSignificantSetBitIndex(uptr x) { CHECK_NE(x, 0U); unsigned long up; #if !SANITIZER_WINDOWS || defined(__clang__) || defined(__GNUC__) @@ -391,7 +391,7 @@ INLINE uptr MostSignificantSetBitIndex(uptr x) { return up; } -INLINE uptr LeastSignificantSetBitIndex(uptr x) { +inline uptr LeastSignificantSetBitIndex(uptr x) { CHECK_NE(x, 0U); unsigned long up; #if !SANITIZER_WINDOWS || defined(__clang__) || defined(__GNUC__) @@ -408,11 +408,11 @@ INLINE uptr LeastSignificantSetBitIndex(uptr x) { return up; } -INLINE bool IsPowerOfTwo(uptr x) { +inline bool IsPowerOfTwo(uptr x) { return (x & (x - 1)) == 0; } -INLINE uptr RoundUpToPowerOfTwo(uptr size) { +inline uptr RoundUpToPowerOfTwo(uptr size) { CHECK(size); if (IsPowerOfTwo(size)) return size; @@ -422,20 +422,20 @@ INLINE uptr RoundUpToPowerOfTwo(uptr size) { return 1ULL << (up + 1); } -INLINE uptr RoundUpTo(uptr size, uptr boundary) { +inline uptr RoundUpTo(uptr size, uptr boundary) { RAW_CHECK(IsPowerOfTwo(boundary)); return (size + boundary - 1) & ~(boundary - 1); } -INLINE uptr RoundDownTo(uptr x, uptr boundary) { +inline uptr RoundDownTo(uptr x, uptr boundary) { return x & ~(boundary - 1); } -INLINE bool IsAligned(uptr a, uptr alignment) { +inline bool IsAligned(uptr a, uptr alignment) { return (a & (alignment - 1)) == 0; } -INLINE uptr Log2(uptr x) { +inline uptr Log2(uptr x) { CHECK(IsPowerOfTwo(x)); return LeastSignificantSetBitIndex(x); } @@ -451,14 +451,14 @@ template void Swap(T& a, T& b) { } // Char handling -INLINE bool IsSpace(int c) { +inline bool IsSpace(int c) { return (c == ' ') || (c == '\n') || (c == '\t') || (c == '\f') || (c == '\r') || (c == '\v'); } -INLINE bool IsDigit(int c) { +inline bool IsDigit(int c) { return (c >= '0') && (c <= '9'); } -INLINE int ToLower(int c) { +inline int ToLower(int c) { return (c >= 'A' && c <= 'Z') ? (c + 'a' - 'A') : c; } @@ -840,15 +840,15 @@ void WriteToSyslog(const char *buffer); #if SANITIZER_MAC || SANITIZER_WIN_TRACE void LogFullErrorReport(const char *buffer); #else -INLINE void LogFullErrorReport(const char *buffer) {} +inline void LogFullErrorReport(const char *buffer) {} #endif #if SANITIZER_LINUX || SANITIZER_MAC void WriteOneLineToSyslog(const char *s); void LogMessageOnPrintf(const char *str); #else -INLINE void WriteOneLineToSyslog(const char *s) {} -INLINE void LogMessageOnPrintf(const char *str) {} +inline void WriteOneLineToSyslog(const char *s) {} +inline void LogMessageOnPrintf(const char *str) {} #endif #if SANITIZER_LINUX || SANITIZER_WIN_TRACE @@ -856,21 +856,21 @@ INLINE void LogMessageOnPrintf(const char *str) {} void AndroidLogInit(); void SetAbortMessage(const char *); #else -INLINE void AndroidLogInit() {} +inline void AndroidLogInit() {} // FIXME: MacOS implementation could use CRSetCrashLogMessage. -INLINE void SetAbortMessage(const char *) {} +inline void SetAbortMessage(const char *) {} #endif #if SANITIZER_ANDROID void SanitizerInitializeUnwinder(); AndroidApiLevel AndroidGetApiLevel(); #else -INLINE void AndroidLogWrite(const char *buffer_unused) {} -INLINE void SanitizerInitializeUnwinder() {} -INLINE AndroidApiLevel AndroidGetApiLevel() { return ANDROID_NOT_ANDROID; } +inline void AndroidLogWrite(const char *buffer_unused) {} +inline void SanitizerInitializeUnwinder() {} +inline AndroidApiLevel AndroidGetApiLevel() { return ANDROID_NOT_ANDROID; } #endif -INLINE uptr GetPthreadDestructorIterations() { +inline uptr GetPthreadDestructorIterations() { #if SANITIZER_ANDROID return (AndroidGetApiLevel() == ANDROID_LOLLIPOP_MR1) ? 8 : 4; #elif SANITIZER_POSIX @@ -976,7 +976,7 @@ RunOnDestruction at_scope_exit(Fn fn) { #if SANITIZER_LINUX && SANITIZER_S390_64 void AvoidCVE_2016_2143(); #else -INLINE void AvoidCVE_2016_2143() {} +inline void AvoidCVE_2016_2143() {} #endif struct StackDepotStats { @@ -997,7 +997,7 @@ bool GetRandom(void *buffer, uptr length, bool blocking = true); // Returns the number of logical processors on the system. u32 GetNumberOfCPUs(); extern u32 NumberOfCPUsCached; -INLINE u32 GetNumberOfCPUsCached() { +inline u32 GetNumberOfCPUsCached() { if (!NumberOfCPUsCached) NumberOfCPUsCached = GetNumberOfCPUs(); return NumberOfCPUsCached; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h index 84973eedda60a..a6c5514870528 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h @@ -196,9 +196,6 @@ typedef u64 tid_t; // This header should NOT include any other headers to avoid portability issues. // Common defs. -#ifndef INLINE -#define INLINE inline -#endif #define INTERFACE_ATTRIBUTE SANITIZER_INTERFACE_ATTRIBUTE #define SANITIZER_WEAK_DEFAULT_IMPL \ extern "C" SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE NOINLINE diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h index c162d1ca5d285..1adc120815d14 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h @@ -109,7 +109,7 @@ void ForEachMappedRegion(link_map *map, void (*cb)(const void *, uptr)); // Releases memory pages entirely within the [beg, end] address range. // The pages no longer count toward RSS; reads are guaranteed to return 0. // Requires (but does not verify!) that pages are MAP_PRIVATE. -INLINE void ReleaseMemoryPagesToOSAndZeroFill(uptr beg, uptr end) { +inline void ReleaseMemoryPagesToOSAndZeroFill(uptr beg, uptr end) { // man madvise on Linux promises zero-fill for anonymous private pages. // Testing shows the same behaviour for private (but not anonymous) mappings // of shm_open() files, as long as the underlying file is untouched. diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp index 86918a51a2460..28c14f2717be9 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp @@ -772,7 +772,7 @@ void LogMessageOnPrintf(const char *str) { // initialized after the vDSO function pointers, so if it exists, is not null // and is not empty, we can use clock_gettime. extern "C" SANITIZER_WEAK_ATTRIBUTE char *__progname; -INLINE bool CanUseVDSO() { +inline bool CanUseVDSO() { // Bionic is safe, it checks for the vDSO function pointers to be initialized. if (SANITIZER_ANDROID) return true; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.h b/compiler-rt/lib/sanitizer_common/sanitizer_mac.h index f61ebe2566e5f..023071e4f11de 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.h @@ -75,7 +75,7 @@ asm(".desc ___crashreporter_info__, 0x10"); namespace __sanitizer { static BlockingMutex crashreporter_info_mutex(LINKER_INITIALIZED); -INLINE void CRAppendCrashLogMessage(const char *msg) { +inline void CRAppendCrashLogMessage(const char *msg) { BlockingMutexLock l(&crashreporter_info_mutex); internal_strlcat(__crashreporter_info_buff__, msg, sizeof(__crashreporter_info_buff__)); } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp index dcc6c71c07d8a..b1c15be58deaa 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp @@ -81,8 +81,6 @@ #include #undef _KERNEL -#undef INLINE // to avoid clashes with sanitizers' definitions - #undef IOC_DIRMASK // Include these after system headers to avoid name clashes and ambiguities. diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp index 25da334b63f09..c8f2aa5dba4af 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,7 @@ #include #include +#include #include #include #include @@ -139,7 +141,158 @@ #include #include #include +#if __has_include() #include +#else +/* Fallback for MKISCSI=no */ + +typedef struct { + uint32_t status; + uint32_t session_id; + uint32_t connection_id; +} iscsi_conn_status_parameters_t; + +typedef struct { + uint32_t status; + uint16_t interface_version; + uint16_t major; + uint16_t minor; + uint8_t version_string[224]; +} iscsi_get_version_parameters_t; + +typedef struct { + uint32_t status; + uint32_t session_id; + uint32_t connection_id; + struct { + unsigned int immediate : 1; + } options; + uint64_t lun; + scsireq_t req; /* from */ +} iscsi_iocommand_parameters_t; + +typedef enum { + ISCSI_AUTH_None = 0, + ISCSI_AUTH_CHAP = 1, + ISCSI_AUTH_KRB5 = 2, + ISCSI_AUTH_SRP = 3 +} iscsi_auth_types_t; + +typedef enum { + ISCSI_LOGINTYPE_DISCOVERY = 0, + ISCSI_LOGINTYPE_NOMAP = 1, + ISCSI_LOGINTYPE_MAP = 2 +} iscsi_login_session_type_t; + +typedef enum { ISCSI_DIGEST_None = 0, ISCSI_DIGEST_CRC32C = 1 } iscsi_digest_t; + +typedef enum { + ISCSI_SESSION_TERMINATED = 1, + ISCSI_CONNECTION_TERMINATED, + ISCSI_RECOVER_CONNECTION, + ISCSI_DRIVER_TERMINATING +} iscsi_event_t; + +typedef struct { + unsigned int mutual_auth : 1; + unsigned int is_secure : 1; + unsigned int auth_number : 4; + iscsi_auth_types_t auth_type[4]; +} iscsi_auth_info_t; + +typedef struct { + uint32_t status; + int socket; + struct { + unsigned int HeaderDigest : 1; + unsigned int DataDigest : 1; + unsigned int MaxConnections : 1; + unsigned int DefaultTime2Wait : 1; + unsigned int DefaultTime2Retain : 1; + unsigned int MaxRecvDataSegmentLength : 1; + unsigned int auth_info : 1; + unsigned int user_name : 1; + unsigned int password : 1; + unsigned int target_password : 1; + unsigned int TargetName : 1; + unsigned int TargetAlias : 1; + unsigned int ErrorRecoveryLevel : 1; + } is_present; + iscsi_auth_info_t auth_info; + iscsi_login_session_type_t login_type; + iscsi_digest_t HeaderDigest; + iscsi_digest_t DataDigest; + uint32_t session_id; + uint32_t connection_id; + uint32_t MaxRecvDataSegmentLength; + uint16_t MaxConnections; + uint16_t DefaultTime2Wait; + uint16_t DefaultTime2Retain; + uint16_t ErrorRecoveryLevel; + void *user_name; + void *password; + void *target_password; + void *TargetName; + void *TargetAlias; +} iscsi_login_parameters_t; + +typedef struct { + uint32_t status; + uint32_t session_id; +} iscsi_logout_parameters_t; + +typedef struct { + uint32_t status; + uint32_t event_id; +} iscsi_register_event_parameters_t; + +typedef struct { + uint32_t status; + uint32_t session_id; + uint32_t connection_id; +} iscsi_remove_parameters_t; + +typedef struct { + uint32_t status; + uint32_t session_id; + void *response_buffer; + uint32_t response_size; + uint32_t response_used; + uint32_t response_total; + uint8_t key[224]; +} iscsi_send_targets_parameters_t; + +typedef struct { + uint32_t status; + uint8_t InitiatorName[224]; + uint8_t InitiatorAlias[224]; + uint8_t ISID[6]; +} iscsi_set_node_name_parameters_t; + +typedef struct { + uint32_t status; + uint32_t event_id; + iscsi_event_t event_kind; + uint32_t session_id; + uint32_t connection_id; + uint32_t reason; +} iscsi_wait_event_parameters_t; + +#define ISCSI_GET_VERSION _IOWR(0, 1, iscsi_get_version_parameters_t) +#define ISCSI_LOGIN _IOWR(0, 2, iscsi_login_parameters_t) +#define ISCSI_LOGOUT _IOWR(0, 3, iscsi_logout_parameters_t) +#define ISCSI_ADD_CONNECTION _IOWR(0, 4, iscsi_login_parameters_t) +#define ISCSI_RESTORE_CONNECTION _IOWR(0, 5, iscsi_login_parameters_t) +#define ISCSI_REMOVE_CONNECTION _IOWR(0, 6, iscsi_remove_parameters_t) +#define ISCSI_CONNECTION_STATUS _IOWR(0, 7, iscsi_conn_status_parameters_t) +#define ISCSI_SEND_TARGETS _IOWR(0, 8, iscsi_send_targets_parameters_t) +#define ISCSI_SET_NODE_NAME _IOWR(0, 9, iscsi_set_node_name_parameters_t) +#define ISCSI_IO_COMMAND _IOWR(0, 10, iscsi_iocommand_parameters_t) +#define ISCSI_REGISTER_EVENT _IOWR(0, 11, iscsi_register_event_parameters_t) +#define ISCSI_DEREGISTER_EVENT _IOWR(0, 12, iscsi_register_event_parameters_t) +#define ISCSI_WAIT_EVENT _IOWR(0, 13, iscsi_wait_event_parameters_t) +#define ISCSI_POLL_EVENT _IOWR(0, 14, iscsi_wait_event_parameters_t) +#endif #include #include #include @@ -372,7 +525,7 @@ struct urio_command { #include "sanitizer_platform_limits_netbsd.h" namespace __sanitizer { -void *__sanitizer_get_link_map_by_dlopen_handle(void* handle) { +void *__sanitizer_get_link_map_by_dlopen_handle(void *handle) { void *p = nullptr; return internal_dlinfo(handle, RTLD_DI_LINKMAP, &p) == 0 ? p : nullptr; } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h index ae54a8cf105ee..9e28dcfef0415 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h @@ -21,8 +21,8 @@ namespace __sanitizer { void *__sanitizer_get_link_map_by_dlopen_handle(void *handle); -# define GET_LINK_MAP_BY_DLOPEN_HANDLE(handle) \ - (link_map *)__sanitizer_get_link_map_by_dlopen_handle(handle) +#define GET_LINK_MAP_BY_DLOPEN_HANDLE(handle) \ + (link_map *)__sanitizer_get_link_map_by_dlopen_handle(handle) extern unsigned struct_utsname_sz; extern unsigned struct_stat_sz; @@ -129,6 +129,12 @@ struct __sanitizer_shmid_ds { void *_shm_internal; }; +struct __sanitizer_protoent { + char *p_name; + char **p_aliases; + int p_proto; +}; + struct __sanitizer_netent { char *n_name; char **n_aliases; @@ -1018,12 +1024,10 @@ extern unsigned struct_RF_ProgressInfo_sz; extern unsigned struct_nvlist_ref_sz; extern unsigned struct_StringList_sz; - // A special value to mark ioctls that are not present on the target platform, // when it can not be determined without including any system headers. extern const unsigned IOCTL_NOT_PRESENT; - extern unsigned IOCTL_AFM_ADDFMAP; extern unsigned IOCTL_AFM_DELFMAP; extern unsigned IOCTL_AFM_CLEANFMAP; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld.h b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld.h index 4e42400571423..7eb7c7684af5e 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld.h @@ -32,13 +32,11 @@ class SuspendedThreadsList { // Can't declare pure virtual functions in sanitizer runtimes: // __cxa_pure_virtual might be unavailable. Use UNIMPLEMENTED() instead. - virtual PtraceRegistersStatus GetRegistersAndSP(uptr index, uptr *buffer, - uptr *sp) const { + virtual PtraceRegistersStatus GetRegistersAndSP( + uptr index, InternalMmapVector *buffer, uptr *sp) const { UNIMPLEMENTED(); } - // The buffer in GetRegistersAndSP should be at least this big. - virtual uptr RegisterCount() const { UNIMPLEMENTED(); } virtual uptr ThreadCount() const { UNIMPLEMENTED(); } virtual tid_t GetThreadID(uptr index) const { UNIMPLEMENTED(); } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp index bd72c0ae00cbe..1e71d6512c1f5 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp @@ -94,9 +94,9 @@ class SuspendedThreadsListLinux : public SuspendedThreadsList { bool ContainsTid(tid_t thread_id) const; void Append(tid_t tid); - PtraceRegistersStatus GetRegistersAndSP(uptr index, uptr *buffer, + PtraceRegistersStatus GetRegistersAndSP(uptr index, + InternalMmapVector *buffer, uptr *sp) const override; - uptr RegisterCount() const override; private: InternalMmapVector thread_ids_; @@ -485,6 +485,9 @@ typedef user_regs_struct regs_struct; #else #define REG_SP rsp #endif +#define ARCH_IOVEC_FOR_GETREGSET +// Compiler may use FP registers to store pointers. +static constexpr uptr kExtraRegs[] = {NT_X86_XSTATE, NT_FPREGSET}; #elif defined(__powerpc__) || defined(__powerpc64__) typedef pt_regs regs_struct; @@ -501,11 +504,13 @@ typedef struct user regs_struct; #elif defined(__aarch64__) typedef struct user_pt_regs regs_struct; #define REG_SP sp +static constexpr uptr kExtraRegs[] = {0}; #define ARCH_IOVEC_FOR_GETREGSET #elif defined(__s390__) typedef _user_regs_struct regs_struct; #define REG_SP gprs[15] +static constexpr uptr kExtraRegs[] = {0}; #define ARCH_IOVEC_FOR_GETREGSET #else @@ -533,24 +538,58 @@ void SuspendedThreadsListLinux::Append(tid_t tid) { } PtraceRegistersStatus SuspendedThreadsListLinux::GetRegistersAndSP( - uptr index, uptr *buffer, uptr *sp) const { + uptr index, InternalMmapVector *buffer, uptr *sp) const { pid_t tid = GetThreadID(index); - regs_struct regs; + constexpr uptr uptr_sz = sizeof(uptr); int pterrno; #ifdef ARCH_IOVEC_FOR_GETREGSET - struct iovec regset_io; - regset_io.iov_base = ®s; - regset_io.iov_len = sizeof(regs_struct); - bool isErr = internal_iserror(internal_ptrace(PTRACE_GETREGSET, tid, - (void*)NT_PRSTATUS, (void*)®set_io), - &pterrno); + auto append = [&](uptr regset) { + uptr size = buffer->size(); + // NT_X86_XSTATE requires 64bit alignment. + uptr size_up = RoundUpTo(size, 8 / uptr_sz); + buffer->reserve(Max(1024, size_up)); + struct iovec regset_io; + for (;; buffer->resize(buffer->capacity() * 2)) { + buffer->resize(buffer->capacity()); + uptr available_bytes = (buffer->size() - size_up) * uptr_sz; + regset_io.iov_base = buffer->data() + size_up; + regset_io.iov_len = available_bytes; + bool fail = + internal_iserror(internal_ptrace(PTRACE_GETREGSET, tid, + (void *)regset, (void *)®set_io), + &pterrno); + if (fail) { + VReport(1, "Could not get regset %p from thread %d (errno %d).\n", + regset, tid, pterrno); + buffer->resize(size); + return false; + } + + // Far enough from the buffer size, no need to resize and repeat. + if (regset_io.iov_len + 64 < available_bytes) + break; + } + buffer->resize(size_up + RoundUpTo(regset_io.iov_len, uptr_sz) / uptr_sz); + return true; + }; + + buffer->clear(); + bool fail = !append(NT_PRSTATUS); + if (!fail) { + // Accept the first available and do not report errors. + for (uptr regs : kExtraRegs) + if (regs && append(regs)) + break; + } #else - bool isErr = internal_iserror(internal_ptrace(PTRACE_GETREGS, tid, nullptr, - ®s), &pterrno); -#endif - if (isErr) { + buffer->resize(RoundUpTo(sizeof(regs_struct), uptr_sz) / uptr_sz); + bool fail = internal_iserror( + internal_ptrace(PTRACE_GETREGS, tid, nullptr, buffer->data()), &pterrno); + if (fail) VReport(1, "Could not get registers from thread %d (errno %d).\n", tid, pterrno); +#endif + if (fail) { // ESRCH means that the given thread is not suspended or already dead. // Therefore it's unsafe to inspect its data (e.g. walk through stack) and // we should notify caller about this. @@ -558,14 +597,10 @@ PtraceRegistersStatus SuspendedThreadsListLinux::GetRegistersAndSP( : REGISTERS_UNAVAILABLE; } - *sp = regs.REG_SP; - internal_memcpy(buffer, ®s, sizeof(regs)); + *sp = reinterpret_cast(buffer->data())[0].REG_SP; return REGISTERS_AVAILABLE; } -uptr SuspendedThreadsListLinux::RegisterCount() const { - return sizeof(regs_struct) / sizeof(uptr); -} } // namespace __sanitizer #endif // SANITIZER_LINUX && (defined(__x86_64__) || defined(__mips__) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_mac.cpp index 7f9529aa35562..a605d5b9ff6bd 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_mac.cpp @@ -37,9 +37,9 @@ class SuspendedThreadsListMac : public SuspendedThreadsList { bool ContainsThread(thread_t thread) const; void Append(thread_t thread); - PtraceRegistersStatus GetRegistersAndSP(uptr index, uptr *buffer, + PtraceRegistersStatus GetRegistersAndSP(uptr index, + InternalMmapVector *buffer, uptr *sp) const override; - uptr RegisterCount() const override; private: InternalMmapVector threads_; @@ -142,7 +142,7 @@ void SuspendedThreadsListMac::Append(thread_t thread) { } PtraceRegistersStatus SuspendedThreadsListMac::GetRegistersAndSP( - uptr index, uptr *buffer, uptr *sp) const { + uptr index, InternalMmapVector *buffer, uptr *sp) const { thread_t thread = GetThread(index); regs_struct regs; int err; @@ -159,7 +159,8 @@ PtraceRegistersStatus SuspendedThreadsListMac::GetRegistersAndSP( : REGISTERS_UNAVAILABLE; } - internal_memcpy(buffer, ®s, sizeof(regs)); + buffer->resize(RoundUpTo(sizeof(regs), sizeof(uptr)) / sizeof(uptr)); + internal_memcpy(buffer->data(), ®s, sizeof(regs)); #if defined(__aarch64__) && defined(arm_thread_state64_get_sp) *sp = arm_thread_state64_get_sp(regs); #else @@ -173,9 +174,6 @@ PtraceRegistersStatus SuspendedThreadsListMac::GetRegistersAndSP( return REGISTERS_AVAILABLE; } -uptr SuspendedThreadsListMac::RegisterCount() const { - return MACHINE_THREAD_STATE_COUNT; -} } // namespace __sanitizer #endif // SANITIZER_MAC && (defined(__x86_64__) || defined(__aarch64__)) || diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_netbsd_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_netbsd_libcdep.cpp index 1ed21343254d5..70df31e6351cb 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_netbsd_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_netbsd_libcdep.cpp @@ -57,9 +57,9 @@ class SuspendedThreadsListNetBSD : public SuspendedThreadsList { bool ContainsTid(tid_t thread_id) const; void Append(tid_t tid); - PtraceRegistersStatus GetRegistersAndSP(uptr index, uptr *buffer, + PtraceRegistersStatus GetRegistersAndSP(uptr index, + InternalMmapVector *buffer, uptr *sp) const; - uptr RegisterCount() const; private: InternalMmapVector thread_ids_; @@ -131,7 +131,7 @@ bool ThreadSuspender::SuspendAllThreads() { pl.pl_lwpid = 0; int val; - while ((val = ptrace(op, pid_, (void *)&pl, sizeof(pl))) != -1 && + while ((val = internal_ptrace(op, pid_, (void *)&pl, sizeof(pl))) != -1 && pl.pl_lwpid != 0) { suspended_threads_list_.Append(pl.pl_lwpid); VReport(2, "Appended thread %d in process %d.\n", pl.pl_lwpid, pid_); @@ -335,7 +335,7 @@ void SuspendedThreadsListNetBSD::Append(tid_t tid) { } PtraceRegistersStatus SuspendedThreadsListNetBSD::GetRegistersAndSP( - uptr index, uptr *buffer, uptr *sp) const { + uptr index, InternalMmapVector *buffer, uptr *sp) const { lwpid_t tid = GetThreadID(index); pid_t ppid = internal_getppid(); struct reg regs; @@ -351,14 +351,12 @@ PtraceRegistersStatus SuspendedThreadsListNetBSD::GetRegistersAndSP( } *sp = PTRACE_REG_SP(®s); - internal_memcpy(buffer, ®s, sizeof(regs)); + buffer->resize(RoundUpTo(sizeof(regs), sizeof(uptr)) / sizeof(uptr)); + internal_memcpy(buffer->data(), ®s, sizeof(regs)); return REGISTERS_AVAILABLE; } -uptr SuspendedThreadsListNetBSD::RegisterCount() const { - return sizeof(struct reg) / sizeof(uptr); -} } // namespace __sanitizer #endif diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp index c26724ceb7a7d..c8eb781dfc845 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp @@ -47,14 +47,14 @@ bool ReportFile::SupportsColors() { return SupportsColoredOutput(fd); } -static INLINE bool ReportSupportsColors() { +static inline bool ReportSupportsColors() { return report_file.SupportsColors(); } #else // SANITIZER_FUCHSIA // Fuchsia's logs always go through post-processing that handles colorization. -static INLINE bool ReportSupportsColors() { return true; } +static inline bool ReportSupportsColors() { return true; } #endif // !SANITIZER_FUCHSIA diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_syscalls_netbsd.inc b/compiler-rt/lib/sanitizer_common/sanitizer_syscalls_netbsd.inc index 02b7e11b1677f..c4a9d99fe2f01 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_syscalls_netbsd.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_syscalls_netbsd.inc @@ -42,8 +42,8 @@ // DO NOT EDIT! THIS FILE HAS BEEN GENERATED! // // Generated with: generate_netbsd_syscalls.awk -// Generated date: 2019-12-24 -// Generated from: syscalls.master,v 1.296 2019/09/22 22:59:39 christos Exp +// Generated date: 2020-09-10 +// Generated from: syscalls.master,v 1.306 2020/08/14 00:53:16 riastradh Exp // //===----------------------------------------------------------------------===// @@ -872,7 +872,13 @@ PRE_SYSCALL(dup2)(long long from_, long long to_) { /* Nothing to do */ } POST_SYSCALL(dup2)(long long res, long long from_, long long to_) { /* Nothing to do */ } -/* syscall 91 has been skipped */ +PRE_SYSCALL(getrandom)(void *buf_, long long buflen_, long long flags_) { + /* TODO */ +} +POST_SYSCALL(getrandom) +(long long res, void *buf_, long long buflen_, long long flags_) { + /* TODO */ +} PRE_SYSCALL(fcntl)(long long fd_, long long cmd_, void *arg_) { /* Nothing to do */ } @@ -1332,9 +1338,29 @@ PRE_SYSCALL(compat_09_ouname)(void *name_) { /* TODO */ } POST_SYSCALL(compat_09_ouname)(long long res, void *name_) { /* TODO */ } PRE_SYSCALL(sysarch)(long long op_, void *parms_) { /* TODO */ } POST_SYSCALL(sysarch)(long long res, long long op_, void *parms_) { /* TODO */ } -/* syscall 166 has been skipped */ -/* syscall 167 has been skipped */ -/* syscall 168 has been skipped */ +PRE_SYSCALL(__futex) +(void *uaddr_, long long op_, long long val_, void *timeout_, void *uaddr2_, + long long val2_, long long val3_) { + /* TODO */ +} +POST_SYSCALL(__futex) +(long long res, void *uaddr_, long long op_, long long val_, void *timeout_, + void *uaddr2_, long long val2_, long long val3_) { + /* TODO */ +} +PRE_SYSCALL(__futex_set_robust_list)(void *head_, long long len_) { /* TODO */ } +POST_SYSCALL(__futex_set_robust_list) +(long long res, void *head_, long long len_) { + /* TODO */ +} +PRE_SYSCALL(__futex_get_robust_list) +(long long lwpid_, void **headp_, void *lenp_) { + /* TODO */ +} +POST_SYSCALL(__futex_get_robust_list) +(long long res, long long lwpid_, void **headp_, void *lenp_) { + /* TODO */ +} #if !defined(_LP64) PRE_SYSCALL(compat_10_osemsys) (long long which_, long long a2_, long long a3_, long long a4_, long long a5_) { @@ -3824,6 +3850,87 @@ PRE_SYSCALL(__fhstatvfs190) } POST_SYSCALL(__fhstatvfs190) (long long res, void *fhp_, long long fh_size_, void *buf_, long long flags_) {} +PRE_SYSCALL(__acl_get_link)(void *path_, long long type_, void *aclp_) { + /* TODO */ +} +POST_SYSCALL(__acl_get_link) +(long long res, void *path_, long long type_, void *aclp_) { + /* TODO */ +} +PRE_SYSCALL(__acl_set_link)(void *path_, long long type_, void *aclp_) { + /* TODO */ +} +POST_SYSCALL(__acl_set_link) +(long long res, void *path_, long long type_, void *aclp_) { + /* TODO */ +} +PRE_SYSCALL(__acl_delete_link)(void *path_, long long type_) { /* TODO */ } +POST_SYSCALL(__acl_delete_link)(long long res, void *path_, long long type_) { + /* TODO */ +} +PRE_SYSCALL(__acl_aclcheck_link)(void *path_, long long type_, void *aclp_) { + /* TODO */ +} +POST_SYSCALL(__acl_aclcheck_link) +(long long res, void *path_, long long type_, void *aclp_) { + /* TODO */ +} +PRE_SYSCALL(__acl_get_file)(void *path_, long long type_, void *aclp_) { + /* TODO */ +} +POST_SYSCALL(__acl_get_file) +(long long res, void *path_, long long type_, void *aclp_) { + /* TODO */ +} +PRE_SYSCALL(__acl_set_file)(void *path_, long long type_, void *aclp_) { + /* TODO */ +} +POST_SYSCALL(__acl_set_file) +(long long res, void *path_, long long type_, void *aclp_) { + /* TODO */ +} +PRE_SYSCALL(__acl_get_fd)(long long filedes_, long long type_, void *aclp_) { + /* TODO */ +} +POST_SYSCALL(__acl_get_fd) +(long long res, long long filedes_, long long type_, void *aclp_) { + /* TODO */ +} +PRE_SYSCALL(__acl_set_fd)(long long filedes_, long long type_, void *aclp_) { + /* TODO */ +} +POST_SYSCALL(__acl_set_fd) +(long long res, long long filedes_, long long type_, void *aclp_) { + /* TODO */ +} +PRE_SYSCALL(__acl_delete_file)(void *path_, long long type_) { /* TODO */ } +POST_SYSCALL(__acl_delete_file)(long long res, void *path_, long long type_) { + /* TODO */ +} +PRE_SYSCALL(__acl_delete_fd)(long long filedes_, long long type_) { /* TODO */ } +POST_SYSCALL(__acl_delete_fd) +(long long res, long long filedes_, long long type_) { + /* TODO */ +} +PRE_SYSCALL(__acl_aclcheck_file)(void *path_, long long type_, void *aclp_) { + /* TODO */ +} +POST_SYSCALL(__acl_aclcheck_file) +(long long res, void *path_, long long type_, void *aclp_) { + /* TODO */ +} +PRE_SYSCALL(__acl_aclcheck_fd) +(long long filedes_, long long type_, void *aclp_) { + /* TODO */ +} +POST_SYSCALL(__acl_aclcheck_fd) +(long long res, long long filedes_, long long type_, void *aclp_) { + /* TODO */ +} +PRE_SYSCALL(lpathconf)(void *path_, long long name_) { /* TODO */ } +POST_SYSCALL(lpathconf)(long long res, void *path_, long long name_) { + /* TODO */ +} #undef SYS_MAXSYSARGS } // extern "C" diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_atomic_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_atomic_test.cpp index 9a3078b25d762..3136886854fa5 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_atomic_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_atomic_test.cpp @@ -12,6 +12,18 @@ #include "sanitizer_common/sanitizer_atomic.h" #include "gtest/gtest.h" +#ifndef __has_extension +#define __has_extension(x) 0 +#endif + +#if __has_extension(c_atomic) || __has_extension(cxx_atomic) +#define ATOMIC_LLONG_LOCK_FREE __CLANG_ATOMIC_LLONG_LOCK_FREE +#elif (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) +#define ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE +#else +#error Unsupported compiler. +#endif + namespace __sanitizer { template @@ -69,11 +81,15 @@ TEST(SanitizerCommon, AtomicStoreLoad) { CheckStoreLoad(); CheckStoreLoad(); + // Avoid fallbacking to software emulated compiler atomics, that are usually + // provided by libatomic, which is not always present. +#if ATOMIC_LLONG_LOCK_FREE == 2 CheckStoreLoad(); CheckStoreLoad(); CheckStoreLoad(); CheckStoreLoad(); CheckStoreLoad(); +#endif CheckStoreLoad (); @@ -119,7 +135,9 @@ TEST(SanitizerCommon, AtomicCompareExchangeTest) { CheckAtomicCompareExchange(); CheckAtomicCompareExchange(); CheckAtomicCompareExchange(); +#if ATOMIC_LLONG_LOCK_FREE == 2 CheckAtomicCompareExchange(); +#endif CheckAtomicCompareExchange(); } #endif //!SANITIZER_ANDROID diff --git a/compiler-rt/lib/scudo/scudo_allocator.cpp b/compiler-rt/lib/scudo/scudo_allocator.cpp index 343f85a4ef88b..c6a3309cb925b 100644 --- a/compiler-rt/lib/scudo/scudo_allocator.cpp +++ b/compiler-rt/lib/scudo/scudo_allocator.cpp @@ -44,7 +44,7 @@ static u32 Cookie; // at compilation or at runtime. static atomic_uint8_t HashAlgorithm = { CRC32Software }; -INLINE u32 computeCRC32(u32 Crc, uptr Value, uptr *Array, uptr ArraySize) { +inline u32 computeCRC32(u32 Crc, uptr Value, uptr *Array, uptr ArraySize) { // If the hardware CRC32 feature is defined here, it was enabled everywhere, // as opposed to only for scudo_crc32.cpp. This means that other hardware // specific instructions were likely emitted at other places, and as a @@ -71,31 +71,31 @@ INLINE u32 computeCRC32(u32 Crc, uptr Value, uptr *Array, uptr ArraySize) { static BackendT &getBackend(); namespace Chunk { - static INLINE AtomicPackedHeader *getAtomicHeader(void *Ptr) { + static inline AtomicPackedHeader *getAtomicHeader(void *Ptr) { return reinterpret_cast(reinterpret_cast(Ptr) - getHeaderSize()); } - static INLINE + static inline const AtomicPackedHeader *getConstAtomicHeader(const void *Ptr) { return reinterpret_cast( reinterpret_cast(Ptr) - getHeaderSize()); } - static INLINE bool isAligned(const void *Ptr) { + static inline bool isAligned(const void *Ptr) { return IsAligned(reinterpret_cast(Ptr), MinAlignment); } // We can't use the offset member of the chunk itself, as we would double // fetch it without any warranty that it wouldn't have been tampered. To // prevent this, we work with a local copy of the header. - static INLINE void *getBackendPtr(const void *Ptr, UnpackedHeader *Header) { + static inline void *getBackendPtr(const void *Ptr, UnpackedHeader *Header) { return reinterpret_cast(reinterpret_cast(Ptr) - getHeaderSize() - (Header->Offset << MinAlignmentLog)); } // Returns the usable size for a chunk, meaning the amount of bytes from the // beginning of the user data to the end of the backend allocated chunk. - static INLINE uptr getUsableSize(const void *Ptr, UnpackedHeader *Header) { + static inline uptr getUsableSize(const void *Ptr, UnpackedHeader *Header) { const uptr ClassId = Header->ClassId; if (ClassId) return PrimaryT::ClassIdToSize(ClassId) - getHeaderSize() - @@ -105,7 +105,7 @@ namespace Chunk { } // Returns the size the user requested when allocating the chunk. - static INLINE uptr getSize(const void *Ptr, UnpackedHeader *Header) { + static inline uptr getSize(const void *Ptr, UnpackedHeader *Header) { const uptr SizeOrUnusedBytes = Header->SizeOrUnusedBytes; if (Header->ClassId) return SizeOrUnusedBytes; @@ -114,7 +114,7 @@ namespace Chunk { } // Compute the checksum of the chunk pointer and its header. - static INLINE u16 computeChecksum(const void *Ptr, UnpackedHeader *Header) { + static inline u16 computeChecksum(const void *Ptr, UnpackedHeader *Header) { UnpackedHeader ZeroChecksumHeader = *Header; ZeroChecksumHeader.Checksum = 0; uptr HeaderHolder[sizeof(UnpackedHeader) / sizeof(uptr)]; @@ -126,7 +126,7 @@ namespace Chunk { // Checks the validity of a chunk by verifying its checksum. It doesn't // incur termination in the event of an invalid chunk. - static INLINE bool isValid(const void *Ptr) { + static inline bool isValid(const void *Ptr) { PackedHeader NewPackedHeader = atomic_load_relaxed(getConstAtomicHeader(Ptr)); UnpackedHeader NewUnpackedHeader = @@ -140,7 +140,7 @@ namespace Chunk { COMPILER_CHECK(ChunkAvailable == 0); // Loads and unpacks the header, verifying the checksum in the process. - static INLINE + static inline void loadHeader(const void *Ptr, UnpackedHeader *NewUnpackedHeader) { PackedHeader NewPackedHeader = atomic_load_relaxed(getConstAtomicHeader(Ptr)); @@ -151,7 +151,7 @@ namespace Chunk { } // Packs and stores the header, computing the checksum in the process. - static INLINE void storeHeader(void *Ptr, UnpackedHeader *NewUnpackedHeader) { + static inline void storeHeader(void *Ptr, UnpackedHeader *NewUnpackedHeader) { NewUnpackedHeader->Checksum = computeChecksum(Ptr, NewUnpackedHeader); PackedHeader NewPackedHeader = bit_cast(*NewUnpackedHeader); atomic_store_relaxed(getAtomicHeader(Ptr), NewPackedHeader); @@ -160,7 +160,7 @@ namespace Chunk { // Packs and stores the header, computing the checksum in the process. We // compare the current header with the expected provided one to ensure that // we are not being raced by a corruption occurring in another thread. - static INLINE void compareExchangeHeader(void *Ptr, + static inline void compareExchangeHeader(void *Ptr, UnpackedHeader *NewUnpackedHeader, UnpackedHeader *OldUnpackedHeader) { NewUnpackedHeader->Checksum = computeChecksum(Ptr, NewUnpackedHeader); diff --git a/compiler-rt/lib/scudo/scudo_crc32.h b/compiler-rt/lib/scudo/scudo_crc32.h index bad15a929a3e0..ef40595a56d1f 100644 --- a/compiler-rt/lib/scudo/scudo_crc32.h +++ b/compiler-rt/lib/scudo/scudo_crc32.h @@ -85,7 +85,7 @@ static const u32 CRC32Table[] = { 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d }; -INLINE u32 computeSoftwareCRC32(u32 Crc, uptr Data) { +inline u32 computeSoftwareCRC32(u32 Crc, uptr Data) { for (uptr i = 0; i < sizeof(Data); i++) { Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8); Data >>= 8; diff --git a/compiler-rt/lib/scudo/scudo_tsd.h b/compiler-rt/lib/scudo/scudo_tsd.h index 1d4e4e6f126e5..ec8dabc1f8a7d 100644 --- a/compiler-rt/lib/scudo/scudo_tsd.h +++ b/compiler-rt/lib/scudo/scudo_tsd.h @@ -29,7 +29,7 @@ struct ALIGNED(SANITIZER_CACHE_LINE_SIZE) ScudoTSD { void init(); void commitBack(); - INLINE bool tryLock() { + inline bool tryLock() { if (Mutex.TryLock()) { atomic_store_relaxed(&Precedence, 0); return true; @@ -40,14 +40,14 @@ struct ALIGNED(SANITIZER_CACHE_LINE_SIZE) ScudoTSD { return false; } - INLINE void lock() { + inline void lock() { atomic_store_relaxed(&Precedence, 0); Mutex.Lock(); } - INLINE void unlock() { Mutex.Unlock(); } + inline void unlock() { Mutex.Unlock(); } - INLINE uptr getPrecedence() { return atomic_load_relaxed(&Precedence); } + inline uptr getPrecedence() { return atomic_load_relaxed(&Precedence); } private: StaticSpinMutex Mutex; diff --git a/compiler-rt/lib/scudo/scudo_utils.cpp b/compiler-rt/lib/scudo/scudo_utils.cpp index f31d68058acbc..b7ce8f9158172 100644 --- a/compiler-rt/lib/scudo/scudo_utils.cpp +++ b/compiler-rt/lib/scudo/scudo_utils.cpp @@ -121,7 +121,7 @@ bool hasHardwareCRC32ARMPosix() { return false; } // initialized after the other globals, so we can check its value to know if // calling getauxval is safe. extern "C" SANITIZER_WEAK_ATTRIBUTE char *__progname; -INLINE bool areBionicGlobalsInitialized() { +inline bool areBionicGlobalsInitialized() { return !SANITIZER_ANDROID || (&__progname && __progname); } diff --git a/compiler-rt/lib/scudo/scudo_utils.h b/compiler-rt/lib/scudo/scudo_utils.h index a8dfbdeb3b708..b657c69d9baff 100644 --- a/compiler-rt/lib/scudo/scudo_utils.h +++ b/compiler-rt/lib/scudo/scudo_utils.h @@ -20,7 +20,7 @@ namespace __scudo { template -INLINE Dest bit_cast(const Source& source) { +inline Dest bit_cast(const Source& source) { static_assert(sizeof(Dest) == sizeof(Source), "Sizes are not equal!"); Dest dest; memcpy(&dest, &source, sizeof(dest)); diff --git a/compiler-rt/lib/scudo/standalone/internal_defs.h b/compiler-rt/lib/scudo/standalone/internal_defs.h index a884f1f3a40ed..0babbbe3c11b5 100644 --- a/compiler-rt/lib/scudo/standalone/internal_defs.h +++ b/compiler-rt/lib/scudo/standalone/internal_defs.h @@ -36,7 +36,6 @@ #define FORMAT(F, A) __attribute__((format(printf, F, A))) #define NOINLINE __attribute__((noinline)) #define NORETURN __attribute__((noreturn)) -#define THREADLOCAL __thread #define LIKELY(X) __builtin_expect(!!(X), 1) #define UNLIKELY(X) __builtin_expect(!!(X), 0) #if defined(__i386__) || defined(__x86_64__) diff --git a/compiler-rt/lib/scudo/standalone/linux.h b/compiler-rt/lib/scudo/standalone/linux.h index c8e41484c8515..72acb6da83a76 100644 --- a/compiler-rt/lib/scudo/standalone/linux.h +++ b/compiler-rt/lib/scudo/standalone/linux.h @@ -18,51 +18,6 @@ namespace scudo { // MapPlatformData is unused on Linux, define it as a minimally sized structure. struct MapPlatformData {}; -#if SCUDO_ANDROID - -#if defined(__aarch64__) -#define __get_tls() \ - ({ \ - void **__v; \ - __asm__("mrs %0, tpidr_el0" : "=r"(__v)); \ - __v; \ - }) -#elif defined(__arm__) -#define __get_tls() \ - ({ \ - void **__v; \ - __asm__("mrc p15, 0, %0, c13, c0, 3" : "=r"(__v)); \ - __v; \ - }) -#elif defined(__i386__) -#define __get_tls() \ - ({ \ - void **__v; \ - __asm__("movl %%gs:0, %0" : "=r"(__v)); \ - __v; \ - }) -#elif defined(__x86_64__) -#define __get_tls() \ - ({ \ - void **__v; \ - __asm__("mov %%fs:0, %0" : "=r"(__v)); \ - __v; \ - }) -#else -#error "Unsupported architecture." -#endif - -// The Android Bionic team has allocated a TLS slot for sanitizers starting -// with Q, given that Android currently doesn't support ELF TLS. It is used to -// store sanitizer thread specific data. -static const int TLS_SLOT_SANITIZER = 6; - -ALWAYS_INLINE uptr *getAndroidTlsPtr() { - return reinterpret_cast(&__get_tls()[TLS_SLOT_SANITIZER]); -} - -#endif // SCUDO_ANDROID - } // namespace scudo #endif // SCUDO_LINUX diff --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp index a7a2b3160611e..605ce44d49739 100644 --- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp @@ -152,7 +152,7 @@ static std::condition_variable Cv; static bool Ready; template static void performAllocations(Primary *Allocator) { - static THREADLOCAL typename Primary::CacheT Cache; + static thread_local typename Primary::CacheT Cache; Cache.init(nullptr, Allocator); std::vector> V; { diff --git a/compiler-rt/lib/scudo/standalone/tests/scudo_unit_test_main.cpp b/compiler-rt/lib/scudo/standalone/tests/scudo_unit_test_main.cpp index 20deca998d964..9bbf6e75a5cd0 100644 --- a/compiler-rt/lib/scudo/standalone/tests/scudo_unit_test_main.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/scudo_unit_test_main.cpp @@ -29,11 +29,11 @@ __scudo_default_options() { "dealloc_type_mismatch=" DEALLOC_TYPE_MISMATCH; } -int main(int argc, char **argv) { +// The zxtest library provides a default main function that does the same thing +// for Fuchsia builds. #if !SCUDO_FUCHSIA +int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return RUN_ALL_TESTS(argc, argv); -#endif } +#endif diff --git a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h index ac5a22c970701..9437167d84821 100644 --- a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h +++ b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h @@ -99,16 +99,16 @@ template struct TSDRegistryExT { atomic_u8 Disabled; TSD FallbackTSD; HybridMutex Mutex; - static THREADLOCAL ThreadState State; - static THREADLOCAL TSD ThreadTSD; + static thread_local ThreadState State; + static thread_local TSD ThreadTSD; friend void teardownThread(void *Ptr); }; template -THREADLOCAL TSD TSDRegistryExT::ThreadTSD; +thread_local TSD TSDRegistryExT::ThreadTSD; template -THREADLOCAL ThreadState TSDRegistryExT::State; +thread_local ThreadState TSDRegistryExT::State; template void teardownThread(void *Ptr) { typedef TSDRegistryExT TSDRegistryT; diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h index 25ba191826c3f..041b834c74852 100644 --- a/compiler-rt/lib/scudo/standalone/tsd_shared.h +++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h @@ -9,9 +9,17 @@ #ifndef SCUDO_TSD_SHARED_H_ #define SCUDO_TSD_SHARED_H_ -#include "linux.h" // for getAndroidTlsPtr() #include "tsd.h" +#if SCUDO_HAS_PLATFORM_TLS_SLOT +// This is a platform-provided header that needs to be on the include path when +// Scudo is compiled. It must declare a function with the prototype: +// uintptr_t *getPlatformAllocatorTlsSlot() +// that returns the address of a thread-local word of storage reserved for +// Scudo, that must be zero-initialized in newly created threads. +#include "scudo_platform_tls_slot.h" +#endif + namespace scudo { template @@ -80,26 +88,21 @@ struct TSDRegistrySharedT { } private: - ALWAYS_INLINE void setCurrentTSD(TSD *CurrentTSD) { -#if _BIONIC - *getAndroidTlsPtr() = reinterpret_cast(CurrentTSD); -#elif SCUDO_LINUX - ThreadTSD = CurrentTSD; + ALWAYS_INLINE uptr *getTlsPtr() const { +#if SCUDO_HAS_PLATFORM_TLS_SLOT + return reinterpret_cast(getPlatformAllocatorTlsSlot()); #else - CHECK_EQ( - pthread_setspecific(PThreadKey, reinterpret_cast(CurrentTSD)), - 0); + static thread_local uptr ThreadTSD; + return &ThreadTSD; #endif } + ALWAYS_INLINE void setCurrentTSD(TSD *CurrentTSD) { + *getTlsPtr() = reinterpret_cast(CurrentTSD); + } + ALWAYS_INLINE TSD *getCurrentTSD() { -#if _BIONIC - return reinterpret_cast *>(*getAndroidTlsPtr()); -#elif SCUDO_LINUX - return ThreadTSD; -#else - return reinterpret_cast *>(pthread_getspecific(PThreadKey)); -#endif + return reinterpret_cast *>(*getTlsPtr()); } bool setNumberOfTSDs(u32 N) { @@ -195,17 +198,8 @@ struct TSDRegistrySharedT { HybridMutex Mutex; HybridMutex MutexTSDs; TSD TSDs[TSDsArraySize]; -#if SCUDO_LINUX && !_BIONIC - static THREADLOCAL TSD *ThreadTSD; -#endif }; -#if SCUDO_LINUX && !_BIONIC -template -THREADLOCAL TSD - *TSDRegistrySharedT::ThreadTSD; -#endif - } // namespace scudo #endif // SCUDO_TSD_SHARED_H_ diff --git a/compiler-rt/lib/tsan/rtl/tsan_flags.cpp b/compiler-rt/lib/tsan/rtl/tsan_flags.cpp index 44bf325cd35bb..49e4a9c21da9c 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_flags.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_flags.cpp @@ -87,7 +87,7 @@ void InitializeFlags(Flags *f, const char *env, const char *env_option_name) { // Let a frontend override. parser.ParseString(__tsan_default_options()); #if TSAN_CONTAINS_UBSAN - const char *ubsan_default_options = __ubsan::MaybeCallUbsanDefaultOptions(); + const char *ubsan_default_options = __ubsan_default_options(); ubsan_parser.ParseString(ubsan_default_options); #endif // Override from command line. diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors.h b/compiler-rt/lib/tsan/rtl/tsan_interceptors.h index 88d1edd775d37..29576ea2d49ad 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interceptors.h +++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors.h @@ -22,7 +22,7 @@ class ScopedInterceptor { LibIgnore *libignore(); #if !SANITIZER_GO -INLINE bool in_symbolizer() { +inline bool in_symbolizer() { cur_thread_init(); return UNLIKELY(cur_thread()->in_symbolizer); } diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp index 645152a06c399..710e7ec97b703 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp @@ -384,12 +384,16 @@ static uptr UnmangleLongJmpSp(uptr mangled_sp) { #endif } -#ifdef __powerpc__ +#if SANITIZER_NETBSD +# ifdef __x86_64__ +# define LONG_JMP_SP_ENV_SLOT 6 +# else +# error unsupported +# endif +#elif defined(__powerpc__) # define LONG_JMP_SP_ENV_SLOT 0 #elif SANITIZER_FREEBSD # define LONG_JMP_SP_ENV_SLOT 2 -#elif SANITIZER_NETBSD -# define LONG_JMP_SP_ENV_SLOT 6 #elif SANITIZER_LINUX # ifdef __aarch64__ # define LONG_JMP_SP_ENV_SLOT 13 diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.h b/compiler-rt/lib/tsan/rtl/tsan_rtl.h index d3bb61ff87d3f..efdc53a1e9252 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl.h +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.h @@ -458,22 +458,22 @@ struct ThreadState { ThreadState *cur_thread(); void set_cur_thread(ThreadState *thr); void cur_thread_finalize(); -INLINE void cur_thread_init() { } +inline void cur_thread_init() { } #else __attribute__((tls_model("initial-exec"))) extern THREADLOCAL char cur_thread_placeholder[]; -INLINE ThreadState *cur_thread() { +inline ThreadState *cur_thread() { return reinterpret_cast(cur_thread_placeholder)->current; } -INLINE void cur_thread_init() { +inline void cur_thread_init() { ThreadState *thr = reinterpret_cast(cur_thread_placeholder); if (UNLIKELY(!thr->current)) thr->current = thr; } -INLINE void set_cur_thread(ThreadState *thr) { +inline void set_cur_thread(ThreadState *thr) { reinterpret_cast(cur_thread_placeholder)->current = thr; } -INLINE void cur_thread_finalize() { } +inline void cur_thread_finalize() { } #endif // SANITIZER_MAC || SANITIZER_ANDROID #endif // SANITIZER_GO diff --git a/compiler-rt/lib/ubsan/ubsan_flags.cpp b/compiler-rt/lib/ubsan/ubsan_flags.cpp index 721c2273f133a..25cefd46ce27c 100644 --- a/compiler-rt/lib/ubsan/ubsan_flags.cpp +++ b/compiler-rt/lib/ubsan/ubsan_flags.cpp @@ -21,10 +21,6 @@ namespace __ubsan { -const char *MaybeCallUbsanDefaultOptions() { - return (&__ubsan_default_options) ? __ubsan_default_options() : ""; -} - static const char *GetFlag(const char *flag) { // We cannot call getenv() from inside a preinit array initializer if (SANITIZER_CAN_USE_PREINIT_ARRAY) { @@ -66,7 +62,7 @@ void InitializeFlags() { RegisterUbsanFlags(&parser, f); // Override from user-specified string. - parser.ParseString(MaybeCallUbsanDefaultOptions()); + parser.ParseString(__ubsan_default_options()); // Override from environment variable. parser.ParseStringFromEnv("UBSAN_OPTIONS"); InitializeCommonFlags(); diff --git a/compiler-rt/lib/ubsan/ubsan_flags.h b/compiler-rt/lib/ubsan/ubsan_flags.h index daa0d7c701e04..c47009bafe539 100644 --- a/compiler-rt/lib/ubsan/ubsan_flags.h +++ b/compiler-rt/lib/ubsan/ubsan_flags.h @@ -34,8 +34,6 @@ inline Flags *flags() { return &ubsan_flags; } void InitializeFlags(); void RegisterUbsanFlags(FlagParser *parser, Flags *f); -const char *MaybeCallUbsanDefaultOptions(); - } // namespace __ubsan extern "C" { diff --git a/compiler-rt/test/asan/TestCases/Linux/asan_prelink_test.cpp b/compiler-rt/test/asan/TestCases/Linux/asan_prelink_test.cpp index e00c215e92b11..9c70b61291b36 100644 --- a/compiler-rt/test/asan/TestCases/Linux/asan_prelink_test.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/asan_prelink_test.cpp @@ -1,11 +1,12 @@ // Test if asan works with prelink. -// It does not actually use prelink, but relies on ld's flag -Ttext-segment -// or gold's flag -Ttext (we try the first flag first, if that fails we +// It does not actually use prelink, but relies on GNU ld's -Ttext-segment, +// LLD's --image-base, or gold's -Ttext (we try the first flag first, if that fails we // try the second flag). // // RUN: %clangxx_asan -c %s -o %t.o // RUN: %clangxx_asan -DBUILD_SO=1 -fPIC -shared %s -o %t.so -Wl,-Ttext-segment=0x3600000000 ||\ -// RUN: %clangxx_asan -DBUILD_SO=1 -fPIC -shared %s -o %t.so -Wl,-Ttext=0x3600000000 +// RUN: %clangxx_asan -DBUILD_SO=1 -fPIC -shared %s -o %t.so -Wl,--image-base=0x3600000000 ||\ +// RUN: %clangxx_asan -DBUILD_SO=1 -fPIC -shared %s -o %t.so -Wl,-Ttext=0x3600000000 // RUN: %clangxx_asan %t.o %t.so -Wl,-R. -o %t // RUN: %env_asan_opts=verbosity=1 %run %t 2>&1 | FileCheck %s diff --git a/compiler-rt/test/asan/TestCases/asan_update_allocation.cpp b/compiler-rt/test/asan/TestCases/asan_update_allocation.cpp index d703fe024aa05..065f793092f05 100644 --- a/compiler-rt/test/asan/TestCases/asan_update_allocation.cpp +++ b/compiler-rt/test/asan/TestCases/asan_update_allocation.cpp @@ -1,19 +1,32 @@ -// RUN: %clangxx_asan -O0 -DSIZE=10 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-%os --check-prefix=CHECK -// RUN: %clangxx_asan -O0 -DSIZE=10000000 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-%os --check-prefix=CHECK +// RUN: %clangxx_asan -O0 %s -o %t + +// RUN: not %run %t 10 0 2>&1 | FileCheck %s --check-prefix=CHECK-%os --check-prefixes=CHECK,T0 +// RUN: not %run %t 10000000 0 2>&1 | FileCheck %s --check-prefixes=CHECK,T0 + +// RUN: not %run %t 10 1 2>&1 | FileCheck %s --check-prefix=CHECK-%os --check-prefixes=CHECK,T1 +// RUN: not %run %t 10000000 1 2>&1 | FileCheck %s --check-prefixes=CHECK,T1 + // REQUIRES: stable-runtime -#include #include +#include +#include void UPDATE(void *p) { __asan_update_allocation_context(p); } -int main() { - char *x = (char*)malloc(SIZE * sizeof(char)); - UPDATE(x); +int main(int argc, char *argv[]) { + char *x = (char *)malloc(atoi(argv[1]) * sizeof(char)); + if (atoi(argv[2])) + std::thread([&]() { UPDATE(x); }).join(); + else + UPDATE(x); free(x); return x[5]; // CHECK: {{.*ERROR: AddressSanitizer: heap-use-after-free on address}} + // CHECK: READ of size 1 at {{.*}} thread T0 + // T0: allocated by thread T0 here + // T1: allocated by thread T1 here // CHECK: UPDATE } diff --git a/compiler-rt/test/asan/TestCases/leaks.cpp b/compiler-rt/test/asan/TestCases/leaks.cpp new file mode 100644 index 0000000000000..9c076dd894ebf --- /dev/null +++ b/compiler-rt/test/asan/TestCases/leaks.cpp @@ -0,0 +1,29 @@ +// Test for LeakSanitizer+AddressSanitizer of different sizes. +// REQUIRES: leak-detection +// +// RUN: %clangxx_asan -O0 %s -o %t +// RUN: not %run %t 0 2>&1 | FileCheck %s +// RUN: not %run %t 1 2>&1 | FileCheck %s +// RUN: not %run %t 1000 2>&1 | FileCheck %s +// RUN: not %run %t 1000000 2>&1 | FileCheck %s +// RUN: not %run %t 10000000 2>&1 | FileCheck %s + +#include +#include +#include +int *t; + +__attribute__((noopt)) void leak(int n) { + // Repeat few times to make sure that at least one pointer is + // not somewhere on the stack. + for (int i = 0; i < 10; ++i) { + t = new int[n]; + printf("t: %p\n", t); + t = 0; + } +} + +int main(int argc, char **argv) { + leak(atoi(argv[1])); +} +// CHECK: LeakSanitizer: detected memory leaks diff --git a/compiler-rt/test/asan/TestCases/lsan_annotations.cpp b/compiler-rt/test/asan/TestCases/lsan_annotations.cpp index f52b0ff66a8df..158c2fdf9f481 100644 --- a/compiler-rt/test/asan/TestCases/lsan_annotations.cpp +++ b/compiler-rt/test/asan/TestCases/lsan_annotations.cpp @@ -5,12 +5,17 @@ #include #include +int *x, *y; + int main() { - int *x = new int; + x = new int; __lsan_ignore_object(x); + { __lsan::ScopedDisabler disabler; - double *y = new double; + y = new int; } + + x = y = nullptr; return 0; } diff --git a/compiler-rt/test/asan/TestCases/lsan_crash.cpp b/compiler-rt/test/asan/TestCases/lsan_crash.cpp new file mode 100644 index 0000000000000..23c2569a0b73c --- /dev/null +++ b/compiler-rt/test/asan/TestCases/lsan_crash.cpp @@ -0,0 +1,31 @@ +// RUN: %clangxx_asan -O2 %s -o %t && %run %t + +#include +#include +#include +#include +#include + +std::atomic done; + +void foo() { + std::unique_ptr mem; + + while (!done) + mem.reset(new char[1000000]); +} + +int main() { + std::vector threads; + for (int i = 0; i < 10; ++i) + threads.emplace_back(foo); + + for (int i = 0; i < 100; ++i) + __lsan_do_recoverable_leak_check(); + + done = true; + for (auto &t : threads) + t.join(); + + return 0; +} diff --git a/compiler-rt/test/builtins/Unit/divmodti4_test.c b/compiler-rt/test/builtins/Unit/divmodti4_test.c new file mode 100644 index 0000000000000..a9f70dcf1c1eb --- /dev/null +++ b/compiler-rt/test/builtins/Unit/divmodti4_test.c @@ -0,0 +1,91 @@ +// RUN: %clang_builtins %s %librt -o %t && %run %t +// REQUIRES: librt_has_divmodti4 +// REQUIRES: int128 +//===-- divmodti4_test.c - Test __divmodti4 -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file tests __divmodti4 for the compiler_rt library. +// +//===----------------------------------------------------------------------===// + +#include "int_lib.h" +#include + +#ifdef CRT_HAS_128BIT + +// Effects: if rem != 0, *rem = a % b +// Returns: a / b + +COMPILER_RT_ABI ti_int __divmodti4(ti_int a, ti_int b, ti_int* rem); + +int test__divmodti4(ti_int a, ti_int b, ti_int expected_q, ti_int expected_r) { + ti_int r; + ti_int q = __divmodti4(a, b, &r); + if (q != expected_q || r != expected_r) + { + utwords at; + at.all = a; + utwords bt; + bt.all = b; + utwords expected_qt; + expected_qt.all = expected_q; + utwords expected_rt; + expected_rt.all = expected_r; + utwords qt; + qt.all = q; + utwords rt; + rt.all = r; + printf("error in __divmodti4: 0x%.16llX%.16llX / 0x%.16llX%.16llX = " + "0x%.16llX%.16llX, R = 0x%.16llX%.16llX, expected 0x%.16llX%.16llX, " + "0x%.16llX%.16llX\n", + at.s.high, at.s.low, bt.s.high, bt.s.low, qt.s.high, qt.s.low, + rt.s.high, rt.s.low, expected_qt.s.high, expected_qt.s.low, + expected_rt.s.high, expected_rt.s.low); + } + return !(q == expected_q && r == expected_r); +} + +char assumption_1[sizeof(ti_int) == 2*sizeof(di_int)] = {0}; + +tu_int tests[][4] = +{ +{ (ti_int) 0, (ti_int) 1, (ti_int) 0, (ti_int) 0 }, +{ (ti_int) 0, (ti_int)-1, (ti_int) 0, (ti_int) 0 }, +{ (ti_int) 2, (ti_int) 1, (ti_int) 2, (ti_int) 0 }, +{ (ti_int) 2, (ti_int)-1, (ti_int)-2, (ti_int) 0 }, +{ (ti_int)-2, (ti_int) 1, (ti_int)-2, (ti_int) 0 }, +{ (ti_int)-2, (ti_int)-1, (ti_int) 2, (ti_int) 0 }, +{ (ti_int) 5, (ti_int) 3, (ti_int) 1, (ti_int) 2 }, +{ (ti_int) 5, (ti_int)-3, (ti_int)-1, (ti_int) 2 }, +{ (ti_int)-5, (ti_int) 3, (ti_int)-1, (ti_int)-2 }, +{ (ti_int)-5, (ti_int)-3, (ti_int) 1, (ti_int)-2 }, +{ (ti_int)0x8000000000000000LL << 64 | 0, (ti_int) 1, (ti_int)0x8000000000000000LL << 64 | 0, (ti_int)0x0LL }, +{ (ti_int)0x8000000000000000LL << 64 | 0, (ti_int)-1, (ti_int)0x8000000000000000LL << 64 | 0, (ti_int)0x0LL }, +{ (ti_int)0x8000000000000000LL << 64 | 0, (ti_int)-2, (ti_int)0x4000000000000000LL << 64 | 0, (ti_int)0x0LL }, +{ (ti_int)0x8000000000000000LL << 64 | 0, (ti_int) 2, (ti_int)0xC000000000000000LL << 64 | 0, (ti_int)0x0LL }, +{ (ti_int)0x8000000000000000LL << 64 | 0, (ti_int)-3, (ti_int)0x2AAAAAAAAAAAAAAALL << 64 | 0xAAAAAAAAAAAAAAAALL, (ti_int)-2 }, +{ (ti_int)0x8000000000000000LL << 64 | 0, (ti_int) 3, (ti_int)0xD555555555555555LL << 64 | 0x5555555555555556LL, (ti_int)-2 }, +}; + +#endif + +int main() +{ +#ifdef CRT_HAS_128BIT + const unsigned N = sizeof(tests) / sizeof(tests[0]); + unsigned i; + for (i = 0; i < N; ++i) + if (test__divmodti4(tests[i][0], tests[i][1], tests[i][2], tests[i][3])) + return 1; + + +#else + printf("skipped\n"); +#endif + return 0; +} diff --git a/compiler-rt/test/dfsan/custom.cpp b/compiler-rt/test/dfsan/custom.cpp index 7802f88f2c248..6d5e06a7799d7 100644 --- a/compiler-rt/test/dfsan/custom.cpp +++ b/compiler-rt/test/dfsan/custom.cpp @@ -17,12 +17,13 @@ #include #include #include -#include #include +#include #include #include -#include +#include #include +#include #include #include #include @@ -86,6 +87,24 @@ void test_memcmp() { #endif } +void test_bcmp() { + char str1[] = "str1", str2[] = "str2"; + dfsan_set_label(i_label, &str1[3], 1); + dfsan_set_label(j_label, &str2[3], 1); + + int rv = bcmp(str1, str2, sizeof(str1)); + assert(rv != 0); +#ifdef STRICT_DATA_DEPENDENCIES + ASSERT_ZERO_LABEL(rv); +#else + ASSERT_LABEL(rv, i_j_label); +#endif + + rv = bcmp(str1, str2, sizeof(str1) - 2); + assert(rv == 0); + ASSERT_ZERO_LABEL(rv); +} + void test_memcpy() { char str1[] = "str1"; char str2[sizeof(str1)]; @@ -967,6 +986,7 @@ int main(void) { assert(i_j_label != j_label); assert(i_j_label != k_label); + test_bcmp(); test_calloc(); test_clock_gettime(); test_ctime_r(); diff --git a/compiler-rt/test/dfsan/event_callbacks.c b/compiler-rt/test/dfsan/event_callbacks.c index c0f4fff372822..b154c9679d45f 100644 --- a/compiler-rt/test/dfsan/event_callbacks.c +++ b/compiler-rt/test/dfsan/event_callbacks.c @@ -114,14 +114,16 @@ int main(int Argc, char *Argv[]) { LabelArgv = dfsan_create_label("Argv", 0); dfsan_set_label(LabelArgv, Argv[1], LenArgv); - char SinkBuf[64]; - assert(LenArgv < sizeof(SinkBuf) - 1); + char Buf[64]; + assert(LenArgv < sizeof(Buf) - 1); // CHECK: Label 4 copied to memory - memcpy(SinkBuf, Argv[1], LenArgv); + void *volatile SinkPtr = Buf; + memcpy(SinkPtr, Argv[1], LenArgv); // CHECK: Label 4 copied to memory - memmove(&SinkBuf[1], SinkBuf, LenArgv); + SinkPtr = &Buf[1]; + memmove(SinkPtr, Buf, LenArgv); return 0; } diff --git a/compiler-rt/test/fuzzer/CustomMutatorWithLongSequencesTest.cpp b/compiler-rt/test/fuzzer/CustomMutatorWithLongSequencesTest.cpp new file mode 100644 index 0000000000000..4c9714788f569 --- /dev/null +++ b/compiler-rt/test/fuzzer/CustomMutatorWithLongSequencesTest.cpp @@ -0,0 +1,40 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Simple test for a cutom mutator that results in long sequences of mutations. +#include +#include +#include +#include +#include +#include + +#include "FuzzerInterface.h" + +static volatile int Sink; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { + assert(Data); + if (Size > 0 && Data[0] == 'H') { + Sink = 1; + if (Size > 1 && Data[1] == 'i') { + Sink = 2; + if (Size > 2 && Data[2] == '!') { + std::cout << "BINGO; Found the target, exiting\n" + << std::flush; + exit(1); + } + } + } + return 0; +} + +extern "C" size_t LLVMFuzzerCustomMutator(uint8_t *Data, size_t Size, + size_t MaxSize, unsigned int Seed) { + // Run this 25 times to generate a large mutation sequence. + for (size_t i = 0; i < 25; i++) { + LLVMFuzzerMutate(Data, Size, MaxSize); + } + return LLVMFuzzerMutate(Data, Size, MaxSize); +} diff --git a/compiler-rt/test/fuzzer/cross_over_uniform_dist.test b/compiler-rt/test/fuzzer/cross_over_uniform_dist.test index 0dff5fd628f37..b5ae7e4659230 100644 --- a/compiler-rt/test/fuzzer/cross_over_uniform_dist.test +++ b/compiler-rt/test/fuzzer/cross_over_uniform_dist.test @@ -6,11 +6,11 @@ RUN: mkdir %t-corpus RUN: echo -n "@SELECT" > %t-corpus/A RUN: echo -n "@FROM WHERE" > %t-corpus/B -RUN: not %run %t-CrossOverUniformDistTest -keep_seed=1 -cross_over_uniform_dist=1 -seed=1 -runs=2000000 %t-corpus 2>&1 | FileCheck %s +RUN: not %run %t-CrossOverUniformDistTest -keep_seed=1 -cross_over_uniform_dist=1 -seed=1 -runs=5000000 %t-corpus 2>&1 | FileCheck %s CHECK: BINGO RUN: rm -rf %t-corpus RUN: mkdir %t-corpus RUN: echo -n "@SELECT" > %t-corpus/A RUN: echo -n "@FROM WHERE" > %t-corpus/B -RUN: %run %t-CrossOverUniformDistTest -keep_seed=1 -seed=1 -runs=2000000 %t-corpus 2>&1 +RUN: %run %t-CrossOverUniformDistTest -keep_seed=1 -seed=1 -runs=5000000 %t-corpus 2>&1 diff --git a/compiler-rt/test/fuzzer/fuzzer-custommutator.test b/compiler-rt/test/fuzzer/fuzzer-custommutator.test index 25f5fe697b43f..7d94ae064bf96 100644 --- a/compiler-rt/test/fuzzer/fuzzer-custommutator.test +++ b/compiler-rt/test/fuzzer/fuzzer-custommutator.test @@ -11,3 +11,17 @@ LLVMFuzzerCustomMutatorWithLenControl: INFO: found LLVMFuzzerCustomMutator LLVMFuzzerCustomMutatorWithLenControl: In LLVMFuzzerCustomMutator LLVMFuzzerCustomMutatorWithLenControl: {{.*}} lim: {{[1-9][0-9]?}} {{.*}} LLVMFuzzerCustomMutatorWithLenControl: BINGO + +# sanity check: verify that we do get long lines with verbose printing on +RUN: %cpp_compiler %S/CustomMutatorWithLongSequencesTest.cpp -o %t-CustomMutatorWithLongSequencesTest +RUN: not %run %t-CustomMutatorWithLongSequencesTest -verbosity=2 2>&1 | FileCheck %s --check-prefix=LLVMFuzzerCustomMutatorLongSequence +LLVMFuzzerCustomMutatorLongSequence: Flag: verbosity 2 +LLVMFuzzerCustomMutatorLongSequence: {{.*}} MS: {{[0-9]*}} {{(([a-zA-Z]*-){11,})}} {{.*}} +LLVMFuzzerCustomMutatorLongSequence: BINGO + +# check a target that prints long mutation sequences and verifies the printed +# output is capped at 10 entries +RUN: not %run %t-CustomMutatorWithLongSequencesTest 2>&1 | FileCheck %s --check-prefix=LLVMFuzzerCustomMutatorLongSequenceTrimmed +LLVMFuzzerCustomMutatorLongSequenceTrimmed-NOT: Flag: verbosity 2 +LLVMFuzzerCustomMutatorLongSequenceTrimmed-NOT: {{.*}} MS: {{[0-9]*}} {{(([a-zA-Z]*-){11,})}} {{.*}} +LLVMFuzzerCustomMutatorLongSequenceTrimmed: BINGO diff --git a/compiler-rt/test/fuzzer/fuzzer-leak.test b/compiler-rt/test/fuzzer/fuzzer-leak.test index 2b61811d5d1b7..dd22fdec8677e 100644 --- a/compiler-rt/test/fuzzer/fuzzer-leak.test +++ b/compiler-rt/test/fuzzer/fuzzer-leak.test @@ -7,7 +7,7 @@ RUN: %cpp_compiler %S/LeakTimeoutTest.cpp -o %t-LeakTimeoutTest RUN: rm -rf %t-corpus && mkdir -p %t-corpus RUN: not %run %t-LeakTest -runs=100000 -detect_leaks=1 %t-corpus 2>&1 | FileCheck %s --check-prefix=LEAK_DURING LEAK_DURING: ERROR: LeakSanitizer: detected memory leaks -LEAK_DURING: Direct leak of 4 byte(s) in 1 object(s) allocated from: +LEAK_DURING: Direct leak of {{.*}} byte(s) in {{.*}} object(s) allocated from: LEAK_DURING: INFO: to ignore leaks on libFuzzer side use -detect_leaks=0 LEAK_DURING: Test unit written to ./leak- LEAK_DURING-NOT: DONE diff --git a/compiler-rt/test/fuzzer/keep-seed.test b/compiler-rt/test/fuzzer/keep-seed.test index 29212ac7c177c..a21cf46e8fe55 100644 --- a/compiler-rt/test/fuzzer/keep-seed.test +++ b/compiler-rt/test/fuzzer/keep-seed.test @@ -5,7 +5,7 @@ RUN: rm -rf %t-corpus RUN: mkdir %t-corpus RUN: echo -n SELECTxFROMxWHERE > %t-corpus/valid-fragments -RUN: not %run %t-KeepSeedTest -keep_seed=1 -seed=1 -runs=2000000 %t-corpus 2>&1 | FileCheck %s +RUN: not %run %t-KeepSeedTest -keep_seed=1 -seed=1 -runs=3000000 %t-corpus 2>&1 | FileCheck %s CHECK: BINGO RUN: rm -rf %t-corpus-baseline @@ -13,5 +13,5 @@ RUN: mkdir %t-corpus-baseline RUN: echo -n SELECTxFROMxWHERE > %t-corpus-baseline/valid-fragments # The following checks whether without -keep_seed=1 libFuzzer does not find the -# crashing input "SELECT FROM WHERE" even with 2x more runs. +# crashing input "SELECT FROM WHERE" even with more runs. RUN: %run %t-KeepSeedTest -seed=1 -runs=4000000 %t-corpus-baseline -print_final_stats=1 diff --git a/compiler-rt/test/fuzzer/mutation-graph.test b/compiler-rt/test/fuzzer/mutation-graph.test new file mode 100644 index 0000000000000..7774a500395e0 --- /dev/null +++ b/compiler-rt/test/fuzzer/mutation-graph.test @@ -0,0 +1,17 @@ +REQUIRES: linux, x86_64 +RUN: %cpp_compiler %S/SimpleTest.cpp -o %t-SimpleTest + +RUN: rm -rf %t-SimpleTestGraph + +RUN: not %run %t-SimpleTest -seed=1 -max_len=3 -mutation_graph_file=%t-SimpleTestGraph 2>&1 | FileCheck %s +CHECK: BINGO + +RUN: cat %t-SimpleTestGraph | FileCheck %s --check-prefix=GRAPH + +# A vertex and edge that correspond to the discovery of "H" +GRAPH: "7cf184f4c67ad58283ecb19349720b0cae756829" +GRAPH: {{.*}} -> "7cf184f4c67ad58283ecb19349720b0cae756829" [label="{{.*}}"]; + +# A vertex and edge that correspond to the discovery of "Hi" +GRAPH: "94dd9e08c129c785f7f256e82fbe0a30e6d1ae40" +GRAPH: {{.*}} -> "94dd9e08c129c785f7f256e82fbe0a30e6d1ae40" [label="{{.*}}"]; diff --git a/compiler-rt/test/lsan/TestCases/use_registers.cpp b/compiler-rt/test/lsan/TestCases/use_registers.cpp index 63ab282d4340c..2a7d97e0fb45e 100644 --- a/compiler-rt/test/lsan/TestCases/use_registers.cpp +++ b/compiler-rt/test/lsan/TestCases/use_registers.cpp @@ -16,6 +16,9 @@ extern "C" void *registers_thread_func(void *arg) { int *sync = reinterpret_cast(arg); void *p = malloc(1337); + print_address("Test alloc: ", 1, p); + fflush(stderr); + // To store the pointer, choose a register which is unlikely to be reused by // a function call. #if defined(__i386__) @@ -50,8 +53,6 @@ void *registers_thread_func(void *arg) { #else #error "Test is not supported on this architecture." #endif - print_address("Test alloc: ", 1, p); - fflush(stderr); __sync_fetch_and_xor(sync, 1); while (true) sched_yield(); diff --git a/compiler-rt/test/lsan/TestCases/use_registers_extra.cpp b/compiler-rt/test/lsan/TestCases/use_registers_extra.cpp new file mode 100644 index 0000000000000..fef5c36a9edef --- /dev/null +++ b/compiler-rt/test/lsan/TestCases/use_registers_extra.cpp @@ -0,0 +1,61 @@ +// Test that registers of running threads are included in the root set. +// RUN: LSAN_BASE="report_objects=1:use_stacks=0" +// RUN: %clangxx_lsan -pthread %s -o %t +// RUN: %env_lsan_opts=$LSAN_BASE:"use_registers=0" not %run %t 2>&1 | FileCheck %s +// RUN: %env_lsan_opts=$LSAN_BASE:"use_registers=1" %run %t 2>&1 +// RUN: %env_lsan_opts="" %run %t 2>&1 + +// FIXME: Support more platforms. +// REQUIRES: x86-target-arch + +#include "sanitizer_common/print_address.h" +#include +#include +#include +#include +#include + +extern "C" void *registers_thread_func(void *arg) { + int *sync = reinterpret_cast(arg); + void *p = malloc(1337); + print_address("Test alloc: ", 1, p); + fflush(stderr); + + // To store the pointer, choose a register which is unlikely to be reused by + // a function call. +#if defined(__i386__) + asm(R"( + movd %0, %%xmm0 + mov $0, %0 + )" + : + : "r"(p)); +#elif defined(__x86_64__) + asm(R"( + movq %0, %%xmm0 + mov $0, %0 + )" + : + : "r"(p)); +#else +#error "Test is not supported on this architecture." +#endif + + __sync_fetch_and_xor(sync, 1); + while (true) + sched_yield(); +} + +int main() { + int sync = 0; + pthread_t thread_id; + int res = pthread_create(&thread_id, 0, registers_thread_func, &sync); + assert(res == 0); + while (!__sync_fetch_and_xor(&sync, 0)) + sched_yield(); + return 0; +} +// CHECK: Test alloc: [[ADDR:0x[0-9,a-f]+]] +// CHECK: LeakSanitizer: detected memory leaks +// CHECK: [[ADDR]] (1337 bytes) +// CHECK: SUMMARY: {{(Leak|Address)}}Sanitizer: diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov index d1104b7f5bbf2..4debf8fc1b680 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov @@ -3,7 +3,7 @@ // CHECK-NEXT: -: 0:Data:instrprof-gcov-multiple-bbs-single-line.gcda // CHECK-NEXT: -: 0:Runs:1 // CHECK-NEXT: -: 0:Programs:1 -// CHECK-NEXT:function main called 1 returned 100% blocks executed 80% +// CHECK-NEXT:function main called 1 returned 100% blocks executed 77% // CHECK-NEXT: 1: 1:int main(void) // CHECK-NEXT: -: 2:{ // CHECK-NEXT: -: 3: int var; diff --git a/compiler-rt/test/profile/Posix/gcov-fork.c b/compiler-rt/test/profile/Posix/gcov-fork.c index b89eb64922f0c..e66690a961e2e 100644 --- a/compiler-rt/test/profile/Posix/gcov-fork.c +++ b/compiler-rt/test/profile/Posix/gcov-fork.c @@ -17,7 +17,7 @@ int main(void) { // CHECK-NEXT: 1: [[#@LINE]]: int status; // CHECK-NEXT: -: [[#@LINE]]: func1(); // CHECK-NEXT: 1: [[#@LINE]]: pid_t pid = fork(); // CHECK-NEXT: 1: [[#@LINE]]: - if (pid == -1) return 1; // CHECK-NEXT: 2: [[#@LINE]]: + if (pid == -1) return 1; // CHECK-NEXT: 1: [[#@LINE]]: if (pid) // CHECK-NEXT: 2: [[#@LINE]]: wait(&status); // CHECK-NEXT: 1: [[#@LINE]]: func2(); // CHECK-NEXT: 2: [[#@LINE]]: diff --git a/compiler-rt/test/profile/Posix/instrprof-gcov-parallel.test b/compiler-rt/test/profile/Posix/instrprof-gcov-parallel.test index 52b51e6269f53..0c7198e3c4e9e 100644 --- a/compiler-rt/test/profile/Posix/instrprof-gcov-parallel.test +++ b/compiler-rt/test/profile/Posix/instrprof-gcov-parallel.test @@ -10,9 +10,6 @@ RUN: %run %t.driver %t.target RUN: llvm-cov gcov instrprof-gcov-parallel.target.gcda RUN: FileCheck --input-file instrprof-gcov-parallel.target.c.gcov %s -# Bug 42535 -# XFAIL: sparc-target-arch - # Test if the .gcda file is correctly created from one of child processes # and counters of all processes are recorded correctly. # 707 = CHILDREN * COUNT diff --git a/compiler-rt/test/profile/gcov-basic.c b/compiler-rt/test/profile/gcov-basic.c index e00cebf4b781c..0d8be6d7de087 100644 --- a/compiler-rt/test/profile/gcov-basic.c +++ b/compiler-rt/test/profile/gcov-basic.c @@ -27,6 +27,8 @@ // CHECK: Runs:2 +#include + int main(int argc, char *argv[]) { // CHECK: 2: [[@LINE]]:int main if (argc > 1) // CHECK-NEXT: 2: [[@LINE]]: puts("hello"); // CHECK-NEXT: 1: [[@LINE]]: diff --git a/compiler-rt/test/profile/gcov-dump-and-remove.c b/compiler-rt/test/profile/gcov-dump-and-remove.c index b7f80535aada3..c35640f93b3de 100644 --- a/compiler-rt/test/profile/gcov-dump-and-remove.c +++ b/compiler-rt/test/profile/gcov-dump-and-remove.c @@ -11,10 +11,10 @@ extern void __gcov_dump(void); extern void __gcov_reset(void); extern int remove(const char *); // CHECK: -: [[#@LINE]]:extern int remove -int main(void) { // CHECK-NEXT: #####: [[#@LINE]]: - __gcov_dump(); // CHECK-NEXT: #####: [[#@LINE]]: - __gcov_reset(); // CHECK-NEXT: #####: [[#@LINE]]: - if (remove("gcov-dump-and-remove.gcda") != 0) // CHECK-NEXT: #####: [[#@LINE]]: +int main(void) { // CHECK-NEXT: 1: [[#@LINE]]: + __gcov_dump(); // CHECK-NEXT: 1: [[#@LINE]]: + __gcov_reset(); // CHECK-NEXT: 1: [[#@LINE]]: + if (remove("gcov-dump-and-remove.gcda") != 0) // CHECK-NEXT: 1: [[#@LINE]]: return 1; // CHECK-NEXT: #####: [[#@LINE]]: return 1; // CHECK-NEXT: -: [[#@LINE]]: __gcov_dump(); // CHECK-NEXT: 1: [[#@LINE]]: diff --git a/compiler-rt/test/ubsan/TestCases/Float/cast-overflow.cpp b/compiler-rt/test/ubsan/TestCases/Float/cast-overflow.cpp index 1c680259a2471..479c39f28428a 100644 --- a/compiler-rt/test/ubsan/TestCases/Float/cast-overflow.cpp +++ b/compiler-rt/test/ubsan/TestCases/Float/cast-overflow.cpp @@ -11,9 +11,6 @@ // FIXME: not %run %t 8 2>&1 | FileCheck %s --check-prefix=CHECK-8 // RUN: not %run %t 9 2>&1 | FileCheck %s --check-prefix=CHECK-9 -// Bug 42535 -// XFAIL: sparc-target-arch - // This test assumes float and double are IEEE-754 single- and double-precision. #if defined(__APPLE__) diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/vptr.cpp b/compiler-rt/test/ubsan/TestCases/TypeCheck/vptr.cpp index 67239e82d340d..ac35e42275710 100644 --- a/compiler-rt/test/ubsan/TestCases/TypeCheck/vptr.cpp +++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/vptr.cpp @@ -162,7 +162,7 @@ int access_p(T *p, char type) { case 'm': // CHECK-MEMBER: vptr.cpp:[[@LINE+6]]:15: runtime error: member access within address [[PTR:0x[0-9a-f]*]] which does not point to an object of type 'T' // CHECK-MEMBER-NEXT: [[PTR]]: note: object is of type [[DYN_TYPE:'S'|'U']] - // CHECK-MEMBER-NEXT: {{^ .. .. .. .. .. .. .. .. .. .. .. .. }} + // CHECK-MEMBER-NEXT: {{^ ?.. .. .. .. ?.. .. .. .. ?.. .. .. .. ?}} // CHECK-MEMBER-NEXT: {{^ \^~~~~~~~~~~(~~~~~~~~~~~~)? *$}} // CHECK-MEMBER-NEXT: {{^ vptr for}} [[DYN_TYPE]] // CHECK-Linux-MEMBER: #0 {{.*}}access_p{{.*}}vptr.cpp:[[@LINE+1]] @@ -178,7 +178,7 @@ int access_p(T *p, char type) { case 'f': // CHECK-MEMFUN: vptr.cpp:[[@LINE+6]]:15: runtime error: member call on address [[PTR:0x[0-9a-f]*]] which does not point to an object of type 'T' // CHECK-MEMFUN-NEXT: [[PTR]]: note: object is of type [[DYN_TYPE:'S'|'U']] - // CHECK-MEMFUN-NEXT: {{^ .. .. .. .. .. .. .. .. .. .. .. .. }} + // CHECK-MEMFUN-NEXT: {{^ ?.. .. .. .. ?.. .. .. .. ?.. .. .. .. ?}} // CHECK-MEMFUN-NEXT: {{^ \^~~~~~~~~~~(~~~~~~~~~~~~)? *$}} // CHECK-MEMFUN-NEXT: {{^ vptr for}} [[DYN_TYPE]] // TODO: Add check for stacktrace here. @@ -196,7 +196,7 @@ int access_p(T *p, char type) { case 'c': // CHECK-DOWNCAST: vptr.cpp:[[@LINE+6]]:11: runtime error: downcast of address [[PTR:0x[0-9a-f]*]] which does not point to an object of type 'T' // CHECK-DOWNCAST-NEXT: [[PTR]]: note: object is of type [[DYN_TYPE:'S'|'U']] - // CHECK-DOWNCAST-NEXT: {{^ .. .. .. .. .. .. .. .. .. .. .. .. }} + // CHECK-DOWNCAST-NEXT: {{^ ?.. .. .. .. ?.. .. .. .. ?.. .. .. .. ?}} // CHECK-DOWNCAST-NEXT: {{^ \^~~~~~~~~~~(~~~~~~~~~~~~)? *$}} // CHECK-DOWNCAST-NEXT: {{^ vptr for}} [[DYN_TYPE]] // CHECK-Linux-DOWNCAST: #0 {{.*}}access_p{{.*}}vptr.cpp:[[@LINE+1]] diff --git a/compiler-rt/utils/generate_netbsd_syscalls.awk b/compiler-rt/utils/generate_netbsd_syscalls.awk index cc7ba314ea551..1bddc0f2f2bff 100755 --- a/compiler-rt/utils/generate_netbsd_syscalls.awk +++ b/compiler-rt/utils/generate_netbsd_syscalls.awk @@ -1167,6 +1167,8 @@ function syscall_body(syscall, mode) pcmd("/* TODO */") } else if (syscall == "dup2") { pcmd("/* Nothing to do */") + } else if (syscall == "getrandom") { + pcmd("/* TODO */") } else if (syscall == "fcntl") { pcmd("/* Nothing to do */") } else if (syscall == "compat_50_select") { @@ -1431,6 +1433,12 @@ function syscall_body(syscall, mode) pcmd("/* TODO */") } else if (syscall == "sysarch") { pcmd("/* TODO */") + } else if (syscall == "__futex") { + pcmd("/* TODO */") + } else if (syscall == "__futex_set_robust_list") { + pcmd("/* TODO */") + } else if (syscall == "__futex_get_robust_list") { + pcmd("/* TODO */") } else if (syscall == "compat_10_osemsys") { pcmd("/* TODO */") } else if (syscall == "compat_10_omsgsys") { @@ -3027,6 +3035,32 @@ function syscall_body(syscall, mode) pcmd(" PRE_READ(fhp_, fh_size_);") pcmd("}") } + } else if (syscall == "__acl_get_link") { + pcmd("/* TODO */") + } else if (syscall == "__acl_set_link") { + pcmd("/* TODO */") + } else if (syscall == "__acl_delete_link") { + pcmd("/* TODO */") + } else if (syscall == "__acl_aclcheck_link") { + pcmd("/* TODO */") + } else if (syscall == "__acl_get_file") { + pcmd("/* TODO */") + } else if (syscall == "__acl_set_file") { + pcmd("/* TODO */") + } else if (syscall == "__acl_get_fd") { + pcmd("/* TODO */") + } else if (syscall == "__acl_set_fd") { + pcmd("/* TODO */") + } else if (syscall == "__acl_delete_file") { + pcmd("/* TODO */") + } else if (syscall == "__acl_delete_fd") { + pcmd("/* TODO */") + } else if (syscall == "__acl_aclcheck_file") { + pcmd("/* TODO */") + } else if (syscall == "__acl_aclcheck_fd") { + pcmd("/* TODO */") + } else if (syscall == "lpathconf") { + pcmd("/* TODO */") } else { print "Unrecognized syscall: " syscall abnormal_exit = 1 diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index 03440b72ec8ca..daae9e9b1246e 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -17,6 +17,7 @@ if (POLICY CMP0077) endif() option(LINK_WITH_FIR "Link driver with FIR and LLVM" ON) +option(FLANG_BUILD_NEW_DRIVER "Build the flang compiler driver" OFF) # Flang requires C++17. set(CMAKE_CXX_STANDARD 17) @@ -56,7 +57,16 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) # We need a pre-built/installed version of LLVM. find_package(LLVM REQUIRED HINTS "${LLVM_CMAKE_PATH}") - list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR}) + # If the user specifies a relative path to LLVM_DIR, the calls to include + # LLVM modules fail. Append the absolute path to LLVM_DIR instead. + get_filename_component(LLVM_DIR_ABSOLUTE ${LLVM_DIR} REALPATH) + list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR_ABSOLUTE}) + + if(FLANG_BUILD_NEW_DRIVER) + # TODO: Remove when libclangDriver is lifted out of Clang + list(APPEND CMAKE_MODULE_PATH ${CLANG_DIR}) + find_package(Clang REQUIRED HINTS "${CLANG_DIR}") + endif() # If LLVM links to zlib we need the imported targets so we can too. if(LLVM_ENABLE_ZLIB) @@ -78,7 +88,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) find_package(MLIR REQUIRED CONFIG) # Use SYSTEM for the same reasons as for LLVM includes include_directories(SYSTEM ${MLIR_INCLUDE_DIRS}) - list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR}) + # If the user specifies a relative path to MLIR_DIR, the calls to include + # MLIR modules fail. Append the absolute path to MLIR_DIR instead. + get_filename_component(MLIR_DIR_ABSOLUTE ${MLIR_DIR} REALPATH) + list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR_ABSOLUTE}) include(AddMLIR) find_program(MLIR_TABLEGEN_EXE "mlir-tblgen" ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH) @@ -194,6 +207,21 @@ else() endif() endif() +if(FLANG_BUILD_NEW_DRIVER) + # TODO: Remove when libclangDriver is lifted out of Clang + if(FLANG_STANDALONE_BUILD) + set(CLANG_INCLUDE_DIR ${CLANG_INCLUDE_DIRS} ) + # No need to specify TableGen output dir as that's embedded in CLANG_DIR + else() + set(CLANG_INCLUDE_DIR ${LLVM_MAIN_SRC_DIR}/../clang/include ) + # Specify TableGen output dir for things like DiagnosticCommonKinds.inc, + # DiagnosticDriverKinds.inc (required for reporting diagnostics) + set(CLANG_TABLEGEN_OUTPUT_DIR ${CMAKE_BINARY_DIR}/tools/clang/include) + include_directories(SYSTEM ${CLANG_TABLEGEN_OUTPUT_DIR}) + endif() + include_directories(SYSTEM ${CLANG_INCLUDE_DIR}) +endif() + if(LINK_WITH_FIR) # tco tool and FIR lib output directories if(FLANG_STANDALONE_BUILD) diff --git a/flang/README.md b/flang/README.md index 3a58c277bacf3..934169b9ae6ac 100644 --- a/flang/README.md +++ b/flang/README.md @@ -143,6 +143,21 @@ cd ~/flang/build cmake -DLLVM_DIR=$LLVM -DMLIR_DIR=$MLIR ~/flang/src make ``` + +### Build The New Flang Driver +The new Flang driver, `flang-new`, is currently under active development and +should be considered as an experimental feature. For this reason it is disabled +by default. This will change once the new driver replaces the _throwaway_ +driver, `flang`. + +In order to build the new driver, add `-DBUILD_FLANG_NEW_DRIVER=ON` to your +CMake invocation line. Additionally, when building out-of-tree, use `CLANG_DIR` +(similarly to `LLVM_DIR` and `MLIR_DIR`) to find the installed Clang +components. + +**Note:** `CLANG_DIR` is only required when building the new Flang driver, +which currently depends on Clang. + # How to Run Tests Flang supports 2 different categories of tests diff --git a/flang/docs/ArrayComposition.md b/flang/docs/ArrayComposition.md index 18194caadf09c..9e61abe5670f3 100644 --- a/flang/docs/ArrayComposition.md +++ b/flang/docs/ArrayComposition.md @@ -1,3 +1,18 @@ + + +# Array Composition + +```eval_rst +.. contents:: + :local: +``` + This note attempts to describe the motivation for and design of an implementation of Fortran 90 (and later) array expression evaluation that minimizes the use of dynamically allocated temporary storage for @@ -26,8 +41,8 @@ Other Fortran intrinsic functions are technically transformational (e.g., `COMMAND_ARGUMENT_COUNT`) but not of interest for this note. The generic `REDUCE` is also not considered here. -Arrays as functions -=================== +## Arrays as functions + A whole array can be viewed as a function that maps its indices to the values of its elements. Specifically, it is a map from a tuple of integers to its element type. @@ -37,8 +52,8 @@ and the shape of the array delimits the domain of the map. `REAL :: A(N,M)` can be seen as a function mapping ordered pairs of integers `(J,K)` with `1<=J<=N` and `1<=J<=M` to real values. -Array expressions as functions -============================== +## Array expressions as functions + The same perspective can be taken of an array expression comprising intrinsic operators and elemental functions. Fortran doesn't allow one to apply subscripts directly to an expression, @@ -75,8 +90,8 @@ side variable as an operand of the right-hand side expression, and any function calls on the right-hand side are elemental or scalar-valued, we can avoid the use of a temporary. -Transformational intrinsic functions as function composition -============================================================ +## Transformational intrinsic functions as function composition + Many of the transformational intrinsic functions listed above can, when their array arguments are viewed as functions over their index tuples, be seen as compositions of those functions with @@ -119,8 +134,8 @@ More completely: * `SPREAD(A,DIM=d,NCOPIES=n)` for compile-time `d` simply applies `A` to a reduced index tuple. -Determination of rank and shape -=============================== +## Determination of rank and shape + An important part of evaluating array expressions without the use of temporary storage is determining the shape of the result prior to, or without, evaluating the elements of the result. @@ -165,8 +180,8 @@ In cases where the analyzed shape is known at compile time, we should be able to have the opportunity to avoid heap allocation in favor of stack storage, if the scope of the variable is local. -Automatic reallocation of allocatables -====================================== +## Automatic reallocation of allocatables + Fortran 2003 introduced the ability to assign non-conforming array expressions to ALLOCATABLE arrays with the implied semantics of reallocation to the new shape. @@ -174,8 +189,8 @@ The implementation of this feature also becomes more straightforward if our implementation of array expressions has decoupled calculation of shapes from the evaluation of the elements of the result. -Rewriting rules -=============== +## Rewriting rules + Let `{...}` denote an ordered tuple of 1-based indices, e.g. `{j,k}`, into the result of an array expression or subexpression. diff --git a/flang/docs/BijectiveInternalNameUniquing.md b/flang/docs/BijectiveInternalNameUniquing.md index b302d389c664f..7a6e8a4f4e644 100644 --- a/flang/docs/BijectiveInternalNameUniquing.md +++ b/flang/docs/BijectiveInternalNameUniquing.md @@ -1,4 +1,9 @@ -## Bijective Internal Name Uniquing +# Bijective Internal Name Uniquing + +```eval_rst +.. contents:: + :local: +``` FIR has a flat namespace. No two objects may have the same name at the module level. (These would be functions, globals, etc.) @@ -13,14 +18,14 @@ Fortran is case insensitive, which allows the compiler to convert the user's identifiers to all lower case. Such a universal conversion implies that all upper case letters are available for use in uniquing. -### Prefix `_Q` +## Prefix `_Q` All uniqued names have the prefix sequence `_Q` to indicate the name has been uniqued. (Q is chosen because it is a [low frequency letter](http://pi.math.cornell.edu/~mec/2003-2004/cryptography/subs/frequencies.html) in English.) -### Scope Building +## Scope Building Symbols can be scoped by the module, submodule, or procedure that contains that symbol. After the `_Q` sigil, names are constructed from outermost to @@ -45,7 +50,7 @@ The uniqued name of `fun` becomes: _QMmodSs1modSs2modFsubPfun ``` -### Common blocks +## Common blocks * A common block name will be prefixed with `B` @@ -69,7 +74,7 @@ The uniqued name in case of `blank common block` becomes: _QB ``` -### Module scope global data +## Module scope global data * A global data entity is prefixed with `E` * A global entity that is constant (parameter) will be prefixed with `EC` @@ -92,7 +97,7 @@ The uniqued name of `pi` becomes: _QMmodECpi ``` -### Procedures/Subprograms +## Procedures/Subprograms * A procedure/subprogram is prefixed with `P` @@ -105,7 +110,7 @@ The uniqued name of `sub` becomes: _QPsub ``` -### Derived types and related +## Derived types and related * A derived type is prefixed with `T` * If a derived type has KIND parameters, they are listed in a consistent @@ -148,7 +153,7 @@ The uniqued name of `yourtype` where `k1=4` and `k2=-6` (at compile-time): type `yourtype` above would be `_QCTyourtypeK4KN6`. The type descriptor for `REAL(4)` would be `_QCrealK4`. -### Compiler generated names +## Compiler generated names Compiler generated names do not have to be mapped back to Fortran. These names will be prefixed with `_QQ` and followed by a unique compiler diff --git a/flang/docs/C++17.md b/flang/docs/C++17.md index ea8395cfdedc7..9e0120d2e4c5e 100644 --- a/flang/docs/C++17.md +++ b/flang/docs/C++17.md @@ -1,4 +1,17 @@ -## C++14/17 features used in f18 + + +# C++14/17 features used in f18 + +```eval_rst +.. contents:: + :local: +``` The C++ dialect used in this project constitutes a subset of the standard C++ programming language and library features. @@ -24,7 +37,7 @@ The most important of these are: (`std::tuple` is actually a C++11 feature, but I include it in this list because it's not particularly well known.) -### Sum types +## Sum types First, some background information to explain the need for sum types in f18. @@ -103,7 +116,7 @@ would be to: functions (or the forbidden `dynamic_cast`) to identify alternatives during analysis -### Product types +## Product types Many productions in the Fortran grammar describe a sequence of various sub-parses. @@ -125,7 +138,7 @@ So we use `std::tuple` for such things. It has also been handy for template metaprogramming that needs to work with lists of types. -### `std::optional` +## `std::optional` This simple little type is used wherever a value might or might not be present. diff --git a/flang/docs/C++style.md b/flang/docs/C++style.md index 77e0a04638238..fb11e64116141 100644 --- a/flang/docs/C++style.md +++ b/flang/docs/C++style.md @@ -1,3 +1,20 @@ + + +# Flang C++ Style Guide + +```eval_rst +.. contents:: + :local: +``` + +This document captures the style guide rules that are followed in the Flang codebase. + ## In brief: * Use *clang-format* from llvm 7 diff --git a/flang/docs/Calls.md b/flang/docs/Calls.md index 8a4d65820d19f..440d0bd147c2d 100644 --- a/flang/docs/Calls.md +++ b/flang/docs/Calls.md @@ -1,3 +1,18 @@ + + +# Representation of Fortran function calls + +```eval_rst +.. contents:: + :local: +``` + ## Procedure reference implementation protocol Fortran function and subroutine references are complicated. diff --git a/flang/docs/Character.md b/flang/docs/Character.md index f66b144389450..603dd8848ba1b 100644 --- a/flang/docs/Character.md +++ b/flang/docs/Character.md @@ -1,6 +1,19 @@ -## Implementation of `CHARACTER` types in f18 + + +# Implementation of `CHARACTER` types in f18 + +```eval_rst +.. contents:: + :local: +``` + +## Kinds and Character Sets The f18 compiler and runtime support three kinds of the intrinsic `CHARACTER` type of Fortran 2018. @@ -40,7 +53,7 @@ We might want to support one or more environment variables to change these assumptions, especially for `KIND=1` users of ISO-8859 character sets besides Latin-1. -### Lengths +## Lengths Allocatable `CHARACTER` objects in Fortran may defer the specification of their lengths until the time of their allocation or whole (non-substring) @@ -68,7 +81,7 @@ Fortran substrings are rather like subscript triplets into a hidden "zero" dimension of a scalar `CHARACTER` value, but they cannot have strides. -### Concatenation +## Concatenation Fortran has one `CHARACTER`-valued intrinsic operator, `//`, which concatenates its operands (10.1.5.3). @@ -97,7 +110,7 @@ The result of `//` may be used The f18 compiler has a general (but slow) means of implementing concatenation and a specialized (fast) option to optimize the most common case. -#### General concatenation +### General concatenation In the most general case, the f18 compiler's generated code and runtime support library represent the result as a deferred-length allocatable @@ -122,7 +135,7 @@ When the left-hand side of a `CHARACTER` assignment is a deferred-length allocatable and the right-hand side is a temporary, use of the runtime's `MoveAlloc()` subroutine instead can save an allocation and a copy. -#### Optimized concatenation +### Optimized concatenation Scalar `CHARACTER(KIND=1)` expressions evaluated as the right-hand sides of assignments to independent substrings or whole variables that are not diff --git a/flang/docs/ControlFlowGraph.md b/flang/docs/ControlFlowGraph.md index 7d1e514a87adb..dcdecf1b77f65 100644 --- a/flang/docs/ControlFlowGraph.md +++ b/flang/docs/ControlFlowGraph.md @@ -1,3 +1,18 @@ + + +# Control Flow Graph + +```eval_rst +.. contents:: + :local: +``` + ## Concept After a Fortran subprogram has been parsed, its names resolved, and all its semantic constraints successfully checked, the parse tree of its diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md index 554dc4608dd43..a1a99b674cef2 100644 --- a/flang/docs/Directives.md +++ b/flang/docs/Directives.md @@ -1,5 +1,14 @@ -Compiler directives supported by F18 -==================================== + + +# Compiler directives supported by Flang + +A list of non-standard directives supported by Flang * `!dir$ fixed` and `!dir$ free` select Fortran source forms. Their effect persists to the end of the current source file. diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md index 027927f67dfd4..1c85c3f42d1b1 100644 --- a/flang/docs/Extensions.md +++ b/flang/docs/Extensions.md @@ -1,3 +1,18 @@ + + +# Fortran Extensions supported by Flang + +```eval_rst +.. contents:: + :local: +``` + As a general principle, this compiler will accept by default and without complaint many legacy features, extensions to the standard language, and features that have been deleted from the standard, @@ -8,8 +23,8 @@ Other non-standard features, which do conflict with the current standard specification of the Fortran programming language, are accepted if enabled by command-line options. -Intentional violations of the standard -====================================== +## Intentional violations of the standard + * Scalar `INTEGER` actual argument expressions (not variables!) are converted to the kinds of scalar `INTEGER` dummy arguments when the interface is explicit and the kinds differ. @@ -21,8 +36,8 @@ Intentional violations of the standard so long as they contain no executable code, no internal subprograms, and allocate no storage outside a named `COMMON` block. (C1415) -Extensions, deletions, and legacy features supported by default -=============================================================== +## Extensions, deletions, and legacy features supported by default + * Tabs in source * `<>` as synonym for `.NE.` and `/=` * `$` and `@` as legal characters in names @@ -115,8 +130,8 @@ Extensions, deletions, and legacy features supported by default * DATA statement initialization is allowed for procedure pointers outside structure constructors. -Extensions supported when enabled by options --------------------------------------------- +### Extensions supported when enabled by options + * C-style backslash escape sequences in quoted CHARACTER literals (but not Hollerith) [-fbackslash] * Logical abbreviations `.T.`, `.F.`, `.N.`, `.A.`, `.O.`, and `.X.` @@ -137,8 +152,8 @@ Extensions supported when enabled by options * Ignore occurrences of `IMPLICIT NONE` and `IMPLICIT NONE(TYPE)` [-fimplicit-none-type-never] -Extensions and legacy features deliberately not supported ---------------------------------------------------------- +### Extensions and legacy features deliberately not supported + * `.LG.` as synonym for `.NE.` * `REDIMENSION` * Allocatable `COMMON` @@ -181,8 +196,8 @@ Extensions and legacy features deliberately not supported PGI, Intel, and XLF support this in ways that are not numerically equivalent. PGI converts the arguments while Intel and XLF replace the specific by the related generic. -Preprocessing behavior -====================== +## Preprocessing behavior + * The preprocessor is always run, whatever the filename extension may be. * We respect Fortran comments in macro actual arguments (like GNU, Intel, NAG; unlike PGI and XLF) on the principle that macro calls should be treated diff --git a/flang/docs/FortranForCProgrammers.md b/flang/docs/FortranForCProgrammers.md index 542034f3ea833..572433ab7c154 100644 --- a/flang/docs/FortranForCProgrammers.md +++ b/flang/docs/FortranForCProgrammers.md @@ -1,5 +1,17 @@ -Fortran For C Programmers -========================= + + +# Fortran For C Programmers + +```eval_rst +.. contents:: + :local: +``` This note is limited to essential information about Fortran so that a C or C++ programmer can get started more quickly with the language, @@ -8,8 +20,8 @@ to write or modify Fortran code. Please see other sources to learn about Fortran's rich history, current applications, and modern best practices in new code. -Know This At Least ------------------- +## Know This At Least + * There have been many implementations of Fortran, often from competing vendors, and the standard language has been defined by U.S. and international standards organizations. The various editions of @@ -45,8 +57,8 @@ Know This At Least interfaces in compiled "modules", as well as legacy mechanisms for sharing data and interconnecting subprograms. -A Rosetta Stone ---------------- +## A Rosetta Stone + Fortran's language standard and other documentation uses some terminology in particular ways that might be unfamiliar. @@ -73,8 +85,8 @@ in particular ways that might be unfamiliar. | Type-bound procedure | Kind of a C++ member function but not really | | Unformatted | Raw binary | -Data Types ----------- +## Data Types + There are five built-in ("intrinsic") types: `INTEGER`, `REAL`, `COMPLEX`, `LOGICAL`, and `CHARACTER`. They are parameterized with "kind" values, which should be treated as @@ -109,8 +121,8 @@ Last, there are "typeless" binary constants that can be used in a few situations, like static data initialization or immediate conversion, where type is not necessary. -Arrays ------- +## Arrays + Arrays are not types in Fortran. Being an array is a property of an object or function, not of a type. Unlike C, one cannot have an array of arrays or an array of pointers, @@ -125,8 +137,8 @@ And yes, the default lower bound on each dimension is 1, not 0. Expressions can manipulate arrays as multidimensional values, and the compiler will create the necessary loops. -Allocatables ------------- +## Allocatables + Modern Fortran programs use `ALLOCATABLE` data extensively. Such variables and derived type components are allocated dynamically. They are automatically deallocated when they go out of scope, much @@ -139,8 +151,8 @@ and follow up all the references that are made in the documentation from the description of `ALLOCATABLE` to other topics; it's a feature that interacts with much of the rest of the language.) -I/O ---- +## I/O + Fortran's input/output features are built into the syntax of the language, rather than being defined by library interfaces as in C and C++. There are means for raw binary I/O and for "formatted" transfers to @@ -165,8 +177,8 @@ One can also use compiler-generated formatting in "list-directed" I/O, in which the compiler derives reasonable default formats based on data types. -Subprograms ------------ +## Subprograms + Fortran has both `FUNCTION` and `SUBROUTINE` subprograms. They share the same name space, but functions cannot be called as subroutines or vice versa. @@ -180,8 +192,8 @@ their own internal procedures. As is the case with C++ lambda expressions, internal procedures can reference names from their host subprograms. -Modules -------- +## Modules + Modern Fortran has good support for separate compilation and namespace management. The *module* is the basic unit of compilation, although independent @@ -196,8 +208,8 @@ All references to objects in modules are done with direct names or aliases that have been added to the local scope, as Fortran has no means of qualifying references with module names. -Arguments ---------- +## Arguments + Functions and subroutines have "dummy" arguments that are dynamically associated with actual arguments during calls. Essentially, all argument passing in Fortran is by reference, not value. @@ -228,8 +240,8 @@ scope. This is the opposite of the assumptions under which a C or C++ compiler must labor when trying to optimize code with pointers. -Overloading ------------ +## Overloading + Fortran supports a form of overloading via its interface feature. By default, an interface is a means for specifying prototypes for a set of subroutines and functions. @@ -242,8 +254,8 @@ A similar feature can be used for generic type-bound procedures. This feature can be used to overload the built-in operators and some I/O statements, too. -Polymorphism ------------- +## Polymorphism + Fortran code can be written to accept data of some derived type or any extension thereof using `CLASS`, deferring the actual type to execution, rather than the usual `TYPE` syntax. @@ -253,8 +265,8 @@ Fortran's `SELECT TYPE` construct is used to distinguish between possible specific types dynamically, when necessary. It's a little like C++17's `std::visit()` on a discriminated union. -Pointers --------- +## Pointers + Pointers are objects in Fortran, not data types. Pointers can point to data, arrays, and subprograms. A pointer can only point to data that has the `TARGET` attribute. @@ -279,8 +291,8 @@ out of scope. A legacy feature, "Cray pointers", implements dynamic base addressing of one variable using an address stored in another. -Preprocessing -------------- +## Preprocessing + There is no standard preprocessing feature, but every real Fortran implementation has some support for passing Fortran source code through a variant of the standard C source preprocessor. @@ -294,8 +306,8 @@ suffix (e.g., "foo.F90") or a compiler command line option. (Since the F18 compiler always runs its built-in preprocessing stage, no special option or filename suffix is required.) -"Object Oriented" Programming ------------------------------ +## "Object Oriented" Programming + Fortran doesn't have member functions (or subroutines) in the sense that C++ does, in which a function has immediate access to the members of a specific instance of a derived type. @@ -317,8 +329,8 @@ There's a lot more that can be said about type-bound procedures (e.g., how they support overloading) but this should be enough to get you started with the most common usage. -Pitfalls --------- +## Pitfalls + Variable initializers, e.g. `INTEGER :: J=123`, are _static_ initializers! They imply that the variable is stored in static storage, not on the stack, and the initialized value lasts only until the variable is assigned. diff --git a/flang/docs/FortranIR.md b/flang/docs/FortranIR.md index 83193ff27a359..f1f643a1d17da 100644 --- a/flang/docs/FortranIR.md +++ b/flang/docs/FortranIR.md @@ -1,5 +1,18 @@ + + # Design: Fortran IR +```eval_rst +.. contents:: + :local: +``` + ## Introduction After semantic analysis is complete and it has been determined that the compiler has a legal Fortran program as input, the parse tree will be lowered to an intermediate representation for the purposes of high-level analysis and optimization. In this document, that intermediate representation will be called Fortran IR or FIR. The pass that converts from the parse tree and other data structures of the front-end to FIR will be called the "Burnside bridge". diff --git a/flang/docs/GettingInvolved.md b/flang/docs/GettingInvolved.md new file mode 100644 index 0000000000000..a244fbcee56a0 --- /dev/null +++ b/flang/docs/GettingInvolved.md @@ -0,0 +1,72 @@ + +# Getting Involved + +```eval_rst +.. contents:: + :local: +``` + +The Flang Project welcomes contributions of all kinds. +Please feel free to join the mailing list or the slack channel for discussions related to development of Flang. +To understand the status of various developments in Flang please join the respective call. + +## Mailing Lists + +[Developer's List (flang-dev)](http://lists.llvm.org/mailman/listinfo/flang-dev) + + This list is for people who want to be included in technical discussions related to Flang. People post to this list when they have questions about writing code + for or using the Flang tools. It is relatively low volume. + + +[Commits Archive (flang-commits)](http://lists.llvm.org/pipermail/flang-commits) + + This list contains all commit messages that are made when Flang developers + commit code changes to the repository. It also serves as a forum for + patch review (i.e. send patches here). It is useful for those who want to + stay on the bleeding edge of Flang development. This list is high + volume. + +## Chat + +### Flang Slack Workspace + +- There is a Slack workspace dedicated to Flang. +- There are a number of topic-oriented channels available (e.g., #driver, #f18-semantics, #fir). +- Add yourself via the *[invitation link](https://join.slack.com/t/flang-compiler/shared_invite/zt-2pcn51lh-VrRQL_YUOkxA_1CEfMGQhw "title")* + +## Calls + +### Flang Community Biweekly Call + +- General updates on the Flang Project, both LLVM Flang and current Flang. +- Join [Flang Community Biweekly Call](https://nvmeet.webex.com/nvmeet/j.php?MTID=mb4edb8c799f69ec2dc0554acc969a162) +- Time: On Wednesdays 8:30 Pacific Time, on the weeks alternating with regular Flang Community Technical Biweekly Call. +- Minutes: They are sent to [flang-dev](http://lists.llvm.org/mailman/listinfo/flang-dev). Search for `Flang Biweekly Sync - Notes`. + +### Flang Community Technical Biweekly Call + +- Technical topics call. +- Join [Flang Community Technical Biweekly Call](https://bluejeans.com/625064848?src=join_info) +- Time: On Mondays 8:30 Pacific Time, on the weeks alternating with regular Flang Community Biweekly Call. +- The agenda is in this [Google Doc](https://docs.google.com/document/d/1Z2U5UAtJ-Dag5wlMaLaW1KRmNgENNAYynJqLW2j2AZQ/). + +### LLVM Alias Analysis Technical Call + +- For people working on improvements to LLVM alias analysis. +- Join [LLVM Alias Analysis Technical Call](https://bluejeans.com/101176001?src=join_info) +- Time: Tuesdays 10:00 AM Pacific Time, every 4 weeks. +- The agenda is in this [Google Doc](https://docs.google.com/document/d/1ybwEKDVtIbhIhK50qYtwKsL50K-NvB6LfuBsfepBZ9Y/). + +### OpenMP Technical Call + +- Development updates on OpenMP and OpenACC in the Flang Project. +- Join [OpenMP Technical Call](https://bit.ly/39eQW3o) +- Time: Weekly call on every Thursdays 8:00 AM Pacific time. +- Meeting minutes are [here](https://docs.google.com/document/d/1yA-MeJf6RYY-ZXpdol0t7YoDoqtwAyBhFLr5thu5pFI). +- Status tracking [page](https://docs.google.com/spreadsheets/d/1FvHPuSkGbl4mQZRAwCIndvQx9dQboffiD-xD0oqxgU0/edit#gid=0). diff --git a/flang/docs/IORuntimeInternals.md b/flang/docs/IORuntimeInternals.md index 8ff464ee9c8f7..2748fcf16fa3c 100644 --- a/flang/docs/IORuntimeInternals.md +++ b/flang/docs/IORuntimeInternals.md @@ -1,5 +1,17 @@ -Fortran I/O Runtime Library Internal Design -=========================================== + + +# Fortran I/O Runtime Library Internal Design + +```eval_rst +.. contents:: + :local: +``` This note is meant to be an overview of the design of the *implementation* of the f18 Fortran compiler's runtime support library for I/O statements. @@ -58,8 +70,7 @@ template library of fast conversion algorithms used to interpret floating-point values in Fortran source programs and to emit them to module files. -Overview of Classes -=================== +## Overview of Classes A suite of C++ classes and class templates are composed to construct the Fortran I/O runtime support library. @@ -71,16 +82,16 @@ classes are in the process of being vigorously rearranged and modified; use `grep` or an IDE to discover these classes in the source for now. (Sorry!) -`Terminator` ----------- +### `Terminator` + A general facility for the entire library, `Terminator` latches a source program statement location in terms of an unowned pointer to its source file path name and line number and uses them to construct a fatal error message if needed. It is used for both user program errors and internal runtime library crashes. -`IoErrorHandler` --------------- +### `IoErrorHandler` + When I/O error conditions arise at runtime that the Fortran program might have the privilege to handle itself via `ERR=`, `END=`, or `EOR=` labels and/or by an `IOSTAT=` variable, this subclass of @@ -88,8 +99,8 @@ might have the privilege to handle itself via `ERR=`, `END=`, or It sorts out priorities in the case of multiple errors and determines the final `IOSTAT=` value at the end of an I/O statement. -`MutableModes` ------------- +### `MutableModes` + Fortran's formatted I/O statements are affected by a suite of modes that can be configured by `OPEN` statements, overridden by data transfer I/O statement control lists, and further overridden @@ -100,8 +111,8 @@ order to properly isolate their modifications. The modes in force at the time each data item is processed constitute a member of each `DataEdit`. -`DataEdit` --------- +### `DataEdit` + Represents a single data edit descriptor from a `FORMAT` statement or `FMT=` character value, with some hidden extensions to also support formatting of list-directed transfers. @@ -111,8 +122,8 @@ For simplicity and efficiency, each data edit descriptor is encoded in the `DataEdit` as a simple capitalized character (or two) and some optional field widths. -`FormatControl<>` ---------------- +### `FormatControl<>` + This class template traverses a `FORMAT` statement's contents (or `FMT=` character value) to extract data edit descriptors like `E20.14` to serve each item in an I/O data transfer statement's *io-list*, @@ -134,32 +145,32 @@ output strings or record positionings at the end of the *io-list*. The `DefaultFormatControlCallbacks` structure summarizes the API expected by `FormatControl` from its class template actual arguments. -`OpenFile` --------- +### `OpenFile` + This class encapsulates all (I hope) the operating system interfaces used to interact with the host's filesystems for operations on external units. Asynchronous I/O interfaces are faked for now with synchronous operations and deferred results. -`ConnectionState` ---------------- +### `ConnectionState` + An active connection to an external or internal unit maintains the common parts of its state in this subclass of `ConnectionAttributes`. The base class holds state that should not change during the lifetime of the connection, while the subclass maintains state that may change during I/O statement execution. -`InternalDescriptorUnit` ----------------------- +### `InternalDescriptorUnit` + When I/O is being performed from/to a Fortran `CHARACTER` array rather than an external file, this class manages the standard interoperable descriptor used to access its elements as records. It has the necessary interfaces to serve as an actual argument to the `FormatControl` class template. -`FileFrame<>` ------------ +### `FileFrame<>` + This CRTP class template isolates all of the complexity involved between an external unit's `OpenFile` and the buffering requirements imposed by the capabilities of Fortran `FORMAT` control edit @@ -184,8 +195,8 @@ a frame may come up short. As a CRTP class template, `FileFrame` accesses the raw filesystem facilities it needs from `*this`. -`ExternalFileUnit` ----------------- +### `ExternalFileUnit` + This class mixes in `ConnectionState`, `OpenFile`, and `FileFrame` to represent the state of an open (or soon to be opened) external file descriptor as a Fortran @@ -202,8 +213,8 @@ Static member functions `LookUp()`, `LookUpOrCrash()`, and `LookUpOrCreate()` probe the map to convert Fortran `UNIT=` numbers from I/O statements into references to active units. -`IoStatementBase` ---------------- +### `IoStatementBase` + The subclasses of `IoStatementBase` each encapsulate and maintain the state of one active Fortran I/O statement across the several I/O runtime library API function calls it may comprise. @@ -231,8 +242,8 @@ the I/O API supports a means whereby the code generated for the Fortran program may supply stack space to the I/O runtime support library for this purpose. -`IoStatementState` ----------------- +### `IoStatementState` + F18's Fortran I/O runtime support library defines and implements an API that uses a sequence of function calls to implement each Fortran I/O statement. @@ -261,8 +272,8 @@ unit, the library has to treat that (expected to be rare) situation as a weird variation of internal I/O since there's no `ExternalFileUnit` available to hold its `IoStatementBase` subclass or `IoStatementState`. -A Narrative Overview Of `PRINT *, 'HELLO, WORLD'` -================================================= +## A Narrative Overview Of `PRINT *, 'HELLO, WORLD'` + 1. When the compiled Fortran program begins execution at the `main()` entry point exported from its main program, it calls `ProgramStart()` with its arguments and environment. diff --git a/flang/docs/ImplementingASemanticCheck.md b/flang/docs/ImplementingASemanticCheck.md index 2406f5bc2a58c..35b107e4988eb 100644 --- a/flang/docs/ImplementingASemanticCheck.md +++ b/flang/docs/ImplementingASemanticCheck.md @@ -1,10 +1,24 @@ + +# How to implement a Sematic Check in Flang + +```eval_rst +.. contents:: + :local: +``` + I recently added a semantic check to the f18 compiler front end. This document describes my thought process and the resulting implementation. For more information about the compiler, start with the [compiler overview](Overview.md). -# Problem definition +## Problem definition In the 2018 Fortran standard, section 11.1.7.4.3, paragraph 2, states that: @@ -21,7 +35,7 @@ emit a warning if an active DO variable was passed to a dummy argument with INTENT(INOUT). Previously, I had implemented similar checks for SUBROUTINE calls. -# Creating a test +## Creating a test My first step was to create a test case to cause the problem. I called it testfun.f90 and used it to check the behavior of other Fortran compilers. Here's the initial version: @@ -86,14 +100,14 @@ constant 216 in the statement: ```fortran dummyArg = 216 ``` -# Analysis and implementation planning +## Analysis and implementation planning I then considered what I needed to do. I needed to detect situations where an active DO variable was passed to a dummy argument with `INTENT(OUT)` or `INTENT(INOUT)`. Once I detected such a situation, I needed to produce a message that highlighted the erroneous source code. -## Deciding where to add the code to the compiler +### Deciding where to add the code to the compiler This new semantic check would depend on several types of information -- the parse tree, source code location information, symbols, and expressions. Thus I needed to put my new code in a place in the compiler after the parse tree had @@ -143,7 +157,7 @@ Since my semantic check was focused on DO CONCURRENT statements, I added it to the file `lib/Semantics/check-do.cpp` where most of the semantic checking for DO statements already lived. -## Taking advantage of prior work +### Taking advantage of prior work When implementing a similar check for SUBROUTINE calls, I created a utility functions in `lib/Semantics/semantics.cpp` to emit messages if a symbol corresponding to an active DO variable was being potentially modified: @@ -165,7 +179,7 @@ information -- The first and third are needed since they're required to call the utility functions. The second is needed to determine whether to call them. -## Finding the source location +### Finding the source location The source code location information that I'd need for the error message must come from the parse tree. I looked in the file `include/flang/Parser/parse-tree.h` and determined that a `struct Expr` @@ -173,7 +187,7 @@ contained source location information since it had the field `CharBlock source`. Thus, if I visited a `parser::Expr` node, I could get the source location information for the associated expression. -## Determining the `INTENT` +### Determining the `INTENT` I knew that I could find the `INTENT` of the dummy argument associated with the actual argument from the function called `dummyIntent()` in the class `evaluate::ActualArgument` in the file `include/flang/Evaluate/call.h`. So @@ -240,7 +254,7 @@ This combination of the traversal framework and `dummyIntent()` would give me the `INTENT` of all of the dummy arguments in a FUNCTION call. Thus, I would have the second piece of information I needed. -## Determining if the actual argument is a variable +### Determining if the actual argument is a variable I also guessed that I could determine if the `evaluate::ActualArgument` consisted of a variable. @@ -256,9 +270,9 @@ needed -- the source location of the erroneous text, the `INTENT` of the dummy argument, and a symbol that I could use to determine whether the actual argument was an active DO variable. -# Implementation +## Implementation -## Adding a parse tree visitor +### Adding a parse tree visitor I started my implementation by adding a visitor for `parser::Expr` nodes. Since this analysis is part of DO construct checking, I did this in `lib/Semantics/check-do.cpp`. I added a print statement to the visitor to @@ -300,7 +314,7 @@ source position of the associated expression (`CharBlock source`). So I now had one of the three pieces of information needed to detect and report errors. -## Collecting the actual arguments +### Collecting the actual arguments To get the `INTENT` of the dummy arguments and the `semantics::Symbol` associated with the actual argument, I needed to find all of the actual arguments embedded in an expression that contained a FUNCTION call. So my next step was to write the @@ -466,7 +480,7 @@ node. So far, so good. -## Finding the `INTENT` of the dummy argument +### Finding the `INTENT` of the dummy argument I now wanted to find the `INTENT` of the dummy argument associated with the arguments in the set. As mentioned earlier, the type `evaluate::ActualArgument` has a member function called `dummyIntent()` @@ -510,7 +524,7 @@ I then modified my test case to convince myself that I was getting the correct So far, so good. -## Finding the symbols for arguments that are variables +### Finding the symbols for arguments that are variables The third and last piece of information I needed was to determine if a variable was being passed as an actual argument. In such cases, I wanted to get the symbol table node (`semantics::Symbol`) for the variable. My starting point was the @@ -630,7 +644,7 @@ Here's the result of running the modified compiler on my Fortran test case: Sweet. -## Emitting the messages +### Emitting the messages At this point, using the source location information from the original `parser::Expr`, I had enough information to plug into the exiting interfaces for emitting messages for active DO variables. I modified the @@ -693,7 +707,7 @@ output: Even sweeter. -# Improving the test case +## Improving the test case At this point, my implementation seemed to be working. But I was concerned about the limitations of my test case. So I augmented it to include arguments other than `INTENT(OUT)` and more complex expressions. Luckily, my @@ -754,7 +768,7 @@ Here's the test I ended up with: end subroutine s ``` -# Submitting the pull request +## Submitting the pull request At this point, my implementation seemed functionally complete, so I stripped out all of the debug statements, ran `clang-format` on it and reviewed it to make sure that the names were clear. Here's what I ended up with: @@ -782,7 +796,7 @@ to make sure that the names were clear. Here's what I ended up with: I then created a pull request to get review comments. -# Responding to pull request comments +## Responding to pull request comments I got feedback suggesting that I use an `if` statement rather than a `case` statement. Another comment reminded me that I should look at the code I'd previously writted to do a similar check for SUBROUTINE calls to see diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index 6f4dec4678233..f9e47e5893bff 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -1,5 +1,18 @@ + + # A categorization of standard (2018) and extended Fortran intrinsic procedures +```eval_rst +.. contents:: + :local: +``` + This note attempts to group the intrinsic procedures of Fortran into categories of functions or subroutines with similar interfaces as an aid to comprehension beyond that which might be gained from the standard's @@ -45,14 +58,14 @@ Intrinsic modules are not covered here. may appear within the brackets to preserve the order of arguments (e.g., `COUNT`). -# Elemental intrinsic functions +## Elemental intrinsic functions Pure elemental semantics apply to these functions, to wit: when one or more of the actual arguments are arrays, the arguments must be conformable, and the result is also an array. Scalar arguments are expanded when the arguments are not all scalars. -## Elemental intrinsic functions that may have unrestricted specific procedures +### Elemental intrinsic functions that may have unrestricted specific procedures When an elemental intrinsic function is documented here as having an _unrestricted specific name_, that name may be passed as an actual @@ -341,7 +354,7 @@ that is present in `SET`, or zero if none is. `VERIFY` is essentially the opposite: it returns the index of the first (or last) character in `STRING` that is *not* present in `SET`, or zero if all are. -# Transformational intrinsic functions +## Transformational intrinsic functions This category comprises a large collection of intrinsic functions that are collected together because they somehow transform their arguments @@ -364,7 +377,7 @@ Some general rules apply to the transformational intrinsic functions: 1. The type `any` here denotes any intrinsic or derived type. 1. The notation `(..)` denotes an array of any rank (but not an assumed-rank array). -## Logical reduction transformational intrinsic functions +### Logical reduction transformational intrinsic functions ``` ALL(LOGICAL(k) MASK(..) [, DIM ]) -> LOGICAL(k) ANY(LOGICAL(k) MASK(..) [, DIM ]) -> LOGICAL(k) @@ -372,7 +385,7 @@ COUNT(LOGICAL(any) MASK(..) [, DIM, KIND=KIND(0) ]) -> INTEGER(KIND) PARITY(LOGICAL(k) MASK(..) [, DIM ]) -> LOGICAL(k) ``` -## Numeric reduction transformational intrinsic functions +### Numeric reduction transformational intrinsic functions ``` IALL(INTEGER(k) ARRAY(..) [, DIM, MASK ]) -> INTEGER(k) IANY(INTEGER(k) ARRAY(..) [, DIM, MASK ]) -> INTEGER(k) @@ -384,7 +397,7 @@ SUM(numeric ARRAY(..) [, DIM, MASK ]) -> numeric `NORM2` generalizes `HYPOT` by computing `SQRT(SUM(X*X))` while avoiding spurious overflows. -## Extrema reduction transformational intrinsic functions +### Extrema reduction transformational intrinsic functions ``` MAXVAL(relational(k) ARRAY(..) [, DIM, MASK ]) -> relational(k) MINVAL(relational(k) ARRAY(..) [, DIM, MASK ]) -> relational(k) @@ -411,7 +424,7 @@ MAXLOC(relational ARRAY(..) [, DIM, MASK, KIND=KIND(0), BACK=.FALSE. ]) MINLOC(relational ARRAY(..) [, DIM, MASK, KIND=KIND(0), BACK=.FALSE. ]) ``` -## Data rearrangement transformational intrinsic functions +### Data rearrangement transformational intrinsic functions The optional `DIM` argument to these functions must be a scalar integer of any kind, and it takes a default value of 1 when absent. @@ -467,7 +480,7 @@ UNPACK(any VECTOR(n), LOGICAL(any) MASK(..), FIELD) -> type and kind of VECTOR, ``` `FIELD` has same type and kind as `VECTOR` and is conformable with `MASK`. -## Other transformational intrinsic functions +### Other transformational intrinsic functions ``` BESSEL_JN(INTEGER(n1) N1, INTEGER(n2) N2, REAL(k) X) -> REAL(k) vector (MAX(N2-N1+1,0)) BESSEL_YN(INTEGER(n1) N1, INTEGER(n2) N2, REAL(k) X) -> REAL(k) vector (MAX(N2-N1+1,0)) @@ -509,7 +522,7 @@ At least one argument must be present in a call to `SELECTED_REAL_KIND`. An assumed-rank array may be passed to `SHAPE`, and if it is associated with an assumed-size array, the last element of the result will be -1. -## Coarray transformational intrinsic functions +### Coarray transformational intrinsic functions ``` FAILED_IMAGES([scalar TEAM_TYPE TEAM, KIND=KIND(0)]) -> INTEGER(KIND) vector GET_TEAM([scalar INTEGER(?) LEVEL]) -> scalar TEAM_TYPE @@ -524,10 +537,10 @@ THIS_IMAGE([COARRAY, DIM, scalar TEAM_TYPE TEAM]) -> default INTEGER The result of `THIS_IMAGE` is a scalar if `DIM` is present or if `COARRAY` is absent, and a vector whose length is the corank of `COARRAY` otherwise. -# Inquiry intrinsic functions +## Inquiry intrinsic functions These are neither elemental nor transformational; all are pure. -## Type inquiry intrinsic functions +### Type inquiry intrinsic functions All of these functions return constants. The value of the argument is not used, and may well be undefined. ``` @@ -546,7 +559,7 @@ RANGE(INTEGER(k) or REAL(k) or COMPLEX(k) X(..)) -> scalar default INTEGER TINY(REAL(k) X(..)) -> scalar REAL(k) ``` -## Bound and size inquiry intrinsic functions +### Bound and size inquiry intrinsic functions The results are scalar when `DIM` is present, and a vector of length=(co)rank(`(CO)ARRAY`) when `DIM` is absent. ``` @@ -559,7 +572,7 @@ UCOBOUND(any COARRAY [, DIM, KIND=KIND(0) ]) -> INTEGER(KIND) Assumed-rank arrays may be used with `LBOUND`, `SIZE`, and `UBOUND`. -## Object characteristic inquiry intrinsic functions +### Object characteristic inquiry intrinsic functions ``` ALLOCATED(any type ALLOCATABLE ARRAY) -> scalar default LOGICAL ALLOCATED(any type ALLOCATABLE SCALAR) -> scalar default LOGICAL @@ -576,11 +589,11 @@ The arguments to `EXTENDS_TYPE_OF` must be of extensible derived types or be unl An assumed-rank array may be used with `IS_CONTIGUOUS` and `RANK`. -# Intrinsic subroutines +## Intrinsic subroutines (*TODO*: complete these descriptions) -## One elemental intrinsic subroutine +### One elemental intrinsic subroutine ``` INTERFACE SUBROUTINE MVBITS(FROM, FROMPOS, LEN, TO, TOPOS) @@ -594,7 +607,7 @@ INTERFACE END INTERFACE ``` -## Non-elemental intrinsic subroutines +### Non-elemental intrinsic subroutines ``` CALL CPU_TIME(REAL INTENT(OUT) TIME) ``` @@ -619,7 +632,7 @@ CALL RANDOM_SEED([SIZE, PUT, GET]) CALL SYSTEM_CLOCK([COUNT, COUNT_RATE, COUNT_MAX]) ``` -## Atomic intrinsic subroutines +### Atomic intrinsic subroutines ``` CALL ATOMIC_ADD(ATOM, VALUE [, STAT=]) CALL ATOMIC_AND(ATOM, VALUE [, STAT=]) @@ -634,7 +647,7 @@ CALL ATOMIC_REF(VALUE, ATOM [, STAT=]) CALL ATOMIC_XOR(ATOM, VALUE [, STAT=]) ``` -## Collective intrinsic subroutines +### Collective intrinsic subroutines ``` CALL CO_BROADCAST CALL CO_MAX @@ -643,8 +656,8 @@ CALL CO_REDUCE CALL CO_SUM ``` -# Non-standard intrinsics -## PGI +## Non-standard intrinsics +### PGI ``` AND, OR, XOR LSHIFT, RSHIFT, SHIFT @@ -658,7 +671,7 @@ JINT, JNINT, KNINT LOC ``` -## Intel +### Intel ``` DCMPLX(X,Y), QCMPLX(X,Y) DREAL(DOUBLE COMPLEX A) -> DOUBLE PRECISION @@ -681,12 +694,12 @@ CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, LOC MALLOC ``` -# Intrinsic Procedure Support in f18 +## Intrinsic Procedure Support in f18 This section gives an overview of the support inside f18 libraries for the intrinsic procedures listed above. It may be outdated, refer to f18 code base for the actual support status. -## Semantic Analysis +### Semantic Analysis F18 semantic expression analysis phase detects intrinsic procedure references, validates the argument types and deduces the return types. This phase currently supports all the intrinsic procedures listed above but the ones in the table below. @@ -702,7 +715,7 @@ This phase currently supports all the intrinsic procedures listed above but the | Collective intrinsic subroutines | CO_BROADCAST &al. | -## Intrinsic Function Folding +### Intrinsic Function Folding Fortran Constant Expressions can contain references to a certain number of intrinsic functions (see Fortran 2018 standard section 10.1.12 for more details). Constant Expressions may be used to define kind arguments. Therefore, the semantic @@ -716,7 +729,7 @@ arrays when an implementation is provided for the scalars (regardless of whether it is using host hardware types or not). The status of intrinsic function folding support is given in the sub-sections below. -### Intrinsic Functions with Host Independent Folding Support +#### Intrinsic Functions with Host Independent Folding Support Implementations using f18 scalar types enables folding intrinsic functions on any host and with any possible type kind supported by f18. The intrinsic functions listed below are folded using host independent implementations. @@ -728,7 +741,7 @@ listed below are folded using host independent implementations. | COMPLEX | CMPLX, CONJG | | LOGICAL | BGE, BGT, BLE, BLT | -### Intrinsic Functions with Host Dependent Folding Support +#### Intrinsic Functions with Host Dependent Folding Support Implementations using the host runtime may not be available for all supported f18 types depending on the host hardware types and the libraries available on the host. The actual support on a host depends on what the host hardware types are. diff --git a/flang/docs/LabelResolution.md b/flang/docs/LabelResolution.md index 2dfa5a30bb3ca..c1227a8bc35a1 100644 --- a/flang/docs/LabelResolution.md +++ b/flang/docs/LabelResolution.md @@ -1,5 +1,18 @@ + + # Semantics: Resolving Labels and Construct Names +```eval_rst +.. contents:: + :local: +``` + ## Overview After the Fortran input file(s) has been parsed into a syntax tree, the compiler must check that the program checks semantically. Target labels must be checked and violations of legal semantics should be reported to the user. diff --git a/flang/docs/ModFiles.md b/flang/docs/ModFiles.md index 367cd4cd54f7c..ccb849ab0decd 100644 --- a/flang/docs/ModFiles.md +++ b/flang/docs/ModFiles.md @@ -1,5 +1,18 @@ + + # Module Files +```eval_rst +.. contents:: + :local: +``` + Module files hold information from a module that is necessary to compile program units that depend on the module. diff --git a/flang/docs/OpenMP-4.5-grammar.txt b/flang/docs/OpenMP-4.5-grammar.md similarity index 97% rename from flang/docs/OpenMP-4.5-grammar.txt rename to flang/docs/OpenMP-4.5-grammar.md index c74072ba1ef27..bc8a18a84e500 100644 --- a/flang/docs/OpenMP-4.5-grammar.txt +++ b/flang/docs/OpenMP-4.5-grammar.md @@ -1,18 +1,16 @@ -#===-- docs/OpenMP-4.5-grammar.txt --------------------------------===# -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#===------------------------------------------------------------------------===# +# OpenMP 4.5 Grammar -# OpenMP 4.5 Specifications +Grammar used by Flang to parse OpenMP 4.5. +## OpenMP 4.5 Specifications +``` 2 omp-directive -> sentinel directive-name [clause[ [,] clause]...] 2.1.1 sentinel -> !$omp | c$omp | *$omp 2.1.2 sentinel -> !$omp +``` -# directive-name +## directive-name +``` 2.5 parallel -> PARALLEL [parallel-clause[ [,] parallel-clause]...] parallel-clause -> if-clause | num-threads-clause | @@ -344,6 +342,8 @@ ATOMIC [seq_cst] atomic-clause -> READ | WRITE | UPDATE | CAPTURE +2.13.6 end-atomic -> END ATOMIC + 2.13.7 flush -> FLUSH [(variable-name-list)] 2.13.8 ordered -> ORDERED ordered-construct-clause [[[,] ordered-construct-clause]...] @@ -462,3 +462,4 @@ ALLOC | RELEASE | DELETE 2.15.5.2 defaultmap -> DEFAULTMAP (TOFROM:SCALAR) +``` diff --git a/flang/docs/OpenMP-semantics.md b/flang/docs/OpenMP-semantics.md index 22a3ca5614ebc..1511bc9e7b3b5 100644 --- a/flang/docs/OpenMP-semantics.md +++ b/flang/docs/OpenMP-semantics.md @@ -1,5 +1,18 @@ + + # OpenMP Semantic Analysis +```eval_rst +.. contents:: + :local: +``` + ## OpenMP for F18 1. Define and document the parse tree representation for diff --git a/flang/docs/OptionComparison.md b/flang/docs/OptionComparison.md index 5c04450a7bb34..347a1d6000ee2 100644 --- a/flang/docs/OptionComparison.md +++ b/flang/docs/OptionComparison.md @@ -1,11 +1,26 @@ -# Compiler options + + +# Compiler options comparison + +```eval_rst +.. contents:: + :local: +``` This document catalogs the options processed by F18's peers/competitors. Much of the document is taken up by a set of tables that list the options categorized into different topics. Some of the table headings link to more information about the contents of the tables. For example, the table on **Standards conformance** options links to [notes on Standards conformance](#standards). -**There's also important information in the ___[Notes section](#notes)___ near the end of the document on how this data was gathered and what ___is___ and ___is not___ included in this document.** +**There's also important information in the ___[Appendix section](#appendix)___ near the end of the document on how this data was gathered and what ___is___ and ___is not___ included in this document.** Note that compilers may support language features without having an option for them. Such cases are frequently, but not always noted in this document. +## Categorisation of Options +
Standards conformance @@ -1175,7 +1190,7 @@ Mcuda -## Notes +## Notes **Standards conformance:** @@ -1282,7 +1297,7 @@ GNU is the only compiler with options governing the use of non-standard intrinsi **Warn for bad call checking**: This Cray option ("-eb") issues a warning message rather than an error message when the compiler detects a call to a procedure with one or more dummy arguments having the TARGET, VOLATILE or ASYNCHRONOUS attribute and there is not an explicit interface definition. -## Notes +## Appendix ### What is and is not included diff --git a/flang/docs/Overview.md b/flang/docs/Overview.md index 807efda2ed9a3..9878589438450 100644 --- a/flang/docs/Overview.md +++ b/flang/docs/Overview.md @@ -1,5 +1,18 @@ + + # Overview of Compiler Phases +```eval_rst +.. contents:: + :local: +``` + Each phase produces either correct output or fatal errors. ## Prescan and Preprocess diff --git a/flang/docs/ParserCombinators.md b/flang/docs/ParserCombinators.md index 757684dcfda60..ff94d341c1501 100644 --- a/flang/docs/ParserCombinators.md +++ b/flang/docs/ParserCombinators.md @@ -1,3 +1,20 @@ + + +# Parser Combinators + +```eval_rst +.. contents:: + :local: +``` + +This document is a primer on Parser Combinators and their use in Flang. + ## Concept The Fortran language recognizer here can be classified as an LL recursive descent parser. It is composed from a *parser combinator* library that diff --git a/flang/docs/Parsing.md b/flang/docs/Parsing.md index 54a4fd752f6c1..dec63e6fbdab4 100644 --- a/flang/docs/Parsing.md +++ b/flang/docs/Parsing.md @@ -1,5 +1,18 @@ -The F18 Parser -============== + + +# The F18 Parser + +```eval_rst +.. contents:: + :local: +``` + This program source code implements a parser for the Fortran programming language. @@ -34,8 +47,8 @@ source file and receive its parse tree and error messages. The interfaces of the Parsing class correspond to the two major passes of the parser, which are described below. -Prescanning and Preprocessing ------------------------------ +## Prescanning and Preprocessing + The first pass is performed by an instance of the Prescanner class, with help from an instance of Preprocessor. @@ -92,8 +105,8 @@ The content of the cooked character stream is available and useful for debugging, being as it is a simple value forwarded from the first major pass of the compiler to the second. -Source Provenance ------------------ +## Source Provenance + The prescanner constructs a chronicle of every file that is read by the parser, viz. the original source file and all others that it directly or indirectly includes. One copy of the content of each of these files @@ -116,8 +129,8 @@ Simple `const char *` pointers to characters in the cooked character stream, or to contiguous ranges thereof, are used as source position indicators within the parser and in the parse tree. -Messages --------- +## Messages + Message texts, and snprintf-like formatting strings for constructing messages, are instantiated in the various components of the parser with C++ user defined character literals tagged with `_err_en_US` and `_en_US` @@ -126,8 +139,8 @@ English used in the United States) so that they may be easily identified for localization. As described above, messages are associated with source code positions by means of provenance values. -The Parse Tree --------------- +## The Parse Tree + Each of the ca. 450 numbered requirement productions in the standard Fortran language grammar, as well as the productions implied by legacy extensions and preserved obsolescent features, maps to a distinct class @@ -166,8 +179,8 @@ stability of pointers into these lists. There is a general purpose library by means of which parse trees may be traversed. -Parsing -------- +## Parsing + This compiler attempts to recognize the entire cooked character stream (see above) as a Fortran program. It records the reductions made during a successful recognition as a parse tree value. The recognized grammar @@ -195,8 +208,8 @@ of "parser combinator" template functions that compose them to form more complicated recognizers and their correspondences to the construction of parse tree values. -Unparsing ---------- +## Unparsing + Parse trees can be converted back into free form Fortran source code. This formatter is not really a classical "pretty printer", but is more of a data structure dump whose output is suitable for compilation diff --git a/flang/docs/Preprocessing.md b/flang/docs/Preprocessing.md index 9b4d905177b7f..3c6984cfa2fd0 100644 --- a/flang/docs/Preprocessing.md +++ b/flang/docs/Preprocessing.md @@ -1,8 +1,20 @@ -Fortran Preprocessing -===================== + + +# Fortran Preprocessing + +```eval_rst +.. contents:: + :local: +``` + +## Behavior common to (nearly) all compilers: -Behavior common to (nearly) all compilers: ------------------------------------------- * Macro and argument names are sensitive to case. * Fixed form right margin clipping after column 72 (or 132) has precedence over macro name recognition, and also over @@ -31,9 +43,8 @@ Behavior common to (nearly) all compilers: * A `#define` directive intermixed with continuation lines can't define a macro that's invoked earlier in the same continued statement. -Behavior that is not consistent over all extant compilers but which -probably should be uncontroversial: ------------------------------------ +## Behavior that is not consistent over all extant compilers but which probably should be uncontroversial: + * Invoked macro names can straddle a Fortran line continuation. * ... unless implicit fixed form card padding intervenes; i.e., in fixed form, a continued macro name has to be split at column @@ -57,8 +68,8 @@ probably should be uncontroversial: directive indicator. * `#define KWM !` allows KWM to signal a comment. -Judgement calls, where precedents are unclear: ----------------------------------------------- +## Judgement calls, where precedents are unclear: + * Expressions in `#if` and `#elif` should support both Fortran and C operators; e.g., `#if 2 .LT. 3` should work. * If a function-like macro does not close its parentheses, line @@ -76,16 +87,16 @@ Judgement calls, where precedents are unclear: lines, it may or may not affect text in the continued statement that appeared before the directive. -Behavior that few compilers properly support (or none), but should: -------------------------------------------------------------------- +## Behavior that few compilers properly support (or none), but should: + * A macro invocation can straddle free form continuation lines in all of their forms, with continuation allowed in the name, before the arguments, and within the arguments. * Directives can be capitalized in free form, too. * `__VA_ARGS__` and `__VA_OPT__` work in variadic function-like macros. -In short, a Fortran preprocessor should work as if: ---------------------------------------------------- +## In short, a Fortran preprocessor should work as if: + 1. Fixed form lines are padded up to column 72 (or 132) and clipped thereafter. 2. Fortran comments are removed. 3. C-style line continuations are processed in preprocessing directives. @@ -117,8 +128,7 @@ text. OpenMP-style directives that look like comments are not addressed by this scheme but are obvious extensions. -Appendix -======== +## Appendix `N` in the table below means "not supported"; this doesn't mean a bug, it just means that a particular behavior was not observed. diff --git a/flang/docs/PullRequestChecklist.md b/flang/docs/PullRequestChecklist.md index 17b6d64923f58..b253c153f61ec 100644 --- a/flang/docs/PullRequestChecklist.md +++ b/flang/docs/PullRequestChecklist.md @@ -1,3 +1,11 @@ + + # Pull request checklist Please review the following items before submitting a pull request. This list can also be used when reviewing pull requests. @@ -28,7 +36,7 @@ even though I've read the style guide, they regularly trip me up. clang-format will do this for most code. But you may need to break up long strings. * Review declarations for proper use of `constexpr` and `const`. -* Follow the C++ [naming guidelines](C++style.md#naming). +* Follow the C++ [naming guidelines](C++style.html#naming) * Ensure that the names evoke their purpose and are consistent with existing code. * Used braced initializers. * Review pointer and reference types to make sure that you're using them diff --git a/flang/docs/ReleaseNotes.md b/flang/docs/ReleaseNotes.md new file mode 100644 index 0000000000000..b4b00ee65ffb2 --- /dev/null +++ b/flang/docs/ReleaseNotes.md @@ -0,0 +1,87 @@ +# Flang 12.0.0 (In-Progress) Release Notes + +> **warning** +> +> These are in-progress notes for the upcoming LLVM 12.0.0 release. +> Release notes for previous releases can be found on [the Download +> Page](https://releases.llvm.org/download.html). + +## Introduction + +This document contains the release notes for the Flang Fortran frontend, +part of the LLVM Compiler Infrastructure, release 12.0.0. Here we +describe the status of Flang in some detail, including major +improvements from the previous release and new feature work. For the +general LLVM release notes, see [the LLVM +documentation](https://llvm.org/docs/ReleaseNotes.html). All LLVM +releases may be downloaded from the [LLVM releases web +site](https://llvm.org/releases/). + +Note that if you are reading this file from a Git checkout, this +document applies to the *next* release, not the current one. To see the +release notes for a specific release, please see the [releases +page](https://llvm.org/releases/). + +## Known Issues + +These are issues that couldn't be fixed before the release. See the bug +reports for the latest status. + + * ... + +## Introducing Flang + +Flang is LLVM's Fortran front end and is new for the LLVM 11 release. + +Flang is still a work in progress for this release and is included for +experimentation and feedback. + +Flang is able to parse a comprehensive subset of the Fortran language +and check it for correctness. Flang is not yet able to generate LLVM IR +for the source code and thus is unable to compile a running binary. + +Flang is able to unparse the input source code into a canonical form and +emit it to allow testing. Flang can also invoke an external Fortran +compiler on this canonical input. + +Flang's parser has comprehensive support for: + * Fortran 2018 + * OpenMP 4.5 + * OpenACC 3.0 + +Interested users are invited to try to compile their Fortran codes with +flang in and report any issues in parsing or semantic checking in +[bugzilla](https://bugs.llvm.org/enter_bug.cgi?product=flang). + +### Major missing features + + * Flang is not supported on Windows platforms. + +## Using Flang + +Usage: `flang hello.f90 -o hello.bin` + +By default, Flang will parse the Fortran file `hello.f90` then unparse it to a +canonical Fortran source file. Flang will then invoke an external +Fortran compiler to compile this source file and link it, placing the +resulting executable in `hello.bin`. + +To specify the external Fortran compiler, set the `F18_FC` environment +variable to the name of the compiler binary and ensure that it is on your +`PATH`. The default value for `F18_FC` is `gfortran`. + +When invoked with no source input, Flang will wait for input on stdin. +When invoked in this way, Flang performs the same actions as if +called with `-fdebug-measure-parse-tree -funparse` and does not invoke +`F18_FC`. + +For a full list of options that Flang supports, run `flang --help`. + +## Additional Information + +Flang's documentation is located in the `flang/docs/` directory in the +LLVM monorepo. + +If you have any questions or comments about Flang, please feel free to +contact us via the [mailing +list](https://lists.llvm.org/mailman/listinfo/flang-dev). diff --git a/flang/docs/ReleaseNotes.rst b/flang/docs/ReleaseNotes.rst deleted file mode 100644 index bbc7377412d63..0000000000000 --- a/flang/docs/ReleaseNotes.rst +++ /dev/null @@ -1,96 +0,0 @@ -======================================== -Flang 11.0.0 (In-Progress) Release Notes -======================================== - -.. contents:: - :local: - :depth: 2 - -.. warning:: - - These are in-progress notes for the upcoming LLVM 11.0.0 release. - Release notes for previous releases can be found on - `the Download Page `_. - -Introduction -============ - -This document contains the release notes for the Flang Fortran -frontend, part of the LLVM Compiler Infrastructure, release 11.0.0. Here we -describe the status of Flang in some detail, including major -improvements from the previous release and new feature work. For the -general LLVM release notes, see `the LLVM -documentation `_. All LLVM -releases may be downloaded from the `LLVM releases web -site `_. - -Note that if you are reading this file from a Git checkout, this document -applies to the *next* release, not -the current one. To see the release notes for a specific release, please -see the `releases page `_. - -Known Issues -============ - -These are issues that couldn't be fixed before the release. See the bug reports for the latest status. - -- ... - -Introducing Flang -================= - -Flang is LLVM's Fortran front end and is new for the LLVM 11 release. - -Flang is still a work in progress for this release and is included for -experimentation and feedback. - -Flang status ------------- - -Flang is able to parse a comprehensive subset of the Fortran language -and check it for correctness. Flang is not yet able to generate LLVM IR for -the source code and thus is unable to compile a running binary. - -Flang is able to unparse the input source code into a canonical form and emit -it to allow testing. Flang can also invoke an external Fortran compiler on this -canonical input. - -Flang's parser has comprehensive support for: -- Fortran 2018 -- OpenMP 4.5 -- OpenACC 3.0 - -Major missing features ----------------------- - -- Flang is not supported on Windows platforms. - -Using Flang -=========== - -Usage: ``flang hello.f90 -o hello.bin`` - -Flang will parse the Fortran file ``hello.f90`` then unparse it to a canonical -Fortran source file. Flang will then invoke an external Fortran compiler to -compile this source file and link it, placing the resulting executable -in ``hello.bin``. - -To specify the external Fortran compiler, set the ``F18_FC`` environment -variable to the name of the compiler binary and ensure it is on your ``PATH``. -The default value for ``F18_FC`` is ``gfortran``. - -When invoked with no source input, Flang will wait for input on standard in. -When invoked in this way, Flang performs the same actions as if called with -``-fdebug-measure-parse-tree -funparse`` and does not invoke ``F18_FC``. - -For a full list of options that Flang supports, run ``flang --help``. - -Additional Information -====================== - -Flang's documentation is located in the ``flang/docs/`` directory in -the LLVM monorepo. - -If you have any questions or comments about Flang, please feel free to -contact us via the `mailing -list `_. diff --git a/flang/docs/RuntimeDescriptor.md b/flang/docs/RuntimeDescriptor.md index a8eff33f65211..f0bbd2e3fedaf 100644 --- a/flang/docs/RuntimeDescriptor.md +++ b/flang/docs/RuntimeDescriptor.md @@ -1,3 +1,18 @@ + + +# Runtime Descriptors + +```eval_rst +.. contents:: + :local: +``` + ## Concept The properties that characterize data values and objects in Fortran programs must sometimes be materialized when the program runs. diff --git a/flang/docs/Semantics.md b/flang/docs/Semantics.md index f879671b4f4ed..361426c936c24 100644 --- a/flang/docs/Semantics.md +++ b/flang/docs/Semantics.md @@ -1,5 +1,18 @@ + + # Semantic Analysis +```eval_rst +.. contents:: + :local: +``` + The semantic analysis pass determines if a syntactically correct Fortran program is is legal by enforcing the constraints of the language. diff --git a/flang/docs/_templates/indexsidebar.html b/flang/docs/_templates/indexsidebar.html new file mode 100644 index 0000000000000..3c8f1abdf9000 --- /dev/null +++ b/flang/docs/_templates/indexsidebar.html @@ -0,0 +1,26 @@ +{# This template defines sidebar which can be used to provide common links on + all documentation pages. #} + +

Documentation

+ + + +

Getting Involved

+ + + +

Additional Links

+ + diff --git a/flang/docs/_templates/layout.html b/flang/docs/_templates/layout.html new file mode 100644 index 0000000000000..12b7731ccca7d --- /dev/null +++ b/flang/docs/_templates/layout.html @@ -0,0 +1,14 @@ +{% extends "!layout.html" %} + +{% block extrahead %} + +{% endblock %} + +{% block rootrellink %} + +
  • Flang Home | 
  • +
  • Documentation»
  • +{% endblock %} diff --git a/flang/docs/conf.py b/flang/docs/conf.py index 045d0a2c41678..197721a4e4c80 100644 --- a/flang/docs/conf.py +++ b/flang/docs/conf.py @@ -46,12 +46,34 @@ else: source_parsers = {'.md': 'recommonmark.parser.CommonMarkParser'} source_suffix['.md'] = 'markdown' + extensions.append('sphinx_markdown_tables') + + # Setup AutoStructify for inline .rst toctrees in index.md + from recommonmark.transform import AutoStructify + + # Stolen from https://github.com/readthedocs/recommonmark/issues/93 + # Monkey patch to fix recommonmark 0.4 doc reference issues. + from recommonmark.states import DummyStateMachine + orig_run_role = DummyStateMachine.run_role + def run_role(self, name, options=None, content=None): + if name == 'doc': + name = 'any' + return orig_run_role(self, name, options, content) + DummyStateMachine.run_role = run_role + + def setup(app): + # Disable inline math to avoid + # https://github.com/readthedocs/recommonmark/issues/120 in Extensions.md + app.add_config_value('recommonmark_config', { + 'enable_inline_math': False + }, True) + app.add_transform(AutoStructify) # The encoding of source files. #source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'Overview' +master_doc = 'index' # General information about the project. project = u'Flang' @@ -156,7 +178,13 @@ #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +html_sidebars = { + '**': [ + 'indexsidebar.html', + 'searchbox.html', + ] +} + # Additional templates that should be rendered to pages, maps page names to # template names. diff --git a/flang/docs/f2018-grammar.txt b/flang/docs/f2018-grammar.md similarity index 99% rename from flang/docs/f2018-grammar.txt rename to flang/docs/f2018-grammar.md index 9b2819d69c724..70f9ebc7f7641 100644 --- a/flang/docs/f2018-grammar.txt +++ b/flang/docs/f2018-grammar.md @@ -1,11 +1,8 @@ -#===-- docs/f2018-grammar.txt -------------------------------------===# -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#===------------------------------------------------------------------------===# +# Fortran 2018 Grammar +Grammar used by Flang to parse Fortran 2018. + +``` R0001 digit -> 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 R0002 letter -> A | B | C | D | E | F | G | H | I | J | K | L | M | @@ -801,3 +798,4 @@ R1542 return-stmt -> RETURN [scalar-int-expr] R1543 contains-stmt -> CONTAINS R1544 stmt-function-stmt -> function-name ( [dummy-arg-name-list] ) = scalar-expr +``` diff --git a/flang/docs/index.md b/flang/docs/index.md new file mode 100644 index 0000000000000..bd7092a418f33 --- /dev/null +++ b/flang/docs/index.md @@ -0,0 +1,62 @@ +# Welcome to Flang's documentation + +Flang is LLVM's Fortran frontend + +```eval_rst +.. toctree:: + :titlesonly: + + ReleaseNotes +``` + +# Contributing to Flang + +```eval_rst +.. toctree:: + :titlesonly: + + GettingInvolved + FortranForCProgrammers + C++style + C++17 + PullRequestChecklist + ImplementingASemanticCheck +``` + +# Design Documents + +```eval_rst +.. toctree:: + :titlesonly: + + Overview + Preprocessing + Parsing + LabelResolution + ModFiles + Semantics + OpenMP-semantics + ControlFlowGraph + FortranIR + IORuntimeInternals + f2018-grammar.md + OpenMP-4.5-grammar.md + Directives + Extensions + Intrinsics + OptionComparison + ParserCombinators + RuntimeDescriptor + Calls + Character + ArrayComposition + BijectiveInternalNameUniquing +``` + +# Indices and tables + +```eval_rst +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` +``` diff --git a/flang/include/flang/Common/enum-set.h b/flang/include/flang/Common/enum-set.h index a7bdc757a1c97..5d2eda57aa819 100644 --- a/flang/include/flang/Common/enum-set.h +++ b/flang/include/flang/Common/enum-set.h @@ -37,8 +37,8 @@ template class EnumSet { constexpr EnumSet() {} constexpr EnumSet(const std::initializer_list &enums) { - for (auto x : enums) { - set(x); + for (auto it{enums.begin()}; it != enums.end(); ++it) { + set(*it); } } constexpr EnumSet(const EnumSet &) = default; diff --git a/flang/include/flang/Evaluate/expression.h b/flang/include/flang/Evaluate/expression.h index 09847ec954072..f0ce375da0153 100644 --- a/flang/include/flang/Evaluate/expression.h +++ b/flang/include/flang/Evaluate/expression.h @@ -717,7 +717,8 @@ class StructureConstructor { return values_.end(); } - const Expr *Find(const Symbol &) const; // can return null + // can return nullopt + std::optional> Find(const Symbol &) const; StructureConstructor &Add(const semantics::Symbol &, Expr &&); int Rank() const { return 0; } @@ -725,6 +726,7 @@ class StructureConstructor { llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const; private: + std::optional> CreateParentComponent(const Symbol &) const; Result result_; StructureConstructorValues values_; }; diff --git a/flang/include/flang/Evaluate/integer.h b/flang/include/flang/Evaluate/integer.h index 6b91cb250c98e..20b6731768de8 100644 --- a/flang/include/flang/Evaluate/integer.h +++ b/flang/include/flang/Evaluate/integer.h @@ -176,22 +176,22 @@ class Integer { constexpr Integer &operator=(const Integer &) = default; constexpr bool operator<(const Integer &that) const { - return CompareUnsigned(that) == Ordering::Less; + return CompareSigned(that) == Ordering::Less; } constexpr bool operator<=(const Integer &that) const { - return CompareUnsigned(that) != Ordering::Greater; + return CompareSigned(that) != Ordering::Greater; } constexpr bool operator==(const Integer &that) const { - return CompareUnsigned(that) == Ordering::Equal; + return CompareSigned(that) == Ordering::Equal; } constexpr bool operator!=(const Integer &that) const { return !(*this == that); } constexpr bool operator>=(const Integer &that) const { - return CompareUnsigned(that) != Ordering::Less; + return CompareSigned(that) != Ordering::Less; } constexpr bool operator>(const Integer &that) const { - return CompareUnsigned(that) == Ordering::Greater; + return CompareSigned(that) == Ordering::Greater; } // Left-justified mask (e.g., MASKL(1) has only its sign bit set) diff --git a/flang/include/flang/Evaluate/type.h b/flang/include/flang/Evaluate/type.h index cf13ba6e27d96..663ece6eb4a09 100644 --- a/flang/include/flang/Evaluate/type.h +++ b/flang/include/flang/Evaluate/type.h @@ -217,6 +217,8 @@ class DynamicType { const semantics::DerivedTypeSpec *GetDerivedTypeSpec(const DynamicType &); const semantics::DerivedTypeSpec *GetDerivedTypeSpec( const std::optional &); +const semantics::DerivedTypeSpec *GetParentTypeSpec( + const semantics::DerivedTypeSpec &); std::string DerivedTypeSpecAsFortran(const semantics::DerivedTypeSpec &); diff --git a/flang/include/flang/Frontend/CompilerInstance.h b/flang/include/flang/Frontend/CompilerInstance.h new file mode 100644 index 0000000000000..298be676ea4a5 --- /dev/null +++ b/flang/include/flang/Frontend/CompilerInstance.h @@ -0,0 +1,105 @@ +//===-- CompilerInstance.h - Flang Compiler Instance ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_FLANG_FRONTEND_COMPILERINSTANCE_H +#define LLVM_FLANG_FRONTEND_COMPILERINSTANCE_H + +#include "flang/Frontend/CompilerInvocation.h" + +#include +#include + +namespace Fortran::frontend { + +class CompilerInstance { + + /// The options used in this compiler instance. + std::shared_ptr invocation_; + + /// The diagnostics engine instance. + llvm::IntrusiveRefCntPtr diagnostics_; + +public: + explicit CompilerInstance(); + + ~CompilerInstance(); + CompilerInvocation &GetInvocation() { + assert(invocation_ && "Compiler instance has no invocation!"); + return *invocation_; + }; + + /// } + /// @name Forwarding Methods + /// { + + clang::DiagnosticOptions &GetDiagnosticOpts() { + return invocation_->GetDiagnosticOpts(); + } + const clang::DiagnosticOptions &GetDiagnosticOpts() const { + return invocation_->GetDiagnosticOpts(); + } + + FrontendOptions &GetFrontendOpts() { return invocation_->GetFrontendOpts(); } + const FrontendOptions &GetFrontendOpts() const { + return invocation_->GetFrontendOpts(); + } + + /// } + /// @name Diagnostics Engine + /// { + + bool HasDiagnostics() const { return diagnostics_ != nullptr; } + + /// Get the current diagnostics engine. + clang::DiagnosticsEngine &GetDiagnostics() const { + assert(diagnostics_ && "Compiler instance has no diagnostics!"); + return *diagnostics_; + } + + /// SetDiagnostics - Replace the current diagnostics engine. + void SetDiagnostics(clang::DiagnosticsEngine *value); + + clang::DiagnosticConsumer &GetDiagnosticClient() const { + assert(diagnostics_ && diagnostics_->getClient() && + "Compiler instance has no diagnostic client!"); + return *diagnostics_->getClient(); + } + + /// Get the current diagnostics engine. + clang::DiagnosticsEngine &getDiagnostics() const { + assert(diagnostics_ && "Compiler instance has no diagnostics!"); + return *diagnostics_; + } + + /// } + /// @name Construction Utility Methods + /// { + + /// Create a DiagnosticsEngine object with a the TextDiagnosticPrinter. + /// + /// If no diagnostic client is provided, this creates a + /// DiagnosticConsumer that is owned by the returned diagnostic + /// object, if using directly the caller is responsible for + /// releasing the returned DiagnosticsEngine's client eventually. + /// + /// \param opts - The diagnostic options; note that the created text + /// diagnostic object contains a reference to these options. + /// + /// \param client If non-NULL, a diagnostic client that will be + /// attached to (and, then, owned by) the returned DiagnosticsEngine + /// object. + /// + /// \return The new object on success, or null on failure. + static clang::IntrusiveRefCntPtr CreateDiagnostics( + clang::DiagnosticOptions *opts, + clang::DiagnosticConsumer *client = nullptr, bool shouldOwnClient = true); + void CreateDiagnostics( + clang::DiagnosticConsumer *client = nullptr, bool shouldOwnClient = true); +}; + +} // end namespace Fortran::frontend +#endif // LLVM_FLANG_FRONTEND_COMPILERINSTANCE_H diff --git a/flang/include/flang/Frontend/CompilerInvocation.h b/flang/include/flang/Frontend/CompilerInvocation.h new file mode 100644 index 0000000000000..0fa169fd16200 --- /dev/null +++ b/flang/include/flang/Frontend/CompilerInvocation.h @@ -0,0 +1,53 @@ +//===- CompilerInvocation.h - Compiler Invocation Helper Data ---*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_FLANG_FRONTEND_COMPILERINVOCATION_H +#define LLVM_FLANG_FRONTEND_COMPILERINVOCATION_H + +#include "flang/Frontend/FrontendOptions.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticOptions.h" + +namespace Fortran::frontend { +class CompilerInvocationBase { +public: + /// Options controlling the diagnostic engine.$ + llvm::IntrusiveRefCntPtr diagnosticOpts_; + + CompilerInvocationBase(); + CompilerInvocationBase(const CompilerInvocationBase &x); + ~CompilerInvocationBase(); + + clang::DiagnosticOptions &GetDiagnosticOpts() { + return *diagnosticOpts_.get(); + } + const clang::DiagnosticOptions &GetDiagnosticOpts() const { + return *diagnosticOpts_.get(); + } +}; + +class CompilerInvocation : public CompilerInvocationBase { + /// Options controlling the frontend itself. + FrontendOptions frontendOpts_; + +public: + CompilerInvocation() = default; + + FrontendOptions &GetFrontendOpts() { return frontendOpts_; } + const FrontendOptions &GetFrontendOpts() const { return frontendOpts_; } + + /// Create a compiler invocation from a list of input options. + /// \returns true on success. + /// \returns false if an error was encountered while parsing the arguments + /// \param [out] res - The resulting invocation. + static bool CreateFromArgs(CompilerInvocation &res, + llvm::ArrayRef commandLineArgs, + clang::DiagnosticsEngine &diags); +}; + +} // end namespace Fortran::frontend +#endif // LLVM_FLANG_FRONTEND_COMPILERINVOCATION_H diff --git a/flang/include/flang/Frontend/FrontendOptions.h b/flang/include/flang/Frontend/FrontendOptions.h new file mode 100644 index 0000000000000..474086f44e3b1 --- /dev/null +++ b/flang/include/flang/Frontend/FrontendOptions.h @@ -0,0 +1,58 @@ +//===- FrontendOptions.h ----------------------------------------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_FLANG_FRONTEND_FRONTENDOPTIONS_H +#define LLVM_FLANG_FRONTEND_FRONTENDOPTIONS_H + +#include +#include +namespace Fortran::frontend { + +enum class Language : uint8_t { + Unknown, + + /// LLVM IR: we accept this so that we can run the optimizer on it, + /// and compile it to assembly or object code. + LLVM_IR, + + ///@{ Languages that the frontend can parse and compile. + Fortran, + ///@} +}; + +/// The kind of a file that we've been handed as an input. +class InputKind { +private: + Language lang_; + +public: + /// The input file format. + enum Format { Source, ModuleMap, Precompiled }; + + constexpr InputKind(Language l = Language::Unknown) : lang_(l) {} + + Language GetLanguage() const { return static_cast(lang_); } + + /// Is the input kind fully-unknown? + bool IsUnknown() const { return lang_ == Language::Unknown; } +}; + +/// FrontendOptions - Options for controlling the behavior of the frontend. +class FrontendOptions { +public: + /// Show the -help text. + unsigned showHelp_ : 1; + + /// Show the -version text. + unsigned showVersion_ : 1; + +public: + FrontendOptions() : showHelp_(false), showVersion_(false) {} +}; +} // namespace Fortran::frontend + +#endif // LLVM_FLANG_FRONTEND_FRONTENDOPTIONS_H diff --git a/flang/include/flang/FrontendTool/Utils.h b/flang/include/flang/FrontendTool/Utils.h new file mode 100644 index 0000000000000..f49c4e6dae62d --- /dev/null +++ b/flang/include/flang/FrontendTool/Utils.h @@ -0,0 +1,29 @@ +//===--- Utils.h - Misc utilities for the flang front-end --------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header contains miscellaneous utilities for various front-end actions +// which were split from Frontend to minimise Frontend's dependencies. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_FLANG_FRONTENDTOOL_UTILS_H +#define LLVM_FLANG_FRONTENDTOOL_UTILS_H + +namespace Fortran::frontend { + +class CompilerInstance; + +/// ExecuteCompilerInvocation - Execute the given actions described by the +/// compiler invocation object in the given compiler instance. +/// +/// \return - True on success. +bool ExecuteCompilerInvocation(CompilerInstance *flang); + +} // end namespace Fortran::frontend + +#endif // LLVM_FLANG_FRONTENDTOOL_UTILS_H diff --git a/flang/include/flang/Optimizer/Dialect/FIRDialect.h b/flang/include/flang/Optimizer/Dialect/FIRDialect.h index 9702c54367b8b..a4b0e3f9aa7fd 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRDialect.h +++ b/flang/include/flang/Optimizer/Dialect/FIRDialect.h @@ -37,6 +37,7 @@ inline void registerFIRDialects(mlir::DialectRegistry ®istry) { // clang-format off registry.insert #include @@ -206,13 +211,75 @@ bool Expr::operator==(const Expr &that) const { DynamicType StructureConstructor::GetType() const { return result_.GetType(); } -const Expr *StructureConstructor::Find( +std::optional> StructureConstructor::CreateParentComponent( + const Symbol &component) const { + if (const semantics::DerivedTypeSpec * + parentSpec{GetParentTypeSpec(derivedTypeSpec())}) { + StructureConstructor structureConstructor{*parentSpec}; + if (const auto *parentDetails{ + component.detailsIf()}) { + auto parentIter{parentDetails->componentNames().begin()}; + for (const auto &childIter : values_) { + if (parentIter == parentDetails->componentNames().end()) { + break; // There are more components in the child + } + SymbolRef componentSymbol{childIter.first}; + structureConstructor.Add( + *componentSymbol, common::Clone(childIter.second.value())); + ++parentIter; + } + Constant constResult{std::move(structureConstructor)}; + Expr result{std::move(constResult)}; + return std::optional>{result}; + } + } + return std::nullopt; +} + +static const Symbol *GetParentComponentSymbol(const Symbol &symbol) { + if (symbol.test(Symbol::Flag::ParentComp)) { + // we have a created parent component + const auto &compObject{symbol.get()}; + if (const semantics::DeclTypeSpec * compType{compObject.type()}) { + const semantics::DerivedTypeSpec &dtSpec{compType->derivedTypeSpec()}; + const semantics::Symbol &compTypeSymbol{dtSpec.typeSymbol()}; + return &compTypeSymbol; + } + } + if (symbol.detailsIf()) { + // we have an implicit parent type component + return &symbol; + } + return nullptr; +} + +std::optional> StructureConstructor::Find( const Symbol &component) const { if (auto iter{values_.find(component)}; iter != values_.end()) { - return &iter->second.value(); - } else { - return nullptr; + return iter->second.value(); + } + // The component wasn't there directly, see if we're looking for the parent + // component of an extended type + if (const Symbol * typeSymbol{GetParentComponentSymbol(component)}) { + return CreateParentComponent(*typeSymbol); + } + // Look for the component in the parent type component. The parent type + // component is always the first one + if (!values_.empty()) { + const Expr *parentExpr{&values_.begin()->second.value()}; + if (const Expr *derivedExpr{ + std::get_if>(&parentExpr->u)}) { + if (const Constant *constExpr{ + std::get_if>(&derivedExpr->u)}) { + if (std::optional parentComponentValue{ + constExpr->GetScalarValue()}) { + // Try to find the component in the parent structure constructor + return parentComponentValue->Find(component); + } + } + } } + return std::nullopt; } StructureConstructor &StructureConstructor::Add( diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h index e01c7de72f8d9..bb5463e697fe1 100644 --- a/flang/lib/Evaluate/fold-implementation.h +++ b/flang/lib/Evaluate/fold-implementation.h @@ -296,8 +296,8 @@ std::optional> Folder::ApplyComponent( Constant &&structures, const Symbol &component, const std::vector> *subscripts) { if (auto scalar{structures.GetScalarValue()}) { - if (auto *expr{scalar->Find(component)}) { - if (const Constant *value{UnwrapConstantValue(*expr)}) { + if (std::optional> expr{scalar->Find(component)}) { + if (const Constant *value{UnwrapConstantValue(expr.value())}) { if (!subscripts) { return std::move(*value); } else { @@ -314,12 +314,12 @@ std::optional> Folder::ApplyComponent( ConstantSubscripts at{structures.lbounds()}; do { StructureConstructor scalar{structures.At(at)}; - if (auto *expr{scalar.Find(component)}) { - if (const Constant *value{UnwrapConstantValue(*expr)}) { + if (std::optional> expr{scalar.Find(component)}) { + if (const Constant *value{UnwrapConstantValue(expr.value())}) { if (!array.get()) { // This technique ensures that character length or derived type // information is propagated to the array constructor. - auto *typedExpr{UnwrapExpr>(*expr)}; + auto *typedExpr{UnwrapExpr>(expr.value())}; CHECK(typedExpr); array = std::make_unique>(*typedExpr); } diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp index 128a73ad4c78f..4edf90d37fa59 100644 --- a/flang/lib/Evaluate/tools.cpp +++ b/flang/lib/Evaluate/tools.cpp @@ -813,8 +813,8 @@ parser::Message *AttachDeclaration( unhosted->detailsIf()}) { if (binding->symbol().name() != symbol.name()) { message.Attach(binding->symbol().name(), - "Procedure '%s' is bound to '%s'"_en_US, symbol.name(), - binding->symbol().name()); + "Procedure '%s' of type '%s' is bound to '%s'"_en_US, symbol.name(), + symbol.owner().GetName().value(), binding->symbol().name()); return &message; } unhosted = &binding->symbol(); diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp index e1eec19e896b9..e96e19150f4ee 100644 --- a/flang/lib/Evaluate/type.cpp +++ b/flang/lib/Evaluate/type.cpp @@ -207,7 +207,7 @@ static const semantics::Symbol *FindParentComponent( return nullptr; } -static const semantics::DerivedTypeSpec *GetParentTypeSpec( +const semantics::DerivedTypeSpec *GetParentTypeSpec( const semantics::DerivedTypeSpec &derived) { if (const semantics::Symbol * parent{FindParentComponent(derived)}) { return &parent->get() diff --git a/flang/lib/Evaluate/variable.cpp b/flang/lib/Evaluate/variable.cpp index d87c71688f1af..c81f2b175ed5e 100644 --- a/flang/lib/Evaluate/variable.cpp +++ b/flang/lib/Evaluate/variable.cpp @@ -204,9 +204,11 @@ std::optional> Substring::Fold(FoldingContext &context) { *ubi = *length; } if (lbi && literal) { - CHECK(*ubi >= *lbi); auto newStaticData{StaticDataObject::Create()}; - auto items{*ubi - *lbi + 1}; + auto items{0}; // If the lower bound is greater, the length is 0 + if (*ubi >= *lbi) { + items = *ubi - *lbi + 1; + } auto width{(*literal)->itemBytes()}; auto bytes{items * width}; auto startByte{(*lbi - 1) * width}; diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt new file mode 100644 index 0000000000000..fac3f955987f1 --- /dev/null +++ b/flang/lib/Frontend/CMakeLists.txt @@ -0,0 +1,16 @@ +add_flang_library(flangFrontend + CompilerInstance.cpp + CompilerInvocation.cpp + FrontendOptions.cpp + + LINK_LIBS + clangBasic + clangDriver + # TODO: Added to re-use clang's TextDiagnosticBuffer & TextDiagnosticPrinter. + # Add a custom implementation for Flang and remove this dependency. + clangFrontend + + LINK_COMPONENTS + Option + Support +) diff --git a/flang/lib/Frontend/CompilerInstance.cpp b/flang/lib/Frontend/CompilerInstance.cpp new file mode 100644 index 0000000000000..bf1461dd16ad6 --- /dev/null +++ b/flang/lib/Frontend/CompilerInstance.cpp @@ -0,0 +1,42 @@ +//===--- CompilerInstance.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Frontend/CompilerInstance.h" +#include "flang/Frontend/CompilerInvocation.h" +#include "clang/Frontend/TextDiagnosticPrinter.h" +#include "llvm/Support/raw_ostream.h" + +using namespace Fortran::frontend; + +CompilerInstance::CompilerInstance() : invocation_(new CompilerInvocation()) {} + +CompilerInstance::~CompilerInstance() = default; + +void CompilerInstance::CreateDiagnostics( + clang::DiagnosticConsumer *client, bool shouldOwnClient) { + diagnostics_ = + CreateDiagnostics(&GetDiagnosticOpts(), client, shouldOwnClient); +} + +clang::IntrusiveRefCntPtr +CompilerInstance::CreateDiagnostics(clang::DiagnosticOptions *opts, + clang::DiagnosticConsumer *client, bool shouldOwnClient) { + clang::IntrusiveRefCntPtr diagID( + new clang::DiagnosticIDs()); + clang::IntrusiveRefCntPtr diags( + new clang::DiagnosticsEngine(diagID, opts)); + + // Create the diagnostic client for reporting errors or for + // implementing -verify. + if (client) { + diags->setClient(client, shouldOwnClient); + } else { + diags->setClient(new clang::TextDiagnosticPrinter(llvm::errs(), opts)); + } + return diags; +} diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp new file mode 100644 index 0000000000000..c68ad5c11d65a --- /dev/null +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -0,0 +1,115 @@ +//===- CompilerInvocation.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Frontend/CompilerInvocation.h" +#include "clang/Basic/AllDiagnostics.h" +#include "clang/Basic/DiagnosticDriver.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Driver/DriverDiagnostic.h" +#include "clang/Driver/Options.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Option/Arg.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/OptTable.h" +#include "llvm/Support/raw_ostream.h" + +using namespace Fortran::frontend; + +//===----------------------------------------------------------------------===// +// Initialization. +//===----------------------------------------------------------------------===// +CompilerInvocationBase::CompilerInvocationBase() + : diagnosticOpts_(new clang::DiagnosticOptions()) {} + +CompilerInvocationBase::CompilerInvocationBase(const CompilerInvocationBase &x) + : diagnosticOpts_(new clang::DiagnosticOptions(x.GetDiagnosticOpts())) {} + +CompilerInvocationBase::~CompilerInvocationBase() = default; + +//===----------------------------------------------------------------------===// +// Deserialization (from args) +//===----------------------------------------------------------------------===// +static InputKind ParseFrontendArgs(FrontendOptions &opts, + llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { + // Identify the action (i.e. opts.ProgramAction) + if (const llvm::opt::Arg *a = + args.getLastArg(clang::driver::options::OPT_Action_Group)) { + switch (a->getOption().getID()) { + default: { + llvm_unreachable("Invalid option in group!"); + } + // TODO: + // case clang::driver::options::OPT_E: + // case clang::driver::options::OPT_emit_obj: + // case calng::driver::options::OPT_emit_llvm: + // case clang::driver::options::OPT_emit_llvm_only: + // case clang::driver::options::OPT_emit_codegen_only: + // case clang::driver::options::OPT_emit_module: + // (...) + } + } + + opts.showHelp_ = args.hasArg(clang::driver::options::OPT_help); + opts.showVersion_ = args.hasArg(clang::driver::options::OPT_version); + + // Get the input kind (from the value passed via `-x`) + InputKind dashX(Language::Unknown); + if (const llvm::opt::Arg *a = + args.getLastArg(clang::driver::options::OPT_x)) { + llvm::StringRef XValue = a->getValue(); + // Principal languages. + dashX = llvm::StringSwitch(XValue) + .Case("f90", Language::Fortran) + .Default(Language::Unknown); + + // Some special cases cannot be combined with suffixes. + if (dashX.IsUnknown()) + dashX = llvm::StringSwitch(XValue) + .Case("ir", Language::LLVM_IR) + .Default(Language::Unknown); + + if (dashX.IsUnknown()) + diags.Report(clang::diag::err_drv_invalid_value) + << a->getAsString(args) << a->getValue(); + } + + return dashX; +} + +bool CompilerInvocation::CreateFromArgs(CompilerInvocation &res, + llvm::ArrayRef commandLineArgs, + clang::DiagnosticsEngine &diags) { + + bool success = true; + + // Parse the arguments + const llvm::opt::OptTable &opts = clang::driver::getDriverOptTable(); + const unsigned includedFlagsBitmask = + clang::driver::options::FC1Option; + unsigned missingArgIndex, missingArgCount; + llvm::opt::InputArgList args = opts.ParseArgs( + commandLineArgs, missingArgIndex, missingArgCount, includedFlagsBitmask); + + // Issue errors on unknown arguments + for (const auto *a : args.filtered(clang::driver::options::OPT_UNKNOWN)) { + auto argString = a->getAsString(args); + std::string nearest; + if (opts.findNearest(argString, nearest, includedFlagsBitmask) > 1) + diags.Report(clang::diag::err_drv_unknown_argument) << argString; + else + diags.Report(clang::diag::err_drv_unknown_argument_with_suggestion) + << argString << nearest; + success = false; + } + + // Parse the frontend args + ParseFrontendArgs(res.GetFrontendOpts(), args, diags); + + return success; +} diff --git a/flang/lib/Frontend/FrontendOptions.cpp b/flang/lib/Frontend/FrontendOptions.cpp new file mode 100644 index 0000000000000..ea5d54aa7ff06 --- /dev/null +++ b/flang/lib/Frontend/FrontendOptions.cpp @@ -0,0 +1,9 @@ +//===- FrontendOptions.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Frontend/FrontendOptions.h" diff --git a/flang/lib/FrontendTool/CMakeLists.txt b/flang/lib/FrontendTool/CMakeLists.txt new file mode 100644 index 0000000000000..eda040f7c7161 --- /dev/null +++ b/flang/lib/FrontendTool/CMakeLists.txt @@ -0,0 +1,11 @@ +add_flang_library(flangFrontendTool + ExecuteCompilerInvocation.cpp + + LINK_LIBS + clangBasic + clangDriver + + LINK_COMPONENTS + Option + Support +) diff --git a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp new file mode 100644 index 0000000000000..ab773c95c85dd --- /dev/null +++ b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -0,0 +1,39 @@ +//===--- ExecuteCompilerInvocation.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file holds ExecuteCompilerInvocation(). It is split into its own file to +// minimize the impact of pulling in essentially everything else in Flang. +// +//===----------------------------------------------------------------------===// + +#include "flang/Frontend/CompilerInstance.h" +#include "clang/Driver/Options.h" +#include "llvm/Option/OptTable.h" +#include "llvm/Support/CommandLine.h" + +namespace Fortran::frontend { +bool ExecuteCompilerInvocation(CompilerInstance *flang) { + // Honor -help. + if (flang->GetFrontendOpts().showHelp_) { + clang::driver::getDriverOptTable().PrintHelp(llvm::outs(), + "flang-new -fc1 [options] file...", "LLVM 'Flang' Compiler", + /*Include=*/clang::driver::options::FlangOption, + /*Exclude=*/0, /*ShowAllAliases=*/false); + return true; + } + + // Honor -version. + if (flang->GetFrontendOpts().showVersion_) { + llvm::cl::PrintVersionMessage(); + return true; + } + + return true; +} + +} // namespace Fortran::frontend diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 7202d4ec03199..f91aff792cbd4 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -1,4 +1,4 @@ -//===-- OpenMP.cpp -- OpenACC directive lowering --------------------------===// +//===-- OpenACC.cpp -- OpenACC directive lowering -------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,16 +11,202 @@ //===----------------------------------------------------------------------===// #include "flang/Lower/OpenACC.h" +#include "flang/Common/idioms.h" #include "flang/Lower/Bridge.h" #include "flang/Lower/FIRBuilder.h" #include "flang/Lower/PFTBuilder.h" #include "flang/Parser/parse-tree.h" +#include "flang/Semantics/tools.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" #include "llvm/Frontend/OpenACC/ACC.h.inc" #define TODO() llvm_unreachable("not yet implemented") +static const Fortran::parser::Name * +getDesignatorNameIfDataRef(const Fortran::parser::Designator &designator) { + const auto *dataRef{std::get_if(&designator.u)}; + return dataRef ? std::get_if(&dataRef->u) : nullptr; +} + +static void genObjectList(const Fortran::parser::AccObjectList &objectList, + Fortran::lower::AbstractConverter &converter, + std::int32_t &objectsCount, + SmallVector &operands) { + for (const auto &accObject : objectList.v) { + std::visit( + Fortran::common::visitors{ + [&](const Fortran::parser::Designator &designator) { + if (const auto *name = getDesignatorNameIfDataRef(designator)) { + ++objectsCount; + const auto variable = converter.getSymbolAddress(*name->symbol); + operands.push_back(variable); + } + }, + [&](const Fortran::parser::Name &name) { + ++objectsCount; + const auto variable = converter.getSymbolAddress(*name.symbol); + operands.push_back(variable); + }}, + accObject.u); + } +} + +static void genACC(Fortran::lower::AbstractConverter &converter, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OpenACCLoopConstruct &loopConstruct) { + + const auto &beginLoopDirective = + std::get(loopConstruct.t); + const auto &loopDirective = + std::get(beginLoopDirective.t); + + if (loopDirective.v == llvm::acc::ACCD_loop) { + auto &firOpBuilder = converter.getFirOpBuilder(); + auto currentLocation = converter.getCurrentLocation(); + llvm::ArrayRef argTy; + + // Add attribute extracted from clauses. + const auto &accClauseList = + std::get(beginLoopDirective.t); + + mlir::Value workerNum; + mlir::Value vectorLength; + mlir::Value gangNum; + mlir::Value gangStatic; + std::int32_t tileOperands = 0; + std::int32_t privateOperands = 0; + std::int32_t reductionOperands = 0; + std::int64_t executionMapping = mlir::acc::OpenACCExecMapping::NONE; + SmallVector operands; + + // Lower clauses values mapped to operands. + for (const auto &clause : accClauseList.v) { + if (const auto *gangClause = + std::get_if(&clause.u)) { + if (gangClause->v) { + const Fortran::parser::AccGangArgument &x = *gangClause->v; + if (const auto &gangNumValue = + std::get>( + x.t)) { + gangNum = converter.genExprValue( + *Fortran::semantics::GetExpr(gangNumValue.value())); + operands.push_back(gangNum); + } + if (const auto &gangStaticValue = + std::get>(x.t)) { + const auto &expr = + std::get>( + gangStaticValue.value().t); + if (expr) { + gangStatic = + converter.genExprValue(*Fortran::semantics::GetExpr(*expr)); + } else { + // * was passed as value and will be represented as a -1 constant + // integer. + gangStatic = firOpBuilder.createIntegerConstant( + currentLocation, firOpBuilder.getIntegerType(32), + /* STAR */ -1); + } + operands.push_back(gangStatic); + } + } + executionMapping |= mlir::acc::OpenACCExecMapping::GANG; + } else if (const auto *workerClause = + std::get_if( + &clause.u)) { + if (workerClause->v) { + workerNum = converter.genExprValue( + *Fortran::semantics::GetExpr(*workerClause->v)); + operands.push_back(workerNum); + } + executionMapping |= mlir::acc::OpenACCExecMapping::WORKER; + } else if (const auto *vectorClause = + std::get_if( + &clause.u)) { + if (vectorClause->v) { + vectorLength = converter.genExprValue( + *Fortran::semantics::GetExpr(*vectorClause->v)); + operands.push_back(vectorLength); + } + executionMapping |= mlir::acc::OpenACCExecMapping::VECTOR; + } else if (const auto *tileClause = + std::get_if(&clause.u)) { + const Fortran::parser::AccTileExprList &accTileExprList = tileClause->v; + for (const auto &accTileExpr : accTileExprList.v) { + const auto &expr = + std::get>( + accTileExpr.t); + ++tileOperands; + if (expr) { + operands.push_back( + converter.genExprValue(*Fortran::semantics::GetExpr(*expr))); + } else { + // * was passed as value and will be represented as a -1 constant + // integer. + mlir::Value tileStar = firOpBuilder.createIntegerConstant( + currentLocation, firOpBuilder.getIntegerType(32), + /* STAR */ -1); + operands.push_back(tileStar); + } + } + } else if (const auto *privateClause = + std::get_if( + &clause.u)) { + const Fortran::parser::AccObjectList &accObjectList = privateClause->v; + genObjectList(accObjectList, converter, privateOperands, operands); + } + // Reduction clause is left out for the moment as the clause will probably + // end up having its own operation. + } + + auto loopOp = firOpBuilder.create(currentLocation, argTy, + operands); + + firOpBuilder.createBlock(&loopOp.getRegion()); + auto &block = loopOp.getRegion().back(); + firOpBuilder.setInsertionPointToStart(&block); + // ensure the block is well-formed. + firOpBuilder.create(currentLocation); + + loopOp.setAttr(mlir::acc::LoopOp::getOperandSegmentSizeAttr(), + firOpBuilder.getI32VectorAttr( + {gangNum ? 1 : 0, gangStatic ? 1 : 0, workerNum ? 1 : 0, + vectorLength ? 1 : 0, tileOperands, privateOperands, + reductionOperands})); + + loopOp.setAttr(mlir::acc::LoopOp::getExecutionMappingAttrName(), + firOpBuilder.getI64IntegerAttr(executionMapping)); + + // Lower clauses mapped to attributes + for (const auto &clause : accClauseList.v) { + if (const auto *collapseClause = + std::get_if(&clause.u)) { + const auto *expr = Fortran::semantics::GetExpr(collapseClause->v); + const auto collapseValue = Fortran::evaluate::ToInt64(*expr); + if (collapseValue) { + loopOp.setAttr(mlir::acc::LoopOp::getCollapseAttrName(), + firOpBuilder.getI64IntegerAttr(*collapseValue)); + } + } else if (std::get_if(&clause.u)) { + loopOp.setAttr(mlir::acc::LoopOp::getSeqAttrName(), + firOpBuilder.getUnitAttr()); + } else if (std::get_if( + &clause.u)) { + loopOp.setAttr(mlir::acc::LoopOp::getIndependentAttrName(), + firOpBuilder.getUnitAttr()); + } else if (std::get_if(&clause.u)) { + loopOp.setAttr(mlir::acc::LoopOp::getAutoAttrName(), + firOpBuilder.getUnitAttr()); + } + } + + // Place the insertion point to the start of the first block. + firOpBuilder.setInsertionPointToStart(&block); + } +} + void Fortran::lower::genOpenACCConstruct( - Fortran::lower::AbstractConverter &absConv, + Fortran::lower::AbstractConverter &converter, Fortran::lower::pft::Evaluation &eval, const Fortran::parser::OpenACCConstruct &accConstruct) { @@ -32,7 +218,7 @@ void Fortran::lower::genOpenACCConstruct( [&](const Fortran::parser::OpenACCCombinedConstruct &combinedConstruct) { TODO(); }, [&](const Fortran::parser::OpenACCLoopConstruct &loopConstruct) { - TODO(); + genACC(converter, eval, loopConstruct); }, [&](const Fortran::parser::OpenACCStandaloneConstruct &standaloneConstruct) { TODO(); }, diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index 36334167184d5..079d16d74181a 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -1552,11 +1552,8 @@ fir::GlobalOp fir::createGlobalOp(mlir::Location loc, mlir::ModuleOp module, return modBuilder.create(loc, name, type, attrs); } -namespace fir { - // Tablegen operators #define GET_OP_CLASSES #include "flang/Optimizer/Dialect/FIROps.cpp.inc" -} // namespace fir diff --git a/flang/lib/Parser/basic-parsers.h b/flang/lib/Parser/basic-parsers.h index 56d9ff1b07069..c92ece0ef6777 100644 --- a/flang/lib/Parser/basic-parsers.h +++ b/flang/lib/Parser/basic-parsers.h @@ -729,13 +729,7 @@ template class ApplyConstructor { return RESULT{}; } else { if constexpr (sizeof...(PARSER) == 1) { - if constexpr (std::is_same_v) { - if (std::get<0>(parsers_).Parse(state)) { - return RESULT{}; - } - } else if (auto arg{std::get<0>(parsers_).Parse(state)}) { - return RESULT{std::move(*arg)}; - } + return ParseOne(state); } else { ApplyArgs results; using Sequence = std::index_sequence_for; @@ -749,6 +743,17 @@ template class ApplyConstructor { } private: + std::optional ParseOne(ParseState &state) const { + if constexpr (std::is_same_v) { + if (std::get<0>(parsers_).Parse(state)) { + return RESULT{}; + } + } else if (auto arg{std::get<0>(parsers_).Parse(state)}) { + return RESULT{std::move(*arg)}; + } + return std::nullopt; + } + const std::tuple parsers_; }; diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index cd5ee0de556dc..a7f4a1ae492c7 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -300,9 +300,9 @@ TYPE_PARSER(sourced(construct(verbatim("CANCEL"_tok), // release // acquire TYPE_PARSER(sourced(construct( - "ACQ_REL" >> pure(OmpFlushMemoryClause::FlushMemoryOrder::AcqRel) || - "RELEASE" >> pure(OmpFlushMemoryClause::FlushMemoryOrder::Release) || - "ACQUIRE" >> pure(OmpFlushMemoryClause::FlushMemoryOrder::Acquire)))) + "ACQ_REL" >> pure(llvm::omp::Clause::OMPC_acq_rel) || + "RELEASE" >> pure(llvm::omp::Clause::OMPC_release) || + "ACQUIRE" >> pure(llvm::omp::Clause::OMPC_acquire)))) TYPE_PARSER(sourced(construct(verbatim("FLUSH"_tok), maybe(Parser{}), @@ -384,51 +384,74 @@ TYPE_PARSER(construct(Parser{}) || construct(Parser{}, parenthesized(optionalList(actualArgSpec)))))) -// 2.13.6 ATOMIC [seq_cst[,]] atomic-clause [[,]seq_cst] | ATOMIC [seq_cst] -// atomic-clause -> READ | WRITE | UPDATE | CAPTURE +// Hint Expression => HINT(hint-expression) +TYPE_PARSER("HINT" >> construct(parenthesized(constantExpr))) + +// 2.17.7 atomic -> ATOMIC [clause [,]] atomic-clause [[,] clause] | +// ATOMIC [clause] +// clause -> memory-order-clause | HINT(hint-expression) +// memory-order-clause -> SEQ_CST | ACQ_REL | RELEASE | ACQUIRE | RELAXED +// atomic-clause -> READ | WRITE | UPDATE | CAPTURE // OMP END ATOMIC TYPE_PARSER(construct(startOmpLine >> "END ATOMIC"_tok)) -// ATOMIC Memory related clause -TYPE_PARSER(sourced(construct( - "SEQ_CST" >> pure(OmpMemoryClause::MemoryOrder::SeqCst)))) +// Memory order clause +TYPE_PARSER(sourced(construct( + "SEQ_CST" >> pure(llvm::omp::Clause::OMPC_seq_cst) || + "ACQ_REL" >> pure(llvm::omp::Clause::OMPC_acq_rel) || + "RELEASE" >> pure(llvm::omp::Clause::OMPC_release) || + "ACQUIRE" >> pure(llvm::omp::Clause::OMPC_acquire) || + "RELAXED" >> pure(llvm::omp::Clause::OMPC_relaxed)))) -// ATOMIC Memory Clause List -TYPE_PARSER(construct( - many(maybe(","_tok) >> Parser{}))) +// ATOMIC Memory order clause or Hint expression +TYPE_PARSER( + construct(Parser{}) || + construct(Parser{})) -TYPE_PARSER(construct( - many(maybe(","_tok) >> Parser{}))) +// ATOMIC Memory order Clause List +TYPE_PARSER(construct( + many(maybe(","_tok) >> Parser{}))) -// OMP [SEQ_CST] ATOMIC READ [SEQ_CST] -TYPE_PARSER("ATOMIC" >> - construct(Parser{} / maybe(","_tok), - verbatim("READ"_tok), Parser{} / endOmpLine, - statement(assignmentStmt), maybe(Parser{} / endOmpLine))) +TYPE_PARSER(construct( + many(maybe(","_tok) >> Parser{}))) -// OMP ATOMIC [SEQ_CST] CAPTURE [SEQ_CST] +// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] READ [MEMORY-ORDER-CLAUSE-LIST] TYPE_PARSER("ATOMIC" >> - construct(Parser{} / maybe(","_tok), - verbatim("CAPTURE"_tok), Parser{} / endOmpLine, - statement(assignmentStmt), statement(assignmentStmt), - Parser{} / endOmpLine)) + construct( + Parser{} / maybe(","_tok), + verbatim("READ"_tok), + Parser{} / endOmpLine, + statement(assignmentStmt), maybe(Parser{} / endOmpLine))) -// OMP ATOMIC [SEQ_CST] UPDATE [SEQ_CST] +// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] CAPTURE [MEMORY-ORDER-CLAUSE-LIST] +TYPE_PARSER( + "ATOMIC" >> construct( + Parser{} / maybe(","_tok), + verbatim("CAPTURE"_tok), + Parser{} / endOmpLine, + statement(assignmentStmt), statement(assignmentStmt), + Parser{} / endOmpLine)) + +// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] UPDATE [MEMORY-ORDER-CLAUSE-LIST] TYPE_PARSER("ATOMIC" >> - construct(Parser{} / maybe(","_tok), - verbatim("UPDATE"_tok), Parser{} / endOmpLine, + construct( + Parser{} / maybe(","_tok), + verbatim("UPDATE"_tok), + Parser{} / endOmpLine, statement(assignmentStmt), maybe(Parser{} / endOmpLine))) -// OMP ATOMIC [SEQ_CST] +// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] TYPE_PARSER(construct(verbatim("ATOMIC"_tok), - Parser{} / endOmpLine, statement(assignmentStmt), - maybe(Parser{} / endOmpLine))) + Parser{} / endOmpLine, + statement(assignmentStmt), maybe(Parser{} / endOmpLine))) -// ATOMIC [SEQ_CST] WRITE [SEQ_CST] +// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] WRITE [MEMORY-ORDER-CLAUSE-LIST] TYPE_PARSER("ATOMIC" >> - construct(Parser{} / maybe(","_tok), - verbatim("WRITE"_tok), Parser{} / endOmpLine, + construct( + Parser{} / maybe(","_tok), + verbatim("WRITE"_tok), + Parser{} / endOmpLine, statement(assignmentStmt), maybe(Parser{} / endOmpLine))) // Atomic Construct @@ -444,9 +467,7 @@ TYPE_PARSER(startOmpLine >> verbatim("END CRITICAL"_tok), maybe(parenthesized(name)))) / endOmpLine) TYPE_PARSER(sourced(construct(verbatim("CRITICAL"_tok), - maybe(parenthesized(name)), - maybe("HINT" >> construct( - parenthesized(constantExpr))))) / + maybe(parenthesized(name)), maybe(Parser{}))) / endOmpLine) TYPE_PARSER(construct( diff --git a/flang/lib/Parser/preprocessor.cpp b/flang/lib/Parser/preprocessor.cpp index a1f07967d9b08..823adda8e95af 100644 --- a/flang/lib/Parser/preprocessor.cpp +++ b/flang/lib/Parser/preprocessor.cpp @@ -540,7 +540,7 @@ void Preprocessor::Directive(const TokenSequence &dir, Prescanner *prescanner) { return; } std::string include; - if (dir.TokenAt(j).ToString() == "<") { + if (dir.TokenAt(j).ToString() == "<") { // #include std::size_t k{j + 1}; if (k >= tokens) { prescanner->Say(dir.GetIntervalProvenanceRange(j, tokens - j), @@ -553,15 +553,12 @@ void Preprocessor::Directive(const TokenSequence &dir, Prescanner *prescanner) { if (k >= tokens) { prescanner->Say(dir.GetIntervalProvenanceRange(j, tokens - j), "#include: expected '>' at end of included file"_en_US); - } else if (k + 1 < tokens) { - prescanner->Say(dir.GetIntervalProvenanceRange(k + 1, tokens - k - 1), - "#include: extra stuff ignored after '>'"_en_US); } TokenSequence braced{dir, j + 1, k - j - 1}; include = ReplaceMacros(braced, *prescanner).ToString(); - } else if (j + 1 == tokens && - (include = dir.TokenAt(j).ToString()).substr(0, 1) == "\"" && - include.substr(include.size() - 1, 1) == "\"") { + j = k; + } else if ((include = dir.TokenAt(j).ToString()).substr(0, 1) == "\"" && + include.substr(include.size() - 1, 1) == "\"") { // #include "foo" include = include.substr(1, include.size() - 2); } else { prescanner->Say(dir.GetTokenProvenanceRange(j < tokens ? j : tokens - 1), @@ -573,6 +570,11 @@ void Preprocessor::Directive(const TokenSequence &dir, Prescanner *prescanner) { "#include: empty include file name"_err_en_US); return; } + j = dir.SkipBlanks(j + 1); + if (j < tokens && dir.TokenAt(j).ToString() != "!") { + prescanner->Say(dir.GetIntervalProvenanceRange(j, tokens - j), + "#include: extra stuff ignored after file name"_en_US); + } std::string buf; llvm::raw_string_ostream error{buf}; const SourceFile *included{allSources_.Open(include, error)}; diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index 8e8e57c1334d9..3eb909fc1ae86 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -62,11 +62,8 @@ static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) { void Prescanner::Prescan(ProvenanceRange range) { startProvenance_ = range.start(); - std::size_t offset{0}; - const SourceFile *source{ - allSources_.GetSourceFile(startProvenance_, &offset)}; - CHECK(source); - start_ = source->content().data() + offset; + start_ = allSources_.GetSource(range); + CHECK(start_); limit_ = start_ + range.size(); nextLine_ = start_; const bool beganInFixedForm{inFixedForm_}; @@ -75,7 +72,7 @@ void Prescanner::Prescan(ProvenanceRange range) { "too many nested INCLUDE/#include files, possibly circular"_err_en_US); return; } - while (nextLine_ < limit_) { + while (!IsAtEnd()) { Statement(); } if (inFixedForm_ != beganInFixedForm) { @@ -232,7 +229,7 @@ void Prescanner::Statement() { } TokenSequence Prescanner::TokenizePreprocessorDirective() { - CHECK(nextLine_ < limit_ && !inPreprocessorDirective_); + CHECK(!IsAtEnd() && !inPreprocessorDirective_); inPreprocessorDirective_ = true; BeginStatementAndAdvance(); TokenSequence tokens; @@ -360,7 +357,7 @@ void Prescanner::SkipCComments() { break; } } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ && - at_[1] == '\n' && nextLine_ < limit_) { + at_[1] == '\n' && !IsAtEnd()) { BeginSourceLineAndAdvance(); } else { break; @@ -804,7 +801,7 @@ bool Prescanner::IsNextLinePreprocessorDirective() const { } bool Prescanner::SkipCommentLine(bool afterAmpersand) { - if (nextLine_ >= limit_) { + if (IsAtEnd()) { if (afterAmpersand && prescannerNesting_ > 0) { // A continuation marker at the end of the last line in an // include file inhibits the newline for that line. @@ -843,7 +840,7 @@ bool Prescanner::SkipCommentLine(bool afterAmpersand) { } const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) { - if (nextLine_ >= limit_) { + if (IsAtEnd()) { return nullptr; } tabInCurrentLine_ = false; @@ -995,7 +992,7 @@ bool Prescanner::FreeFormContinuation() { // arguments to span multiple lines. bool Prescanner::IsImplicitContinuation() const { return !inPreprocessorDirective_ && !inCharLiteral_ && - delimiterNesting_ > 0 && nextLine_ < limit_ && + delimiterNesting_ > 0 && !IsAtEnd() && ClassifyLine(nextLine_).kind == LineClassification::Kind::Source; } diff --git a/flang/lib/Parser/provenance.cpp b/flang/lib/Parser/provenance.cpp index bcb871bd7cb41..46a0dc9268225 100644 --- a/flang/lib/Parser/provenance.cpp +++ b/flang/lib/Parser/provenance.cpp @@ -301,6 +301,14 @@ const SourceFile *AllSources::GetSourceFile( origin.u); } +const char *AllSources::GetSource(ProvenanceRange range) const { + Provenance start{range.start()}; + const Origin &origin{MapToOrigin(start)}; + return origin.covers.Contains(range) + ? &origin[origin.covers.MemberOffset(start)] + : nullptr; +} + std::optional AllSources::GetSourcePosition( Provenance prov) const { const Origin &origin{MapToOrigin(prov)}; @@ -402,7 +410,7 @@ const AllSources::Origin &AllSources::MapToOrigin(Provenance at) const { std::optional CookedSource::GetProvenanceRange( CharBlock cookedRange) const { - if (!Contains(cookedRange)) { + if (!AsCharBlock().Contains(cookedRange)) { return std::nullopt; } ProvenanceRange first{provenanceMap_.Map(cookedRange.begin() - &data_[0])}; diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index e26795d0825bb..ab94aa2e00c26 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2222,19 +2222,36 @@ class UnparseVisitor { break; } } - void Unparse(const OmpMemoryClause &x) { + void Unparse(const OmpHintExpr &x) { Word("HINT("), Walk(x.v), Put(')'); } + void Unparse(const OmpMemoryOrderClause &x) { switch (x.v) { - case OmpMemoryClause::MemoryOrder::SeqCst: + case llvm::omp::Clause::OMPC_seq_cst: Word("SEQ_CST"); break; + case llvm::omp::Clause::OMPC_acq_rel: + Word("ACQ_REL"); + break; + case llvm::omp::Clause::OMPC_release: + Word("RELEASE"); + break; + case llvm::omp::Clause::OMPC_acquire: + Word("ACQUIRE"); + break; + case llvm::omp::Clause::OMPC_relaxed: + Word("RELAXED"); + break; + default: + break; } } - void Unparse(const OmpMemoryClauseList &x) { Walk(" ", x.v, " "); } - void Unparse(const OmpMemoryClausePostList &x) { Walk(" ", x.v, " "); } + void Unparse(const OmpAtomicMemoryOrderClauseList &x) { Walk(" ", x.v, " "); } + void Unparse(const OmpAtomicMemoryOrderClausePostList &x) { + Walk(" ", x.v, " "); + } void Unparse(const OmpAtomic &x) { BeginOpenMP(); Word("!$OMP ATOMIC"); - Walk(std::get(x.t)); + Walk(std::get(x.t)); Put("\n"); EndOpenMP(); Walk(std::get>(x.t)); @@ -2245,9 +2262,9 @@ class UnparseVisitor { void Unparse(const OmpAtomicCapture &x) { BeginOpenMP(); Word("!$OMP ATOMIC"); - Walk(std::get(x.t)); + Walk(std::get(x.t)); Word(" CAPTURE"); - Walk(std::get(x.t)); + Walk(std::get(x.t)); Put("\n"); EndOpenMP(); Walk(std::get(x.t)); @@ -2260,9 +2277,9 @@ class UnparseVisitor { void Unparse(const OmpAtomicRead &x) { BeginOpenMP(); Word("!$OMP ATOMIC"); - Walk(std::get(x.t)); + Walk(std::get(x.t)); Word(" READ"); - Walk(std::get(x.t)); + Walk(std::get(x.t)); Put("\n"); EndOpenMP(); Walk(std::get>(x.t)); @@ -2273,9 +2290,9 @@ class UnparseVisitor { void Unparse(const OmpAtomicUpdate &x) { BeginOpenMP(); Word("!$OMP ATOMIC"); - Walk(std::get(x.t)); + Walk(std::get(x.t)); Word(" UPDATE"); - Walk(std::get(x.t)); + Walk(std::get(x.t)); Put("\n"); EndOpenMP(); Walk(std::get>(x.t)); @@ -2286,9 +2303,9 @@ class UnparseVisitor { void Unparse(const OmpAtomicWrite &x) { BeginOpenMP(); Word("!$OMP ATOMIC"); - Walk(std::get(x.t)); + Walk(std::get(x.t)); Word(" WRITE"); - Walk(std::get(x.t)); + Walk(std::get(x.t)); Put("\n"); EndOpenMP(); Walk(std::get>(x.t)); @@ -2300,8 +2317,7 @@ class UnparseVisitor { BeginOpenMP(); Word("!$OMP CRITICAL"); Walk(" (", std::get>(x.t), ")"); - Walk(" HINT(", std::get>(x.t), - ")"); + Walk(std::get>(x.t)); Put("\n"); EndOpenMP(); } @@ -2431,15 +2447,17 @@ class UnparseVisitor { } void Unparse(const OmpFlushMemoryClause &x) { switch (x.v) { - case OmpFlushMemoryClause::FlushMemoryOrder::AcqRel: + case llvm::omp::Clause::OMPC_acq_rel: Word("ACQ_REL "); break; - case OmpFlushMemoryClause::FlushMemoryOrder::Release: + case llvm::omp::Clause::OMPC_release: Word("RELEASE "); break; - case OmpFlushMemoryClause::FlushMemoryOrder::Acquire: + case llvm::omp::Clause::OMPC_acquire: Word("ACQUIRE "); break; + default: + break; } } void Unparse(const OpenMPFlushConstruct &x) { diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index df7ae6e53b1f6..896af3cc83e08 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -21,17 +21,19 @@ namespace Fortran::semantics { -using evaluate::characteristics::DummyArgument; -using evaluate::characteristics::DummyDataObject; -using evaluate::characteristics::DummyProcedure; -using evaluate::characteristics::FunctionResult; -using evaluate::characteristics::Procedure; +namespace characteristics = evaluate::characteristics; +using characteristics::DummyArgument; +using characteristics::DummyDataObject; +using characteristics::DummyProcedure; +using characteristics::FunctionResult; +using characteristics::Procedure; class CheckHelper { public: explicit CheckHelper(SemanticsContext &c) : context_{c} {} CheckHelper(SemanticsContext &c, const Scope &s) : context_{c}, scope_{&s} {} + SemanticsContext &context() { return context_; } void Check() { Check(context_.globalScope()); } void Check(const ParamValue &, bool canBeAssumed); void Check(const Bound &bound) { CheckSpecExpr(bound.GetExplicit()); } @@ -44,6 +46,7 @@ class CheckHelper { void Check(const Symbol &); void Check(const Scope &); void CheckInitialization(const Symbol &); + const Procedure *Characterize(const Symbol &); private: template void CheckSpecExpr(const A &x) { @@ -63,24 +66,20 @@ class CheckHelper { void CheckSubprogram(const Symbol &, const SubprogramDetails &); void CheckAssumedTypeEntity(const Symbol &, const ObjectEntityDetails &); void CheckDerivedType(const Symbol &, const DerivedTypeDetails &); - void CheckHostAssoc(const Symbol &, const HostAssocDetails &); void CheckGeneric(const Symbol &, const GenericDetails &); - std::optional> Characterize(const SymbolVector &); - bool CheckDefinedOperator(const SourceName &, const GenericKind &, - const Symbol &, const Procedure &); + void CheckHostAssoc(const Symbol &, const HostAssocDetails &); + bool CheckDefinedOperator( + SourceName, GenericKind, const Symbol &, const Procedure &); std::optional CheckNumberOfArgs( const GenericKind &, std::size_t); bool CheckDefinedOperatorArg( const SourceName &, const Symbol &, const Procedure &, std::size_t); bool CheckDefinedAssignment(const Symbol &, const Procedure &); bool CheckDefinedAssignmentArg(const Symbol &, const DummyArgument &, int); - void CheckSpecificsAreDistinguishable( - const Symbol &, const GenericDetails &, const std::vector &); + void CheckSpecificsAreDistinguishable(const Symbol &, const GenericDetails &); void CheckEquivalenceSet(const EquivalenceSet &); void CheckBlockData(const Scope &); - - void SayNotDistinguishable( - const SourceName &, GenericKind, const Symbol &, const Symbol &); + void CheckGenericOps(const Scope &); bool CheckConflicting(const Symbol &, Attr, Attr); bool InPure() const { return innermostSymbol_ && IsPureProcedure(*innermostSymbol_); @@ -108,6 +107,27 @@ class CheckHelper { // This symbol is the one attached to the innermost enclosing scope // that has a symbol. const Symbol *innermostSymbol_{nullptr}; + // Cache of calls to Procedure::Characterize(Symbol) + std::map> characterizeCache_; +}; + +class DistinguishabilityHelper { +public: + DistinguishabilityHelper(SemanticsContext &context) : context_{context} {} + void Add(const Symbol &, GenericKind, const Symbol &, const Procedure &); + void Check(); + +private: + void SayNotDistinguishable( + const SourceName &, GenericKind, const Symbol &, const Symbol &); + + SemanticsContext &context_; + struct ProcedureInfo { + GenericKind kind; + const Symbol &symbol; + const Procedure &procedure; + }; + std::map> nameToInfo_; }; void CheckHelper::Check(const ParamValue &value, bool canBeAssumed) { @@ -664,12 +684,13 @@ void CheckHelper::CheckProcEntity( // - C1551: NON_RECURSIVE prefix class SubprogramMatchHelper { public: - explicit SubprogramMatchHelper(SemanticsContext &context) - : context{context} {} + explicit SubprogramMatchHelper(CheckHelper &checkHelper) + : checkHelper{checkHelper} {} void Check(const Symbol &, const Symbol &); private: + SemanticsContext &context() { return checkHelper.context(); } void CheckDummyArg(const Symbol &, const Symbol &, const DummyArgument &, const DummyArgument &); void CheckDummyDataObject(const Symbol &, const Symbol &, @@ -692,7 +713,7 @@ class SubprogramMatchHelper { return parser::ToUpperCaseLetters(DummyProcedure::EnumToString(attr)); } - SemanticsContext &context; + CheckHelper &checkHelper; }; // 15.6.2.6 para 3 - can the result of an ENTRY differ from its function? @@ -719,7 +740,7 @@ bool CheckHelper::IsResultOkToDiffer(const FunctionResult &result) { void CheckHelper::CheckSubprogram( const Symbol &symbol, const SubprogramDetails &details) { if (const Symbol * iface{FindSeparateModuleSubprogramInterface(&symbol)}) { - SubprogramMatchHelper{context_}.Check(symbol, *iface); + SubprogramMatchHelper{*this}.Check(symbol, *iface); } if (const Scope * entryScope{details.entryScope()}) { // ENTRY 15.6.2.6, esp. C1571 @@ -834,66 +855,25 @@ void CheckHelper::CheckHostAssoc( void CheckHelper::CheckGeneric( const Symbol &symbol, const GenericDetails &details) { - const SymbolVector &specifics{details.specificProcs()}; - const auto &bindingNames{details.bindingNames()}; - std::optional> procs{Characterize(specifics)}; - if (!procs) { - return; - } - bool ok{true}; - if (details.kind().IsIntrinsicOperator()) { - for (std::size_t i{0}; i < specifics.size(); ++i) { - auto restorer{messages_.SetLocation(bindingNames[i])}; - ok &= CheckDefinedOperator( - symbol.name(), details.kind(), specifics[i], (*procs)[i]); - } - } - if (details.kind().IsAssignment()) { - for (std::size_t i{0}; i < specifics.size(); ++i) { - auto restorer{messages_.SetLocation(bindingNames[i])}; - ok &= CheckDefinedAssignment(specifics[i], (*procs)[i]); - } - } - if (ok) { - CheckSpecificsAreDistinguishable(symbol, details, *procs); - } + CheckSpecificsAreDistinguishable(symbol, details); } // Check that the specifics of this generic are distinguishable from each other -void CheckHelper::CheckSpecificsAreDistinguishable(const Symbol &generic, - const GenericDetails &details, const std::vector &procs) { +void CheckHelper::CheckSpecificsAreDistinguishable( + const Symbol &generic, const GenericDetails &details) { + GenericKind kind{details.kind()}; const SymbolVector &specifics{details.specificProcs()}; std::size_t count{specifics.size()}; - if (count < 2) { + if (count < 2 || !kind.IsName()) { return; } - GenericKind kind{details.kind()}; - auto distinguishable{kind.IsAssignment() || kind.IsOperator() - ? evaluate::characteristics::DistinguishableOpOrAssign - : evaluate::characteristics::Distinguishable}; - for (std::size_t i1{0}; i1 < count - 1; ++i1) { - auto &proc1{procs[i1]}; - for (std::size_t i2{i1 + 1}; i2 < count; ++i2) { - auto &proc2{procs[i2]}; - if (!distinguishable(proc1, proc2)) { - SayNotDistinguishable( - generic.name(), kind, specifics[i1], specifics[i2]); - } + DistinguishabilityHelper helper{context_}; + for (const Symbol &specific : specifics) { + if (const Procedure * procedure{Characterize(specific)}) { + helper.Add(generic, kind, specific, *procedure); } } -} - -void CheckHelper::SayNotDistinguishable(const SourceName &name, - GenericKind kind, const Symbol &proc1, const Symbol &proc2) { - auto &&text{kind.IsDefinedOperator() - ? "Generic operator '%s' may not have specific procedures '%s'" - " and '%s' as their interfaces are not distinguishable"_err_en_US - : "Generic '%s' may not have specific procedures '%s'" - " and '%s' as their interfaces are not distinguishable"_err_en_US}; - auto &msg{ - context_.Say(name, std::move(text), name, proc1.name(), proc2.name())}; - evaluate::AttachDeclaration(msg, proc1); - evaluate::AttachDeclaration(msg, proc2); + helper.Check(); } static bool ConflictsWithIntrinsicAssignment(const Procedure &proc) { @@ -905,6 +885,9 @@ static bool ConflictsWithIntrinsicAssignment(const Procedure &proc) { static bool ConflictsWithIntrinsicOperator( const GenericKind &kind, const Procedure &proc) { + if (!kind.IsIntrinsicOperator()) { + return false; + } auto arg0{std::get(proc.dummyArguments[0].u).type}; auto type0{arg0.type()}; if (proc.dummyArguments.size() == 1) { // unary @@ -942,8 +925,11 @@ static bool ConflictsWithIntrinsicOperator( } // Check if this procedure can be used for defined operators (see 15.4.3.4.2). -bool CheckHelper::CheckDefinedOperator(const SourceName &opName, - const GenericKind &kind, const Symbol &specific, const Procedure &proc) { +bool CheckHelper::CheckDefinedOperator(SourceName opName, GenericKind kind, + const Symbol &specific, const Procedure &proc) { + if (context_.HasError(specific)) { + return false; + } std::optional msg; if (specific.attrs().test(Attr::NOPASS)) { // C774 msg = "%s procedure '%s' may not have NOPASS attribute"_err_en_US; @@ -962,8 +948,9 @@ bool CheckHelper::CheckDefinedOperator(const SourceName &opName, } else { return true; // OK } - SayWithDeclaration(specific, std::move(msg.value()), - parser::ToUpperCaseLetters(opName.ToString()), specific.name()); + SayWithDeclaration( + specific, std::move(*msg), MakeOpName(opName), specific.name()); + context_.SetError(specific); return false; } @@ -971,6 +958,9 @@ bool CheckHelper::CheckDefinedOperator(const SourceName &opName, // false and return the error message in msg. std::optional CheckHelper::CheckNumberOfArgs( const GenericKind &kind, std::size_t nargs) { + if (!kind.IsIntrinsicOperator()) { + return std::nullopt; + } std::size_t min{2}, max{2}; // allowed number of args; default is binary std::visit(common::visitors{ [&](const common::NumericOperator &x) { @@ -1035,6 +1025,9 @@ bool CheckHelper::CheckDefinedOperatorArg(const SourceName &opName, // Check if this procedure can be used for defined assignment (see 15.4.3.4.3). bool CheckHelper::CheckDefinedAssignment( const Symbol &specific, const Procedure &proc) { + if (context_.HasError(specific)) { + return false; + } std::optional msg; if (specific.attrs().test(Attr::NOPASS)) { // C774 msg = "Defined assignment procedure '%s' may not have" @@ -1054,6 +1047,7 @@ bool CheckHelper::CheckDefinedAssignment( return true; // OK } SayWithDeclaration(specific, std::move(msg.value()), specific.name()); + context_.SetError(specific); return false; } @@ -1086,6 +1080,7 @@ bool CheckHelper::CheckDefinedAssignmentArg( } if (msg) { SayWithDeclaration(symbol, std::move(*msg), symbol.name(), arg.name); + context_.SetError(symbol); return false; } return true; @@ -1102,17 +1097,14 @@ bool CheckHelper::CheckConflicting(const Symbol &symbol, Attr a1, Attr a2) { } } -std::optional> CheckHelper::Characterize( - const SymbolVector &specifics) { - std::vector result; - for (const Symbol &specific : specifics) { - auto proc{Procedure::Characterize(specific, context_.intrinsics())}; - if (!proc || context_.HasError(specific)) { - return std::nullopt; - } - result.emplace_back(*proc); - } - return result; +const Procedure *CheckHelper::Characterize(const Symbol &symbol) { + auto it{characterizeCache_.find(symbol)}; + if (it == characterizeCache_.end()) { + auto pair{characterizeCache_.emplace(SymbolRef{symbol}, + Procedure::Characterize(symbol, context_.intrinsics()))}; + it = pair.first; + } + return common::GetPtrFromOptional(it->second); } void CheckHelper::CheckVolatile(const Symbol &symbol, bool isAssociated, @@ -1298,10 +1290,8 @@ void CheckHelper::CheckProcBinding( ? "A NOPASS type-bound procedure may not override a passed-argument procedure"_err_en_US : "A passed-argument type-bound procedure may not override a NOPASS procedure"_err_en_US); } else { - auto bindingChars{evaluate::characteristics::Procedure::Characterize( - binding.symbol(), context_.intrinsics())}; - auto overriddenChars{evaluate::characteristics::Procedure::Characterize( - overriddenBinding->symbol(), context_.intrinsics())}; + const auto *bindingChars{Characterize(binding.symbol())}; + const auto *overriddenChars{Characterize(overriddenBinding->symbol())}; if (bindingChars && overriddenChars) { if (isNopass) { if (!bindingChars->CanOverride(*overriddenChars, std::nullopt)) { @@ -1357,6 +1347,7 @@ void CheckHelper::Check(const Scope &scope) { if (scope.kind() == Scope::Kind::BlockData) { CheckBlockData(scope); } + CheckGenericOps(scope); } void CheckHelper::CheckEquivalenceSet(const EquivalenceSet &set) { @@ -1417,6 +1408,53 @@ void CheckHelper::CheckBlockData(const Scope &scope) { } } +// Check distinguishability of generic assignment and operators. +// For these, generics and generic bindings must be considered together. +void CheckHelper::CheckGenericOps(const Scope &scope) { + DistinguishabilityHelper helper{context_}; + auto addSpecifics{[&](const Symbol &generic) { + const auto *details{generic.GetUltimate().detailsIf()}; + if (!details) { + return; + } + GenericKind kind{details->kind()}; + if (!kind.IsAssignment() && !kind.IsOperator()) { + return; + } + const SymbolVector &specifics{details->specificProcs()}; + const std::vector &bindingNames{details->bindingNames()}; + for (std::size_t i{0}; i < specifics.size(); ++i) { + const Symbol &specific{*specifics[i]}; + if (const Procedure * proc{Characterize(specific)}) { + auto restorer{messages_.SetLocation(bindingNames[i])}; + if (kind.IsAssignment()) { + if (!CheckDefinedAssignment(specific, *proc)) { + continue; + } + } else { + if (!CheckDefinedOperator(generic.name(), kind, specific, *proc)) { + continue; + } + } + helper.Add(generic, kind, specific, *proc); + } + } + }}; + for (const auto &pair : scope) { + const Symbol &symbol{*pair.second}; + addSpecifics(symbol); + const Symbol &ultimate{symbol.GetUltimate()}; + if (ultimate.has()) { + if (const Scope * typeScope{ultimate.scope()}) { + for (const auto &pair2 : *typeScope) { + addSpecifics(*pair2.second); + } + } + } + } + helper.Check(); +} + void SubprogramMatchHelper::Check( const Symbol &symbol1, const Symbol &symbol2) { const auto details1{symbol1.get()}; @@ -1469,8 +1507,8 @@ void SubprogramMatchHelper::Check( string1, string2); } } - auto proc1{Procedure::Characterize(symbol1, context.intrinsics())}; - auto proc2{Procedure::Characterize(symbol2, context.intrinsics())}; + const Procedure *proc1{checkHelper.Characterize(symbol1)}; + const Procedure *proc2{checkHelper.Characterize(symbol2)}; if (!proc1 || !proc2) { return; } @@ -1583,7 +1621,7 @@ bool SubprogramMatchHelper::CheckSameIntent(const Symbol &symbol1, template void SubprogramMatchHelper::Say(const Symbol &symbol1, const Symbol &symbol2, parser::MessageFixedText &&text, A &&...args) { - auto &message{context.Say(symbol1.name(), std::move(text), symbol1.name(), + auto &message{context().Say(symbol1.name(), std::move(text), symbol1.name(), std::forward(args)...)}; evaluate::AttachDeclaration(message, symbol2); } @@ -1615,7 +1653,7 @@ bool SubprogramMatchHelper::CheckSameAttrs( bool SubprogramMatchHelper::ShapesAreCompatible( const DummyDataObject &obj1, const DummyDataObject &obj2) { - return evaluate::characteristics::ShapesAreCompatible( + return characteristics::ShapesAreCompatible( FoldShape(obj1.type.shape()), FoldShape(obj2.type.shape())); } @@ -1623,11 +1661,58 @@ evaluate::Shape SubprogramMatchHelper::FoldShape(const evaluate::Shape &shape) { evaluate::Shape result; for (const auto &extent : shape) { result.emplace_back( - evaluate::Fold(context.foldingContext(), common::Clone(extent))); + evaluate::Fold(context().foldingContext(), common::Clone(extent))); } return result; } +void DistinguishabilityHelper::Add(const Symbol &generic, GenericKind kind, + const Symbol &specific, const Procedure &procedure) { + if (!context_.HasError(specific)) { + nameToInfo_[generic.name()].emplace_back( + ProcedureInfo{kind, specific, procedure}); + } +} + +void DistinguishabilityHelper::Check() { + for (const auto &[name, info] : nameToInfo_) { + auto count{info.size()}; + for (std::size_t i1{0}; i1 < count - 1; ++i1) { + const auto &[kind1, symbol1, proc1] = info[i1]; + for (std::size_t i2{i1 + 1}; i2 < count; ++i2) { + const auto &[kind2, symbol2, proc2] = info[i2]; + auto distinguishable{kind1.IsName() + ? evaluate::characteristics::Distinguishable + : evaluate::characteristics::DistinguishableOpOrAssign}; + if (!distinguishable(proc1, proc2)) { + SayNotDistinguishable(name, kind1, symbol1, symbol2); + } + } + } + } +} + +void DistinguishabilityHelper::SayNotDistinguishable(const SourceName &name, + GenericKind kind, const Symbol &proc1, const Symbol &proc2) { + std::string name1{proc1.name().ToString()}; + std::string name2{proc2.name().ToString()}; + if (kind.IsOperator() || kind.IsAssignment()) { + // proc1 and proc2 may come from different scopes so qualify their names + if (proc1.owner().IsDerivedType()) { + name1 = proc1.owner().GetName()->ToString() + '%' + name1; + } + if (proc2.owner().IsDerivedType()) { + name2 = proc2.owner().GetName()->ToString() + '%' + name2; + } + } + auto &msg{context_.Say(name, + "Generic '%s' may not have specific procedures '%s' and '%s'" + " as their interfaces are not distinguishable"_err_en_US, + MakeOpName(name), name1, name2)}; + evaluate::AttachDeclaration(msg, proc1); + evaluate::AttachDeclaration(msg, proc2); +} + void CheckDeclarations(SemanticsContext &context) { CheckHelper{context}.Check(); } diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 6a4980ebcd544..3e360b8ec4ca4 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -456,6 +456,9 @@ void OmpStructureChecker::Enter(const parser::OmpAlignedClause &x) { } // 2.8.1 TODO: list-item attribute check } +void OmpStructureChecker::Enter(const parser::OmpAllocateClause &) { + CheckAllowed(llvm::omp::Clause::OMPC_allocate); +} void OmpStructureChecker::Enter(const parser::OmpDefaultClause &) { CheckAllowed(llvm::omp::Clause::OMPC_default); } diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 9a0c1e2c0a2d4..fbe95d0ee2e0a 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -150,6 +150,7 @@ class OmpStructureChecker void Enter(const parser::OmpClause::IsDevicePtr &); void Enter(const parser::OmpAlignedClause &); + void Enter(const parser::OmpAllocateClause &); void Enter(const parser::OmpDefaultClause &); void Enter(const parser::OmpDefaultmapClause &); void Enter(const parser::OmpDependClause &); diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index ae53559ea5db2..5a2a7df9fb98d 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -1684,7 +1684,6 @@ auto ExpressionAnalyzer::AnalyzeProcedureComponentRef( const parser::ProcComponentRef &pcr, ActualArguments &&arguments) -> std::optional { const parser::StructureComponent &sc{pcr.v.thing}; - const auto &name{sc.component.source}; if (MaybeExpr base{Analyze(sc.base)}) { if (const Symbol * sym{sc.component.symbol}) { if (auto *dtExpr{UnwrapExpr>(*base)}) { @@ -1722,7 +1721,7 @@ auto ExpressionAnalyzer::AnalyzeProcedureComponentRef( } } } - Say(name, + Say(sc.component.source, "Base of procedure component reference is not a derived-type object"_err_en_US); } } @@ -1997,11 +1996,18 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::FunctionReference &funcRef, const auto &designator{std::get(call.t)}; if (const auto *name{std::get_if(&designator.u)}) { semantics::Scope &scope{context_.FindScope(name->source)}; + semantics::DerivedTypeSpec dtSpec{ + name->source, derivedType.GetUltimate()}; + if (dtSpec.IsForwardReferenced()) { + Say(call.source, + "Cannot construct value for derived type '%s' " + "before it is defined"_err_en_US, + name->source); + return std::nullopt; + } const semantics::DeclTypeSpec &type{ - semantics::FindOrInstantiateDerivedType(scope, - semantics::DerivedTypeSpec{ - name->source, derivedType.GetUltimate()}, - context_)}; + semantics::FindOrInstantiateDerivedType( + scope, std::move(dtSpec), context_)}; auto &mutableRef{const_cast(funcRef)}; *structureConstructor = mutableRef.ConvertToStructureConstructor(type.derivedTypeSpec()); @@ -2940,18 +2946,26 @@ std::optional ArgumentAnalyzer::GetDefinedAssignmentProc() { context_.EmitGenericResolutionError(*symbol); } } - for (std::size_t passIndex{0}; passIndex < actuals_.size(); ++passIndex) { - if (const Symbol * specific{FindBoundOp(oprName, passIndex)}) { - proc = specific; + int passedObjectIndex{-1}; + for (std::size_t i{0}; i < actuals_.size(); ++i) { + if (const Symbol * specific{FindBoundOp(oprName, i)}) { + if (const Symbol * + resolution{GetBindingResolution(GetType(i), *specific)}) { + proc = resolution; + } else { + proc = specific; + passedObjectIndex = i; + } } } - if (proc) { - ActualArguments actualsCopy{actuals_}; - actualsCopy[1]->Parenthesize(); - return ProcedureRef{ProcedureDesignator{*proc}, std::move(actualsCopy)}; - } else { + if (!proc) { return std::nullopt; } + ActualArguments actualsCopy{actuals_}; + if (passedObjectIndex >= 0) { + actualsCopy[passedObjectIndex]->set_isPassedObject(); + } + return ProcedureRef{ProcedureDesignator{*proc}, std::move(actualsCopy)}; } void ArgumentAnalyzer::Dump(llvm::raw_ostream &os) { diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index e73bfa7c37ccf..f68bcd1e1fa86 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -13,6 +13,7 @@ #include "resolve-names-utils.h" #include "flang/Common/idioms.h" #include "flang/Evaluate/fold.h" +#include "flang/Evaluate/type.h" #include "flang/Parser/parse-tree-visitor.h" #include "flang/Parser/parse-tree.h" #include "flang/Parser/tools.h" @@ -226,7 +227,8 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { } bool Pre(const parser::OpenMPBlockConstruct &); - void Post(const parser::OpenMPBlockConstruct &) { PopContext(); } + void Post(const parser::OpenMPBlockConstruct &); + void Post(const parser::OmpBeginBlockDirective &) { GetContext().withinConstruct = true; } @@ -254,6 +256,11 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { ResolveOmpObjectList(x.v, Symbol::Flag::OmpPrivate); return false; } + bool Pre(const parser::OmpAllocateClause &x) { + const auto &objectList{std::get(x.t)}; + ResolveOmpObjectList(objectList, Symbol::Flag::OmpAllocate); + return false; + } bool Pre(const parser::OmpClause::Firstprivate &x) { ResolveOmpObjectList(x.v, Symbol::Flag::OmpFirstPrivate); return false; @@ -273,6 +280,10 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { Symbol::Flag::OmpFirstPrivate, Symbol::Flag::OmpLastPrivate, Symbol::Flag::OmpReduction, Symbol::Flag::OmpLinear}; + static constexpr Symbol::Flags privateDataSharingAttributeFlags{ + Symbol::Flag::OmpPrivate, Symbol::Flag::OmpFirstPrivate, + Symbol::Flag::OmpLastPrivate}; + static constexpr Symbol::Flags ompFlagsRequireNewSymbol{ Symbol::Flag::OmpPrivate, Symbol::Flag::OmpLinear, Symbol::Flag::OmpFirstPrivate, Symbol::Flag::OmpLastPrivate, @@ -281,6 +292,21 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { static constexpr Symbol::Flags ompFlagsRequireMark{ Symbol::Flag::OmpThreadprivate}; + std::vector allocateNames_; // on one directive + SymbolSet privateDataSharingAttributeObjects_; // on one directive + + void AddAllocateName(const parser::Name *&object) { + allocateNames_.push_back(object); + } + void ClearAllocateNames() { allocateNames_.clear(); } + + void AddPrivateDataSharingAttributeObjects(SymbolRef object) { + privateDataSharingAttributeObjects_.insert(object); + } + void ClearPrivateDataSharingAttributeObjects() { + privateDataSharingAttributeObjects_.clear(); + } + // Predetermined DSA rules void PrivatizeAssociatedLoopIndex(const parser::OpenMPLoopConstruct &); void ResolveSeqLoopIndexInParallelOrTaskConstruct(const parser::Name &); @@ -632,9 +658,49 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPBlockConstruct &x) { break; } ClearDataSharingAttributeObjects(); + ClearPrivateDataSharingAttributeObjects(); + ClearAllocateNames(); return true; } +void OmpAttributeVisitor::Post(const parser::OpenMPBlockConstruct &x) { + const auto &beginBlockDir{std::get(x.t)}; + const auto &beginDir{std::get(beginBlockDir.t)}; + switch (beginDir.v) { + case llvm::omp::Directive::OMPD_parallel: + case llvm::omp::Directive::OMPD_single: + case llvm::omp::Directive::OMPD_target: + case llvm::omp::Directive::OMPD_task: + case llvm::omp::Directive::OMPD_teams: + case llvm::omp::Directive::OMPD_parallel_workshare: + case llvm::omp::Directive::OMPD_target_teams: + case llvm::omp::Directive::OMPD_target_parallel: { + bool hasPrivate; + for (const auto *allocName : allocateNames_) { + hasPrivate = false; + for (auto privateObj : privateDataSharingAttributeObjects_) { + const Symbol &symbolPrivate{*privateObj}; + if (allocName->source == symbolPrivate.name()) { + hasPrivate = true; + break; + } + } + if (!hasPrivate) { + context_.Say(allocName->source, + "The ALLOCATE clause requires that '%s' must be listed in a " + "private " + "data-sharing attribute clause on the same directive"_err_en_US, + allocName->ToString()); + } + } + break; + } + default: + break; + } + PopContext(); +} + bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) { const auto &beginLoopDir{std::get(x.t)}; const auto &beginDir{std::get(beginLoopDir.t)}; @@ -879,6 +945,9 @@ void OmpAttributeVisitor::ResolveOmpObject( if (dataSharingAttributeFlags.test(ompFlag)) { CheckMultipleAppearances(*name, *symbol, ompFlag); } + if (ompFlag == Symbol::Flag::OmpAllocate) { + AddAllocateName(name); + } } } else { // Array sections to be changed to substrings as needed @@ -976,6 +1045,9 @@ void OmpAttributeVisitor::CheckMultipleAppearances( name.ToString()); } else { AddDataSharingAttributeObject(*target); + if (privateDataSharingAttributeFlags.test(ompFlag)) { + AddPrivateDataSharingAttributeObjects(*target); + } } } diff --git a/flang/lib/Semantics/resolve-names-utils.cpp b/flang/lib/Semantics/resolve-names-utils.cpp index d6f0302e98545..8dbd25e163acb 100644 --- a/flang/lib/Semantics/resolve-names-utils.cpp +++ b/flang/lib/Semantics/resolve-names-utils.cpp @@ -47,12 +47,6 @@ parser::MessageFixedText WithIsFatal( msg.text().begin(), msg.text().size(), isFatal}; } -bool IsDefinedOperator(const SourceName &name) { - const char *begin{name.begin()}; - const char *end{name.end()}; - return begin != end && begin[0] == '.' && end[-1] == '.'; -} - bool IsIntrinsicOperator( const SemanticsContext &context, const SourceName &name) { std::string str{name.ToString()}; diff --git a/flang/lib/Semantics/resolve-names-utils.h b/flang/lib/Semantics/resolve-names-utils.h index 08db70345f152..17462d111d970 100644 --- a/flang/lib/Semantics/resolve-names-utils.h +++ b/flang/lib/Semantics/resolve-names-utils.h @@ -47,8 +47,6 @@ Symbol *Resolve(const parser::Name &, Symbol *); parser::MessageFixedText WithIsFatal( const parser::MessageFixedText &msg, bool isFatal); -// Is this the name of a defined operator, e.g. ".foo." -bool IsDefinedOperator(const SourceName &); bool IsIntrinsicOperator(const SemanticsContext &, const SourceName &); bool IsLogicalConstant(const SemanticsContext &, const SourceName &); diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index a75c5b6a829e3..b501ac69098f9 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -2276,19 +2276,13 @@ ModuleVisitor::SymbolRename ModuleVisitor::AddUse( return {}; // error occurred finding module } if (!useSymbol) { - Say(useName, - IsDefinedOperator(useName) - ? "Operator '%s' not found in module '%s'"_err_en_US - : "'%s' not found in module '%s'"_err_en_US, - useName, useModuleScope_->GetName().value()); + Say(useName, "'%s' not found in module '%s'"_err_en_US, MakeOpName(useName), + useModuleScope_->GetName().value()); return {}; } if (useSymbol->attrs().test(Attr::PRIVATE)) { - Say(useName, - IsDefinedOperator(useName) - ? "Operator '%s' is PRIVATE in '%s'"_err_en_US - : "'%s' is PRIVATE in '%s'"_err_en_US, - useName, useModuleScope_->GetName().value()); + Say(useName, "'%s' is PRIVATE in '%s'"_err_en_US, MakeOpName(useName), + useModuleScope_->GetName().value()); return {}; } auto &localSymbol{MakeSymbol(localName)}; @@ -2550,11 +2544,9 @@ void InterfaceVisitor::ResolveSpecificsInGeneric(Symbol &generic) { } } if (!namesSeen.insert(name->source).second) { - Say(*name, - details.kind().IsDefinedOperator() - ? "Procedure '%s' is already specified in generic operator '%s'"_err_en_US - : "Procedure '%s' is already specified in generic '%s'"_err_en_US, - name->source, generic.name()); + Say(name->source, + "Procedure '%s' is already specified in generic '%s'"_err_en_US, + name->source, MakeOpName(generic.name())); continue; } details.AddSpecificProc(*symbol, name->source); @@ -5044,6 +5036,9 @@ void ConstructVisitor::Post(const parser::Association &x) { const auto &name{std::get(x.t)}; GetCurrentAssociation().name = &name; if (auto *symbol{MakeAssocEntity()}) { + if (ExtractCoarrayRef(GetCurrentAssociation().selector.expr)) { // C1103 + Say("Selector must not be a coindexed object"_err_en_US); + } SetTypeFromAssociation(*symbol); SetAttrsFromAssociation(*symbol); } @@ -5098,6 +5093,9 @@ void ConstructVisitor::Post(const parser::SelectTypeStmt &x) { MakePlaceholder(*name, MiscDetails::Kind::SelectTypeAssociateName); association.name = &*name; auto exprType{association.selector.expr->GetType()}; + if (ExtractCoarrayRef(association.selector.expr)) { // C1103 + Say("Selector must not be a coindexed object"_err_en_US); + } if (exprType && !exprType->IsPolymorphic()) { // C1159 Say(association.selector.source, "Selector '%s' in SELECT TYPE statement must be " @@ -5926,10 +5924,11 @@ Symbol &ModuleVisitor::SetAccess( if (attrs.HasAny({Attr::PUBLIC, Attr::PRIVATE})) { // PUBLIC/PRIVATE already set: make it a fatal error if it changed Attr prev = attrs.test(Attr::PUBLIC) ? Attr::PUBLIC : Attr::PRIVATE; - auto msg{IsDefinedOperator(name) - ? "The accessibility of operator '%s' has already been specified as %s"_en_US - : "The accessibility of '%s' has already been specified as %s"_en_US}; - Say(name, WithIsFatal(msg, attr != prev), name, EnumToString(prev)); + Say(name, + WithIsFatal( + "The accessibility of '%s' has already been specified as %s"_en_US, + attr != prev), + MakeOpName(name), EnumToString(prev)); } else { attrs.set(attr); } diff --git a/flang/lib/Semantics/scope.cpp b/flang/lib/Semantics/scope.cpp index c7635c0b1a3bb..768f9f5aab1b8 100644 --- a/flang/lib/Semantics/scope.cpp +++ b/flang/lib/Semantics/scope.cpp @@ -114,14 +114,6 @@ Symbol *Scope::FindComponent(SourceName name) const { } } -std::optional Scope::GetName() const { - if (const auto *sym{GetSymbol()}) { - return sym->name(); - } else { - return std::nullopt; - } -} - bool Scope::Contains(const Scope &that) const { for (const Scope *scope{&that};; scope = &scope->parent()) { if (*scope == *this) { diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp index e0d80ec6d1c8b..c15c60406c36c 100644 --- a/flang/lib/Semantics/symbol.cpp +++ b/flang/lib/Semantics/symbol.cpp @@ -541,13 +541,11 @@ const DerivedTypeSpec *Symbol::GetParentTypeSpec(const Scope *scope) const { const Symbol *Symbol::GetParentComponent(const Scope *scope) const { if (const auto *dtDetails{detailsIf()}) { - if (!scope) { - scope = scope_; + if (const Scope * localScope{scope ? scope : scope_}) { + return dtDetails->GetParentComponent(DEREF(localScope)); } - return dtDetails->GetParentComponent(DEREF(scope)); - } else { - return nullptr; } + return nullptr; } void DerivedTypeDetails::add_component(const Symbol &symbol) { diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index 3f93944cd3c33..848aef08e3a1f 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -156,6 +156,19 @@ bool IsGenericDefinedOp(const Symbol &symbol) { } } +bool IsDefinedOperator(SourceName name) { + const char *begin{name.begin()}; + const char *end{name.end()}; + return begin != end && begin[0] == '.' && end[-1] == '.'; +} + +std::string MakeOpName(SourceName name) { + std::string result{name.ToString()}; + return IsDefinedOperator(name) ? "OPERATOR(" + result + ")" + : result.find("operator(", 0) == 0 ? parser::ToUpperCaseLetters(result) + : result; +} + bool IsCommonBlockContaining(const Symbol &block, const Symbol &object) { const auto &objects{block.get().objects()}; auto found{std::find(objects.begin(), objects.end(), object)}; @@ -739,7 +752,6 @@ bool InProtectedContext(const Symbol &symbol, const Scope ¤tScope) { } // C1101 and C1158 -// TODO Need to check for a coindexed object (why? C1103?) std::optional WhyNotModifiable( const Symbol &symbol, const Scope &scope) { const Symbol *root{GetAssociationRoot(symbol)}; diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp index 998edc954ba75..da281aa68e435 100644 --- a/flang/runtime/edit-input.cpp +++ b/flang/runtime/edit-input.cpp @@ -180,10 +180,11 @@ static int ScanRealInput(char *buffer, int bufferSize, IoStatementState &io, first == 'E' || first == 'D' || first == 'Q') { Put('.'); // input field is normalized to a fraction auto start{got}; + bool bzMode{(edit.modes.editingFlags & blankZero) != 0}; for (; next; next = io.NextInField(remaining)) { char32_t ch{*next}; if (ch == ' ' || ch == '\t') { - if (edit.modes.editingFlags & blankZero) { + if (bzMode) { ch = '0'; // BZ mode - treat blank as if it were zero } else { continue; @@ -206,19 +207,29 @@ static int ScanRealInput(char *buffer, int bufferSize, IoStatementState &io, if (next && (*next == 'e' || *next == 'E' || *next == 'd' || *next == 'D' || *next == 'q' || *next == 'Q')) { + // Optional exponent letter. Blanks are allowed between the + // optional exponent letter and the exponent value. io.SkipSpaces(remaining); next = io.NextInField(remaining); } - exponent = -edit.modes.scale; // default exponent is -kP + // The default exponent is -kP, but the scale factor doesn't affect + // an explicit exponent. + exponent = -edit.modes.scale; if (next && - (*next == '-' || *next == '+' || (*next >= '0' && *next <= '9'))) { + (*next == '-' || *next == '+' || (*next >= '0' && *next <= '9') || + (bzMode && (*next == ' ' || *next == '\t')))) { bool negExpo{*next == '-'}; if (negExpo || *next == '+') { next = io.NextInField(remaining); } - for (exponent = 0; next && (*next >= '0' && *next <= '9'); - next = io.NextInField(remaining)) { - exponent = 10 * exponent + *next - '0'; + for (exponent = 0; next; next = io.NextInField(remaining)) { + if (*next >= '0' && *next <= '9') { + exponent = 10 * exponent + *next - '0'; + } else if (bzMode && (*next == ' ' || *next == '\t')) { + exponent = 10 * exponent; + } else { + break; + } } if (negExpo) { exponent = -exponent; diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt index a1532dc7141ff..635d3d88b61c6 100644 --- a/flang/test/CMakeLists.txt +++ b/flang/test/CMakeLists.txt @@ -41,6 +41,10 @@ if (LINK_WITH_FIR) list(APPEND FLANG_TEST_DEPENDS tco) endif() +if (FLANG_BUILD_NEW_DRIVER) + list(APPEND FLANG_TEST_DEPENDS flang-new) +endif() + if (FLANG_INCLUDE_TESTS) if (FLANG_GTEST_AVAIL) list(APPEND FLANG_TEST_DEPENDS FlangUnitTests) diff --git a/flang/test/Evaluate/folding12.f90 b/flang/test/Evaluate/folding12.f90 new file mode 100644 index 0000000000000..657ddc6a34ae5 --- /dev/null +++ b/flang/test/Evaluate/folding12.f90 @@ -0,0 +1,163 @@ +! RUN: %S/test_folding.sh %s %t %f18 +! Test folding of structure constructors +module m1 + type parent_type + integer :: parent_field + end type parent_type + type, extends(parent_type) :: child_type + integer :: child_field + end type child_type + type parent_array_type + integer, dimension(2) :: parent_field + end type parent_array_type + type, extends(parent_array_type) :: child_array_type + integer :: child_field + end type child_array_type + + type(child_type), parameter :: child_const1 = child_type(10, 11) + logical, parameter :: test_child1 = child_const1%child_field == 11 + logical, parameter :: test_parent = child_const1%parent_field == 10 + + type(child_type), parameter :: child_const2 = child_type(12, 13) + type(child_type), parameter :: array_var(2) = & + [child_type(14, 15), child_type(16, 17)] + logical, parameter :: test_array_child = array_var(2)%child_field == 17 + logical, parameter :: test_array_parent = array_var(2)%parent_field == 16 + + type array_type + real, dimension(3) :: real_field + end type array_type + type(array_type), parameter :: array_var2 = & + array_type([(real(i*i), i = 1,3)]) + logical, parameter :: test_array_var = array_var2%real_field(2) == 4.0 + + type(child_type), parameter, dimension(2) :: child_const3 = & + [child_type(18, 19), child_type(20, 21)] + integer, dimension(2), parameter :: int_const4 = & + child_const3(:)%parent_field + logical, parameter :: test_child2 = int_const4(1) == 18 + + type(child_array_type), parameter, dimension(2) :: child_const5 = & + [child_array_type([22, 23], 24), child_array_type([25, 26], 27)] + integer, dimension(2), parameter :: int_const6 = child_const5(:)%parent_field(2) + logical, parameter :: test_child3 = int_const6(1) == 23 + + type(child_type), parameter :: child_const7 = child_type(28, 29) + type(parent_type), parameter :: parent_const8 = child_const7%parent_type + logical, parameter :: test_child4 = parent_const8%parent_field == 28 + + type(child_type), parameter :: child_const9 = & + child_type(parent_type(30), 31) + integer, parameter :: int_const10 = child_const9%parent_field + logical, parameter :: test_child5 = int_const10 == 30 + +end module m1 + +module m2 + type grandparent_type + real :: grandparent_field + end type grandparent_type + type, extends(grandparent_type) :: parent_type + integer :: parent_field + end type parent_type + type, extends(parent_type) :: child_type + real :: child_field + end type child_type + + type(child_type), parameter :: child_const1 = child_type(10.0, 11, 12.0) + integer, parameter :: int_const2 = & + child_const1%grandparent_type%grandparent_field + logical, parameter :: test_child1 = int_const2 == 10.0 + integer, parameter :: int_const3 = & + child_const1%grandparent_field + logical, parameter :: test_child2 = int_const3 == 10.0 + + type(child_type), parameter :: child_const4 = & + child_type(parent_type(13.0, 14), 15.0) + integer, parameter :: int_const5 = & + child_const4%grandparent_type%grandparent_field + logical, parameter :: test_child3 = int_const5 == 13.0 + + type(child_type), parameter :: child_const6 = & + child_type(parent_type(grandparent_type(16.0), 17), 18.0) + integer, parameter :: int_const7 = & + child_const6%grandparent_type%grandparent_field + logical, parameter :: test_child4 = int_const7 == 16.0 + integer, parameter :: int_const8 = & + child_const6%grandparent_field + logical, parameter :: test_child5 = int_const8 == 16.0 +end module m2 + +module m3 + ! tests that use components with default initializations and with the + ! components in the structure constructors in a different order from the + ! declared order + type parent_type + integer :: parent_field1 + real :: parent_field2 = 20.0 + logical :: parent_field3 + end type parent_type + type, extends(parent_type) :: child_type + real :: child_field1 + logical :: child_field2 = .false. + integer :: child_field3 + end type child_type + + type(child_type), parameter :: child_const1 = & + child_type( & + parent_field2 = 10.0, child_field3 = 11, & + child_field2 = .true., parent_field3 = .false., & + parent_field1 = 12, child_field1 = 13.3) + logical, parameter :: test_child1 = child_const1%child_field1 == 13.3 + logical, parameter :: test_child2 = child_const1%child_field2 .eqv. .true. + logical, parameter :: test_child3 = child_const1%child_field3 == 11 + logical, parameter :: test_parent1 = child_const1%parent_field1 == 12 + logical, parameter :: test_parent2 = child_const1%parent_field2 == 10.0 + logical, parameter :: test_parent3 = child_const1%parent_field3 .eqv. .false. + logical, parameter :: test_parent4 = & + child_const1%parent_type%parent_field1 == 12 + logical, parameter :: test_parent5 = & + child_const1%parent_type%parent_field2 == 10.0 + logical, parameter :: test_parent6 = & + child_const1%parent_type%parent_field3 .eqv. .false. + + type(parent_type), parameter ::parent_const1 = child_const1%parent_type + logical, parameter :: test_parent7 = parent_const1%parent_field1 == 12 + logical, parameter :: test_parent8 = parent_const1%parent_field2 == 10.0 + logical, parameter :: test_parent9 = & + parent_const1%parent_field3 .eqv. .false. + + type(child_type), parameter :: child_const2 = & + child_type( & + child_field3 = 14, parent_field3 = .true., & + parent_field1 = 15, child_field1 = 16.6) + logical, parameter :: test_child4 = child_const2%child_field1 == 16.6 + logical, parameter :: test_child5 = child_const2%child_field2 .eqv. .false. + logical, parameter :: test_child6 = child_const2%child_field3 == 14 + logical, parameter :: test_parent10 = child_const2%parent_field1 == 15 + logical, parameter :: test_parent11 = child_const2%parent_field2 == 20.0 + logical, parameter :: test_parent12 = child_const2%parent_field3 .eqv. .true. + + type(child_type), parameter :: child_const3 = & + child_type(parent_type( & + parent_field2 = 17.7, parent_field3 = .false., parent_field1 = 18), & + child_field2 = .false., child_field1 = 19.9, child_field3 = 21) + logical, parameter :: test_child7 = child_const3%parent_field1 == 18 + logical, parameter :: test_child8 = child_const3%parent_field2 == 17.7 + logical, parameter :: test_child9 = child_const3%parent_field3 .eqv. .false. + logical, parameter :: test_child10 = child_const3%child_field1 == 19.9 + logical, parameter :: test_child11 = child_const3%child_field2 .eqv. .false. + logical, parameter :: test_child12 = child_const3%child_field3 == 21 + + type(child_type), parameter :: child_const4 = & + child_type(parent_type( & + parent_field3 = .true., parent_field1 = 22), & + child_field1 = 23.4, child_field3 = 24) + logical, parameter :: test_child13 = child_const4%parent_field1 == 22 + logical, parameter :: test_child14 = child_const4%parent_field2 == 20.0 + logical, parameter :: test_child15 = child_const4%parent_field3 .eqv. .true. + logical, parameter :: test_child16 = child_const4%child_field1 == 23.4 + logical, parameter :: test_child17 = child_const4%child_field2 .eqv. .false. + logical, parameter :: test_child18 = child_const4%child_field3 == 24 + +end module m3 diff --git a/flang/test/Flang-Driver/driver-error-cc1.c b/flang/test/Flang-Driver/driver-error-cc1.c new file mode 100644 index 0000000000000..1563ee431579f --- /dev/null +++ b/flang/test/Flang-Driver/driver-error-cc1.c @@ -0,0 +1,7 @@ +// RUN: not %flang-new %s 2>&1 | FileCheck %s + +// REQUIRES: new-flang-driver + +// C files are currently not supported (i.e. `flang -cc1`) + +// CHECK:error: unknown integrated tool '-cc1'. Valid tools include '-fc1'. diff --git a/flang/test/Flang-Driver/driver-error-cc1.cpp b/flang/test/Flang-Driver/driver-error-cc1.cpp new file mode 100644 index 0000000000000..20e469733bc9a --- /dev/null +++ b/flang/test/Flang-Driver/driver-error-cc1.cpp @@ -0,0 +1,7 @@ +// RUN: not %flang-new %s 2>&1 | FileCheck %s + +// REQUIRES: new-flang-driver + +// C++ files are currently not supported (i.e. `flang -cc1`) + +// CHECK:error: unknown integrated tool '-cc1'. Valid tools include '-fc1'. diff --git a/flang/test/Flang-Driver/driver-help.f90 b/flang/test/Flang-Driver/driver-help.f90 new file mode 100644 index 0000000000000..6ecd076efee4e --- /dev/null +++ b/flang/test/Flang-Driver/driver-help.f90 @@ -0,0 +1,13 @@ +! RUN: %flang-new -help 2>&1 | FileCheck %s +! RUN: %flang-new -fc1 -help 2>&1 | FileCheck %s +! RUN: not %flang-new -helps 2>&1 | FileCheck %s --check-prefix=ERROR + +! REQUIRES: new-flang-driver + +! CHECK:USAGE: flang-new +! CHECK-EMPTY: +! CHECK-NEXT:OPTIONS: +! CHECK-NEXT: -help Display available options +! CHECK-NEXT: --version Print version information + +! ERROR: error: unknown argument '-helps'; did you mean '-help' diff --git a/flang/test/Flang-Driver/driver-version.f90 b/flang/test/Flang-Driver/driver-version.f90 new file mode 100644 index 0000000000000..8552d0b2f28b4 --- /dev/null +++ b/flang/test/Flang-Driver/driver-version.f90 @@ -0,0 +1,11 @@ +! RUN: %flang-new --version 2>&1 | FileCheck %s +! RUN: not %flang-new --versions 2>&1 | FileCheck %s --check-prefix=ERROR + +! REQUIRES: new-flang-driver + +! CHECK:flang-new version +! CHECK-NEXT:Target: +! CHECK-NEXT:Thread model: +! CHECK-NEXT:InstalledDir: + +! ERROR: error: unsupported option '--versions'; did you mean '--version'? diff --git a/flang/test/Flang-Driver/emit-obj.f90 b/flang/test/Flang-Driver/emit-obj.f90 new file mode 100644 index 0000000000000..4ddd483828626 --- /dev/null +++ b/flang/test/Flang-Driver/emit-obj.f90 @@ -0,0 +1,17 @@ +! RUN: not %flang-new %s 2>&1 | FileCheck %s --check-prefix=ERROR-IMPLICIT +! RUN: not %flang-new -emit-obj %s 2>&1 | FileCheck %s --check-prefix=ERROR-EXPLICIT +! RUN: not %flang-new -fc1 -emit-obj %s 2>&1 | FileCheck %s --check-prefix=ERROR-FC1 + +! REQUIRES: new-flang-driver + +! By default (e.g. when no options like `-E` are passed) flang-new +! creates a job that corresponds to `-emit-obj`. This option/action is +! not yet supported. Verify that this is correctly reported as error. + +! ERROR-IMPLICIT: error: unknown argument: '-triple' +! ERROR-IMPLICIT: error: unknown argument: '-emit-obj' +! ERROR-IMPLICIT: error: unknown argument: '-o' + +! ERROR-EXPLICIT: error: unknown argument: '-o' + +! ERROR-FC1: error: unknown argument: '-emit-obj' diff --git a/flang/test/Flang-Driver/missing-input.f90 b/flang/test/Flang-Driver/missing-input.f90 new file mode 100644 index 0000000000000..96818bc4bd385 --- /dev/null +++ b/flang/test/Flang-Driver/missing-input.f90 @@ -0,0 +1,5 @@ +! RUN: not %flang-new 2>&1 | FileCheck %s + +! REQUIRES: new-flang-driver + +! CHECK: error: no input files diff --git a/flang/test/Preprocessing/empty.h b/flang/test/Preprocessing/empty.h new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Preprocessing/include-comment.F90 b/flang/test/Preprocessing/include-comment.F90 new file mode 100644 index 0000000000000..6ac475f76e46e --- /dev/null +++ b/flang/test/Preprocessing/include-comment.F90 @@ -0,0 +1,18 @@ +! RUN: %f18 -I%S -E %s 2>&1 | FileCheck %s +! CHECK-NOT: :3: +#include ! comment +! CHECK-NOT: :5: +#include /* comment */ +! CHECK-NOT: :7: +#include !comment +! CHECK: :9:20: #include: extra stuff ignored after file name +#include comment +! CHECK-NOT: :11: +#include "empty.h" ! comment +! CHECK-NOT: :13: +#include "empty.h" /* comment */ +! CHECK-NOT: :15: +#include "empty.h" !comment +! CHECK: :17:20: #include: extra stuff ignored after file name +#include "empty.h" comment +end diff --git a/flang/test/Semantics/bad-forward-type.f90 b/flang/test/Semantics/bad-forward-type.f90 index 5fe17ad833ad4..b7857e1f8af42 100644 --- a/flang/test/Semantics/bad-forward-type.f90 +++ b/flang/test/Semantics/bad-forward-type.f90 @@ -70,3 +70,12 @@ subroutine s7(x) type, extends(undef) :: t end type end subroutine + +subroutine s8 + implicit type(t2)(x) + !ERROR: Cannot construct value for derived type 't2' before it is defined + parameter(y=t2(12.3)) + type t2 + real :: c + end type +end subroutine diff --git a/flang/test/Semantics/case01.f90 b/flang/test/Semantics/case01.f90 index e1965db573b6d..6342233a727e8 100644 --- a/flang/test/Semantics/case01.f90 +++ b/flang/test/Semantics/case01.f90 @@ -163,3 +163,17 @@ program selectCaseProg end select end program + +program test_overlap + integer :: i + !OK: these cases do not overlap + select case(i) + case(0:) + case(:-1) + end select + select case(i) + case(-1:) + !ERROR: CASE (:0_4) conflicts with previous cases + case(:0) + end select +end diff --git a/flang/test/Semantics/defined-ops.f90 b/flang/test/Semantics/defined-ops.f90 new file mode 100644 index 0000000000000..24e72677c6eb1 --- /dev/null +++ b/flang/test/Semantics/defined-ops.f90 @@ -0,0 +1,88 @@ +! RUN: %f18 -funparse %s 2>&1 | FileCheck %s + +! Check the analyzed form of a defined operator or assignment. + +! Type-bound defined assignment +module m1 + type :: t + contains + procedure :: b1 => s1 + procedure, pass(y) :: b2 => s2 + generic :: assignment(=) => b1, b2 + end type +contains + subroutine s1(x, y) + class(t), intent(out) :: x + integer, intent(in) :: y + end + subroutine s2(x, y) + real, intent(out) :: x + class(t), intent(in) :: y + end + subroutine test1(x) + type(t) :: x + real :: a + !CHECK: CALL s1(x,1_4) + x = 1 + !CHECK: CALL s2(a,x) + a = x + end + subroutine test2(x) + class(t) :: x + real :: a + !CHECK: CALL x%b1(1_4) + x = 1 + !CHECK: CALL x%b2(a) + a = x + end +end + +! Type-bound operator +module m2 + type :: t2 + contains + procedure, pass(x2) :: b2 => f + generic :: operator(+) => b2 + end type +contains + integer pure function f(x1, x2) + class(t2), intent(in) :: x1 + class(t2), intent(in) :: x2 + end + subroutine test2(x, y) + class(t2) :: x + type(t2) :: y + !CHECK: i=f(x,y) + i = x + y + !CHECK: i=x%b2(y) + i = y + x + end +end module + +! Non-type-bound assignment and operator +module m3 + type t + end type + interface assignment(=) + subroutine s1(x, y) + import + class(t), intent(out) :: x + integer, intent(in) :: y + end + end interface + interface operator(+) + integer function f(x, y) + import + class(t), intent(in) :: x, y + end + end interface +contains + subroutine test(x, y) + class(t) :: x, y + !CHECK: CALL s1(x,2_4) + x = 2 + !CHECK: i=f(x,y) + i = x + y + end +end + diff --git a/flang/test/Semantics/omp-atomic.f90 b/flang/test/Semantics/omp-atomic.f90 index d5cb87aaba32d..8d3f95a770454 100644 --- a/flang/test/Semantics/omp-atomic.f90 +++ b/flang/test/Semantics/omp-atomic.f90 @@ -1,5 +1,5 @@ ! RUN: %S/test_errors.sh %s %t %f18 -fopenmp - +use omp_lib ! Check OpenMP 2.13.6 atomic Construct a = 1.0 @@ -11,12 +11,32 @@ a = b !$omp end atomic + !$omp atomic read acquire hint(OMP_LOCK_HINT_CONTENDED) + a = b + + !$omp atomic release hint(OMP_LOCK_HINT_UNCONTENDED) write + a = b + !$omp atomic capture seq_cst b = a a = a + 1 !$omp end atomic + !$omp atomic hint(1) acq_rel capture + b = a + a = a + 1 + !$omp end atomic + + !ERROR: expected end of line + !ERROR: expected end of line + !$omp atomic read write + a = a + 1 + !$omp atomic a = a + 1 + + !$omp atomic relaxed + a = a + 1 + !$omp end parallel end diff --git a/flang/test/Semantics/omp-clause-validity01.f90 b/flang/test/Semantics/omp-clause-validity01.f90 index d3f77a432de86..07f55733c8dc8 100644 --- a/flang/test/Semantics/omp-clause-validity01.f90 +++ b/flang/test/Semantics/omp-clause-validity01.f90 @@ -9,7 +9,7 @@ ! TODO: all the internal errors integer :: b = 128 - integer :: c = 32 + integer :: z, c = 32 integer, parameter :: num = 16 real(8) :: arrayA(256), arrayB(512) @@ -39,29 +39,54 @@ enddo !$omp end parallel - !$omp parallel allocate(b) + !$omp parallel private(b) allocate(b) do i = 1, N a = 3.14 enddo !$omp end parallel - !$omp parallel allocate(omp_default_mem_space : b, c) + !$omp parallel private(c, b) allocate(omp_default_mem_space : b, c) do i = 1, N a = 3.14 enddo !$omp end parallel - !$omp parallel allocate(b) allocate(c) + !$omp parallel allocate(b) allocate(c) private(b, c) do i = 1, N a = 3.14 enddo !$omp end parallel - !$omp parallel allocate(xy_alloc :b) + !$omp parallel allocate(xy_alloc :b) private(b) do i = 1, N a = 3.14 enddo !$omp end parallel + + !$omp task private(b) allocate(b) + do i = 1, N + z = 2 + end do + !$omp end task + + !$omp teams private(b) allocate(b) + do i = 1, N + z = 2 + end do + !$omp end teams + + !$omp target private(b) allocate(b) + do i = 1, N + z = 2 + end do + !$omp end target + + !ERROR: ALLOCATE clause is not allowed on the TARGET DATA directive + !$omp target data map(from: b) allocate(b) + do i = 1, N + z = 2 + enddo + !$omp end target data !ERROR: SCHEDULE clause is not allowed on the PARALLEL directive !$omp parallel schedule(static) diff --git a/flang/test/Semantics/omp-resolve06.f90 b/flang/test/Semantics/omp-resolve06.f90 new file mode 100644 index 0000000000000..0909c0f54a576 --- /dev/null +++ b/flang/test/Semantics/omp-resolve06.f90 @@ -0,0 +1,54 @@ +! RUN: %S/test_errors.sh %s %t %f18 -fopenmp +use omp_lib +!2.11.4 Allocate Clause +!For any list item that is specified in the allocate +!clause on a directive, a data-sharing attribute clause +!that may create a private copy of that list item must be +!specified on the same directive. + + integer :: N = 2 + + !ERROR: The ALLOCATE clause requires that 'x' must be listed in a private data-sharing attribute clause on the same directive + !$omp parallel allocate(omp_default_mem_space : x) + do i = 1, N + x = 2 + enddo + !$omp end parallel + + !ERROR: The ALLOCATE clause requires that 'y' must be listed in a private data-sharing attribute clause on the same directive + !$omp parallel allocate(omp_default_mem_space : y) firstprivate(x) + do i = 1, N + x = 2 + enddo + !$omp end parallel + + !ERROR: The ALLOCATE clause requires that 'x' must be listed in a private data-sharing attribute clause on the same directive + !ERROR: The ALLOCATE clause requires that 'x' must be listed in a private data-sharing attribute clause on the same directive + !$omp parallel allocate(omp_default_mem_space : x) allocate(omp_default_mem_space : x) + do i = 1, N + x = 2 + enddo + !$omp end parallel + + !ERROR: The ALLOCATE clause requires that 'f' must be listed in a private data-sharing attribute clause on the same directive + !$omp parallel allocate(omp_default_mem_space : f) shared(f) + do i = 1, N + x = 2 + enddo + !$omp end parallel + + !ERROR: The ALLOCATE clause requires that 'q' must be listed in a private data-sharing attribute clause on the same directive + !$omp parallel private(t) allocate(omp_default_mem_space : z, t, q, r) firstprivate(z, r) + do i = 1, N + x = 2 + enddo + !$omp end parallel + + !ERROR: The ALLOCATE clause requires that 'b' must be listed in a private data-sharing attribute clause on the same directive + !ERROR: The ALLOCATE clause requires that 'c' must be listed in a private data-sharing attribute clause on the same directive + !$omp parallel allocate(omp_default_mem_space : a, b, c, d) firstprivate(a, d) + do i = 1, N + x = 2 + enddo + !$omp end parallel +end diff --git a/flang/test/Semantics/resolve11.f90 b/flang/test/Semantics/resolve11.f90 index 60dfcb8a10247..06c57b6e4cb89 100644 --- a/flang/test/Semantics/resolve11.f90 +++ b/flang/test/Semantics/resolve11.f90 @@ -13,13 +13,13 @@ module m2 module procedure ifoo end interface public :: operator(.foo.) - !ERROR: The accessibility of operator '.foo.' has already been specified as PUBLIC + !ERROR: The accessibility of 'OPERATOR(.foo.)' has already been specified as PUBLIC private :: operator(.foo.) interface operator(+) module procedure ifoo end interface public :: operator(+) - !ERROR: The accessibility of 'operator(+)' has already been specified as PUBLIC + !ERROR: The accessibility of 'OPERATOR(+)' has already been specified as PUBLIC private :: operator(+) , ifoo contains integer function ifoo(x, y) @@ -37,7 +37,7 @@ logical function lt(x, y) type(t), intent(in) :: x, y end function end interface - !ERROR: The accessibility of 'operator(<)' has already been specified as PRIVATE + !ERROR: The accessibility of 'OPERATOR(<)' has already been specified as PRIVATE public :: operator(<) interface operator(.gt.) logical function gt(x, y) @@ -46,6 +46,6 @@ logical function gt(x, y) end function end interface public :: operator(>) - !ERROR: The accessibility of 'operator(.gt.)' has already been specified as PUBLIC + !ERROR: The accessibility of 'OPERATOR(.GT.)' has already been specified as PUBLIC private :: operator(.gt.) end diff --git a/flang/test/Semantics/resolve13.f90 b/flang/test/Semantics/resolve13.f90 index a611aa09e5ccf..f6105b1ec8a87 100644 --- a/flang/test/Semantics/resolve13.f90 +++ b/flang/test/Semantics/resolve13.f90 @@ -27,24 +27,24 @@ integer function ifoo(x, y) !ERROR: 'z' not found in module 'm1' use m1, local_z => z use m1, operator(.localfoo.) => operator(.foo.) -!ERROR: Operator '.bar.' not found in module 'm1' +!ERROR: 'OPERATOR(.bar.)' not found in module 'm1' use m1, operator(.localbar.) => operator(.bar.) !ERROR: 'y' is PRIVATE in 'm1' use m1, only: y -!ERROR: Operator '.priv.' is PRIVATE in 'm1' +!ERROR: 'OPERATOR(.priv.)' is PRIVATE in 'm1' use m1, only: operator(.priv.) -!ERROR: 'operator(*)' is PRIVATE in 'm1' +!ERROR: 'OPERATOR(*)' is PRIVATE in 'm1' use m1, only: operator(*) !ERROR: 'z' not found in module 'm1' use m1, only: z !ERROR: 'z' not found in module 'm1' use m1, only: my_x => z use m1, only: operator(.foo.) -!ERROR: Operator '.bar.' not found in module 'm1' +!ERROR: 'OPERATOR(.bar.)' not found in module 'm1' use m1, only: operator(.bar.) use m1, only: operator(-) , ifoo -!ERROR: 'operator(+)' not found in module 'm1' +!ERROR: 'OPERATOR(+)' not found in module 'm1' use m1, only: operator(+) end diff --git a/flang/test/Semantics/resolve15.f90 b/flang/test/Semantics/resolve15.f90 index 3658a68e1e884..c520c5886599b 100644 --- a/flang/test/Semantics/resolve15.f90 +++ b/flang/test/Semantics/resolve15.f90 @@ -9,7 +9,9 @@ module m end interface interface operator(.foo.) !ERROR: 'var' is not a subprogram - procedure :: sub, var + procedure :: var + !ERROR: OPERATOR(.foo.) procedure 'sub' must be a function + procedure :: sub !ERROR: Procedure 'bad' not found procedure :: bad end interface diff --git a/flang/test/Semantics/resolve25.f90 b/flang/test/Semantics/resolve25.f90 index 3264194993ead..ec0a98ad6a59a 100644 --- a/flang/test/Semantics/resolve25.f90 +++ b/flang/test/Semantics/resolve25.f90 @@ -1,7 +1,7 @@ ! RUN: %S/test_errors.sh %s %t %f18 module m interface foo - subroutine s1(x) + real function s1(x) real x end !ERROR: 's2' is not a module procedure @@ -12,12 +12,12 @@ subroutine s1(x) procedure s1 end interface interface - subroutine s4(x,y) - real x,y - end subroutine - subroutine s2(x,y) - complex x,y - end subroutine + real function s4(x,y) + real, intent(in) :: x,y + end function + complex function s2(x,y) + complex, intent(in) :: x,y + end function end interface generic :: bar => s4 generic :: bar => s2 @@ -26,7 +26,7 @@ subroutine s2(x,y) generic :: operator(.foo.)=> s4 generic :: operator(.foo.)=> s2 - !ERROR: Procedure 's4' is already specified in generic operator '.foo.' + !ERROR: Procedure 's4' is already specified in generic 'OPERATOR(.foo.)' generic :: operator(.foo.)=> s4 end module @@ -37,7 +37,7 @@ integer function f(x, y) end function end interface generic :: operator(+)=> f - !ERROR: Procedure 'f' is already specified in generic 'operator(+)' + !ERROR: Procedure 'f' is already specified in generic 'OPERATOR(+)' generic :: operator(+)=> f end @@ -46,11 +46,11 @@ module m3 procedure f end interface interface operator(>=) - !ERROR: Procedure 'f' is already specified in generic 'operator(.ge.)' + !ERROR: Procedure 'f' is already specified in generic 'OPERATOR(.GE.)' procedure f end interface generic :: operator(>) => f - !ERROR: Procedure 'f' is already specified in generic 'operator(>)' + !ERROR: Procedure 'f' is already specified in generic 'OPERATOR(>)' generic :: operator(.gt.) => f contains logical function f(x, y) result(result) diff --git a/flang/test/Semantics/resolve49.f90 b/flang/test/Semantics/resolve49.f90 index b0bca059c0412..5ead0784603b1 100644 --- a/flang/test/Semantics/resolve49.f90 +++ b/flang/test/Semantics/resolve49.f90 @@ -17,6 +17,7 @@ program p2 end type character :: a(10) character :: b(5) + character :: c(0) integer :: n n = 3 b = a(n:7) @@ -26,6 +27,7 @@ program p2 a(n+3:) = b a(:n+2) = b n = iachar(1_'ABCDEFGHIJ'(1:1)) + c = 'ABCDEFGHIJ'(1:0) end ! Test pointer assignment with bounds diff --git a/flang/test/Semantics/resolve53.f90 b/flang/test/Semantics/resolve53.f90 index acb27c8575b7d..1487873bd86b3 100644 --- a/flang/test/Semantics/resolve53.f90 +++ b/flang/test/Semantics/resolve53.f90 @@ -210,7 +210,7 @@ module m14 module procedure f1 module procedure f2 end interface - !ERROR: Generic 'operator(+)' may not have specific procedures 'f1' and 'f3' as their interfaces are not distinguishable + !ERROR: Generic 'OPERATOR(+)' may not have specific procedures 'f1' and 'f3' as their interfaces are not distinguishable interface operator(+) module procedure f1 module procedure f3 @@ -219,7 +219,7 @@ module m14 module procedure f1 module procedure f2 end interface - !ERROR: Generic operator '.bar.' may not have specific procedures 'f1' and 'f3' as their interfaces are not distinguishable + !ERROR: Generic 'OPERATOR(.bar.)' may not have specific procedures 'f1' and 'f3' as their interfaces are not distinguishable interface operator(.bar.) module procedure f1 module procedure f3 @@ -332,7 +332,6 @@ subroutine s9(x) end subroutine end - ! Check that specifics for type-bound generics can be distinguished module m16 type :: t @@ -441,20 +440,20 @@ module m19 module procedure f1 module procedure f2 end interface - !ERROR: Generic operator '.bar.' may not have specific procedures 'f2' and 'f3' as their interfaces are not distinguishable + !ERROR: Generic 'OPERATOR(.bar.)' may not have specific procedures 'f2' and 'f3' as their interfaces are not distinguishable interface operator(.bar.) module procedure f2 module procedure f3 end interface contains integer function f1(i) - integer :: i + integer, intent(in) :: i end integer function f2(i, j) - integer :: i, j + integer, value :: i, j end integer function f3(i, j) - integer :: i, j + integer, intent(in) :: i, j end end @@ -472,11 +471,11 @@ real function f(x) subroutine s1() use m20 interface operator(.not.) - !ERROR: Procedure 'f' is already specified in generic 'operator(.not.)' + !ERROR: Procedure 'f' is already specified in generic 'OPERATOR(.NOT.)' procedure f end interface interface operator(+) - !ERROR: Procedure 'f' is already specified in generic 'operator(+)' + !ERROR: Procedure 'f' is already specified in generic 'OPERATOR(+)' procedure f end interface end subroutine s1 diff --git a/flang/test/Semantics/resolve95.f90 b/flang/test/Semantics/resolve95.f90 new file mode 100644 index 0000000000000..78ff09d88d324 --- /dev/null +++ b/flang/test/Semantics/resolve95.f90 @@ -0,0 +1,15 @@ +! RUN: %S/test_errors.sh %s %t %f18 +! Test SELECT TYPE and ASSOCIATE errors: C1103 + +subroutine s1() + class(*),allocatable :: calc[:] + integer,save :: icoa[*] + !ERROR: Selector must not be a coindexed object + associate(sel=>icoa[2]) + end associate + icoa = 2 + allocate(integer::calc[*]) + !ERROR: Selector must not be a coindexed object + select type(sel=>calc[2]) + end select +end subroutine diff --git a/flang/test/Semantics/resolve96.f90 b/flang/test/Semantics/resolve96.f90 new file mode 100644 index 0000000000000..b026e042397ec --- /dev/null +++ b/flang/test/Semantics/resolve96.f90 @@ -0,0 +1,62 @@ +! RUN: %S/test_errors.sh %s %t %f18 + +! Check distinguishability for specific procedures of defined operators and +! assignment. These are different from names because there a normal generic +! is invoked the same way as a type-bound generic. +! E.g. for a generic name like 'foo', the generic name is invoked as 'foo(x, y)' +! while the type-bound generic is invoked as 'x%foo(y)'. +! But for 'operator(.foo.)', it is 'x .foo. y' in either case. +! So to check the specifics of 'operator(.foo.)' we have to consider all +! definitions of it visible in the current scope. + +! One operator(.foo.) comes from interface-stmt, the other is type-bound. +module m1 + type :: t1 + contains + procedure, pass :: p => s1 + generic :: operator(.foo.) => p + end type + type :: t2 + end type + !ERROR: Generic 'OPERATOR(.foo.)' may not have specific procedures 's2' and 't1%p' as their interfaces are not distinguishable + interface operator(.foo.) + procedure :: s2 + end interface +contains + integer function s1(x1, x2) + class(t1), intent(in) :: x1 + class(t2), intent(in) :: x2 + end + integer function s2(x1, x2) + class(t1), intent(in) :: x1 + class(t2), intent(in) :: x2 + end +end module + +! assignment(=) as type-bound generic in each type +module m2 + type :: t1 + integer :: n + contains + procedure, pass(x1) :: p1 => s1 + !ERROR: Generic 'assignment(=)' may not have specific procedures 't1%p1' and 't2%p2' as their interfaces are not distinguishable + generic :: assignment(=) => p1 + end type + type :: t2 + integer :: n + contains + procedure, pass(x2) :: p2 => s2 + generic :: assignment(=) => p2 + end type +contains + subroutine s1(x1, x2) + class(t1), intent(out) :: x1 + class(t2), intent(in) :: x2 + x1%n = x2%n + 1 + end subroutine + subroutine s2(x1, x2) + class(t1), intent(out) :: x1 + class(t2), intent(in) :: x2 + x1%n = x2%n + 2 + end subroutine +end module diff --git a/flang/test/Semantics/test_errors.sh b/flang/test/Semantics/test_errors.sh index 15383475c5051..5411482e4d3b6 100755 --- a/flang/test/Semantics/test_errors.sh +++ b/flang/test/Semantics/test_errors.sh @@ -2,7 +2,7 @@ # Compile a source file and check errors against those listed in the file. # Change the compiler by setting the F18 environment variable. -F18_OPTIONS="-fdebug-resolve-names -fparse-only" +F18_OPTIONS="-fparse-only" srcdir=$(dirname $0) source $srcdir/common.sh [[ ! -f $src ]] && die "File not found: $src" diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py index 25c63890832fe..21d8530434312 100644 --- a/flang/test/lit.cfg.py +++ b/flang/test/lit.cfg.py @@ -25,7 +25,7 @@ config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell) # suffixes: A list of file extensions to treat as test files. -config.suffixes = ['.f', '.F', '.ff', '.FOR', '.for', '.f77', '.f90', '.F90', +config.suffixes = ['.c', '.cpp', '.f', '.F', '.ff', '.FOR', '.for', '.f77', '.f90', '.F90', '.ff90', '.f95', '.F95', '.ff95', '.fpp', '.FPP', '.cuf', '.CUF', '.f18', '.F18', '.fir'] @@ -38,6 +38,13 @@ # directories. config.excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt'] +# If the new Flang driver is enabled, add the corresponding feature to +# config. Otherwise, exclude the corresponding test directory. +if config.include_flang_new_driver_test: + config.available_features.add('new-flang-driver') +else: + config.excludes.append('Flang-Driver') + # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) @@ -63,6 +70,9 @@ unresolved='fatal') ] +if config.include_flang_new_driver_test: + tools.append(ToolSubst('%flang-new', command=FindTool('flang-new'), unresolved='fatal')) + if config.flang_standalone_build: llvm_config.add_tool_substitutions(tools, [config.flang_llvm_tools_dir]) else: diff --git a/flang/test/lit.site.cfg.py.in b/flang/test/lit.site.cfg.py.in index 10ec132081544..7a59280283813 100644 --- a/flang/test/lit.site.cfg.py.in +++ b/flang/test/lit.site.cfg.py.in @@ -11,6 +11,11 @@ config.flang_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin" config.python_executable = "@PYTHON_EXECUTABLE@" config.flang_standalone_build = @FLANG_STANDALONE_BUILD@ +# Control the regression test for flang-new driver +import lit.util +config.include_flang_new_driver_test = \ + lit.util.pythonize_bool("@FLANG_BUILD_NEW_DRIVER@") + # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. try: diff --git a/flang/tools/CMakeLists.txt b/flang/tools/CMakeLists.txt index b973127d34435..0fbf828253ef7 100644 --- a/flang/tools/CMakeLists.txt +++ b/flang/tools/CMakeLists.txt @@ -7,6 +7,9 @@ #===------------------------------------------------------------------------===# add_subdirectory(f18) +if(FLANG_BUILD_NEW_DRIVER) + add_subdirectory(flang-driver) +endif() if(LINK_WITH_FIR) add_subdirectory(tco) endif() diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt index b92733d8374e7..64ccf12505fea 100644 --- a/flang/tools/f18/CMakeLists.txt +++ b/flang/tools/f18/CMakeLists.txt @@ -84,4 +84,4 @@ set(FLANG_INTRINSIC_MODULES_DIR ${CMAKE_INSTALL_PREFIX}/include/flang) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/flang.sh.in ${FLANG_BINARY_DIR}/bin/flang-install.sh @ONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/f18_version.h.in ${CMAKE_CURRENT_BINARY_DIR}/f18_version.h @ONLY) -install(PROGRAMS ${FLANG_BINARY_DIR}/bin/flang-install.sh DESTINATION bin RENAME flang PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE) +install(PROGRAMS ${FLANG_BINARY_DIR}/bin/flang-install.sh DESTINATION bin RENAME flang) diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp index a33a167686e49..54a905133db76 100644 --- a/flang/tools/f18/f18.cpp +++ b/flang/tools/f18/f18.cpp @@ -251,7 +251,7 @@ std::string CompileFortran(std::string path, Fortran::parser::Options options, driver.dumpSymbols || driver.dumpUnparseWithSymbols || driver.getDefinition || driver.getSymbolsSources) { Fortran::semantics::Semantics semantics{semanticsContext, parseTree, - parsing.cooked(), driver.debugModuleWriter}; + parsing.cooked().AsCharBlock(), driver.debugModuleWriter}; semantics.Perform(); semantics.EmitMessages(llvm::errs()); if (driver.dumpSymbols) { diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt new file mode 100644 index 0000000000000..d7bab277287f5 --- /dev/null +++ b/flang/tools/flang-driver/CMakeLists.txt @@ -0,0 +1,25 @@ +# Infrastructure to build flang driver entry point. Flang driver depends on +# LLVM libraries. + +# Set your project compile flags. +link_directories(${LLVM_LIBRARY_DIR}) + +add_flang_tool(flang-new + driver.cpp + fc1_main.cpp +) + +# Link against LLVM and Clang libraries +target_link_libraries(flang-new + PRIVATE + ${LLVM_COMMON_LIBS} + flangFrontend + flangFrontendTool + clangDriver + clangBasic + LLVMSupport + LLVMTarget + LLVMOption +) + +install(TARGETS flang-new DESTINATION bin) diff --git a/flang/tools/flang-driver/driver.cpp b/flang/tools/flang-driver/driver.cpp new file mode 100644 index 0000000000000..9d04994d98435 --- /dev/null +++ b/flang/tools/flang-driver/driver.cpp @@ -0,0 +1,129 @@ +//===-- driver.cpp - Flang Driver -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is the entry point to the flang driver; it is a thin wrapper +// for functionality in the Driver flang library. +// +//===----------------------------------------------------------------------===// +#include "clang/Driver/Driver.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticIDs.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Driver/Compilation.h" +#include "clang/Frontend/TextDiagnosticPrinter.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/IntrusiveRefCntPtr.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Support/Host.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/VirtualFileSystem.h" + +// main frontend method. Lives inside fc1_main.cpp +extern int fc1_main(llvm::ArrayRef argv, const char *argv0); + +std::string GetExecutablePath(const char *argv0) { + // This just needs to be some symbol in the binary + void *p = (void *)(intptr_t)GetExecutablePath; + return llvm::sys::fs::getMainExecutable(argv0, p); +} + +// This lets us create the DiagnosticsEngine with a properly-filled-out +// DiagnosticOptions instance +static clang::DiagnosticOptions *CreateAndPopulateDiagOpts( + llvm::ArrayRef argv) { + auto *diagOpts = new clang::DiagnosticOptions; + return diagOpts; +} + +static int ExecuteFC1Tool(llvm::SmallVectorImpl &argV) { + llvm::StringRef tool = argV[1]; + if (tool == "-fc1") + return fc1_main(makeArrayRef(argV).slice(2), argV[0]); + + // Reject unknown tools. + // ATM it only supports fc1. Any fc1[*] is rejected. + llvm::errs() << "error: unknown integrated tool '" << tool << "'. " + << "Valid tools include '-fc1'.\n"; + return 1; +} + +int main(int argc_, const char **argv_) { + + // Initialize variables to call the driver + llvm::InitLLVM x(argc_, argv_); + llvm::SmallVector argv(argv_, argv_ + argc_); + + clang::driver::ParsedClangName targetandMode("flang", "--driver-mode=flang"); + std::string driverPath = GetExecutablePath(argv[0]); + + // Check if flang-new is in the frontend mode + auto firstArg = std::find_if( + argv.begin() + 1, argv.end(), [](const char *a) { return a != nullptr; }); + if (firstArg != argv.end()) { + if (llvm::StringRef(argv[1]).startswith("-cc1")) { + llvm::errs() << "error: unknown integrated tool '" << argv[1] << "'. " + << "Valid tools include '-fc1'.\n"; + return 1; + } + // Call flang-new frontend + if (llvm::StringRef(argv[1]).startswith("-fc1")) { + return ExecuteFC1Tool(argv); + } + } + + // Not in the frontend mode - continue in the compiler driver mode. + + // Create DiagnosticsEngine for the compiler driver + llvm::IntrusiveRefCntPtr diagOpts = + CreateAndPopulateDiagOpts(argv); + llvm::IntrusiveRefCntPtr diagID( + new clang::DiagnosticIDs()); + clang::TextDiagnosticPrinter *diagClient = + new clang::TextDiagnosticPrinter(llvm::errs(), &*diagOpts); + clang::DiagnosticsEngine diags(diagID, &*diagOpts, diagClient); + + // Prepare the driver + clang::driver::Driver theDriver(driverPath, + llvm::sys::getDefaultTargetTriple(), diags, "flang LLVM compiler"); + theDriver.setTargetAndMode(targetandMode); + std::unique_ptr c( + theDriver.BuildCompilation(argv)); + llvm::SmallVector, 4> + failingCommands; + + // Run the driver + int res = 1; + bool isCrash = false; + res = theDriver.ExecuteCompilation(*c, failingCommands); + + for (const auto &p : failingCommands) { + int CommandRes = p.first; + const clang::driver::Command *failingCommand = p.second; + if (!res) + res = CommandRes; + + // If result status is < 0 (e.g. when sys::ExecuteAndWait returns -1), + // then the driver command signalled an error. On Windows, abort will + // return an exit code of 3. In these cases, generate additional diagnostic + // information if possible. + isCrash = CommandRes < 0; +#ifdef _WIN32 + IsCrash |= CommandRes == 3; +#endif + if (isCrash) { + theDriver.generateCompilationDiagnostics(*c, *failingCommand); + break; + } + } + + diags.getClient()->finish(); + + // If we have multiple failing commands, we return the result of the first + // failing command. + return res; +} diff --git a/flang/tools/flang-driver/fc1_main.cpp b/flang/tools/flang-driver/fc1_main.cpp new file mode 100644 index 0000000000000..bb69517edde28 --- /dev/null +++ b/flang/tools/flang-driver/fc1_main.cpp @@ -0,0 +1,56 @@ +//===-- fc1_main.cpp - Flang FC1 Compiler Frontend ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is the entry point to the flang -fc1 functionality, which implements the +// core compiler functionality along with a number of additional tools for +// demonstration and testing purposes. +// +//===----------------------------------------------------------------------===// + +#include "flang/Frontend/CompilerInstance.h" +#include "flang/Frontend/CompilerInvocation.h" +#include "flang/FrontendTool/Utils.h" +#include "clang/Driver/DriverDiagnostic.h" +#include "clang/Frontend/TextDiagnosticBuffer.h" +#include "llvm/Option/Arg.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/OptTable.h" + +#include + +using namespace Fortran::frontend; + +int fc1_main(llvm::ArrayRef argv, const char *argv0) { + // Create CompilerInstance + std::unique_ptr flang(new CompilerInstance()); + + // Create DiagnosticsEngine for the frontend driver + flang->CreateDiagnostics(); + if (!flang->HasDiagnostics()) + return 1; + + // Create CompilerInvocation - use a dedicated instance of DiagnosticsEngine + // for parsing the arguments + llvm::IntrusiveRefCntPtr diagID( + new clang::DiagnosticIDs()); + llvm::IntrusiveRefCntPtr diagOpts = + new clang::DiagnosticOptions(); + clang::TextDiagnosticBuffer *diagsBuffer = new clang::TextDiagnosticBuffer; + clang::DiagnosticsEngine diags(diagID, &*diagOpts, diagsBuffer); + bool success = + CompilerInvocation::CreateFromArgs(flang->GetInvocation(), argv, diags); + + diagsBuffer->FlushDiagnostics(flang->getDiagnostics()); + if (!success) + return 1; + + // Execute the frontend actions. + success = ExecuteCompilerInvocation(flang.get()); + + return !success; +} diff --git a/flang/unittests/CMakeLists.txt b/flang/unittests/CMakeLists.txt index a30f0edaec615..c88e9fc660f16 100644 --- a/flang/unittests/CMakeLists.txt +++ b/flang/unittests/CMakeLists.txt @@ -22,3 +22,7 @@ add_subdirectory(Decimal) add_subdirectory(Evaluate) add_subdirectory(Runtime) add_subdirectory(Lower) + +if (FLANG_BUILD_NEW_DRIVER) + add_subdirectory(Frontend) +endif() diff --git a/flang/unittests/Evaluate/intrinsics.cpp b/flang/unittests/Evaluate/intrinsics.cpp index 4f2a21dfe6048..52507b8ef8b67 100644 --- a/flang/unittests/Evaluate/intrinsics.cpp +++ b/flang/unittests/Evaluate/intrinsics.cpp @@ -26,10 +26,10 @@ class CookedStrings { } void Marshal() { cooked_.Marshal(allSources_); } parser::CharBlock operator()(const std::string &s) { - return {cooked_.data().data() + offsets_[s], s.size()}; + return {cooked_.AsCharBlock().begin() + offsets_[s], s.size()}; } parser::ContextualMessages Messages(parser::Messages &buffer) { - return parser::ContextualMessages{cooked_.data(), &buffer}; + return parser::ContextualMessages{cooked_.AsCharBlock(), &buffer}; } void Emit(llvm::raw_ostream &o, const parser::Messages &messages) { messages.Emit(o, allCookedSources_); diff --git a/flang/unittests/Frontend/CMakeLists.txt b/flang/unittests/Frontend/CMakeLists.txt new file mode 100644 index 0000000000000..dd5cbedb0f91d --- /dev/null +++ b/flang/unittests/Frontend/CMakeLists.txt @@ -0,0 +1,10 @@ +add_flang_unittest(FlangFrontendTests + CompilerInstanceTest.cpp +) + +target_link_libraries(FlangFrontendTests + PRIVATE + LLVMSupport + clangBasic + flangFrontend + flangFrontendTool) diff --git a/flang/unittests/Frontend/CompilerInstanceTest.cpp b/flang/unittests/Frontend/CompilerInstanceTest.cpp new file mode 100644 index 0000000000000..a971c4c2b6c97 --- /dev/null +++ b/flang/unittests/Frontend/CompilerInstanceTest.cpp @@ -0,0 +1,52 @@ +//===- unittests/Frontend/CompilerInstanceTest.cpp - CI tests -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Frontend/CompilerInstance.h" +#include "gtest/gtest.h" +#include "flang/Frontend/CompilerInvocation.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Driver/Options.h" +#include "clang/Frontend/TextDiagnosticPrinter.h" +#include "llvm/Support/raw_ostream.h" + +#include +using namespace llvm; +using namespace Fortran::frontend; + +namespace { + +TEST(CompilerInstance, AllowDiagnosticLogWithUnownedDiagnosticConsumer) { + // 1. Set-up a basic DiagnosticConsumer + std::string diagnosticOutput; + llvm::raw_string_ostream diagnosticsOS(diagnosticOutput); + auto diagPrinter = std::make_unique( + diagnosticsOS, new clang::DiagnosticOptions()); + + // 2. Create a CompilerInstance (to manage a DiagnosticEngine) + CompilerInstance compInst; + + // 3. Set-up DiagnosticOptions + auto diagOpts = new clang::DiagnosticOptions(); + // Tell the diagnostics engine to emit the diagnostic log to STDERR. This + // ensures that a chained diagnostic consumer is created so that the test can + // exercise the unowned diagnostic consumer in a chained consumer. + diagOpts->DiagnosticLogFile = "-"; + + // 4. Create a DiagnosticEngine with an unowned consumer + IntrusiveRefCntPtr diags = + compInst.CreateDiagnostics(diagOpts, diagPrinter.get(), + /*ShouldOwnClient=*/false); + + // 5. Report a diagnostic + diags->Report(clang::diag::err_expected) << "no crash"; + + // 6. Verify that the reported diagnostic wasn't lost and did end up in the + // output stream + ASSERT_EQ(diagnosticsOS.str(), "error: expected no crash\n"); +} +} // namespace diff --git a/flang/unittests/Runtime/hello.cpp b/flang/unittests/Runtime/hello.cpp index c38aedf4f6549..c1daccae383ac 100644 --- a/flang/unittests/Runtime/hello.cpp +++ b/flang/unittests/Runtime/hello.cpp @@ -481,6 +481,7 @@ int main() { realInTest("(-1P,F18.0)", " 125", 0x4093880000000000); // 1250 realInTest("(1P,F18.0)", " 125", 0x4029000000000000); // 12.5 realInTest("(BZ,F18.0)", " 125 ", 0x4093880000000000); // 1250 + realInTest("(BZ,F18.0)", " 125 . e +1 ", 0x42a6bcc41e900000); // 1.25e13 realInTest("(DC,F18.0)", " 12,5", 0x4029000000000000); listInputTest(); diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 34d07c24505d9..e654d594bce0b 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -64,6 +64,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.frexp libc.src.math.frexpf libc.src.math.frexpl + libc.src.math.hypotf libc.src.math.logb libc.src.math.logbf libc.src.math.logbl diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td index 33ae64c0a08cb..40eec8f55c1c6 100644 --- a/libc/config/linux/api.td +++ b/libc/config/linux/api.td @@ -191,6 +191,7 @@ def MathAPI : PublicAPI<"math.h"> { "frexp", "frexpf", "frexpl", + "hypotf", "logb", "logbf", "logbl", diff --git a/libc/config/linux/platfrom_defs.h.inc b/libc/config/linux/platform_defs.h.inc similarity index 100% rename from libc/config/linux/platfrom_defs.h.inc rename to libc/config/linux/platform_defs.h.inc diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 6aca5e400d68a..a67e4084dd5e4 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -97,6 +97,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.frexp libc.src.math.frexpf libc.src.math.frexpl + libc.src.math.hypotf libc.src.math.logb libc.src.math.logbf libc.src.math.logbl diff --git a/libc/spec/posix.td b/libc/spec/posix.td index c20cbefe42ce0..1bf64f082c62b 100644 --- a/libc/spec/posix.td +++ b/libc/spec/posix.td @@ -228,7 +228,9 @@ def POSIX : StandardSpec<"POSIX"> { FunctionSpec< "strtok_r", RetValSpec, - [ArgSpec, ArgSpec] + [ArgSpec, + ArgSpec, + ArgSpec] >, ] >; diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 77fa971adc614..61b3dcb24ef06 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -296,6 +296,8 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"frexpf", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"frexpl", RetValSpec, [ArgSpec, ArgSpec]>, + FunctionSpec<"hypotf", RetValSpec, [ArgSpec, ArgSpec]>, + FunctionSpec<"logb", RetValSpec, [ArgSpec]>, FunctionSpec<"logbf", RetValSpec, [ArgSpec]>, FunctionSpec<"logbl", RetValSpec, [ArgSpec]>, diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt index c1ee46cd62cf6..e9f9579b6d0fe 100644 --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -2,8 +2,8 @@ add_gen_header( common DEF_FILE common.h.def PARAMS - platform_defs=../../config/${LIBC_TARGET_OS}/platfrom_defs.h.inc + platform_defs=../../config/${LIBC_TARGET_OS}/platform_defs.h.inc GEN_HDR common.h DATA_FILES - ../../config/${LIBC_TARGET_OS}/platfrom_defs.h.inc + ../../config/${LIBC_TARGET_OS}/platform_defs.h.inc ) diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 3b4f821726576..633a1cdddc540 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -593,3 +593,15 @@ add_entrypoint_object( COMPILE_OPTIONS -O2 ) + +add_entrypoint_object( + hypotf + SRCS + hypotf.cpp + HDRS + hypotf.h + DEPENDS + libc.utils.FPUtil.fputil + COMPILE_OPTIONS + -O2 +) diff --git a/libc/src/math/hypotf.cpp b/libc/src/math/hypotf.cpp new file mode 100644 index 0000000000000..10ebbb1b9ec9d --- /dev/null +++ b/libc/src/math/hypotf.cpp @@ -0,0 +1,222 @@ +//===-- Implementation of hypotf function ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "src/__support/common.h" +#include "utils/FPUtil/BasicOperations.h" +#include "utils/FPUtil/FPBits.h" + +namespace __llvm_libc { + +using namespace fputil; + +uint32_t findLeadingOne(uint32_t mant, int &shift_length) { + shift_length = 0; + constexpr int nsteps = 5; + constexpr uint32_t bounds[nsteps] = {1 << 16, 1 << 8, 1 << 4, 1 << 2, 1 << 1}; + constexpr int shifts[nsteps] = {16, 8, 4, 2, 1}; + for (int i = 0; i < nsteps; ++i) { + if (mant >= bounds[i]) { + shift_length += shifts[i]; + mant >>= shifts[i]; + } + } + return 1U << shift_length; +} + +// Correctly rounded IEEE 754 HYPOT(x, y) with round to nearest, ties to even. +// +// Algorithm: +// - Let a = max(|x|, |y|), b = min(|x|, |y|), then we have that: +// a <= sqrt(a^2 + b^2) <= min(a + b, a*sqrt(2)) +// 1. So if b < eps(a)/2, then HYPOT(x, y) = a. +// +// - Moreover, the exponent part of HYPOT(x, y) is either the same or 1 more +// than the exponent part of a. +// +// 2. For the remaining cases, we will use the digit-by-digit (shift-and-add) +// algorithm to compute SQRT(Z): +// +// - For Y = y0.y1...yn... = SQRT(Z), +// let Y(n) = y0.y1...yn be the first n fractional digits of Y. +// +// - The nth scaled residual R(n) is defined to be: +// R(n) = 2^n * (Z - Y(n)^2) +// +// - Since Y(n) = Y(n - 1) + yn * 2^(-n), the scaled residual +// satisfies the following recurrence formula: +// R(n) = 2*R(n - 1) - yn*(2*Y(n - 1) + 2^(-n)), +// with the initial conditions: +// Y(0) = y0, and R(0) = Z - y0. +// +// - So the nth fractional digit of Y = SQRT(Z) can be decided by: +// yn = 1 if 2*R(n - 1) >= 2*Y(n - 1) + 2^(-n), +// 0 otherwise. +// +// 3. Precision analysis: +// +// - Notice that in the decision function: +// 2*R(n - 1) >= 2*Y(n - 1) + 2^(-n), +// the right hand side only uses up to the 2^(-n)-bit, and both sides are +// non-negative, so R(n - 1) can be truncated at the 2^(-(n + 1))-bit, so +// that 2*R(n - 1) is corrected up to the 2^(-n)-bit. +// +// - Thus, in order to round SQRT(a^2 + b^2) correctly up to n-fractional +// bits, we need to perform the summation (a^2 + b^2) correctly up to (2n + +// 2)-fractional bits, and the remaining bits are sticky bits (i.e. we only +// care if they are 0 or > 0), and the comparisons, additions/subtractions +// can be done in n-fractional bits precision. +// +// - For single precision (float), we can use uint64_t to store the sum a^2 + +// b^2 exact up to (2n + 2)-fractional bits. +// +// - Then we can feed this sum into the digit-by-digit algorithm for SQRT(Z) +// described above. +// +// +// Special cases: +// - HYPOT(x, y) is +Inf if x or y is +Inf or -Inf; else +// - HYPOT(x, y) is NaN if x or y is NaN. +// +float LLVM_LIBC_ENTRYPOINT(hypotf)(float x, float y) { + FPBits x_bits(x), y_bits(y); + + if (x_bits.isInf() || y_bits.isInf()) { + return FPBits::inf(); + } + if (x_bits.isNaN()) { + return x; + } + if (y_bits.isNaN()) { + return y; + } + + uint16_t a_exp, b_exp, out_exp; + uint32_t a_mant, b_mant; + uint64_t a_mant_sq, b_mant_sq; + bool sticky_bits; + + if ((x_bits.exponent >= y_bits.exponent + MantissaWidth::value + 2) || + (y == 0)) { + return abs(x); + } else if ((y_bits.exponent >= + x_bits.exponent + MantissaWidth::value + 2) || + (x == 0)) { + y_bits.sign = 0; + return abs(y); + } + + if (x >= y) { + a_exp = x_bits.exponent; + a_mant = x_bits.mantissa; + b_exp = y_bits.exponent; + b_mant = y_bits.mantissa; + } else { + a_exp = y_bits.exponent; + a_mant = y_bits.mantissa; + b_exp = x_bits.exponent; + b_mant = x_bits.mantissa; + } + + out_exp = a_exp; + + // Add an extra bit to simplify the final rounding bit computation. + constexpr uint32_t one = 1U << (MantissaWidth::value + 1); + + a_mant <<= 1; + b_mant <<= 1; + + uint32_t leading_one; + int y_mant_width; + if (a_exp != 0) { + leading_one = one; + a_mant |= one; + y_mant_width = MantissaWidth::value + 1; + } else { + leading_one = findLeadingOne(a_mant, y_mant_width); + } + + if (b_exp != 0) { + b_mant |= one; + } + + a_mant_sq = static_cast(a_mant) * a_mant; + b_mant_sq = static_cast(b_mant) * b_mant; + + // At this point, a_exp >= b_exp > a_exp - 25, so in order to line up aSqMant + // and bSqMant, we need to shift bSqMant to the right by (a_exp - b_exp) bits. + // But before that, remember to store the losing bits to sticky. + // The shift length is for a^2 and b^2, so it's double of the exponent + // difference between a and b. + uint16_t shift_length = 2 * (a_exp - b_exp); + sticky_bits = ((b_mant_sq & ((1ULL << shift_length) - 1)) != 0); + b_mant_sq >>= shift_length; + + uint64_t sum = a_mant_sq + b_mant_sq; + if (sum >= (1ULL << (2 * y_mant_width + 2))) { + // a^2 + b^2 >= 4* leading_one^2, so we will need an extra bit to the left. + if (leading_one == one) { + // For normal result, we discard the last 2 bits of the sum and increase + // the exponent. + sticky_bits = sticky_bits || ((sum & 0x3U) != 0); + sum >>= 2; + ++out_exp; + if (out_exp >= FPBits::maxExponent) { + return FPBits::inf(); + } + } else { + // For denormal result, we simply move the leading bit of the result to + // the left by 1. + leading_one <<= 1; + ++y_mant_width; + } + } + + uint32_t Y = leading_one; + uint32_t R = static_cast(sum >> y_mant_width) - leading_one; + uint32_t tailBits = static_cast(sum) & (leading_one - 1); + + for (uint32_t current_bit = leading_one >> 1; current_bit; + current_bit >>= 1) { + R = (R << 1) + ((tailBits & current_bit) ? 1 : 0); + uint32_t tmp = (Y << 1) + current_bit; // 2*y(n - 1) + 2^(-n) + if (R >= tmp) { + R -= tmp; + Y += current_bit; + } + } + + bool round_bit = Y & 1U; + bool lsb = Y & 2U; + + if (Y >= one) { + Y -= one; + + if (out_exp == 0) { + out_exp = 1; + } + } + + Y >>= 1; + + // Round to the nearest, tie to even. + if (round_bit && (lsb || sticky_bits || (R != 0))) { + ++Y; + } + + if (Y >= (one >> 1)) { + Y -= one >> 1; + ++out_exp; + if (out_exp >= FPBits::maxExponent) { + return FPBits::inf(); + } + } + + Y |= static_cast(out_exp) << MantissaWidth::value; + return *reinterpret_cast(&Y); +} + +} // namespace __llvm_libc diff --git a/libc/src/math/hypotf.h b/libc/src/math/hypotf.h new file mode 100644 index 0000000000000..084fd7f3ef814 --- /dev/null +++ b/libc/src/math/hypotf.h @@ -0,0 +1,18 @@ +//===-- Implementation header for hypotf ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_HYPOTF_H +#define LLVM_LIBC_SRC_MATH_HYPOTF_H + +namespace __llvm_libc { + +float hypotf(float x, float y); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_MATH_HYPOTF_H diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 8efe8c89e9e7f..8a2adbe08e0b0 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -16,8 +16,7 @@ add_entrypoint_object( strcat.h DEPENDS .strcpy - .strlen - libc.include.string + .string_utils ) add_entrypoint_object( @@ -28,8 +27,7 @@ add_entrypoint_object( strcpy.h DEPENDS .memcpy - .strlen - libc.include.string + .string_utils ) add_entrypoint_object( @@ -48,8 +46,6 @@ add_entrypoint_object( strcmp.cpp HDRS strcmp.h - DEPENDS - libc.include.string ) add_entrypoint_object( @@ -58,6 +54,8 @@ add_entrypoint_object( memchr.cpp HDRS memchr.h + DEPENDS + .string_utils ) add_entrypoint_object( @@ -83,7 +81,7 @@ add_entrypoint_object( HDRS strnlen.h DEPENDS - .memchr + .string_utils ) add_entrypoint_object( diff --git a/libc/src/string/bzero.h b/libc/src/string/bzero.h index a16e1d097f953..064800bad29b5 100644 --- a/libc/src/string/bzero.h +++ b/libc/src/string/bzero.h @@ -9,7 +9,7 @@ #ifndef LLVM_LIBC_SRC_STRING_BZERO_H #define LLVM_LIBC_SRC_STRING_BZERO_H -#include "include/string.h" +#include // size_t namespace __llvm_libc { diff --git a/libc/src/string/memchr.cpp b/libc/src/string/memchr.cpp index 303f78185f49c..c95e2724f1a16 100644 --- a/libc/src/string/memchr.cpp +++ b/libc/src/string/memchr.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/string/memchr.h" +#include "src/string/string_utils.h" + #include "src/__support/common.h" #include @@ -14,11 +16,8 @@ namespace __llvm_libc { // TODO: Look at performance benefits of comparing words. void *LLVM_LIBC_ENTRYPOINT(memchr)(const void *src, int c, size_t n) { - const unsigned char *str = reinterpret_cast(src); - const unsigned char ch = c; - for (; n && *str != ch; --n, ++str) - ; - return n ? const_cast(str) : nullptr; + return internal::find_first_character( + reinterpret_cast(src), c, n); } } // namespace __llvm_libc diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp index a8056714a225f..00d66ea677d25 100644 --- a/libc/src/string/memcpy.cpp +++ b/libc/src/string/memcpy.cpp @@ -44,12 +44,8 @@ static void memcpy_impl(char *__restrict dst, const char *__restrict src, return CopyBlock<4>(dst, src); if (count < 8) return CopyBlockOverlap<4>(dst, src, count); - if (count == 8) - return CopyBlock<8>(dst, src); if (count < 16) return CopyBlockOverlap<8>(dst, src, count); - if (count == 16) - return CopyBlock<16>(dst, src); if (count < 32) return CopyBlockOverlap<16>(dst, src, count); if (count < 64) diff --git a/libc/src/string/memcpy.h b/libc/src/string/memcpy.h index 39ca4a46f7f35..f643f1de6294e 100644 --- a/libc/src/string/memcpy.h +++ b/libc/src/string/memcpy.h @@ -9,7 +9,6 @@ #ifndef LLVM_LIBC_SRC_STRING_MEMCPY_H #define LLVM_LIBC_SRC_STRING_MEMCPY_H -#include "include/string.h" #include // size_t namespace __llvm_libc { diff --git a/libc/src/string/memset.h b/libc/src/string/memset.h index 611e70705b205..e38eb7d78a976 100644 --- a/libc/src/string/memset.h +++ b/libc/src/string/memset.h @@ -9,7 +9,7 @@ #ifndef LLVM_LIBC_SRC_STRING_MEMSET_H #define LLVM_LIBC_SRC_STRING_MEMSET_H -#include "include/string.h" +#include // size_t namespace __llvm_libc { diff --git a/libc/src/string/strcat.cpp b/libc/src/string/strcat.cpp index c02de2d21b93f..f5e8616f022ac 100644 --- a/libc/src/string/strcat.cpp +++ b/libc/src/string/strcat.cpp @@ -8,7 +8,7 @@ #include "src/string/strcat.h" #include "src/string/strcpy.h" -#include "src/string/strlen.h" +#include "src/string/string_utils.h" #include "src/__support/common.h" @@ -16,7 +16,7 @@ namespace __llvm_libc { char *LLVM_LIBC_ENTRYPOINT(strcat)(char *__restrict dest, const char *__restrict src) { - __llvm_libc::strcpy(dest + __llvm_libc::strlen(dest), src); + __llvm_libc::strcpy(dest + internal::string_length(dest), src); return dest; } diff --git a/libc/src/string/strcpy.cpp b/libc/src/string/strcpy.cpp index 6927d9d3ec898..69a40c9f53925 100644 --- a/libc/src/string/strcpy.cpp +++ b/libc/src/string/strcpy.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/string/strcpy.h" -#include "src/string/strlen.h" #include "src/string/memcpy.h" +#include "src/string/string_utils.h" #include "src/__support/common.h" @@ -17,7 +17,7 @@ namespace __llvm_libc { char *LLVM_LIBC_ENTRYPOINT(strcpy)(char *__restrict dest, const char *__restrict src) { return reinterpret_cast( - __llvm_libc::memcpy(dest, src, __llvm_libc::strlen(src) + 1)); + __llvm_libc::memcpy(dest, src, internal::string_length(src) + 1)); } } // namespace __llvm_libc diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h index 234246c10b065..dfb2c8af45279 100644 --- a/libc/src/string/string_utils.h +++ b/libc/src/string/string_utils.h @@ -15,6 +15,24 @@ namespace __llvm_libc { namespace internal { +// Returns the length of a string, denoted by the first occurrence +// of a null terminator. +static inline size_t string_length(const char *src) { + size_t length; + for (length = 0; *src; ++src, ++length) + ; + return length; +} + +// Returns the first occurrence of 'ch' within the first 'n' characters of +// 'src'. If 'ch' is not found, returns nullptr. +static inline void *find_first_character(const unsigned char *src, + unsigned char ch, size_t n) { + for (; n && *src != ch; --n, ++src) + ; + return n ? const_cast(src) : nullptr; +} + // Returns the maximum length span that contains only characters not found in // 'segment'. If no characters are found, returns the length of 'src'. static inline size_t complementary_span(const char *src, const char *segment) { diff --git a/libc/src/string/strlen.cpp b/libc/src/string/strlen.cpp index 0b7597ec52b6f..81e1f17e7c118 100644 --- a/libc/src/string/strlen.cpp +++ b/libc/src/string/strlen.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/string/strlen.h" +#include "src/string/string_utils.h" #include "src/__support/common.h" @@ -15,10 +16,7 @@ namespace __llvm_libc { // TODO: investigate the performance of this function. // There might be potential for compiler optimization. size_t LLVM_LIBC_ENTRYPOINT(strlen)(const char *src) { - const char *end = src; - while (*end != '\0') - ++end; - return end - src; + return internal::string_length(src); } } // namespace __llvm_libc diff --git a/libc/src/string/strnlen.cpp b/libc/src/string/strnlen.cpp index 17dd6e171504a..ea8fa9c26d54b 100644 --- a/libc/src/string/strnlen.cpp +++ b/libc/src/string/strnlen.cpp @@ -7,17 +7,17 @@ //===----------------------------------------------------------------------===// #include "src/string/strnlen.h" +#include "src/string/string_utils.h" #include "src/__support/common.h" -#include "src/string/memchr.h" #include namespace __llvm_libc { size_t LLVM_LIBC_ENTRYPOINT(strnlen)(const char *src, size_t n) { - const char *temp = - reinterpret_cast(__llvm_libc::memchr(src, '\0', n)); - return temp ? temp - src : n; + const void *temp = internal::find_first_character( + reinterpret_cast(src), '\0', n); + return temp ? reinterpret_cast(temp) - src : n; } } // namespace __llvm_libc diff --git a/libc/src/string/x86/memcpy.cpp b/libc/src/string/x86/memcpy.cpp index 811ce5183fe4e..2e2148eb7289b 100644 --- a/libc/src/string/x86/memcpy.cpp +++ b/libc/src/string/x86/memcpy.cpp @@ -59,12 +59,8 @@ static void memcpy_x86(char *__restrict dst, const char *__restrict src, return CopyBlock<4>(dst, src); if (count < 8) return CopyBlockOverlap<4>(dst, src, count); - if (count == 8) - return CopyBlock<8>(dst, src); if (count < 16) return CopyBlockOverlap<8>(dst, src, count); - if (count == 16) - return CopyBlock<16>(dst, src); if (count < 32) return CopyBlockOverlap<16>(dst, src, count); if (count < 64) diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt index e6390fc7a1d65..aa606ae630bc4 100644 --- a/libc/test/src/CMakeLists.txt +++ b/libc/test/src/CMakeLists.txt @@ -22,6 +22,8 @@ endforeach() list(REMOVE_ITEM entrypoints_name_list "__assert_fail" "__errno_location") list(TRANSFORM entrypoints_name_list PREPEND "-e=") +file(GLOB spec_files ${LIBC_SOURCE_DIR}/spec/*.td) + # Generate integration test souce code. add_custom_command( OUTPUT ${public_test} @@ -30,7 +32,7 @@ add_custom_command( -I ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/config/${LIBC_TARGET_OS}/api.td - DEPENDS ${LIBC_SOURCE_DIR}/config/${LIBC_TARGET_OS}/api.td + DEPENDS ${LIBC_SOURCE_DIR}/config/${LIBC_TARGET_OS}/api.td ${spec_files} libc-prototype-testgen ${TARGET_PUBLIC_HEADERS} llvmlibc llvmlibm ) diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index e1bac1a339067..a90736992f1f8 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -591,3 +591,16 @@ add_fp_unittest( libc.src.math.remquol libc.utils.FPUtil.fputil ) + +add_fp_unittest( + hypotf_test + NEED_MPFR + SUITE + libc_math_unittests + SRCS + hypotf_test.cpp + DEPENDS + libc.include.math + libc.src.math.hypotf + libc.utils.FPUtil.fputil +) diff --git a/libc/test/src/math/hypotf_test.cpp b/libc/test/src/math/hypotf_test.cpp new file mode 100644 index 0000000000000..7b1ffd5241dbb --- /dev/null +++ b/libc/test/src/math/hypotf_test.cpp @@ -0,0 +1,65 @@ +//===-- Unittests for hypotf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "include/math.h" +#include "src/math/hypotf.h" +#include "utils/FPUtil/FPBits.h" +#include "utils/FPUtil/TestHelpers.h" +#include "utils/MPFRWrapper/MPFRUtils.h" +#include "utils/UnitTest/Test.h" + +using FPBits = __llvm_libc::fputil::FPBits; +using UIntType = FPBits::UIntType; + +namespace mpfr = __llvm_libc::testing::mpfr; + +static const float zero = FPBits::zero(); +static const float negZero = FPBits::negZero(); +static const float nan = FPBits::buildNaN(1); +static const float inf = FPBits::inf(); +static const float negInf = FPBits::negInf(); + +TEST(HypotfTest, SpecialNumbers) { + EXPECT_FP_EQ(__llvm_libc::hypotf(inf, nan), inf); + EXPECT_FP_EQ(__llvm_libc::hypotf(nan, negInf), inf); + EXPECT_FP_EQ(__llvm_libc::hypotf(zero, inf), inf); + EXPECT_FP_EQ(__llvm_libc::hypotf(negInf, negZero), inf); + + EXPECT_FP_EQ(__llvm_libc::hypotf(nan, nan), nan); + EXPECT_FP_EQ(__llvm_libc::hypotf(nan, zero), nan); + EXPECT_FP_EQ(__llvm_libc::hypotf(negZero, nan), nan); + + EXPECT_FP_EQ(__llvm_libc::hypotf(negZero, zero), zero); +} + +TEST(HypotfTest, SubnormalRange) { + constexpr UIntType count = 1000001; + constexpr UIntType step = + (FPBits::maxSubnormal - FPBits::minSubnormal) / count; + for (UIntType v = FPBits::minSubnormal, w = FPBits::maxSubnormal; + v <= FPBits::maxSubnormal && w >= FPBits::minSubnormal; + v += step, w -= step) { + float x = FPBits(v), y = FPBits(w); + float result = __llvm_libc::hypotf(x, y); + mpfr::BinaryInput input{x, y}; + ASSERT_MPFR_MATCH(mpfr::Operation::Hypot, input, result, 0.5); + } +} + +TEST(HypotfTest, NormalRange) { + constexpr UIntType count = 1000001; + constexpr UIntType step = (FPBits::maxNormal - FPBits::minNormal) / count; + for (UIntType v = FPBits::minNormal, w = FPBits::maxNormal; + v <= FPBits::maxNormal && w >= FPBits::minNormal; v += step, w -= step) { + float x = FPBits(v), y = FPBits(w); + float result = __llvm_libc::hypotf(x, y); + ; + mpfr::BinaryInput input{x, y}; + ASSERT_MPFR_MATCH(mpfr::Operation::Hypot, input, result, 0.5); + } +} diff --git a/libc/utils/FPUtil/SqrtLongDoubleX86.h b/libc/utils/FPUtil/SqrtLongDoubleX86.h index 2ac73044cf92f..df80d7d932bac 100644 --- a/libc/utils/FPUtil/SqrtLongDoubleX86.h +++ b/libc/utils/FPUtil/SqrtLongDoubleX86.h @@ -10,6 +10,8 @@ #define LLVM_LIBC_UTILS_FPUTIL_SQRT_LONG_DOUBLE_X86_H #include "FPBits.h" +#include "Sqrt.h" + #include "utils/CPP/TypeTraits.h" namespace __llvm_libc { diff --git a/libc/utils/LibcTableGenUtil/CMakeLists.txt b/libc/utils/LibcTableGenUtil/CMakeLists.txt index ae887a8bdb03a..d2632a240bd3d 100644 --- a/libc/utils/LibcTableGenUtil/CMakeLists.txt +++ b/libc/utils/LibcTableGenUtil/CMakeLists.txt @@ -2,6 +2,6 @@ add_llvm_library( LibcTableGenUtil APIIndexer.cpp APIIndexer.h - LINK_COMPONENTS Support + LINK_COMPONENTS Support TableGen ) target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR}) diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt index 6a3c24e27b158..cc66d1c47d62c 100644 --- a/libc/utils/MPFRWrapper/CMakeLists.txt +++ b/libc/utils/MPFRWrapper/CMakeLists.txt @@ -13,7 +13,7 @@ if(LIBC_TESTS_CAN_USE_MPFR) MPFRUtils.h ) add_dependencies(libcMPFRWrapper libc.utils.CPP.standalone_cpp libc.utils.FPUtil.fputil LibcUnitTest LLVMSupport) - target_link_libraries(libcMPFRWrapper -lmpfr -lgmp LibcUnitTest LLVMSupport) + target_link_libraries(libcMPFRWrapper -lmpfr -lgmp LibcFPTestHelpers LibcUnitTest LLVMSupport) else() message(WARNING "Math tests using MPFR will be skipped.") endif() diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp index a121234e62246..56764e9740b01 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.cpp +++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp @@ -15,10 +15,20 @@ #include "llvm/ADT/StringRef.h" #include -#include #include #include +#ifdef CUSTOM_MPFR_INCLUDER +// Some downstream repos are monoliths carrying MPFR sources in their third +// party directory. In such repos, including the MPFR header as +// `#include ` is either disallowed or not possible. If that is the +// case, a file named `CustomMPFRIncluder.h` should be added through which the +// MPFR header can be included in manner allowed in that repo. +#include "CustomMPFRIncluder.h" +#else +#include +#endif + template using FPBits = __llvm_libc::fputil::FPBits; namespace __llvm_libc { @@ -123,6 +133,12 @@ class MPFRNumber { return result; } + MPFRNumber hypot(const MPFRNumber &b) { + MPFRNumber result; + mpfr_hypot(result.value, value, b.value, MPFR_RNDN); + return result; + } + MPFRNumber remquo(const MPFRNumber &divisor, int "ient) { MPFRNumber remainder; long q; @@ -266,6 +282,18 @@ unaryOperationTwoOutputs(Operation op, InputType input, int &output) { } } +template +cpp::EnableIfType::Value, MPFRNumber> +binaryOperationOneOutput(Operation op, InputType x, InputType y) { + MPFRNumber inputX(x), inputY(y); + switch (op) { + case Operation::Hypot: + return inputX.hypot(inputY); + default: + __builtin_unreachable(); + } +} + template cpp::EnableIfType::Value, MPFRNumber> binaryOperationTwoOutputs(Operation op, InputType x, InputType y, int &output) { @@ -391,6 +419,41 @@ template void explainBinaryOperationTwoOutputsError( Operation, const BinaryInput &, const BinaryOutput &, testutils::StreamWrapper &); +template +void explainBinaryOperationOneOutputError(Operation op, + const BinaryInput &input, + T libcResult, + testutils::StreamWrapper &OS) { + MPFRNumber mpfrX(input.x); + MPFRNumber mpfrY(input.y); + FPBits xbits(input.x); + FPBits ybits(input.y); + MPFRNumber mpfrResult = binaryOperationOneOutput(op, input.x, input.y); + MPFRNumber mpfrMatchValue(libcResult); + + OS << "Input decimal: x: " << mpfrX.str() << " y: " << mpfrY.str() << '\n'; + __llvm_libc::fputil::testing::describeValue("First input bits: ", input.x, + OS); + __llvm_libc::fputil::testing::describeValue("Second input bits: ", input.y, + OS); + + OS << "Libc result: " << mpfrMatchValue.str() << '\n' + << "MPFR result: " << mpfrResult.str() << '\n'; + __llvm_libc::fputil::testing::describeValue( + "Libc floating point result bits: ", libcResult, OS); + __llvm_libc::fputil::testing::describeValue( + " MPFR rounded bits: ", mpfrResult.as(), OS); + OS << "ULP error: " << std::to_string(mpfrResult.ulp(libcResult)) << '\n'; +} + +template void explainBinaryOperationOneOutputError( + Operation, const BinaryInput &, float, testutils::StreamWrapper &); +template void explainBinaryOperationOneOutputError( + Operation, const BinaryInput &, double, testutils::StreamWrapper &); +template void explainBinaryOperationOneOutputError( + Operation, const BinaryInput &, long double, + testutils::StreamWrapper &); + template bool compareUnaryOperationSingleOutput(Operation op, T input, T libcResult, double ulpError) { @@ -470,6 +533,26 @@ template bool compareBinaryOperationTwoOutputs( Operation, const BinaryInput &, const BinaryOutput &, double); +template +bool compareBinaryOperationOneOutput(Operation op, const BinaryInput &input, + T libcResult, double ulpError) { + MPFRNumber mpfrResult = binaryOperationOneOutput(op, input.x, input.y); + double ulp = mpfrResult.ulp(libcResult); + + bool bitsAreEven = ((FPBits(libcResult).bitsAsUInt() & 1) == 0); + return (ulp < ulpError) || + ((ulp == ulpError) && ((ulp != 0.5) || bitsAreEven)); +} + +template bool compareBinaryOperationOneOutput(Operation, + const BinaryInput &, + float, double); +template bool +compareBinaryOperationOneOutput(Operation, const BinaryInput &, + double, double); +template bool compareBinaryOperationOneOutput( + Operation, const BinaryInput &, long double, double); + } // namespace internal } // namespace mpfr diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h index b46f09dd5e558..6fb9fe5c47b65 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.h +++ b/libc/utils/MPFRWrapper/MPFRUtils.h @@ -47,7 +47,7 @@ enum class Operation : int { // input and produce a single floating point number of the same type as // output. BeginBinaryOperationsSingleOutput, - // TODO: Add operations like hypot. + Hypot, EndBinaryOperationsSingleOutput, // Operations which take two floating point numbers of the same type as @@ -109,6 +109,10 @@ bool compareBinaryOperationTwoOutputs(Operation op, const BinaryInput &input, const BinaryOutput &libcOutput, double t); +template +bool compareBinaryOperationOneOutput(Operation op, const BinaryInput &input, + T libcOutput, double t); + template void explainUnaryOperationSingleOutputError(Operation op, T input, T matchValue, testutils::StreamWrapper &OS); @@ -122,6 +126,12 @@ void explainBinaryOperationTwoOutputsError(Operation op, const BinaryOutput &matchValue, testutils::StreamWrapper &OS); +template +void explainBinaryOperationOneOutputError(Operation op, + const BinaryInput &input, + T matchValue, + testutils::StreamWrapper &OS); + template class MPFRMatcher : public testing::Matcher { InputType input; @@ -153,7 +163,7 @@ class MPFRMatcher : public testing::Matcher { template static bool match(const BinaryInput &in, T out, double tolerance) { - // TODO: Implement the comparision function and error reporter. + return compareBinaryOperationOneOutput(op, in, out, tolerance); } template @@ -183,6 +193,12 @@ class MPFRMatcher : public testing::Matcher { testutils::StreamWrapper &OS) { explainBinaryOperationTwoOutputsError(op, in, out, OS); } + + template + static void explainError(const BinaryInput &in, T out, + testutils::StreamWrapper &OS) { + explainBinaryOperationOneOutputError(op, in, out, OS); + } }; } // namespace internal diff --git a/libclc/generic/lib/math/math.h b/libclc/generic/lib/math/math.h index 3790d4cf67762..9c37633f57e4c 100644 --- a/libclc/generic/lib/math/math.h +++ b/libclc/generic/lib/math/math.h @@ -40,6 +40,9 @@ #if (defined __AMDGCN__ || defined __R600__) && !defined __HAS_FMAF__ #define HAVE_HW_FMA32() (0) +#elif defined CLC_SPIRV || defined CLC_SPIRV64 +bool __attribute__((noinline)) __clc_runtime_has_hw_fma32(void); +#define HAVE_HW_FMA32() __clc_runtime_has_hw_fma32() #else #define HAVE_HW_FMA32() (1) #endif diff --git a/libclc/spirv/lib/SOURCES b/libclc/spirv/lib/SOURCES index f594fa7e85d49..854cba614c8bf 100644 --- a/libclc/spirv/lib/SOURCES +++ b/libclc/spirv/lib/SOURCES @@ -41,6 +41,10 @@ subnormal_config.cl ../../generic/lib/math/exp2.cl ../../generic/lib/math/clc_exp10.cl ../../generic/lib/math/exp10.cl +../../generic/lib/math/clc_fma.cl +math/fma.cl +../../generic/lib/math/clc_fmod.cl +../../generic/lib/math/fmod.cl ../../generic/lib/math/fract.cl ../../generic/lib/math/frexp.cl ../../generic/lib/math/half_rsqrt.cl @@ -48,6 +52,8 @@ subnormal_config.cl ../../generic/lib/math/clc_hypot.cl ../../generic/lib/math/hypot.cl ../../generic/lib/math/ilogb.cl +../../generic/lib/math/clc_ldexp.cl +../../generic/lib/math/ldexp.cl ../../generic/lib/math/lgamma.cl ../../generic/lib/math/lgamma_r.cl ../../generic/lib/math/log.cl diff --git a/libclc/spirv/lib/math/fma.cl b/libclc/spirv/lib/math/fma.cl new file mode 100644 index 0000000000000..79142425e52d2 --- /dev/null +++ b/libclc/spirv/lib/math/fma.cl @@ -0,0 +1,11 @@ +#include +#include + +#define __CLC_BODY +#define __FLOAT_ONLY +#include + +bool __clc_runtime_has_hw_fma32() +{ + return false; +} diff --git a/libclc/spirv/lib/math/fma.inc b/libclc/spirv/lib/math/fma.inc new file mode 100644 index 0000000000000..0f12c565758ff --- /dev/null +++ b/libclc/spirv/lib/math/fma.inc @@ -0,0 +1,3 @@ +_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE fma(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) { + return __clc_sw_fma(a, b, c); +} diff --git a/libclc/spirv64/lib/SOURCES b/libclc/spirv64/lib/SOURCES index f594fa7e85d49..854cba614c8bf 100644 --- a/libclc/spirv64/lib/SOURCES +++ b/libclc/spirv64/lib/SOURCES @@ -41,6 +41,10 @@ subnormal_config.cl ../../generic/lib/math/exp2.cl ../../generic/lib/math/clc_exp10.cl ../../generic/lib/math/exp10.cl +../../generic/lib/math/clc_fma.cl +math/fma.cl +../../generic/lib/math/clc_fmod.cl +../../generic/lib/math/fmod.cl ../../generic/lib/math/fract.cl ../../generic/lib/math/frexp.cl ../../generic/lib/math/half_rsqrt.cl @@ -48,6 +52,8 @@ subnormal_config.cl ../../generic/lib/math/clc_hypot.cl ../../generic/lib/math/hypot.cl ../../generic/lib/math/ilogb.cl +../../generic/lib/math/clc_ldexp.cl +../../generic/lib/math/ldexp.cl ../../generic/lib/math/lgamma.cl ../../generic/lib/math/lgamma_r.cl ../../generic/lib/math/log.cl diff --git a/libclc/spirv64/lib/math/fma.cl b/libclc/spirv64/lib/math/fma.cl new file mode 100644 index 0000000000000..79142425e52d2 --- /dev/null +++ b/libclc/spirv64/lib/math/fma.cl @@ -0,0 +1,11 @@ +#include +#include + +#define __CLC_BODY +#define __FLOAT_ONLY +#include + +bool __clc_runtime_has_hw_fma32() +{ + return false; +} diff --git a/libclc/spirv64/lib/math/fma.inc b/libclc/spirv64/lib/math/fma.inc new file mode 100644 index 0000000000000..0f12c565758ff --- /dev/null +++ b/libclc/spirv64/lib/math/fma.inc @@ -0,0 +1,3 @@ +_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE fma(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) { + return __clc_sw_fma(a, b, c); +} diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index ea0aa0a259a22..8e7df5d19610e 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -41,33 +41,19 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBCXX_STANDALONE_BUIL endif() if (LIBCXX_STANDALONE_BUILD) - if(CMAKE_VERSION VERSION_LESS 3.12) - include(FindPythonInterp) - if( NOT PYTHONINTERP_FOUND ) - message(WARNING "Failed to find python interpreter. " - "The libc++ test suite will be disabled.") - set(LLVM_INCLUDE_TESTS OFF) - else() - add_executable(Python3::Interpreter IMPORTED) - set_target_properties(Python3::Interpreter PROPERTIES - IMPORTED_LOCATION ${PYTHON_EXECUTABLE}) - set(Python3_EXECUTABLE ${PYTHON_EXECUTABLE}) + find_package(Python3 COMPONENTS Interpreter) + if(NOT Python3_Interpreter_FOUND) + message(WARNING "Python3 not found, using python2 as a fallback") + find_package(Python2 COMPONENTS Interpreter REQUIRED) + if(Python2_VERSION VERSION_LESS 2.7) + message(SEND_ERROR "Python 2.7 or newer is required") endif() - else() - find_package(Python3 COMPONENTS Interpreter) - if(NOT Python3_Interpreter_FOUND) - message(WARNING "Python3 not found, using python2 as a fallback") - find_package(Python2 COMPONENTS Interpreter REQUIRED) - if(Python2_VERSION VERSION_LESS 2.7) - message(SEND_ERROR "Python 2.7 or newer is required") - endif() - # Treat python2 as python3 - add_executable(Python3::Interpreter IMPORTED) - set_target_properties(Python3::Interpreter PROPERTIES - IMPORTED_LOCATION ${Python2_EXECUTABLE}) - set(Python3_EXECUTABLE ${Python2_EXECUTABLE}) - endif() + # Treat python2 as python3 + add_executable(Python3::Interpreter IMPORTED) + set_target_properties(Python3::Interpreter PROPERTIES + IMPORTED_LOCATION ${Python2_EXECUTABLE}) + set(Python3_EXECUTABLE ${Python2_EXECUTABLE}) endif() endif() @@ -110,7 +96,7 @@ option(LIBCXX_INCLUDE_TESTS "Build the libc++ tests." ${LLVM_INCLUDE_TESTS}) option(LIBCXX_ENABLE_PARALLEL_ALGORITHMS "Enable the parallel algorithms library. This requires the PSTL to be available." OFF) option(LIBCXX_TEST_GDB_PRETTY_PRINTERS "Test gdb pretty printers." OFF) set(LIBCXX_TEST_CONFIG "${CMAKE_CURRENT_SOURCE_DIR}/test/configs/legacy.cfg.in" CACHE STRING - "The Lit testing configuration to use when running the tests." FORCE) # TODO: Stop using 'FORCE' once we can assume all CMake build dirs have been re-generated + "The Lit testing configuration to use when running the tests.") set(LIBCXX_TEST_PARAMS "" CACHE STRING "A list of parameters to run the Lit test suite with.") diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt index 8480ede23a49f..42d25c20c8115 100644 --- a/libcxx/benchmarks/CMakeLists.txt +++ b/libcxx/benchmarks/CMakeLists.txt @@ -70,18 +70,9 @@ set(BENCHMARK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(BENCHMARK_LIBCXX_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/benchmark-libcxx) set(BENCHMARK_NATIVE_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/benchmark-native) -check_flag_supported("-std=c++17") -mangle_name("LIBCXX_SUPPORTS_STD_EQ_c++17_FLAG" BENCHMARK_SUPPORTS_STD_CXX17_FLAG) -if (${BENCHMARK_SUPPORTS_STD_CXX17_FLAG}) - set(BENCHMARK_DIALECT_FLAG "-std=c++17") -else() - # If the compiler doesn't support -std=c++17, attempt to fall back to -std=c++1z while still - # requiring C++17 language features. - set(BENCHMARK_DIALECT_FLAG "-std=c++1z") -endif() set(BENCHMARK_TEST_COMPILE_FLAGS - ${BENCHMARK_DIALECT_FLAG} -O2 + -O2 -fsized-deallocation -I${BENCHMARK_LIBCXX_INSTALL}/include -I${LIBCXX_SOURCE_DIR}/test/support @@ -90,6 +81,7 @@ set(BENCHMARK_TEST_LIBCXX_COMPILE_FLAGS ${BENCHMARK_TEST_COMPILE_FLAGS} ${SANITIZER_FLAGS} -Wno-user-defined-literals + -Wno-suggest-override ) set(BENCHMARK_TEST_LIBCXX_LINK_FLAGS @@ -147,7 +139,10 @@ function(add_benchmark_test name source_file) OUTPUT_NAME "${name}.libcxx.out" RUNTIME_OUTPUT_DIRECTORY "${BENCHMARK_OUTPUT_DIR}" COMPILE_FLAGS "${BENCHMARK_TEST_LIBCXX_COMPILE_FLAGS}" - LINK_FLAGS "${BENCHMARK_TEST_LIBCXX_LINK_FLAGS}") + LINK_FLAGS "${BENCHMARK_TEST_LIBCXX_LINK_FLAGS}" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED YES + CXX_EXTENSIONS NO) cxx_link_system_libraries(${libcxx_target}) if (LIBCXX_BENCHMARK_NATIVE_STDLIB) if (LIBCXX_BENCHMARK_NATIVE_STDLIB STREQUAL "libstdc++" AND NOT DEFINED LIBSTDCXX_FILESYSTEM_LIB @@ -174,7 +169,10 @@ function(add_benchmark_test name source_file) RUNTIME_OUTPUT_DIRECTORY "${BENCHMARK_OUTPUT_DIR}" INCLUDE_DIRECTORIES "" COMPILE_FLAGS "${BENCHMARK_TEST_NATIVE_COMPILE_FLAGS}" - LINK_FLAGS "${BENCHMARK_TEST_NATIVE_LINK_FLAGS}") + LINK_FLAGS "${BENCHMARK_TEST_NATIVE_LINK_FLAGS}" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED YES + CXX_EXTENSIONS NO) endif() endfunction() diff --git a/libcxx/benchmarks/map.bench.cpp b/libcxx/benchmarks/map.bench.cpp new file mode 100644 index 0000000000000..dd1884f65032e --- /dev/null +++ b/libcxx/benchmarks/map.bench.cpp @@ -0,0 +1,1037 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include "CartesianBenchmarks.h" +#include "benchmark/benchmark.h" +#include "test_macros.h" + +// When VALIDATE is defined the benchmark will run to validate the benchmarks. +// The time taken by several operations depend on whether or not an element +// exists. To avoid errors in the benchmark these operations have a validation +// mode to test the benchmark. Since they are not meant to be benchmarked the +// number of sizes tested is limited to 1. +//#define VALIDATE + +namespace { + +enum class Mode { Hit, Miss }; + +struct AllModes : EnumValuesAsTuple { + static constexpr const char* Names[] = {"ExistingElement", "NewElement"}; +}; + +// The positions of the hints to pick: +// - Begin picks the first item. The item cannot be put before this element. +// - Thrid picks the third item. This is just an element with a valid entry +// before and after it. +// - Correct contains the correct hint. +// - End contains a hint to the end of the map. +enum class Hint { Begin, Third, Correct, End }; +struct AllHints : EnumValuesAsTuple { + static constexpr const char* Names[] = {"Begin", "Third", "Correct", "End"}; +}; + +enum class Order { Sorted, Random }; +struct AllOrders : EnumValuesAsTuple { + static constexpr const char* Names[] = {"Sorted", "Random"}; +}; + +struct TestSets { + std::vector Keys; + std::vector > Maps; + std::vector< + std::vector::const_iterator> > + Hints; +}; + +enum class Shuffle { None, Keys, Hints }; + +TestSets makeTestingSets(size_t MapSize, Mode mode, Shuffle shuffle, + size_t max_maps) { + /* + * The shuffle does not retain the random number generator to use the same + * set of random numbers for every iteration. + */ + TestSets R; + + int MapCount = std::min(max_maps, 1000000 / MapSize); + + for (uint64_t I = 0; I < MapSize; ++I) { + R.Keys.push_back(mode == Mode::Hit ? 2 * I + 2 : 2 * I + 1); + } + if (shuffle == Shuffle::Keys) + std::shuffle(R.Keys.begin(), R.Keys.end(), std::mt19937()); + + for (int M = 0; M < MapCount; ++M) { + auto& map = R.Maps.emplace_back(); + auto& hints = R.Hints.emplace_back(); + for (uint64_t I = 0; I < MapSize; ++I) { + hints.push_back(map.insert(std::make_pair(2 * I + 2, 0)).first); + } + if (shuffle == Shuffle::Hints) + std::shuffle(hints.begin(), hints.end(), std::mt19937()); + } + + return R; +} + +struct Base { + size_t MapSize; + Base(size_t T) : MapSize(T) {} + + std::string baseName() const { return "_MapSize=" + std::to_string(MapSize); } +}; + +//*******************************************************************| +// Member functions | +//*******************************************************************| + +struct ConstructorDefault { + void run(benchmark::State& State) const { + for (auto _ : State) { + benchmark::DoNotOptimize(std::map()); + } + } + + std::string name() const { return "BM_ConstructorDefault"; } +}; + +struct ConstructorIterator : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode::Hit, Shuffle::None, 1); + auto& Map = Data.Maps.front(); + while (State.KeepRunningBatch(MapSize)) { +#ifndef VALIDATE + benchmark::DoNotOptimize( + std::map(Map.begin(), Map.end())); +#else + std::map M{Map.begin(), Map.end()}; + if (M != Map) + State.SkipWithError("Map copy not identical"); +#endif + } + } + + std::string name() const { return "BM_ConstructorIterator" + baseName(); } +}; + +struct ConstructorCopy : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode::Hit, Shuffle::None, 1); + auto& Map = Data.Maps.front(); + while (State.KeepRunningBatch(MapSize)) { +#ifndef VALIDATE + std::map M(Map); + benchmark::DoNotOptimize(M); +#else + std::map M(Map); + if (M != Map) + State.SkipWithError("Map copy not identical"); +#endif + } + } + + std::string name() const { return "BM_ConstructorCopy" + baseName(); } +}; + +struct ConstructorMove : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode::Hit, Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (auto& Map : Data.Maps) { + std::map M(std::move(Map)); + benchmark::DoNotOptimize(M); + } + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode::Hit, Shuffle::None, 1000); + State.ResumeTiming(); + } + } + + std::string name() const { return "BM_ConstructorMove" + baseName(); } +}; + +//*******************************************************************| +// Capacity | +//*******************************************************************| + +struct Empty : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode::Hit, Shuffle::None, 1); + auto& Map = Data.Maps.front(); + for (auto _ : State) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.empty()); +#else + if (Map.empty()) + State.SkipWithError("Map contains an invalid number of elements."); +#endif + } + } + + std::string name() const { return "BM_Empty" + baseName(); } +}; + +struct Size : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode::Hit, Shuffle::None, 1); + auto& Map = Data.Maps.front(); + for (auto _ : State) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.size()); +#else + if (Map.size() != MapSize) + State.SkipWithError("Map contains an invalid number of elements."); +#endif + } + } + + std::string name() const { return "BM_Size" + baseName(); } +}; + +//*******************************************************************| +// Modifiers | +//*******************************************************************| + +struct Clear : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode::Hit, Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (auto& Map : Data.Maps) { + Map.clear(); + benchmark::DoNotOptimize(Map); + } + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode::Hit, Shuffle::None, 1000); + State.ResumeTiming(); + } + } + + std::string name() const { return "BM_Clear" + baseName(); } +}; + +template +struct Insert : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets( + MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys : Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (auto& Map : Data.Maps) { + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.insert(std::make_pair(K, 1))); +#else + bool Inserted = Map.insert(std::make_pair(K, 1)).second; + if (Mode() == ::Mode::Hit) { + if (Inserted) + State.SkipWithError("Inserted a duplicate element"); + } else { + if (!Inserted) + State.SkipWithError("Failed to insert e new element"); + } +#endif + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys + : Shuffle::None, + 1000); + State.ResumeTiming(); + } + } + + std::string name() const { + return "BM_Insert" + baseName() + Mode::name() + Order::name(); + } +}; + +template +struct InsertHint : Base { + using Base::Base; + + template < ::Hint hint> + typename std::enable_if::type + run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (size_t I = 0; I < Data.Maps.size(); ++I) { + auto& Map = Data.Maps[I]; + auto H = Data.Hints[I].begin(); + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.insert(*H, std::make_pair(K, 1))); +#else + auto Inserted = Map.insert(*H, std::make_pair(K, 1)); + if (Mode() == ::Mode::Hit) { + if (Inserted != *H) + State.SkipWithError("Inserted a duplicate element"); + } else { + if (++Inserted != *H) + State.SkipWithError("Failed to insert a new element"); + } +#endif + ++H; + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + State.ResumeTiming(); + } + } + + template < ::Hint hint> + typename std::enable_if::type + run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (size_t I = 0; I < Data.Maps.size(); ++I) { + auto& Map = Data.Maps[I]; + auto Third = *(Data.Hints[I].begin() + 2); + for (auto K : Data.Keys) { + auto Itor = hint == ::Hint::Begin + ? Map.begin() + : hint == ::Hint::Third ? Third : Map.end(); +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.insert(Itor, std::make_pair(K, 1))); +#else + size_t Size = Map.size(); + Map.insert(Itor, std::make_pair(K, 1)); + if (Mode() == ::Mode::Hit) { + if (Size != Map.size()) + State.SkipWithError("Inserted a duplicate element"); + } else { + if (Size + 1 != Map.size()) + State.SkipWithError("Failed to insert a new element"); + } +#endif + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + State.ResumeTiming(); + } + } + + void run(benchmark::State& State) const { + static constexpr auto h = Hint(); + run(State); + } + + std::string name() const { + return "BM_InsertHint" + baseName() + Mode::name() + Hint::name(); + } +}; + +template +struct InsertAssign : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets( + MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys : Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (auto& Map : Data.Maps) { + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.insert_or_assign(K, 1)); +#else + bool Inserted = Map.insert_or_assign(K, 1).second; + if (Mode() == ::Mode::Hit) { + if (Inserted) + State.SkipWithError("Inserted a duplicate element"); + } else { + if (!Inserted) + State.SkipWithError("Failed to insert e new element"); + } +#endif + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys + : Shuffle::None, + 1000); + State.ResumeTiming(); + } + } + + std::string name() const { + return "BM_InsertAssign" + baseName() + Mode::name() + Order::name(); + } +}; + +template +struct InsertAssignHint : Base { + using Base::Base; + + template < ::Hint hint> + typename std::enable_if::type + run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (size_t I = 0; I < Data.Maps.size(); ++I) { + auto& Map = Data.Maps[I]; + auto H = Data.Hints[I].begin(); + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.insert_or_assign(*H, K, 1)); +#else + auto Inserted = Map.insert_or_assign(*H, K, 1); + if (Mode() == ::Mode::Hit) { + if (Inserted != *H) + State.SkipWithError("Inserted a duplicate element"); + } else { + if (++Inserted != *H) + State.SkipWithError("Failed to insert a new element"); + } +#endif + ++H; + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + State.ResumeTiming(); + } + } + + template < ::Hint hint> + typename std::enable_if::type + run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (size_t I = 0; I < Data.Maps.size(); ++I) { + auto& Map = Data.Maps[I]; + auto Third = *(Data.Hints[I].begin() + 2); + for (auto K : Data.Keys) { + auto Itor = hint == ::Hint::Begin + ? Map.begin() + : hint == ::Hint::Third ? Third : Map.end(); +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.insert_or_assign(Itor, K, 1)); +#else + size_t Size = Map.size(); + Map.insert_or_assign(Itor, K, 1); + if (Mode() == ::Mode::Hit) { + if (Size != Map.size()) + State.SkipWithError("Inserted a duplicate element"); + } else { + if (Size + 1 != Map.size()) + State.SkipWithError("Failed to insert a new element"); + } +#endif + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + State.ResumeTiming(); + } + } + + void run(benchmark::State& State) const { + static constexpr auto h = Hint(); + run(State); + } + + std::string name() const { + return "BM_InsertAssignHint" + baseName() + Mode::name() + Hint::name(); + } +}; + +template +struct Emplace : Base { + using Base::Base; + + void run(benchmark::State& State) const { + + auto Data = makeTestingSets( + MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys : Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (auto& Map : Data.Maps) { + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.emplace(K, 1)); +#else + bool Inserted = Map.emplace(K, 1).second; + if (Mode() == ::Mode::Hit) { + if (Inserted) + State.SkipWithError("Emplaced a duplicate element"); + } else { + if (!Inserted) + State.SkipWithError("Failed to emplace a new element"); + } +#endif + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys + : Shuffle::None, + 1000); + State.ResumeTiming(); + } + } + + std::string name() const { + return "BM_Emplace" + baseName() + Mode::name() + Order::name(); + } +}; + +template +struct EmplaceHint : Base { + using Base::Base; + + template < ::Hint hint> + typename std::enable_if::type + run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (size_t I = 0; I < Data.Maps.size(); ++I) { + auto& Map = Data.Maps[I]; + auto H = Data.Hints[I].begin(); + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.emplace_hint(*H, K, 1)); +#else + auto Inserted = Map.emplace_hint(*H, K, 1); + if (Mode() == ::Mode::Hit) { + if (Inserted != *H) + State.SkipWithError("Emplaced a duplicate element"); + } else { + if (++Inserted != *H) + State.SkipWithError("Failed to emplace a new element"); + } +#endif + ++H; + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + State.ResumeTiming(); + } + } + + template < ::Hint hint> + typename std::enable_if::type + run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (size_t I = 0; I < Data.Maps.size(); ++I) { + auto& Map = Data.Maps[I]; + auto Third = *(Data.Hints[I].begin() + 2); + for (auto K : Data.Keys) { + auto Itor = hint == ::Hint::Begin + ? Map.begin() + : hint == ::Hint::Third ? Third : Map.end(); +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.emplace_hint(Itor, K, 1)); +#else + size_t Size = Map.size(); + Map.emplace_hint(Itor, K, 1); + if (Mode() == ::Mode::Hit) { + if (Size != Map.size()) + State.SkipWithError("Emplaced a duplicate element"); + } else { + if (Size + 1 != Map.size()) + State.SkipWithError("Failed to emplace a new element"); + } +#endif + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + State.ResumeTiming(); + } + } + + void run(benchmark::State& State) const { + static constexpr auto h = Hint(); + run(State); + } + + std::string name() const { + return "BM_EmplaceHint" + baseName() + Mode::name() + Hint::name(); + } +}; + +template +struct TryEmplace : Base { + using Base::Base; + + void run(benchmark::State& State) const { + + auto Data = makeTestingSets( + MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys : Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (auto& Map : Data.Maps) { + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.try_emplace(K, 1)); +#else + bool Inserted = Map.try_emplace(K, 1).second; + if (Mode() == ::Mode::Hit) { + if (Inserted) + State.SkipWithError("Emplaced a duplicate element"); + } else { + if (!Inserted) + State.SkipWithError("Failed to emplace a new element"); + } +#endif + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys + : Shuffle::None, + 1000); + State.ResumeTiming(); + } + } + + std::string name() const { + return "BM_TryEmplace" + baseName() + Mode::name() + Order::name(); + } +}; + +template +struct TryEmplaceHint : Base { + using Base::Base; + + template < ::Hint hint> + typename std::enable_if::type + run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (size_t I = 0; I < Data.Maps.size(); ++I) { + auto& Map = Data.Maps[I]; + auto H = Data.Hints[I].begin(); + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.try_emplace(*H, K, 1)); +#else + auto Inserted = Map.try_emplace(*H, K, 1); + if (Mode() == ::Mode::Hit) { + if (Inserted != *H) + State.SkipWithError("Emplaced a duplicate element"); + } else { + if (++Inserted != *H) + State.SkipWithError("Failed to emplace a new element"); + } +#endif + ++H; + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + State.ResumeTiming(); + } + } + + template < ::Hint hint> + typename std::enable_if::type + run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (size_t I = 0; I < Data.Maps.size(); ++I) { + auto& Map = Data.Maps[I]; + auto Third = *(Data.Hints[I].begin() + 2); + for (auto K : Data.Keys) { + auto Itor = hint == ::Hint::Begin + ? Map.begin() + : hint == ::Hint::Third ? Third : Map.end(); +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.try_emplace(Itor, K, 1)); +#else + size_t Size = Map.size(); + Map.try_emplace(Itor, K, 1); + if (Mode() == ::Mode::Hit) { + if (Size != Map.size()) + State.SkipWithError("Emplaced a duplicate element"); + } else { + if (Size + 1 != Map.size()) + State.SkipWithError("Failed to emplace a new element"); + } +#endif + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), Shuffle::None, 1000); + State.ResumeTiming(); + } + } + + void run(benchmark::State& State) const { + static constexpr auto h = Hint(); + run(State); + } + + std::string name() const { + return "BM_TryEmplaceHint" + baseName() + Mode::name() + Hint::name(); + } +}; + +template +struct Erase : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets( + MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys : Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (auto& Map : Data.Maps) { + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.erase(K)); +#else + size_t I = Map.erase(K); + if (Mode() == ::Mode::Hit) { + if (I == 0) + State.SkipWithError("Did not find the existing element"); + } else { + if (I == 1) + State.SkipWithError("Did find the non-existing element"); + } +#endif + } + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys + : Shuffle::None, + 1000); + State.ResumeTiming(); + } + } + + std::string name() const { + return "BM_Erase" + baseName() + Mode::name() + Order::name(); + } +}; + +template +struct EraseIterator : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets( + MapSize, Mode::Hit, + Order::value == ::Order::Random ? Shuffle::Hints : Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (size_t I = 0; I < Data.Maps.size(); ++I) { + auto& Map = Data.Maps[I]; + for (auto H : Data.Hints[I]) { + benchmark::DoNotOptimize(Map.erase(H)); + } +#ifdef VALIDATE + if (!Map.empty()) + State.SkipWithError("Did not erase the entire map"); +#endif + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode::Hit, + Order::value == ::Order::Random ? Shuffle::Hints + : Shuffle::None, + 1000); + State.ResumeTiming(); + } + } + + std::string name() const { + return "BM_EraseIterator" + baseName() + Order::name(); + } +}; + +struct EraseRange : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets(MapSize, Mode::Hit, Shuffle::None, 1000); + while (State.KeepRunningBatch(MapSize * Data.Maps.size())) { + for (auto& Map : Data.Maps) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.erase(Map.begin(), Map.end())); +#else + Map.erase(Map.begin(), Map.end()); + if (!Map.empty()) + State.SkipWithError("Did not erase the entire map"); +#endif + } + + State.PauseTiming(); + Data = makeTestingSets(MapSize, Mode::Hit, Shuffle::None, 1000); + State.ResumeTiming(); + } + } + + std::string name() const { return "BM_EraseRange" + baseName(); } +}; + +//*******************************************************************| +// Lookup | +//*******************************************************************| + +template +struct Count : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets( + MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys : Shuffle::None, 1); + auto& Map = Data.Maps.front(); + while (State.KeepRunningBatch(MapSize)) { + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.count(K)); +#else + size_t I = Map.count(K); + if (Mode() == ::Mode::Hit) { + if (I == 0) + State.SkipWithError("Did not find the existing element"); + } else { + if (I == 1) + State.SkipWithError("Did find the non-existing element"); + } +#endif + } + } + } + + std::string name() const { + return "BM_Count" + baseName() + Mode::name() + Order::name(); + } +}; + +template +struct Find : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets( + MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys : Shuffle::None, 1); + auto& Map = Data.Maps.front(); + while (State.KeepRunningBatch(MapSize)) { + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.find(K)); +#else + auto Itor = Map.find(K); + if (Mode() == ::Mode::Hit) { + if (Itor == Map.end()) + State.SkipWithError("Did not find the existing element"); + } else { + if (Itor != Map.end()) + State.SkipWithError("Did find the non-existing element"); + } +#endif + } + } + } + + std::string name() const { + return "BM_Find" + baseName() + Mode::name() + Order::name(); + } +}; + +template +struct EqualRange : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets( + MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys : Shuffle::None, 1); + auto& Map = Data.Maps.front(); + while (State.KeepRunningBatch(MapSize)) { + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.equal_range(K)); +#else + auto Range = Map.equal_range(K); + if (Mode() == ::Mode::Hit) { + // Adjust validation for the last element. + auto Key = K; + if (Range.second == Map.end() && K == 2 * MapSize) { + --Range.second; + Key -= 2; + } + if (Range.first == Map.end() || Range.first->first != K || + Range.second == Map.end() || Range.second->first - 2 != Key) + State.SkipWithError("Did not find the existing element"); + } else { + if (Range.first == Map.end() || Range.first->first - 1 != K || + Range.second == Map.end() || Range.second->first - 1 != K) + State.SkipWithError("Did find the non-existing element"); + } +#endif + } + } + } + + std::string name() const { + return "BM_EqualRange" + baseName() + Mode::name() + Order::name(); + } +}; + +template +struct LowerBound : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets( + MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys : Shuffle::None, 1); + auto& Map = Data.Maps.front(); + while (State.KeepRunningBatch(MapSize)) { + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.lower_bound(K)); +#else + auto Itor = Map.lower_bound(K); + if (Mode() == ::Mode::Hit) { + if (Itor == Map.end() || Itor->first != K) + State.SkipWithError("Did not find the existing element"); + } else { + if (Itor == Map.end() || Itor->first - 1 != K) + State.SkipWithError("Did find the non-existing element"); + } +#endif + } + } + } + + std::string name() const { + return "BM_LowerBound" + baseName() + Mode::name() + Order::name(); + } +}; + +template +struct UpperBound : Base { + using Base::Base; + + void run(benchmark::State& State) const { + auto Data = makeTestingSets( + MapSize, Mode(), + Order::value == ::Order::Random ? Shuffle::Keys : Shuffle::None, 1); + auto& Map = Data.Maps.front(); + while (State.KeepRunningBatch(MapSize)) { + for (auto K : Data.Keys) { +#ifndef VALIDATE + benchmark::DoNotOptimize(Map.upper_bound(K)); +#else + std::map::iterator Itor = Map.upper_bound(K); + if (Mode() == ::Mode::Hit) { + // Adjust validation for the last element. + auto Key = K; + if (Itor == Map.end() && K == 2 * MapSize) { + --Itor; + Key -= 2; + } + if (Itor == Map.end() || Itor->first - 2 != Key) + State.SkipWithError("Did not find the existing element"); + } else { + if (Itor == Map.end() || Itor->first - 1 != K) + State.SkipWithError("Did find the non-existing element"); + } +#endif + } + } + } + + std::string name() const { + return "BM_UpperBound" + baseName() + Mode::name() + Order::name(); + } +}; + +} // namespace + +int main(int argc, char** argv) { + benchmark::Initialize(&argc, argv); + if (benchmark::ReportUnrecognizedArguments(argc, argv)) + return 1; + +#ifdef VALIDATE + const std::vector MapSize{10}; +#else + const std::vector MapSize{10, 100, 1000, 10000, 100000, 1000000}; +#endif + + // Member functions + makeCartesianProductBenchmark(); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + + // Capacity + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + + // Modifiers + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + + // Lookup + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + makeCartesianProductBenchmark(MapSize); + + benchmark::RunSpecifiedBenchmarks(); +} diff --git a/libcxx/docs/DesignDocs/AvailabilityMarkup.rst b/libcxx/docs/DesignDocs/AvailabilityMarkup.rst index 87ad0abb62d79..26975a7370683 100644 --- a/libcxx/docs/DesignDocs/AvailabilityMarkup.rst +++ b/libcxx/docs/DesignDocs/AvailabilityMarkup.rst @@ -64,31 +64,33 @@ Testing Some parameters can be passed to lit to run the test-suite and exercise the availability. -* The `platform` parameter controls the deployment target. For example lit can - be invoked with `--param=platform=macosx10.12`. Default is the current host. -* The `use_system_cxx_lib` parameter indicates that the test suite is being run - against a system library. +* The `target_triple` parameter controls the deployment target. For example lit + can be invoked with `--param=target_triple=x86_64-apple-macosx10.12`. + Default is the current host. +* The `use_system_cxx_lib` parameter indicates that the test suite is being + compiled with the intent of being run against the system library for the + given triple, AND that it is being run against it. -Tests can be marked as XFAIL based on multiple features made available by lit: - -* if `--param=platform=macosx10.12` is passed, the following features will be available: - - - availability=macosx - - availability=macosx10.12 - - This feature is used to XFAIL a test that *is* using a class or a method marked - as unavailable *and* that is expected to *fail* if deployed on an older system. - -* if `use_system_cxx_lib` and `--param=platform=macosx10.12` are passed to lit, - the following features will also be available: +Tests can be marked as XFAIL based on multiple features made available by lit. +If `use_system_cxx_lib` is true, then assuming `target_triple=x86_64-apple-macosx10.12`, +the following features will be made available: - with_system_cxx_lib=macosx - with_system_cxx_lib=macosx10.12 - with_system_cxx_lib=x86_64-apple-macosx10.12 - This feature is used to XFAIL a test that is *not* using a class or a method - marked as unavailable *but* that is expected to fail if deployed on an older - system. For example, if the test exhibits a bug in the libc on a particular - system version, or if the test uses a symbol that is not available on an - older version of the dylib (but for which there is no availability markup, - otherwise the XFAIL should use `availability` above). +These features are used to XFAIL a test that fails when deployed on (or is +compiled for) an older system. For example, if the test exhibits a bug in the +libc on a particular system version, or if the test uses a symbol that is not +available on an older version of the dylib, it can be marked as XFAIL with +one of the above features. + +It is sometimes useful to check that a test fails specifically when compiled +for a given deployment target. For example, this is the case when testing +availability markup, where we want to make sure that using the annotated +facility on a deployment target that doesn't support it will fail at compile +time, not at runtime. This can be achieved by creating a `.compile.pass.cpp` +and XFAILing it for the right deployment target. If the test doesn't fail at +compile-time like it's supposed to, the test will XPASS. Another option is to +create a `.verify.cpp` test that checks for the right errors, and mark that +test as requiring `with_system_cxx_lib=`. diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index f5c6e5b8251aa..61773381c15f8 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -170,8 +170,20 @@ Status ------------------------------------------------------------------- ``__cpp_lib_array_constexpr`` ``201811L`` ------------------------------------------------- ----------------- + ``__cpp_lib_atomic_flag_test`` ``201907L`` + ------------------------------------------------- ----------------- + ``__cpp_lib_atomic_float`` *unimplemented* + ------------------------------------------------- ----------------- + ``__cpp_lib_atomic_lock_free_type_aliases`` ``201907L`` + ------------------------------------------------- ----------------- ``__cpp_lib_atomic_ref`` *unimplemented* ------------------------------------------------- ----------------- + ``__cpp_lib_atomic_shared_ptr`` *unimplemented* + ------------------------------------------------- ----------------- + ``__cpp_lib_atomic_value_initialization`` *unimplemented* + ------------------------------------------------- ----------------- + ``__cpp_lib_atomic_wait`` ``201907L`` + ------------------------------------------------- ----------------- ``__cpp_lib_bind_front`` *unimplemented* ------------------------------------------------- ----------------- ``__cpp_lib_bit_cast`` *unimplemented* diff --git a/libcxx/include/__config b/libcxx/include/__config index 17e6bfe207aaf..c29fd4267f323 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -456,10 +456,6 @@ typedef __char32_t char32_t; #define _LIBCPP_HAS_NO_AUTO_TYPE #endif -#if !(__has_feature(cxx_variadic_templates)) -#define _LIBCPP_HAS_NO_VARIADICS -#endif - // Objective-C++ features (opt-in) #if __has_feature(objc_arc) #define _LIBCPP_HAS_OBJC_ARC diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm index 83e49f19ab987..5d09b6c3c0150 100644 --- a/libcxx/include/algorithm +++ b/libcxx/include/algorithm @@ -1631,7 +1631,7 @@ search_n(_ForwardIterator __first, _ForwardIterator __last, _Size __count, const // copy template -inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR _Iter __unwrap_iter(_Iter __i) { @@ -1639,7 +1639,7 @@ __unwrap_iter(_Iter __i) } template -inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR typename enable_if < is_trivially_copy_assignable<_Tp>::value, @@ -1653,7 +1653,7 @@ __unwrap_iter(move_iterator<_Tp*> __i) #if _LIBCPP_DEBUG_LEVEL < 2 template -inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR typename enable_if < is_trivially_copy_assignable<_Tp>::value, @@ -1665,7 +1665,7 @@ __unwrap_iter(__wrap_iter<_Tp*> __i) } template -inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR typename enable_if < is_trivially_copy_assignable<_Tp>::value, @@ -1679,7 +1679,7 @@ __unwrap_iter(__wrap_iter __i) #else template -inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR typename enable_if < is_trivially_copy_assignable<_Tp>::value, @@ -1859,18 +1859,28 @@ copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result) // move +// __move_constexpr exists so that __move doesn't call itself when delegating to the constexpr +// version of __move. template -inline _LIBCPP_INLINE_VISIBILITY +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14 _OutputIterator -__move(_InputIterator __first, _InputIterator __last, _OutputIterator __result) +__move_constexpr(_InputIterator __first, _InputIterator __last, _OutputIterator __result) { for (; __first != __last; ++__first, (void) ++__result) *__result = _VSTD::move(*__first); return __result; } +template +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14 +_OutputIterator +__move(_InputIterator __first, _InputIterator __last, _OutputIterator __result) +{ + return __move_constexpr(__first, __last, __result); +} + template -inline _LIBCPP_INLINE_VISIBILITY +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14 typename enable_if < is_same::type, _Up>::value && @@ -1879,6 +1889,8 @@ typename enable_if >::type __move(_Tp* __first, _Tp* __last, _Up* __result) { + if (__libcpp_is_constant_evaluated()) + return __move_constexpr(__first, __last, __result); const size_t __n = static_cast(__last - __first); if (__n > 0) _VSTD::memmove(__result, __first, __n * sizeof(_Up)); @@ -1886,7 +1898,7 @@ __move(_Tp* __first, _Tp* __last, _Up* __result) } template -inline _LIBCPP_INLINE_VISIBILITY +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 _OutputIterator move(_InputIterator __first, _InputIterator __last, _OutputIterator __result) { @@ -1895,18 +1907,28 @@ move(_InputIterator __first, _InputIterator __last, _OutputIterator __result) // move_backward +// __move_backward_constexpr exists so that __move_backward doesn't call itself when delegating to +// the constexpr version of __move_backward. template -inline _LIBCPP_INLINE_VISIBILITY +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14 _OutputIterator -__move_backward(_InputIterator __first, _InputIterator __last, _OutputIterator __result) +__move_backward_constexpr(_InputIterator __first, _InputIterator __last, _OutputIterator __result) { while (__first != __last) *--__result = _VSTD::move(*--__last); return __result; } +template +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14 +_OutputIterator +__move_backward(_InputIterator __first, _InputIterator __last, _OutputIterator __result) +{ + return __move_backward_constexpr(__first, __last, __result); +} + template -inline _LIBCPP_INLINE_VISIBILITY +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14 typename enable_if < is_same::type, _Up>::value && @@ -1915,6 +1937,8 @@ typename enable_if >::type __move_backward(_Tp* __first, _Tp* __last, _Up* __result) { + if (__libcpp_is_constant_evaluated()) + return __move_backward_constexpr(__first, __last, __result); const size_t __n = static_cast(__last - __first); if (__n > 0) { @@ -1925,7 +1949,7 @@ __move_backward(_Tp* __first, _Tp* __last, _Up* __result) } template -inline _LIBCPP_INLINE_VISIBILITY +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 _BidirectionalIterator2 move_backward(_BidirectionalIterator1 __first, _BidirectionalIterator1 __last, _BidirectionalIterator2 __result) @@ -2333,7 +2357,7 @@ reverse_copy(_BidirectionalIterator __first, _BidirectionalIterator __last, _Out // rotate template -_ForwardIterator +_LIBCPP_CONSTEXPR_AFTER_CXX11 _ForwardIterator __rotate_left(_ForwardIterator __first, _ForwardIterator __last) { typedef typename iterator_traits<_ForwardIterator>::value_type value_type; @@ -2344,7 +2368,7 @@ __rotate_left(_ForwardIterator __first, _ForwardIterator __last) } template -_BidirectionalIterator +_LIBCPP_CONSTEXPR_AFTER_CXX11 _BidirectionalIterator __rotate_right(_BidirectionalIterator __first, _BidirectionalIterator __last) { typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type; @@ -2356,7 +2380,7 @@ __rotate_right(_BidirectionalIterator __first, _BidirectionalIterator __last) } template -_ForwardIterator +_LIBCPP_CONSTEXPR_AFTER_CXX14 _ForwardIterator __rotate_forward(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last) { _ForwardIterator __i = __middle; @@ -2392,7 +2416,7 @@ __rotate_forward(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIt template inline _LIBCPP_INLINE_VISIBILITY -_Integral +_LIBCPP_CONSTEXPR_AFTER_CXX14 _Integral __algo_gcd(_Integral __x, _Integral __y) { do @@ -2405,7 +2429,7 @@ __algo_gcd(_Integral __x, _Integral __y) } template -_RandomAccessIterator +_LIBCPP_CONSTEXPR_AFTER_CXX14 _RandomAccessIterator __rotate_gcd(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; @@ -2441,7 +2465,7 @@ __rotate_gcd(_RandomAccessIterator __first, _RandomAccessIterator __middle, _Ran template inline _LIBCPP_INLINE_VISIBILITY -_ForwardIterator +_LIBCPP_CONSTEXPR_AFTER_CXX11 _ForwardIterator __rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last, _VSTD::forward_iterator_tag) { @@ -2456,7 +2480,7 @@ __rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator _ template inline _LIBCPP_INLINE_VISIBILITY -_BidirectionalIterator +_LIBCPP_CONSTEXPR_AFTER_CXX11 _BidirectionalIterator __rotate(_BidirectionalIterator __first, _BidirectionalIterator __middle, _BidirectionalIterator __last, _VSTD::bidirectional_iterator_tag) { @@ -2473,7 +2497,7 @@ __rotate(_BidirectionalIterator __first, _BidirectionalIterator __middle, _Bidir template inline _LIBCPP_INLINE_VISIBILITY -_RandomAccessIterator +_LIBCPP_CONSTEXPR_AFTER_CXX11 _RandomAccessIterator __rotate(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last, _VSTD::random_access_iterator_tag) { @@ -2491,7 +2515,7 @@ __rotate(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomA template inline _LIBCPP_INLINE_VISIBILITY -_ForwardIterator +_LIBCPP_CONSTEXPR_AFTER_CXX17 _ForwardIterator rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last) { if (__first == __middle) @@ -2505,7 +2529,7 @@ rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __l // rotate_copy template -inline _LIBCPP_INLINE_VISIBILITY +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 _OutputIterator rotate_copy(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last, _OutputIterator __result) { @@ -4370,6 +4394,7 @@ binary_search(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __va // merge template +_LIBCPP_CONSTEXPR_AFTER_CXX17 _OutputIterator __merge(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result, _Compare __comp) @@ -4393,7 +4418,7 @@ __merge(_InputIterator1 __first1, _InputIterator1 __last1, } template -inline _LIBCPP_INLINE_VISIBILITY +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 _OutputIterator merge(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result, _Compare __comp) @@ -4403,7 +4428,7 @@ merge(_InputIterator1 __first1, _InputIterator1 __last1, } template -inline _LIBCPP_INLINE_VISIBILITY +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 _OutputIterator merge(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result) diff --git a/libcxx/include/any b/libcxx/include/any index 36b07c9d7e753..7546f31248772 100644 --- a/libcxx/include/any +++ b/libcxx/include/any @@ -82,7 +82,6 @@ namespace std { #include #include -#include #include #include #include @@ -368,7 +367,11 @@ namespace __any_imp template _LIBCPP_INLINE_VISIBILITY static _Tp& __create(any & __dest, _Args&&... __args) { - _Tp* __ret = ::new (static_cast(&__dest.__s.__buf)) _Tp(_VSTD::forward<_Args>(__args)...); + typedef allocator<_Tp> _Alloc; + typedef allocator_traits<_Alloc> _ATraits; + _Alloc __a; + _Tp * __ret = static_cast<_Tp*>(static_cast(&__dest.__s.__buf)); + _ATraits::construct(__a, __ret, _VSTD::forward<_Args>(__args)...); __dest.__h = &_SmallHandler::__handle; return *__ret; } @@ -376,8 +379,11 @@ namespace __any_imp private: _LIBCPP_INLINE_VISIBILITY static void __destroy(any & __this) { - _Tp & __value = *static_cast<_Tp *>(static_cast(&__this.__s.__buf)); - __value.~_Tp(); + typedef allocator<_Tp> _Alloc; + typedef allocator_traits<_Alloc> _ATraits; + _Alloc __a; + _Tp * __p = static_cast<_Tp *>(static_cast(&__this.__s.__buf)); + _ATraits::destroy(__a, __p); __this.__h = nullptr; } @@ -445,10 +451,12 @@ namespace __any_imp _LIBCPP_INLINE_VISIBILITY static _Tp& __create(any & __dest, _Args&&... __args) { typedef allocator<_Tp> _Alloc; + typedef allocator_traits<_Alloc> _ATraits; typedef __allocator_destructor<_Alloc> _Dp; _Alloc __a; - unique_ptr<_Tp, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - _Tp* __ret = ::new ((void*)__hold.get()) _Tp(_VSTD::forward<_Args>(__args)...); + unique_ptr<_Tp, _Dp> __hold(_ATraits::allocate(__a, 1), _Dp(__a, 1)); + _Tp * __ret = __hold.get(); + _ATraits::construct(__a, __ret, _VSTD::forward<_Args>(__args)...); __dest.__s.__ptr = __hold.release(); __dest.__h = &_LargeHandler::__handle; return *__ret; @@ -458,7 +466,12 @@ namespace __any_imp _LIBCPP_INLINE_VISIBILITY static void __destroy(any & __this){ - delete static_cast<_Tp*>(__this.__s.__ptr); + typedef allocator<_Tp> _Alloc; + typedef allocator_traits<_Alloc> _ATraits; + _Alloc __a; + _Tp * __p = static_cast<_Tp *>(__this.__s.__ptr); + _ATraits::destroy(__a, __p); + _ATraits::deallocate(__a, __p, 1); __this.__h = nullptr; } diff --git a/libcxx/include/atomic b/libcxx/include/atomic index 9c28986537882..56bd03584c9b4 100644 --- a/libcxx/include/atomic +++ b/libcxx/include/atomic @@ -16,9 +16,12 @@ namespace std { -// feature test macro +// feature test macro [version.syn] -#define __cpp_lib_atomic_is_always_lock_free // as specified by SG10 +#define __cpp_lib_atomic_is_always_lock_free +#define __cpp_lib_atomic_flag_test +#define __cpp_lib_atomic_lock_free_type_aliases +#define __cpp_lib_atomic_wait // order and consistency @@ -108,6 +111,7 @@ template <> struct atomic { using value_type = integral; + using difference_type = value_type; static constexpr bool is_always_lock_free; bool is_lock_free() const volatile noexcept; @@ -190,6 +194,7 @@ template struct atomic { using value_type = T*; + using difference_type = ptrdiff_t; static constexpr bool is_always_lock_free; bool is_lock_free() const volatile noexcept; @@ -1245,10 +1250,10 @@ template _LIBCPP_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) { - __a->__lock(); _Tp __temp; + __a->__lock(); __cxx_atomic_assign_volatile(__temp, __a->__a_value); - bool __ret = __temp == *__expected; + bool __ret = (memcmp(&__temp, __expected, sizeof(_Tp)) == 0); if(__ret) __cxx_atomic_assign_volatile(__a->__a_value, __value); else @@ -1261,11 +1266,11 @@ _LIBCPP_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) { __a->__lock(); - bool __ret = __a->__a_value == *__expected; + bool __ret = (memcmp(&__a->__a_value, __expected, sizeof(_Tp)) == 0); if(__ret) - __a->__a_value = __value; + memcpy(&__a->__a_value, &__value, sizeof(_Tp)); else - *__expected = __a->__a_value; + memcpy(__expected, &__a->__a_value, sizeof(_Tp)); __a->__unlock(); return __ret; } @@ -1274,10 +1279,10 @@ template _LIBCPP_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) { - __a->__lock(); _Tp __temp; + __a->__lock(); __cxx_atomic_assign_volatile(__temp, __a->__a_value); - bool __ret = __temp == *__expected; + bool __ret = (memcmp(&__temp, __expected, sizeof(_Tp)) == 0); if(__ret) __cxx_atomic_assign_volatile(__a->__a_value, __value); else @@ -1290,11 +1295,11 @@ _LIBCPP_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) { __a->__lock(); - bool __ret = __a->__a_value == *__expected; + bool __ret = (memcmp(&__a->__a_value, __expected, sizeof(_Tp)) == 0); if(__ret) - __a->__a_value = __value; + memcpy(&__a->__a_value, &__value, sizeof(_Tp)); else - *__expected = __a->__a_value; + memcpy(__expected, &__a->__a_value, sizeof(_Tp)); __a->__unlock(); return __ret; } @@ -1775,6 +1780,7 @@ struct atomic { typedef __atomic_base<_Tp> __base; typedef _Tp value_type; + typedef value_type difference_type; _LIBCPP_INLINE_VISIBILITY atomic() _NOEXCEPT _LIBCPP_DEFAULT _LIBCPP_INLINE_VISIBILITY @@ -1796,6 +1802,7 @@ struct atomic<_Tp*> { typedef __atomic_base<_Tp*> __base; typedef _Tp* value_type; + typedef ptrdiff_t difference_type; _LIBCPP_INLINE_VISIBILITY atomic() _NOEXCEPT _LIBCPP_DEFAULT _LIBCPP_INLINE_VISIBILITY @@ -1872,7 +1879,7 @@ atomic_is_lock_free(const atomic<_Tp>* __o) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY void -atomic_init(volatile atomic<_Tp>* __o, _Tp __d) _NOEXCEPT +atomic_init(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT { __cxx_atomic_init(&__o->__a_, __d); } @@ -1880,7 +1887,7 @@ atomic_init(volatile atomic<_Tp>* __o, _Tp __d) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY void -atomic_init(atomic<_Tp>* __o, _Tp __d) _NOEXCEPT +atomic_init(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT { __cxx_atomic_init(&__o->__a_, __d); } @@ -1890,7 +1897,7 @@ atomic_init(atomic<_Tp>* __o, _Tp __d) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY void -atomic_store(volatile atomic<_Tp>* __o, _Tp __d) _NOEXCEPT +atomic_store(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT { __o->store(__d); } @@ -1898,7 +1905,7 @@ atomic_store(volatile atomic<_Tp>* __o, _Tp __d) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY void -atomic_store(atomic<_Tp>* __o, _Tp __d) _NOEXCEPT +atomic_store(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT { __o->store(__d); } @@ -1908,7 +1915,7 @@ atomic_store(atomic<_Tp>* __o, _Tp __d) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY void -atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) _NOEXCEPT +atomic_store_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d, memory_order __m) _NOEXCEPT _LIBCPP_CHECK_STORE_MEMORY_ORDER(__m) { __o->store(__d, __m); @@ -1917,7 +1924,7 @@ atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) _NOE template _LIBCPP_INLINE_VISIBILITY void -atomic_store_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) _NOEXCEPT +atomic_store_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d, memory_order __m) _NOEXCEPT _LIBCPP_CHECK_STORE_MEMORY_ORDER(__m) { __o->store(__d, __m); @@ -1966,7 +1973,7 @@ atomic_load_explicit(const atomic<_Tp>* __o, memory_order __m) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY _Tp -atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) _NOEXCEPT +atomic_exchange(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT { return __o->exchange(__d); } @@ -1974,7 +1981,7 @@ atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY _Tp -atomic_exchange(atomic<_Tp>* __o, _Tp __d) _NOEXCEPT +atomic_exchange(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT { return __o->exchange(__d); } @@ -1984,7 +1991,7 @@ atomic_exchange(atomic<_Tp>* __o, _Tp __d) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY _Tp -atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) _NOEXCEPT +atomic_exchange_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d, memory_order __m) _NOEXCEPT { return __o->exchange(__d, __m); } @@ -1992,7 +1999,7 @@ atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) _ template _LIBCPP_INLINE_VISIBILITY _Tp -atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) _NOEXCEPT +atomic_exchange_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d, memory_order __m) _NOEXCEPT { return __o->exchange(__d, __m); } @@ -2002,7 +2009,7 @@ atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY bool -atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEXCEPT +atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e, typename atomic<_Tp>::value_type __d) _NOEXCEPT { return __o->compare_exchange_weak(*__e, __d); } @@ -2010,7 +2017,7 @@ atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEX template _LIBCPP_INLINE_VISIBILITY bool -atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEXCEPT +atomic_compare_exchange_weak(atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e, typename atomic<_Tp>::value_type __d) _NOEXCEPT { return __o->compare_exchange_weak(*__e, __d); } @@ -2020,7 +2027,7 @@ atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY bool -atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEXCEPT +atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e, typename atomic<_Tp>::value_type __d) _NOEXCEPT { return __o->compare_exchange_strong(*__e, __d); } @@ -2028,7 +2035,7 @@ atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NO template _LIBCPP_INLINE_VISIBILITY bool -atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEXCEPT +atomic_compare_exchange_strong(atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e, typename atomic<_Tp>::value_type __d) _NOEXCEPT { return __o->compare_exchange_strong(*__e, __d); } @@ -2038,8 +2045,8 @@ atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY bool -atomic_compare_exchange_weak_explicit(volatile atomic<_Tp>* __o, _Tp* __e, - _Tp __d, +atomic_compare_exchange_weak_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e, + typename atomic<_Tp>::value_type __d, memory_order __s, memory_order __f) _NOEXCEPT _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) { @@ -2049,7 +2056,7 @@ atomic_compare_exchange_weak_explicit(volatile atomic<_Tp>* __o, _Tp* __e, template _LIBCPP_INLINE_VISIBILITY bool -atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, _Tp* __e, _Tp __d, +atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e, typename atomic<_Tp>::value_type __d, memory_order __s, memory_order __f) _NOEXCEPT _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) { @@ -2062,7 +2069,7 @@ template _LIBCPP_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(volatile atomic<_Tp>* __o, - _Tp* __e, _Tp __d, + typename atomic<_Tp>::value_type* __e, typename atomic<_Tp>::value_type __d, memory_order __s, memory_order __f) _NOEXCEPT _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) { @@ -2072,8 +2079,8 @@ atomic_compare_exchange_strong_explicit(volatile atomic<_Tp>* __o, template _LIBCPP_INLINE_VISIBILITY bool -atomic_compare_exchange_strong_explicit(atomic<_Tp>* __o, _Tp* __e, - _Tp __d, +atomic_compare_exchange_strong_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e, + typename atomic<_Tp>::value_type __d, memory_order __s, memory_order __f) _NOEXCEPT _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) { @@ -2156,10 +2163,10 @@ template _LIBCPP_INLINE_VISIBILITY typename enable_if < - is_integral<_Tp>::value && !is_same<_Tp, bool>::value, + is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value, _Tp >::type -atomic_fetch_add(volatile atomic<_Tp>* __o, _Tp __op) _NOEXCEPT +atomic_fetch_add(volatile atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op) _NOEXCEPT { return __o->fetch_add(__op); } @@ -2168,26 +2175,26 @@ template _LIBCPP_INLINE_VISIBILITY typename enable_if < - is_integral<_Tp>::value && !is_same<_Tp, bool>::value, + is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value, _Tp >::type -atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT +atomic_fetch_add(atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op) _NOEXCEPT { return __o->fetch_add(__op); } template _LIBCPP_INLINE_VISIBILITY -_Tp* -atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) _NOEXCEPT +_Tp* +atomic_fetch_add(volatile atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op) _NOEXCEPT { return __o->fetch_add(__op); } template _LIBCPP_INLINE_VISIBILITY -_Tp* -atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t __op) _NOEXCEPT +_Tp* +atomic_fetch_add(atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op) _NOEXCEPT { return __o->fetch_add(__op); } @@ -2198,10 +2205,10 @@ template _LIBCPP_INLINE_VISIBILITY typename enable_if < - is_integral<_Tp>::value && !is_same<_Tp, bool>::value, + is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value, _Tp >::type -atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT +atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_add(__op, __m); } @@ -2210,10 +2217,10 @@ template _LIBCPP_INLINE_VISIBILITY typename enable_if < - is_integral<_Tp>::value && !is_same<_Tp, bool>::value, + is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value, _Tp >::type -atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT +atomic_fetch_add_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_add(__op, __m); } @@ -2221,8 +2228,7 @@ atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEP template _LIBCPP_INLINE_VISIBILITY _Tp* -atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, - memory_order __m) _NOEXCEPT +atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_add(__op, __m); } @@ -2230,7 +2236,7 @@ atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, template _LIBCPP_INLINE_VISIBILITY _Tp* -atomic_fetch_add_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) _NOEXCEPT +atomic_fetch_add_explicit(atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_add(__op, __m); } @@ -2241,10 +2247,10 @@ template _LIBCPP_INLINE_VISIBILITY typename enable_if < - is_integral<_Tp>::value && !is_same<_Tp, bool>::value, + is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value, _Tp >::type -atomic_fetch_sub(volatile atomic<_Tp>* __o, _Tp __op) _NOEXCEPT +atomic_fetch_sub(volatile atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op) _NOEXCEPT { return __o->fetch_sub(__op); } @@ -2253,10 +2259,10 @@ template _LIBCPP_INLINE_VISIBILITY typename enable_if < - is_integral<_Tp>::value && !is_same<_Tp, bool>::value, + is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value, _Tp >::type -atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT +atomic_fetch_sub(atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op) _NOEXCEPT { return __o->fetch_sub(__op); } @@ -2264,7 +2270,7 @@ atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY _Tp* -atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) _NOEXCEPT +atomic_fetch_sub(volatile atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op) _NOEXCEPT { return __o->fetch_sub(__op); } @@ -2272,7 +2278,7 @@ atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) _NOEXCEPT template _LIBCPP_INLINE_VISIBILITY _Tp* -atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t __op) _NOEXCEPT +atomic_fetch_sub(atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op) _NOEXCEPT { return __o->fetch_sub(__op); } @@ -2283,10 +2289,10 @@ template _LIBCPP_INLINE_VISIBILITY typename enable_if < - is_integral<_Tp>::value && !is_same<_Tp, bool>::value, + is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value, _Tp >::type -atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT +atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_sub(__op, __m); } @@ -2295,10 +2301,10 @@ template _LIBCPP_INLINE_VISIBILITY typename enable_if < - is_integral<_Tp>::value && !is_same<_Tp, bool>::value, + is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value, _Tp >::type -atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT +atomic_fetch_sub_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_sub(__op, __m); } @@ -2306,8 +2312,7 @@ atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEP template _LIBCPP_INLINE_VISIBILITY _Tp* -atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, - memory_order __m) _NOEXCEPT +atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_sub(__op, __m); } @@ -2315,7 +2320,7 @@ atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, template _LIBCPP_INLINE_VISIBILITY _Tp* -atomic_fetch_sub_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) _NOEXCEPT +atomic_fetch_sub_explicit(atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_sub(__op, __m); } @@ -2329,7 +2334,7 @@ typename enable_if is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp >::type -atomic_fetch_and(volatile atomic<_Tp>* __o, _Tp __op) _NOEXCEPT +atomic_fetch_and(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op) _NOEXCEPT { return __o->fetch_and(__op); } @@ -2341,7 +2346,7 @@ typename enable_if is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp >::type -atomic_fetch_and(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT +atomic_fetch_and(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op) _NOEXCEPT { return __o->fetch_and(__op); } @@ -2355,7 +2360,7 @@ typename enable_if is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp >::type -atomic_fetch_and_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT +atomic_fetch_and_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_and(__op, __m); } @@ -2367,7 +2372,7 @@ typename enable_if is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp >::type -atomic_fetch_and_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT +atomic_fetch_and_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_and(__op, __m); } @@ -2381,7 +2386,7 @@ typename enable_if is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp >::type -atomic_fetch_or(volatile atomic<_Tp>* __o, _Tp __op) _NOEXCEPT +atomic_fetch_or(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op) _NOEXCEPT { return __o->fetch_or(__op); } @@ -2393,7 +2398,7 @@ typename enable_if is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp >::type -atomic_fetch_or(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT +atomic_fetch_or(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op) _NOEXCEPT { return __o->fetch_or(__op); } @@ -2407,7 +2412,7 @@ typename enable_if is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp >::type -atomic_fetch_or_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT +atomic_fetch_or_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_or(__op, __m); } @@ -2419,7 +2424,7 @@ typename enable_if is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp >::type -atomic_fetch_or_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT +atomic_fetch_or_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_or(__op, __m); } @@ -2433,7 +2438,7 @@ typename enable_if is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp >::type -atomic_fetch_xor(volatile atomic<_Tp>* __o, _Tp __op) _NOEXCEPT +atomic_fetch_xor(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op) _NOEXCEPT { return __o->fetch_xor(__op); } @@ -2445,7 +2450,7 @@ typename enable_if is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp >::type -atomic_fetch_xor(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT +atomic_fetch_xor(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op) _NOEXCEPT { return __o->fetch_xor(__op); } @@ -2459,7 +2464,7 @@ typename enable_if is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp >::type -atomic_fetch_xor_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT +atomic_fetch_xor_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_xor(__op, __m); } @@ -2471,7 +2476,7 @@ typename enable_if is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp >::type -atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT +atomic_fetch_xor_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op, memory_order __m) _NOEXCEPT { return __o->fetch_xor(__op, __m); } diff --git a/libcxx/include/future b/libcxx/include/future index 483266dddec4e..295b6ac5d6ee7 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -1605,8 +1605,6 @@ template struct _LIBCPP_TEMPLATE_VIS uses_allocator, _Alloc> : public true_type {}; -#ifndef _LIBCPP_HAS_NO_VARIADICS - // packaged_task template class __packaged_task_base; @@ -2158,6 +2156,8 @@ __make_async_assoc_state(_Fp&& __f) return future<_Rp>(__h.get()); } +#ifndef _LIBCPP_CXX03_LANG + template class _LIBCPP_HIDDEN __async_func { @@ -2225,7 +2225,7 @@ async(_Fp&& __f, _Args&&... __args) _VSTD::forward<_Args>(__args)...); } -#endif // _LIBCPP_HAS_NO_VARIADICS +#endif // C++03 // shared_future diff --git a/libcxx/include/iterator b/libcxx/include/iterator index a13214fca5e4b..e2910e9fdc2a1 100644 --- a/libcxx/include/iterator +++ b/libcxx/include/iterator @@ -1052,9 +1052,19 @@ class _LIBCPP_TEMPLATE_VIS ostream_iterator : public iterator { public: - typedef _CharT char_type; - typedef _Traits traits_type; - typedef basic_ostream<_CharT,_Traits> ostream_type; + typedef output_iterator_tag iterator_category; + typedef void value_type; +#if _LIBCPP_STD_VER > 17 + typedef std::ptrdiff_t difference_type; +#else + typedef void difference_type; +#endif + typedef void pointer; + typedef void reference; + typedef _CharT char_type; + typedef _Traits traits_type; + typedef basic_ostream<_CharT, _Traits> ostream_type; + private: ostream_type* __out_stream_; const char_type* __delim_; @@ -1151,10 +1161,20 @@ class _LIBCPP_TEMPLATE_VIS ostreambuf_iterator : public iterator { public: - typedef _CharT char_type; - typedef _Traits traits_type; - typedef basic_streambuf<_CharT,_Traits> streambuf_type; - typedef basic_ostream<_CharT,_Traits> ostream_type; + typedef output_iterator_tag iterator_category; + typedef void value_type; +#if _LIBCPP_STD_VER > 17 + typedef std::ptrdiff_t difference_type; +#else + typedef void difference_type; +#endif + typedef void pointer; + typedef void reference; + typedef _CharT char_type; + typedef _Traits traits_type; + typedef basic_streambuf<_CharT, _Traits> streambuf_type; + typedef basic_ostream<_CharT, _Traits> ostream_type; + private: streambuf_type* __sbuf_; public: @@ -1373,13 +1393,13 @@ operator+(typename __wrap_iter<_Iter>::difference_type, __wrap_iter<_Iter>) _NOE template _Op _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 copy(_Ip, _Ip, _Op); template _B2 _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 copy_backward(_B1, _B1, _B2); -template _Op _LIBCPP_INLINE_VISIBILITY move(_Ip, _Ip, _Op); -template _B2 _LIBCPP_INLINE_VISIBILITY move_backward(_B1, _B1, _B2); +template _Op _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 move(_Ip, _Ip, _Op); +template _B2 _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 move_backward(_B1, _B1, _B2); #if _LIBCPP_DEBUG_LEVEL < 2 template -_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG +_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR typename enable_if < is_trivially_copy_assignable<_Tp>::value, @@ -1390,7 +1410,7 @@ __unwrap_iter(__wrap_iter<_Tp*>); #else template -inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR typename enable_if < is_trivially_copy_assignable<_Tp>::value, @@ -1584,12 +1604,12 @@ private: template friend _LIBCPP_CONSTEXPR_AFTER_CXX17 _Op copy(_Ip, _Ip, _Op); template friend _LIBCPP_CONSTEXPR_AFTER_CXX17 _B2 copy_backward(_B1, _B1, _B2); - template friend _Op move(_Ip, _Ip, _Op); - template friend _B2 move_backward(_B1, _B1, _B2); + template friend _LIBCPP_CONSTEXPR_AFTER_CXX17 _Op move(_Ip, _Ip, _Op); + template friend _LIBCPP_CONSTEXPR_AFTER_CXX17 _B2 move_backward(_B1, _B1, _B2); #if _LIBCPP_DEBUG_LEVEL < 2 template - _LIBCPP_CONSTEXPR_IF_NODEBUG friend + _LIBCPP_CONSTEXPR friend typename enable_if < is_trivially_copy_assignable<_Tp>::value, @@ -1598,7 +1618,7 @@ private: __unwrap_iter(__wrap_iter<_Tp*>); #else template - inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG + inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR friend typename enable_if < is_trivially_copy_assignable<_Tp>::value, diff --git a/libcxx/include/memory b/libcxx/include/memory index ebb0a723a162a..0ce7d092a2e11 100644 --- a/libcxx/include/memory +++ b/libcxx/include/memory @@ -762,8 +762,6 @@ struct __pointer_traits_element_type<_Ptr, true> typedef _LIBCPP_NODEBUG_TYPE typename _Ptr::element_type type; }; -#ifndef _LIBCPP_HAS_NO_VARIADICS - template { + using value_type = Small; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using propagate_on_container_move_assignment = std::true_type; + using is_always_equal = std::true_type; + + Small* allocate(std::size_t) { assert(false); } + + template + void construct(Small* p, Args&& ...args) { + new (p) Small(std::forward(args)...); + Small_was_constructed = true; + } + + void destroy(Small* p) { + p->~Small(); + Small_was_destroyed = true; + } + + void deallocate(Small*, std::size_t) { assert(false); } + }; +} // end namespace std + + +int main(int, char**) { + // Test large types + { + { + std::any a = Large(); + (void)a; + + assert(Large_was_allocated); + assert(Large_was_constructed); + } + + assert(Large_was_destroyed); + assert(Large_was_deallocated); + } + + // Test small types + { + { + std::any a = Small(); + (void)a; + + assert(Small_was_constructed); + } + + assert(Small_was_destroyed); + } + + return 0; +} diff --git a/libcxx/test/libcxx/utilities/charconv/charconv.to.chars/availability.fail.cpp b/libcxx/test/libcxx/utilities/charconv/charconv.to.chars/availability.fail.cpp index cd099420d1829..70f5d3c1808d7 100644 --- a/libcxx/test/libcxx/utilities/charconv/charconv.to.chars/availability.fail.cpp +++ b/libcxx/test/libcxx/utilities/charconv/charconv.to.chars/availability.fail.cpp @@ -7,8 +7,12 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: c++03 -// REQUIRES: with_system_cxx_lib=macosx -// REQUIRES: availability=macosx10.9 || availability=macosx10.10 || availability=macosx10.11 || availability=macosx10.12 || availability=macosx10.13 || availability=macosx10.14 +// REQUIRES: with_system_cxx_lib=macosx10.9 || \ +// REQUIRES: with_system_cxx_lib=macosx10.10 || \ +// REQUIRES: with_system_cxx_lib=macosx10.11 || \ +// REQUIRES: with_system_cxx_lib=macosx10.12 || \ +// REQUIRES: with_system_cxx_lib=macosx10.13 || \ +// REQUIRES: with_system_cxx_lib=macosx10.14 // Test the availability markup on std::to_chars. diff --git a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/function_type_default_deleter.fail.cpp b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/function_type_default_deleter.fail.cpp index 5dea3cb7cc175..0bba136ade6dc 100644 --- a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/function_type_default_deleter.fail.cpp +++ b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/function_type_default_deleter.fail.cpp @@ -1,3 +1,11 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + // UNSUPPORTED: c++03 #include diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp index cdb126d4942ce..7e69c54797c82 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp @@ -13,6 +13,11 @@ // OutIter // move(InIter first, InIter last, OutIter result); +// Older compilers don't support std::is_constant_evaluated +// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8 +// UNSUPPORTED: apple-clang-9, apple-clang-10, apple-clang-11 +// UNSUPPORTED: gcc-5, gcc-6, gcc-7, gcc-8 + #include #include #include @@ -21,11 +26,11 @@ #include "test_iterators.h" template -void +_LIBCPP_CONSTEXPR_AFTER_CXX17 bool test() { const unsigned N = 1000; - int ia[N]; + int ia[N] = {}; for (unsigned i = 0; i < N; ++i) ia[i] = i; int ib[N] = {0}; @@ -34,6 +39,8 @@ test() assert(base(r) == ib+N); for (unsigned i = 0; i < N; ++i) assert(ia[i] == ib[i]); + + return true; } #if TEST_STD_VER >= 11 @@ -128,5 +135,37 @@ int main(int, char**) test1*, std::unique_ptr*>(); #endif // TEST_STD_VER >= 11 +#if TEST_STD_VER > 17 + static_assert(test, input_iterator >()); + static_assert(test, forward_iterator >()); + static_assert(test, bidirectional_iterator >()); + static_assert(test, random_access_iterator >()); + static_assert(test, int*>()); + + static_assert(test, input_iterator >()); + static_assert(test, forward_iterator >()); + static_assert(test, bidirectional_iterator >()); + static_assert(test, random_access_iterator >()); + static_assert(test, int*>()); + + static_assert(test, input_iterator >()); + static_assert(test, forward_iterator >()); + static_assert(test, bidirectional_iterator >()); + static_assert(test, random_access_iterator >()); + static_assert(test, int*>()); + + static_assert(test, input_iterator >()); + static_assert(test, forward_iterator >()); + static_assert(test, bidirectional_iterator >()); + static_assert(test, random_access_iterator >()); + static_assert(test, int*>()); + + static_assert(test >()); + static_assert(test >()); + static_assert(test >()); + static_assert(test >()); + static_assert(test()); +#endif // TEST_STD_VER > 17 + return 0; } diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp index 365c1a1158d7e..5e1afe857cca2 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp @@ -6,6 +6,10 @@ // //===----------------------------------------------------------------------===// +// Older compilers don't support std::is_constant_evaluated +// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8 +// UNSUPPORTED: apple-clang-9, apple-clang-10 + // // template @@ -21,11 +25,11 @@ #include "test_iterators.h" template -void +_LIBCPP_CONSTEXPR_AFTER_CXX17 bool test() { const unsigned N = 1000; - int ia[N]; + int ia[N] = {}; for (unsigned i = 0; i < N; ++i) ia[i] = i; int ib[N] = {0}; @@ -34,6 +38,8 @@ test() assert(base(r) == ib); for (unsigned i = 0; i < N; ++i) assert(ia[i] == ib[i]); + + return true; } #if TEST_STD_VER >= 11 @@ -82,5 +88,19 @@ int main(int, char**) test1*, std::unique_ptr*>(); #endif // TEST_STD_VER >= 11 +#if TEST_STD_VER > 17 + static_assert(test, bidirectional_iterator >()); + static_assert(test, random_access_iterator >()); + static_assert(test, int*>()); + + static_assert(test, bidirectional_iterator >()); + static_assert(test, random_access_iterator >()); + static_assert(test, int*>()); + + static_assert(test >()); + static_assert(test >()); + static_assert(test()); +#endif // TEST_STD_VER > 17 + return 0; } diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate.pass.cpp index 007faf685bfc2..2617f9a6a126e 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate.pass.cpp @@ -12,6 +12,10 @@ // Iter // rotate(Iter first, Iter middle, Iter last); +// Older compilers don't support std::is_constant_evaluated +// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8 +// UNSUPPORTED: apple-clang-9, apple-clang-10 + #include #include #include @@ -20,7 +24,7 @@ #include "test_iterators.h" template -void +_LIBCPP_CONSTEXPR_AFTER_CXX17 bool test() { int ia[] = {0}; @@ -209,6 +213,8 @@ test() assert(ig[3] == 0); assert(ig[4] == 1); assert(ig[5] == 2); + + return true; } #if TEST_STD_VER >= 11 @@ -435,5 +441,12 @@ int main(int, char**) #endif +#if TEST_STD_VER > 17 + static_assert(test >()); + static_assert(test >()); + static_assert(test >()); + static_assert(test()); +#endif // TEST_STD_VER > 17 + return 0; } diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate_copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate_copy.pass.cpp index d66bf8caad6e6..d9dca0c6ebf09 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate_copy.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate_copy.pass.cpp @@ -12,145 +12,149 @@ // constexpr OutIter // constexpr after C++17 // rotate_copy(InIter first, InIter middle, InIter last, OutIter result); +// Older compilers don't support std::is_constant_evaluated +// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8 +// UNSUPPORTED: apple-clang-9, apple-clang-10 + #include #include #include "test_macros.h" #include "test_iterators.h" -// #if TEST_STD_VER > 17 -// TEST_CONSTEXPR bool test_constexpr() { -// int ia[] = {1, 3, 5, 2, 5, 6}; -// int ib[std::size(ia)] = {0}; -// -// const size_t N = 2; -// const auto middle = std::begin(ia) + N; -// auto it = std::rotate_copy(std::begin(ia), middle, std::end(ia), std::begin(ib)); -// -// return std::distance(std::begin(ib), it) == std::size(ia) -// && std::equal (std::begin(ia), middle, std::begin(ib) + std::size(ia) - N) -// && std::equal (middle, std::end(ia), std::begin(ib)) -// ; -// } -// #endif template -void -test() -{ - int ia[] = {0, 1, 2, 3}; - const unsigned sa = sizeof(ia)/sizeof(ia[0]); - int ib[sa] = {0}; - - OutIter r = std::rotate_copy(InIter(ia), InIter(ia), InIter(ia), OutIter(ib)); - assert(base(r) == ib); - - r = std::rotate_copy(InIter(ia), InIter(ia), InIter(ia+1), OutIter(ib)); - assert(base(r) == ib+1); - assert(ib[0] == 0); - - r = std::rotate_copy(InIter(ia), InIter(ia+1), InIter(ia+1), OutIter(ib)); - assert(base(r) == ib+1); - assert(ib[0] == 0); - - r = std::rotate_copy(InIter(ia), InIter(ia), InIter(ia+2), OutIter(ib)); - assert(base(r) == ib+2); - assert(ib[0] == 0); - assert(ib[1] == 1); - - r = std::rotate_copy(InIter(ia), InIter(ia+1), InIter(ia+2), OutIter(ib)); - assert(base(r) == ib+2); - assert(ib[0] == 1); - assert(ib[1] == 0); - - r = std::rotate_copy(InIter(ia), InIter(ia+2), InIter(ia+2), OutIter(ib)); - assert(base(r) == ib+2); - assert(ib[0] == 0); - assert(ib[1] == 1); - - r = std::rotate_copy(InIter(ia), InIter(ia), InIter(ia+3), OutIter(ib)); - assert(base(r) == ib+3); - assert(ib[0] == 0); - assert(ib[1] == 1); - assert(ib[2] == 2); - - r = std::rotate_copy(InIter(ia), InIter(ia+1), InIter(ia+3), OutIter(ib)); - assert(base(r) == ib+3); - assert(ib[0] == 1); - assert(ib[1] == 2); - assert(ib[2] == 0); - - r = std::rotate_copy(InIter(ia), InIter(ia+2), InIter(ia+3), OutIter(ib)); - assert(base(r) == ib+3); - assert(ib[0] == 2); - assert(ib[1] == 0); - assert(ib[2] == 1); - - r = std::rotate_copy(InIter(ia), InIter(ia+3), InIter(ia+3), OutIter(ib)); - assert(base(r) == ib+3); - assert(ib[0] == 0); - assert(ib[1] == 1); - assert(ib[2] == 2); - - r = std::rotate_copy(InIter(ia), InIter(ia), InIter(ia+4), OutIter(ib)); - assert(base(r) == ib+4); - assert(ib[0] == 0); - assert(ib[1] == 1); - assert(ib[2] == 2); - assert(ib[3] == 3); - - r = std::rotate_copy(InIter(ia), InIter(ia+1), InIter(ia+4), OutIter(ib)); - assert(base(r) == ib+4); - assert(ib[0] == 1); - assert(ib[1] == 2); - assert(ib[2] == 3); - assert(ib[3] == 0); - - r = std::rotate_copy(InIter(ia), InIter(ia+2), InIter(ia+4), OutIter(ib)); - assert(base(r) == ib+4); - assert(ib[0] == 2); - assert(ib[1] == 3); - assert(ib[2] == 0); - assert(ib[3] == 1); - - r = std::rotate_copy(InIter(ia), InIter(ia+3), InIter(ia+4), OutIter(ib)); - assert(base(r) == ib+4); - assert(ib[0] == 3); - assert(ib[1] == 0); - assert(ib[2] == 1); - assert(ib[3] == 2); - - r = std::rotate_copy(InIter(ia), InIter(ia+4), InIter(ia+4), OutIter(ib)); - assert(base(r) == ib+4); - assert(ib[0] == 0); - assert(ib[1] == 1); - assert(ib[2] == 2); - assert(ib[3] == 3); +TEST_CONSTEXPR_CXX20 void test() { + int ia[] = {0, 1, 2, 3}; + const unsigned sa = sizeof(ia) / sizeof(ia[0]); + int ib[sa] = {0}; + + OutIter r = std::rotate_copy(InIter(ia), InIter(ia), InIter(ia), OutIter(ib)); + assert(base(r) == ib); + + r = std::rotate_copy(InIter(ia), InIter(ia), InIter(ia + 1), OutIter(ib)); + assert(base(r) == ib + 1); + assert(ib[0] == 0); + + r = std::rotate_copy(InIter(ia), InIter(ia + 1), InIter(ia + 1), OutIter(ib)); + assert(base(r) == ib + 1); + assert(ib[0] == 0); + + r = std::rotate_copy(InIter(ia), InIter(ia), InIter(ia + 2), OutIter(ib)); + assert(base(r) == ib + 2); + assert(ib[0] == 0); + assert(ib[1] == 1); + + r = std::rotate_copy(InIter(ia), InIter(ia + 1), InIter(ia + 2), OutIter(ib)); + assert(base(r) == ib + 2); + assert(ib[0] == 1); + assert(ib[1] == 0); + + r = std::rotate_copy(InIter(ia), InIter(ia + 2), InIter(ia + 2), OutIter(ib)); + assert(base(r) == ib + 2); + assert(ib[0] == 0); + assert(ib[1] == 1); + + r = std::rotate_copy(InIter(ia), InIter(ia), InIter(ia + 3), OutIter(ib)); + assert(base(r) == ib + 3); + assert(ib[0] == 0); + assert(ib[1] == 1); + assert(ib[2] == 2); + + r = std::rotate_copy(InIter(ia), InIter(ia + 1), InIter(ia + 3), OutIter(ib)); + assert(base(r) == ib + 3); + assert(ib[0] == 1); + assert(ib[1] == 2); + assert(ib[2] == 0); + + r = std::rotate_copy(InIter(ia), InIter(ia + 2), InIter(ia + 3), OutIter(ib)); + assert(base(r) == ib + 3); + assert(ib[0] == 2); + assert(ib[1] == 0); + assert(ib[2] == 1); + + r = std::rotate_copy(InIter(ia), InIter(ia + 3), InIter(ia + 3), OutIter(ib)); + assert(base(r) == ib + 3); + assert(ib[0] == 0); + assert(ib[1] == 1); + assert(ib[2] == 2); + + r = std::rotate_copy(InIter(ia), InIter(ia), InIter(ia + 4), OutIter(ib)); + assert(base(r) == ib + 4); + assert(ib[0] == 0); + assert(ib[1] == 1); + assert(ib[2] == 2); + assert(ib[3] == 3); + + r = std::rotate_copy(InIter(ia), InIter(ia + 1), InIter(ia + 4), OutIter(ib)); + assert(base(r) == ib + 4); + assert(ib[0] == 1); + assert(ib[1] == 2); + assert(ib[2] == 3); + assert(ib[3] == 0); + + r = std::rotate_copy(InIter(ia), InIter(ia + 2), InIter(ia + 4), OutIter(ib)); + assert(base(r) == ib + 4); + assert(ib[0] == 2); + assert(ib[1] == 3); + assert(ib[2] == 0); + assert(ib[3] == 1); + + r = std::rotate_copy(InIter(ia), InIter(ia + 3), InIter(ia + 4), OutIter(ib)); + assert(base(r) == ib + 4); + assert(ib[0] == 3); + assert(ib[1] == 0); + assert(ib[2] == 1); + assert(ib[3] == 2); + + r = std::rotate_copy(InIter(ia), InIter(ia + 4), InIter(ia + 4), OutIter(ib)); + assert(base(r) == ib + 4); + assert(ib[0] == 0); + assert(ib[1] == 1); + assert(ib[2] == 2); + assert(ib[3] == 3); + + { + int ints[] = {1, 3, 5, 2, 5, 6}; + int const n_ints = sizeof(ints)/sizeof(int); + int zeros[n_ints] = {0}; + + const size_t N = 2; + const auto middle = std::begin(ints) + N; + auto it = std::rotate_copy(std::begin(ints), middle, std::end(ints), std::begin(zeros)); + assert(std::distance(std::begin(zeros), it) == n_ints); + assert(std::equal(std::begin(ints), middle, std::begin(zeros) + n_ints - N)); + assert(std::equal(middle, std::end(ints), std::begin(zeros))); + } +} + +TEST_CONSTEXPR_CXX20 bool all_tests() { + test, output_iterator >(); + test, forward_iterator >(); + test, bidirectional_iterator >(); + test, random_access_iterator >(); + test, int*>(); + + test, output_iterator >(); + test, forward_iterator >(); + test, bidirectional_iterator >(); + test, random_access_iterator >(); + test, int*>(); + + test >(); + test >(); + test >(); + test >(); + test(); + + return true; } -int main(int, char**) -{ - test, output_iterator >(); - test, forward_iterator >(); - test, bidirectional_iterator >(); - test, random_access_iterator >(); - test, int*>(); - - test, output_iterator >(); - test, forward_iterator >(); - test, bidirectional_iterator >(); - test, random_access_iterator >(); - test, int*>(); - - test >(); - test >(); - test >(); - test >(); - test(); - -// #if TEST_STD_VER > 17 -// static_assert(test_constexpr()); -// #endif +int main(int, char**) { + all_tests(); +#if TEST_STD_VER > 17 + static_assert(all_tests()); +#endif return 0; } diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge.pass.cpp index 6c6f0c46d446f..8730ecdbd572b 100644 --- a/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge.pass.cpp +++ b/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge.pass.cpp @@ -8,6 +8,10 @@ // // REQUIRES: long_tests +// Older compilers don't support std::is_constant_evaluated +// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8 +// UNSUPPORTED: apple-clang-9, apple-clang-10 + // // template @@ -24,28 +28,26 @@ #include "test_macros.h" #include "test_iterators.h" - -// #if TEST_STD_VER > 17 -// TEST_CONSTEXPR bool test_constexpr() { -// int ia[] = {0, 1, 2, 3, 4}; -// int ib[] = {2, 4, 6, 8}; -// int ic[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -// const int expected[] = {0, 1, 2, 2, 3, 4, 4, 6, 8}; -// -// auto it = std::merge(std::begin(ia), std::end(ia), std::begin(ib), std::end(ib), std::begin(ic)); -// return std::distance(std::begin(ic), it) == (std::size(ia) + std::size(ib)) -// && *it == 0 -// && std::equal(std::begin(ic), it, std::begin(expected), std::end(expected)) -// ; -// } -// #endif +#if TEST_STD_VER > 17 +TEST_CONSTEXPR bool test_constexpr() { + int ia[] = {0, 1, 2, 3, 4}; + int ib[] = {2, 4, 6, 8}; + int ic[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + const int expected[] = {0, 1, 2, 2, 3, 4, 4, 6, 8}; + + auto it = std::merge(std::begin(ia), std::end(ia), std::begin(ib), + std::end(ib), std::begin(ic)); + assert(std::distance(std::begin(ic), it) == (std::size(ia) + std::size(ib))); + assert(*it == 0); + assert(std::equal(std::begin(ic), it, std::begin(expected), std::end(expected))); + return true; +} +#endif std::mt19937 randomness; template -void -test() -{ +void test() { { unsigned N = 100000; int* ia = new int[N]; @@ -242,9 +244,8 @@ int main(int, char**) test(); #if TEST_STD_VER > 17 -// Not yet - waiting on std::copy -// static_assert(test_constexpr()); + static_assert(test_constexpr()); #endif - return 0; + return 0; } diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge_comp.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge_comp.pass.cpp index afa7073581e54..376ffd0d1d59a 100644 --- a/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge_comp.pass.cpp +++ b/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge_comp.pass.cpp @@ -8,6 +8,10 @@ // // REQUIRES: long_tests +// Older compilers don't support std::is_constant_evaluated +// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8 +// UNSUPPORTED: apple-clang-9, apple-clang-10 + // // template 17 -// TEST_CONSTEXPR bool test_constexpr() { -// int ia[] = {0, 1, 2, 3, 4}; -// int ib[] = {2, 4, 6, 8}; -// int ic[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -// const int expected[] = {0, 1, 2, 2, 3, 4, 4, 6, 8}; -// -// auto it = std::merge(std::begin(ia), std::end(ia), -// std::begin(ib), std::end(ib), -// std::begin(ic), [](int a, int b) {return a == b; }); -// return std::distance(std::begin(ic), it) == (std::size(ia) + std::size(ib)) -// && *it == 0 -// && std::equal(std::begin(ic), it, std::begin(expected), std::end(expected)) -// ; -// } -// #endif +#if TEST_STD_VER > 17 +TEST_CONSTEXPR bool test_constexpr() { + int ia[] = {0, 1, 2, 3, 4}; + int ib[] = {2, 4, 6, 8}; + int ic[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + const int expected[] = {0, 1, 2, 2, 3, 4, 4, 6, 8}; + + auto it = + std::merge(std::begin(ia), std::end(ia), std::begin(ib), std::end(ib), + std::begin(ic), [](int a, int b) { return a == b; }); + assert(std::distance(std::begin(ic), it) == (std::size(ia) + std::size(ib))); + assert(*it == 0); + assert( + std::equal(std::begin(ic), it, std::begin(expected), std::end(expected))); + return true; +} +#endif std::mt19937 randomness; @@ -253,8 +258,7 @@ int main(int, char**) test(); #if TEST_STD_VER > 17 -// Not yet - waiting on std::copy -// static_assert(test_constexpr()); + static_assert(test_constexpr()); #endif return 0; diff --git a/libcxx/test/std/atomics/atomics.flag/atomic_flag_test.pass.cpp b/libcxx/test/std/atomics/atomics.flag/atomic_flag_test.pass.cpp new file mode 100644 index 0000000000000..22e4b66d45c5a --- /dev/null +++ b/libcxx/test/std/atomics/atomics.flag/atomic_flag_test.pass.cpp @@ -0,0 +1,39 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads + +// + +// struct atomic_flag + +// bool atomic_flag_test_and_set(volatile atomic_flag*); +// bool atomic_flag_test_and_set(atomic_flag*); + +#include +#include + +#include "test_macros.h" + +int main(int, char**) +{ + { + std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_and_set(&f) == 0); + assert(f.test_and_set() == 1); + } + { + volatile std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_and_set(&f) == 0); + assert(f.test_and_set() == 1); + } + + return 0; +} diff --git a/libcxx/test/std/atomics/atomics.flag/atomic_flag_test_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.flag/atomic_flag_test_explicit.pass.cpp new file mode 100644 index 0000000000000..45ac737b59846 --- /dev/null +++ b/libcxx/test/std/atomics/atomics.flag/atomic_flag_test_explicit.pass.cpp @@ -0,0 +1,111 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads + +// + +// struct atomic_flag + +// bool atomic_flag_test_explicit(volatile atomic_flag*, memory_order); +// bool atomic_flag_test_explicit(atomic_flag*, memory_order); + +#include +#include + +#include "test_macros.h" + +int main(int, char**) +{ + { + std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_explicit(&f, std::memory_order_relaxed) == 0); + assert(f.test_and_set() == 0); + assert(atomic_flag_test_explicit(&f, std::memory_order_relaxed) == 1); + } + { + std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_explicit(&f, std::memory_order_consume) == 0); + assert(f.test_and_set() == 0); + assert(atomic_flag_test_explicit(&f, std::memory_order_consume) == 1); + } + { + std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_explicit(&f, std::memory_order_acquire) == 0); + assert(f.test_and_set() == 0); + assert(atomic_flag_test_explicit(&f, std::memory_order_acquire) == 1); + } + { + std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_explicit(&f, std::memory_order_release) == 0); + assert(f.test_and_set() == 0); + assert(atomic_flag_test_explicit(&f, std::memory_order_release) == 1); + } + { + std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_explicit(&f, std::memory_order_acq_rel) == 0); + assert(f.test_and_set() == 0); + assert(atomic_flag_test_explicit(&f, std::memory_order_acq_rel) == 1); + } + { + std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_explicit(&f, std::memory_order_seq_cst) == 0); + assert(f.test_and_set() == 0); + assert(atomic_flag_test_explicit(&f, std::memory_order_seq_cst) == 1); + } + { + volatile std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_explicit(&f, std::memory_order_relaxed) == 0); + assert(f.test_and_set() == 0); + assert(atomic_flag_test_explicit(&f, std::memory_order_relaxed) == 1); + } + { + volatile std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_explicit(&f, std::memory_order_consume) == 0); + assert(f.test_and_set() == 0); + assert(atomic_flag_test_explicit(&f, std::memory_order_consume) == 1); + } + { + volatile std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_explicit(&f, std::memory_order_acquire) == 0); + assert(f.test_and_set() == 0); + assert(atomic_flag_test_explicit(&f, std::memory_order_acquire) == 1); + } + { + volatile std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_explicit(&f, std::memory_order_release) == 0); + assert(f.test_and_set() == 0); + assert(atomic_flag_test_explicit(&f, std::memory_order_release) == 1); + } + { + volatile std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_explicit(&f, std::memory_order_acq_rel) == 0); + assert(f.test_and_set() == 0); + assert(atomic_flag_test_explicit(&f, std::memory_order_acq_rel) == 1); + } + { + volatile std::atomic_flag f; + f.clear(); + assert(atomic_flag_test_explicit(&f, std::memory_order_seq_cst) == 0); + assert(f.test_and_set() == 0); + assert(atomic_flag_test_explicit(&f, std::memory_order_seq_cst) == 1); + } + + return 0; +} diff --git a/libcxx/test/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp b/libcxx/test/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp index 34a0689182867..8dd8c345592bf 100644 --- a/libcxx/test/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp +++ b/libcxx/test/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp @@ -134,6 +134,11 @@ void run() checkLongLongTypes(); static_assert(std::atomic::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), ""); static_assert(std::atomic::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), ""); + +#if TEST_STD_VER >= 20 + static_assert(std::atomic::is_always_lock_free, ""); + static_assert(std::atomic::is_always_lock_free, ""); +#endif } int main(int, char**) { run(); return 0; } diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add.pass.cpp index e584ea955d754..38ce06e2817b5 100644 --- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add.pass.cpp +++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add.pass.cpp @@ -63,6 +63,7 @@ void testp() A t; std::atomic_init(&t, T(1*sizeof(X))); assert(std::atomic_fetch_add(&t, 2) == T(1*sizeof(X))); + std::atomic_fetch_add(&t, 0); assert(t == T(3*sizeof(X))); } { @@ -71,6 +72,7 @@ void testp() volatile A t; std::atomic_init(&t, T(1*sizeof(X))); assert(std::atomic_fetch_add(&t, 2) == T(1*sizeof(X))); + std::atomic_fetch_add(&t, 0); assert(t == T(3*sizeof(X))); } } diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add_explicit.pass.cpp index 548101a409e9e..f39adb14effac 100644 --- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add_explicit.pass.cpp +++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add_explicit.pass.cpp @@ -67,6 +67,7 @@ testp() std::atomic_init(&t, T(1*sizeof(X))); assert(std::atomic_fetch_add_explicit(&t, 2, std::memory_order_seq_cst) == T(1*sizeof(X))); + std::atomic_fetch_add_explicit(&t, 0, std::memory_order_relaxed); assert(t == T(3*sizeof(X))); } { @@ -76,6 +77,7 @@ testp() std::atomic_init(&t, T(1*sizeof(X))); assert(std::atomic_fetch_add_explicit(&t, 2, std::memory_order_seq_cst) == T(1*sizeof(X))); + std::atomic_fetch_add_explicit(&t, 0, std::memory_order_relaxed); assert(t == T(3*sizeof(X))); } } diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub.pass.cpp index 20ec7688bb2ba..3568d2fa60ff6 100644 --- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub.pass.cpp +++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub.pass.cpp @@ -63,6 +63,7 @@ void testp() A t; std::atomic_init(&t, T(3*sizeof(X))); assert(std::atomic_fetch_sub(&t, 2) == T(3*sizeof(X))); + std::atomic_fetch_sub(&t, 0); assert(t == T(1*sizeof(X))); } { @@ -71,6 +72,7 @@ void testp() volatile A t; std::atomic_init(&t, T(3*sizeof(X))); assert(std::atomic_fetch_sub(&t, 2) == T(3*sizeof(X))); + std::atomic_fetch_sub(&t, 0); assert(t == T(1*sizeof(X))); } } diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub_explicit.pass.cpp index f26cefcbdb074..261917f8087e0 100644 --- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub_explicit.pass.cpp +++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub_explicit.pass.cpp @@ -67,6 +67,7 @@ void testp() std::atomic_init(&t, T(3*sizeof(X))); assert(std::atomic_fetch_sub_explicit(&t, 2, std::memory_order_seq_cst) == T(3*sizeof(X))); + std::atomic_fetch_sub_explicit(&t, 0, std::memory_order_relaxed); assert(t == T(1*sizeof(X))); } { @@ -76,6 +77,7 @@ void testp() std::atomic_init(&t, T(3*sizeof(X))); assert(std::atomic_fetch_sub_explicit(&t, 2, std::memory_order_seq_cst) == T(3*sizeof(X))); + std::atomic_fetch_sub_explicit(&t, 0, std::memory_order_relaxed); assert(t == T(1*sizeof(X))); } } diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_helpers.h b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_helpers.h index 65676339c7429..c248e3ab17585 100644 --- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_helpers.h +++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_helpers.h @@ -23,6 +23,43 @@ struct UserAtomicType { return x.i == y.i; } }; +/* + +Enable these once we have P0528 + +struct WeirdUserAtomicType +{ + char i, j, k; // the 3 chars of doom + + explicit WeirdUserAtomicType(int d = 0) TEST_NOEXCEPT : i(d) {} + + friend bool operator==(const WeirdUserAtomicType& x, const WeirdUserAtomicType& y) + { return x.i == y.i; } +}; + +struct PaddedUserAtomicType +{ + char i; int j; // probably lock-free? + + explicit PaddedUserAtomicType(int d = 0) TEST_NOEXCEPT : i(d) {} + + friend bool operator==(const PaddedUserAtomicType& x, const PaddedUserAtomicType& y) + { return x.i == y.i; } +}; + +*/ + +struct LargeUserAtomicType +{ + int i, j[127]; /* decidedly not lock-free */ + + LargeUserAtomicType(int d = 0) TEST_NOEXCEPT : i(d) + {} + + friend bool operator==(const LargeUserAtomicType& x, const LargeUserAtomicType& y) + { return x.i == y.i; } +}; + template < template class TestFunctor > struct TestEachIntegralType { void operator()() const { @@ -58,8 +95,23 @@ struct TestEachAtomicType { void operator()() const { TestEachIntegralType()(); TestFunctor()(); +#ifndef __APPLE__ + /* + These aren't going to be lock-free, + so some libatomic.a is necessary. + */ + TestFunctor()(); +#endif +/* + Enable these once we have P0528 + + TestFunctor()(); + TestFunctor()(); +*/ TestFunctor()(); TestFunctor()(); + TestFunctor()(); + TestFunctor()(); } }; diff --git a/libcxx/test/std/atomics/types.pass.cpp b/libcxx/test/std/atomics/types.pass.cpp index f891f90e116bf..891bbbbd6d515 100644 --- a/libcxx/test/std/atomics/types.pass.cpp +++ b/libcxx/test/std/atomics/types.pass.cpp @@ -30,15 +30,43 @@ #include "test_macros.h" +template +struct test_atomic +{ + test_atomic() + { + A a; (void)a; +#if TEST_STD_VER >= 17 + static_assert((std::is_same_v), ""); +#endif + } +}; + template -void -test_atomic() +struct test_atomic { - A a; (void)a; + test_atomic() + { + A a; (void)a; #if TEST_STD_VER >= 17 - static_assert((std::is_same::value), ""); + static_assert((std::is_same_v), ""); + static_assert((std::is_same_v), ""); #endif -} + } +}; + +template +struct test_atomic +{ + test_atomic() + { + A a; (void)a; +#if TEST_STD_VER >= 17 + static_assert((std::is_same_v), ""); + static_assert((std::is_same_v), ""); +#endif + } +}; template void @@ -46,15 +74,30 @@ test() { using A = std::atomic; #if TEST_STD_VER >= 17 - static_assert((std::is_same::value), ""); + static_assert((std::is_same_v), ""); #endif - test_atomic(); + test_atomic::value && !std::is_same::value>(); } struct TriviallyCopyable { int i_; }; +struct WeirdTriviallyCopyable +{ + char i, j, k; /* the 3 chars of doom */ +}; + +struct PaddedTriviallyCopyable +{ + char i; int j; /* probably lock-free? */ +}; + +struct LargeTriviallyCopyable +{ + int i, j[127]; /* decidedly not lock-free */ +}; + int main(int, char**) { test (); @@ -111,13 +154,23 @@ int main(int, char**) test (); test(); + test(); +#ifndef __APPLE__ // Apple doesn't ship libatomic + /* + These aren't going to be lock-free, + so some libatomic.a is necessary. + */ + test(); + test(); +#endif + test(); test(); test(); #if TEST_STD_VER >= 20 - test_atomic(); - test_atomic(); + test(); + test(); /* test>(); */ diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp index a7a3fbcf96f42..6ec30127ae592 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp @@ -26,7 +26,7 @@ int main(int, char**) } { std::strstreambuf s(1024); - assert(s.str() == nullptr); + LIBCPP_ASSERT(s.str() == nullptr); assert(s.pcount() == 0); } diff --git a/libcxx/test/std/input.output/iostream.objects/init.pass.cpp b/libcxx/test/std/input.output/iostream.objects/init.pass.cpp new file mode 100644 index 0000000000000..62a9ffbca3ea3 --- /dev/null +++ b/libcxx/test/std/input.output/iostream.objects/init.pass.cpp @@ -0,0 +1,88 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: libcpp-has-no-stdin, libcpp-has-no-stdout + +// Make sure that the iostreams are initialized before everything else. +// This has been an issue when statically linking libc++ in some contexts. +// See https://llvm.org/PR28954 for details. +// +// This test works by checking that std::{cin,cout,cerr} is the same in a +// static object constructor and in the main function. It dumps the memory of +// each stream in the static object constructor and compares it with the memory +// in the main function. +// +// The assumption is that if there are no uses of the stream object (such as +// construction), then its memory must be the same. In the case where the test +// "fails" and we are actually accessing an uninitialized object when we perform +// the memcpy, the behavior is technically undefined (so the test could still +// pass). + +#include +#include +#include + +struct Checker { + char *cerr_mem_dump; + char *cin_mem_dump; + char *cout_mem_dump; + char *clog_mem_dump; + + char *wcerr_mem_dump; + char *wcin_mem_dump; + char *wcout_mem_dump; + char *wclog_mem_dump; + + Checker() + : cerr_mem_dump(new char[sizeof(std::cerr)]) + , cin_mem_dump(new char[sizeof(std::cin)]) + , cout_mem_dump(new char[sizeof(std::cout)]) + , clog_mem_dump(new char[sizeof(std::clog)]) + + , wcerr_mem_dump(new char[sizeof(std::wcerr)]) + , wcin_mem_dump(new char[sizeof(std::wcin)]) + , wcout_mem_dump(new char[sizeof(std::wcout)]) + , wclog_mem_dump(new char[sizeof(std::wclog)]) + { + std::memcpy(cerr_mem_dump, (char*)&std::cerr, sizeof(std::cerr)); + std::memcpy(cin_mem_dump, (char*)&std::cin, sizeof(std::cin)); + std::memcpy(cout_mem_dump, (char*)&std::cout, sizeof(std::cout)); + std::memcpy(clog_mem_dump, (char*)&std::clog, sizeof(std::clog)); + + std::memcpy(wcerr_mem_dump, (char*)&std::wcerr, sizeof(std::wcerr)); + std::memcpy(wcin_mem_dump, (char*)&std::wcin, sizeof(std::wcin)); + std::memcpy(wcout_mem_dump, (char*)&std::wcout, sizeof(std::wcout)); + std::memcpy(wclog_mem_dump, (char*)&std::wclog, sizeof(std::wclog)); + } + + ~Checker() { + delete[] cerr_mem_dump; + delete[] cin_mem_dump; + delete[] cout_mem_dump; + delete[] clog_mem_dump; + + delete[] wcerr_mem_dump; + delete[] wcin_mem_dump; + delete[] wcout_mem_dump; + delete[] wclog_mem_dump; + } +}; + +static Checker check; + +int main() { + assert(std::memcmp(check.cerr_mem_dump, (char const*)&std::cerr, sizeof(std::cerr)) == 0); + assert(std::memcmp(check.cin_mem_dump, (char const*)&std::cin, sizeof(std::cin)) == 0); + assert(std::memcmp(check.cout_mem_dump, (char const*)&std::cout, sizeof(std::cout)) == 0); + assert(std::memcmp(check.clog_mem_dump, (char const*)&std::clog, sizeof(std::clog)) == 0); + + assert(std::memcmp(check.wcerr_mem_dump, (char const*)&std::wcerr, sizeof(std::wcerr)) == 0); + assert(std::memcmp(check.wcin_mem_dump, (char const*)&std::wcin, sizeof(std::wcin)) == 0); + assert(std::memcmp(check.wcout_mem_dump, (char const*)&std::wcout, sizeof(std::wcout)) == 0); + assert(std::memcmp(check.wclog_mem_dump, (char const*)&std::wclog, sizeof(std::wclog)) == 0); +} diff --git a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/pbump2gig.pass.cpp b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/pbump2gig.pass.cpp index eee48f3dfdb12..e34dbc999592f 100644 --- a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/pbump2gig.pass.cpp +++ b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/pbump2gig.pass.cpp @@ -15,6 +15,10 @@ // // REQUIRES: long_tests +// Unsupported for no-exceptions builds because they have no way to report an +// allocation failure when attempting to allocate the 2GiB string. +// UNSUPPORTED: no-exceptions + #include #include #include "test_macros.h" @@ -28,18 +32,14 @@ struct SB : std::stringbuf int main(int, char**) { -#ifndef TEST_HAS_NO_EXCEPTIONS try { -#endif std::string str(2147483648, 'a'); SB sb; sb.str(str); assert(sb.pubpbase() <= sb.pubpptr()); -#ifndef TEST_HAS_NO_EXCEPTIONS } catch (const std::length_error &) {} // maybe the string can't take 2GB catch (const std::bad_alloc &) {} // maybe we don't have enough RAM -#endif return 0; } diff --git a/libcxx/test/std/iterators/stream.iterators/ostream.iterator/types.pass.cpp b/libcxx/test/std/iterators/stream.iterators/ostream.iterator/types.pass.cpp index 950c7dfe8c0b5..739e39d62b78f 100644 --- a/libcxx/test/std/iterators/stream.iterators/ostream.iterator/types.pass.cpp +++ b/libcxx/test/std/iterators/stream.iterators/ostream.iterator/types.pass.cpp @@ -19,6 +19,7 @@ // typedef basic_istream istream_type; // ... +#include #include #include @@ -33,7 +34,11 @@ int main(int, char**) #else static_assert((std::is_same::value), ""); static_assert((std::is_same::value), ""); +#if TEST_STD_VER <= 17 static_assert((std::is_same::value), ""); +#else + static_assert((std::is_same::value), ""); +#endif static_assert((std::is_same::value), ""); static_assert((std::is_same::value), ""); #endif @@ -47,7 +52,11 @@ int main(int, char**) #else static_assert((std::is_same::value), ""); static_assert((std::is_same::value), ""); +#if TEST_STD_VER <= 17 static_assert((std::is_same::value), ""); +#else + static_assert((std::is_same::value), ""); +#endif static_assert((std::is_same::value), ""); static_assert((std::is_same::value), ""); #endif diff --git a/libcxx/test/std/iterators/stream.iterators/ostreambuf.iterator/types.pass.cpp b/libcxx/test/std/iterators/stream.iterators/ostreambuf.iterator/types.pass.cpp index 671a09bb7a3fa..2a4e6ffa5e6b6 100644 --- a/libcxx/test/std/iterators/stream.iterators/ostreambuf.iterator/types.pass.cpp +++ b/libcxx/test/std/iterators/stream.iterators/ostreambuf.iterator/types.pass.cpp @@ -19,6 +19,7 @@ // typedef basic_ostream ostream_type; // ... +#include #include #include #include @@ -34,7 +35,11 @@ int main(int, char**) #else static_assert((std::is_same::value), ""); static_assert((std::is_same::value), ""); +#if TEST_STD_VER <= 17 static_assert((std::is_same::value), ""); +#else + static_assert((std::is_same::value), ""); +#endif static_assert((std::is_same::value), ""); static_assert((std::is_same::value), ""); #endif @@ -50,7 +55,11 @@ int main(int, char**) #else static_assert((std::is_same::value), ""); static_assert((std::is_same::value), ""); +#if TEST_STD_VER <= 17 static_assert((std::is_same::value), ""); +#else + static_assert((std::is_same::value), ""); +#endif static_assert((std::is_same::value), ""); static_assert((std::is_same::value), ""); #endif diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/delete_align_val_t_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/delete_align_val_t_replace.pass.cpp index b092fa141e611..eb7f5ad4aafd1 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/delete_align_val_t_replace.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/delete_align_val_t_replace.pass.cpp @@ -15,21 +15,14 @@ // Aligned allocation was not provided before macosx10.14 and as a result we // get availability errors when the deployment target is older than macosx10.14. -// However, AppleClang 10 (and older) don't trigger availability errors, and -// Clang < 8.0 doesn't warn for 10.13. -// XFAIL: !(apple-clang-9 || apple-clang-10 || clang-7) && availability=macosx10.13 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.12 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.11 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.10 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.9 - -// On AppleClang 10 (and older), instead of getting an availability failure -// like above, we get a link error when we link against a dylib that does -// not export the aligned allocation functions. -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.12 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.11 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.10 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.9 +// However, support for that was broken prior to Clang 8 and AppleClang 11. +// UNSUPPORTED: apple-clang-9, apple-clang-10 +// UNSUPPORTED: clang-5, clang-6, clang-7 +// XFAIL: with_system_cxx_lib=macosx10.13 +// XFAIL: with_system_cxx_lib=macosx10.12 +// XFAIL: with_system_cxx_lib=macosx10.11 +// XFAIL: with_system_cxx_lib=macosx10.10 +// XFAIL: with_system_cxx_lib=macosx10.9 // On Windows libc++ doesn't provide its own definitions for new/delete // but instead depends on the ones in VCRuntime. However VCRuntime does not diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t.pass.cpp index bfa5f155a9c56..6b372e076915a 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t.pass.cpp @@ -13,21 +13,14 @@ // Aligned allocation was not provided before macosx10.14 and as a result we // get availability errors when the deployment target is older than macosx10.14. -// However, AppleClang 10 (and older) don't trigger availability errors, and -// Clang < 8.0 doesn't warn for 10.13. -// XFAIL: !(apple-clang-9 || apple-clang-10 || clang-7) && availability=macosx10.13 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.12 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.11 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.10 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.9 - -// On AppleClang 10 (and older), instead of getting an availability failure -// like above, we get a link error when we link against a dylib that does -// not export the aligned allocation functions. -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.12 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.11 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.10 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.9 +// However, support for that was broken prior to Clang 8 and AppleClang 11. +// UNSUPPORTED: apple-clang-9, apple-clang-10 +// UNSUPPORTED: clang-5, clang-6, clang-7 +// XFAIL: with_system_cxx_lib=macosx10.13 +// XFAIL: with_system_cxx_lib=macosx10.12 +// XFAIL: with_system_cxx_lib=macosx10.11 +// XFAIL: with_system_cxx_lib=macosx10.10 +// XFAIL: with_system_cxx_lib=macosx10.9 // On Windows libc++ doesn't provide its own definitions for new/delete // but instead depends on the ones in VCRuntime. However VCRuntime does not diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow.pass.cpp index 869e29a8e87be..e9e9d95e83a3c 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow.pass.cpp @@ -13,21 +13,14 @@ // Aligned allocation was not provided before macosx10.14 and as a result we // get availability errors when the deployment target is older than macosx10.14. -// However, AppleClang 10 (and older) don't trigger availability errors, and -// Clang < 8.0 doesn't warn for 10.13. -// XFAIL: !(apple-clang-9 || apple-clang-10 || clang-7) && availability=macosx10.13 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.12 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.11 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.10 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.9 - -// On AppleClang 10 (and older), instead of getting an availability failure -// like above, we get a link error when we link against a dylib that does -// not export the aligned allocation functions. -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.12 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.11 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.10 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.9 +// However, support for that was broken prior to Clang 8 and AppleClang 11. +// UNSUPPORTED: apple-clang-9, apple-clang-10 +// UNSUPPORTED: clang-5, clang-6, clang-7 +// XFAIL: with_system_cxx_lib=macosx10.13 +// XFAIL: with_system_cxx_lib=macosx10.12 +// XFAIL: with_system_cxx_lib=macosx10.11 +// XFAIL: with_system_cxx_lib=macosx10.10 +// XFAIL: with_system_cxx_lib=macosx10.9 // On Windows libc++ doesn't provide its own definitions for new/delete // but instead depends on the ones in VCRuntime. However VCRuntime does not diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow_replace.pass.cpp index 6f346a72a0ae6..e7a1e403d73dd 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow_replace.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow_replace.pass.cpp @@ -11,21 +11,14 @@ // Aligned allocation was not provided before macosx10.14 and as a result we // get availability errors when the deployment target is older than macosx10.14. -// However, AppleClang 10 (and older) don't trigger availability errors, and -// Clang < 8.0 doesn't warn for 10.13. -// XFAIL: !(apple-clang-9 || apple-clang-10 || clang-7) && availability=macosx10.13 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.12 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.11 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.10 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.9 - -// On AppleClang 10 (and older), instead of getting an availability failure -// like above, we get a link error when we link against a dylib that does -// not export the aligned allocation functions. -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.12 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.11 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.10 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.9 +// However, support for that was broken prior to Clang 8 and AppleClang 11. +// UNSUPPORTED: apple-clang-9, apple-clang-10 +// UNSUPPORTED: clang-5, clang-6, clang-7 +// XFAIL: with_system_cxx_lib=macosx10.13 +// XFAIL: with_system_cxx_lib=macosx10.12 +// XFAIL: with_system_cxx_lib=macosx10.11 +// XFAIL: with_system_cxx_lib=macosx10.10 +// XFAIL: with_system_cxx_lib=macosx10.9 // On Windows libc++ doesn't provide its own definitions for new/delete // but instead depends on the ones in VCRuntime. However VCRuntime does not diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array_fsizeddeallocation.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array_fsizeddeallocation.pass.cpp index cdebcda46a0b7..1274ddff54236 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array_fsizeddeallocation.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array_fsizeddeallocation.pass.cpp @@ -12,12 +12,10 @@ // when sized deallocation is not supported, e.g., prior to C++14. // UNSUPPORTED: sanitizer-new-delete -// XFAIL: availability=macosx10.11 -// XFAIL: availability=macosx10.10 -// XFAIL: availability=macosx10.9 +// XFAIL: with_system_cxx_lib=macosx10.11 +// XFAIL: with_system_cxx_lib=macosx10.10 +// XFAIL: with_system_cxx_lib=macosx10.9 - -// NOTE: Only clang-3.7 and GCC 5.1 and greater support -fsized-deallocation. // REQUIRES: -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS: -fsized-deallocation diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/delete_align_val_t_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/delete_align_val_t_replace.pass.cpp index f50507a815d43..4d0100d04597d 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/delete_align_val_t_replace.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/delete_align_val_t_replace.pass.cpp @@ -15,21 +15,14 @@ // Aligned allocation was not provided before macosx10.14 and as a result we // get availability errors when the deployment target is older than macosx10.14. -// However, AppleClang 10 (and older) don't trigger availability errors, and -// Clang < 8.0 doesn't warn for 10.13 -// XFAIL: !(apple-clang-9 || apple-clang-10 || clang-7) && availability=macosx10.13 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.12 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.11 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.10 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.9 - -// On AppleClang 10 (and older), instead of getting an availability failure -// like above, we get a link error when we link against a dylib that does -// not export the aligned allocation functions. -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.12 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.11 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.10 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.9 +// However, support for that was broken prior to Clang 8 and AppleClang 11. +// UNSUPPORTED: apple-clang-9, apple-clang-10 +// UNSUPPORTED: clang-5, clang-6, clang-7 +// XFAIL: with_system_cxx_lib=macosx10.13 +// XFAIL: with_system_cxx_lib=macosx10.12 +// XFAIL: with_system_cxx_lib=macosx10.11 +// XFAIL: with_system_cxx_lib=macosx10.10 +// XFAIL: with_system_cxx_lib=macosx10.9 // On Windows libc++ doesn't provide its own definitions for new/delete // but instead depends on the ones in VCRuntime. However VCRuntime does not diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t.pass.cpp index 80ec88e437fe0..01cb88658954e 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t.pass.cpp @@ -10,21 +10,14 @@ // Aligned allocation was not provided before macosx10.14 and as a result we // get availability errors when the deployment target is older than macosx10.14. -// However, AppleClang 10 (and older) don't trigger availability errors, and -// Clang < 8.0 doesn't warn for 10.13. -// XFAIL: !(apple-clang-9 || apple-clang-10 || clang-7) && availability=macosx10.13 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.12 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.11 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.10 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.9 - -// On AppleClang 10 (and older), instead of getting an availability failure -// like above, we get a link error when we link against a dylib that does -// not export the aligned allocation functions. -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.12 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.11 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.10 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.9 +// However, support for that was broken prior to Clang 8 and AppleClang 11. +// UNSUPPORTED: apple-clang-9, apple-clang-10 +// UNSUPPORTED: clang-5, clang-6, clang-7 +// XFAIL: with_system_cxx_lib=macosx10.13 +// XFAIL: with_system_cxx_lib=macosx10.12 +// XFAIL: with_system_cxx_lib=macosx10.11 +// XFAIL: with_system_cxx_lib=macosx10.10 +// XFAIL: with_system_cxx_lib=macosx10.9 // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow.pass.cpp index 0a42fbac6fd4c..930eff95bb999 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow.pass.cpp @@ -10,21 +10,14 @@ // Aligned allocation was not provided before macosx10.14 and as a result we // get availability errors when the deployment target is older than macosx10.14. -// However, AppleClang 10 (and older) don't trigger availability errors, and -// Clang < 8.0 doesn't warn for 10.13 -// XFAIL: !(apple-clang-9 || apple-clang-10 || clang-7) && availability=macosx10.13 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.12 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.11 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.10 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.9 - -// On AppleClang 10 (and older), instead of getting an availability failure -// like above, we get a link error when we link against a dylib that does -// not export the aligned allocation functions. -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.12 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.11 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.10 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.9 +// However, support for that was broken prior to Clang 8 and AppleClang 11. +// UNSUPPORTED: apple-clang-9, apple-clang-10 +// UNSUPPORTED: clang-5, clang-6, clang-7 +// XFAIL: with_system_cxx_lib=macosx10.13 +// XFAIL: with_system_cxx_lib=macosx10.12 +// XFAIL: with_system_cxx_lib=macosx10.11 +// XFAIL: with_system_cxx_lib=macosx10.10 +// XFAIL: with_system_cxx_lib=macosx10.9 // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow_replace.pass.cpp index 655ec9352d682..62ceafb7644af 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow_replace.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow_replace.pass.cpp @@ -11,21 +11,14 @@ // Aligned allocation was not provided before macosx10.14 and as a result we // get availability errors when the deployment target is older than macosx10.14. -// However, AppleClang 10 (and older) don't trigger availability errors, and -// Clang < 8.0 doesn't warn for 10.13 -// XFAIL: !(apple-clang-9 || apple-clang-10 || clang-7) && availability=macosx10.13 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.12 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.11 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.10 -// XFAIL: !(apple-clang-9 || apple-clang-10) && availability=macosx10.9 - -// On AppleClang 10 (and older), instead of getting an availability failure -// like above, we get a link error when we link against a dylib that does -// not export the aligned allocation functions. -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.12 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.11 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.10 -// XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.9 +// However, support for that was broken prior to Clang 8 and AppleClang 11. +// UNSUPPORTED: apple-clang-9, apple-clang-10 +// UNSUPPORTED: clang-5, clang-6, clang-7 +// XFAIL: with_system_cxx_lib=macosx10.13 +// XFAIL: with_system_cxx_lib=macosx10.12 +// XFAIL: with_system_cxx_lib=macosx10.11 +// XFAIL: with_system_cxx_lib=macosx10.10 +// XFAIL: with_system_cxx_lib=macosx10.9 // On Windows libc++ doesn't provide its own definitions for new/delete // but instead depends on the ones in VCRuntime. However VCRuntime does not diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete_fsizeddeallocation.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete_fsizeddeallocation.pass.cpp index e827ff618ec5a..22ea35ebced97 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete_fsizeddeallocation.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete_fsizeddeallocation.pass.cpp @@ -12,9 +12,9 @@ // when sized deallocation is not supported, e.g., prior to C++14. // UNSUPPORTED: sanitizer-new-delete -// XFAIL: availability=macosx10.11 -// XFAIL: availability=macosx10.10 -// XFAIL: availability=macosx10.9 +// XFAIL: with_system_cxx_lib=macosx10.11 +// XFAIL: with_system_cxx_lib=macosx10.10 +// XFAIL: with_system_cxx_lib=macosx10.9 // NOTE: Only clang-3.7 and GCC 5.1 and greater support -fsized-deallocation. // REQUIRES: -fsized-deallocation diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.pass.cpp index d8f6f548cd23f..d4c63edb5b8a3 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.pass.cpp @@ -15,10 +15,16 @@ // Test the feature test macros defined by -/* Constant Value - __cpp_lib_atomic_is_always_lock_free 201603L [C++17] - __cpp_lib_atomic_ref 201806L [C++2a] - __cpp_lib_char8_t 201811L [C++2a] +/* Constant Value + __cpp_lib_atomic_flag_test 201907L [C++2a] + __cpp_lib_atomic_float 201711L [C++2a] + __cpp_lib_atomic_is_always_lock_free 201603L [C++17] + __cpp_lib_atomic_lock_free_type_aliases 201907L [C++2a] + __cpp_lib_atomic_ref 201806L [C++2a] + __cpp_lib_atomic_shared_ptr 201711L [C++2a] + __cpp_lib_atomic_value_initialization 201911L [C++2a] + __cpp_lib_atomic_wait 201907L [C++2a] + __cpp_lib_char8_t 201811L [C++2a] */ #include @@ -26,34 +32,90 @@ #if TEST_STD_VER < 14 +# ifdef __cpp_lib_atomic_flag_test +# error "__cpp_lib_atomic_flag_test should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should not be defined before c++2a" +# endif + # ifdef __cpp_lib_atomic_is_always_lock_free # error "__cpp_lib_atomic_is_always_lock_free should not be defined before c++17" # endif +# ifdef __cpp_lib_atomic_lock_free_type_aliases +# error "__cpp_lib_atomic_lock_free_type_aliases should not be defined before c++2a" +# endif + # ifdef __cpp_lib_atomic_ref # error "__cpp_lib_atomic_ref should not be defined before c++2a" # endif +# ifdef __cpp_lib_atomic_shared_ptr +# error "__cpp_lib_atomic_shared_ptr should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_wait +# error "__cpp_lib_atomic_wait should not be defined before c++2a" +# endif + # ifdef __cpp_lib_char8_t # error "__cpp_lib_char8_t should not be defined before c++2a" # endif #elif TEST_STD_VER == 14 +# ifdef __cpp_lib_atomic_flag_test +# error "__cpp_lib_atomic_flag_test should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should not be defined before c++2a" +# endif + # ifdef __cpp_lib_atomic_is_always_lock_free # error "__cpp_lib_atomic_is_always_lock_free should not be defined before c++17" # endif +# ifdef __cpp_lib_atomic_lock_free_type_aliases +# error "__cpp_lib_atomic_lock_free_type_aliases should not be defined before c++2a" +# endif + # ifdef __cpp_lib_atomic_ref # error "__cpp_lib_atomic_ref should not be defined before c++2a" # endif +# ifdef __cpp_lib_atomic_shared_ptr +# error "__cpp_lib_atomic_shared_ptr should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_wait +# error "__cpp_lib_atomic_wait should not be defined before c++2a" +# endif + # ifdef __cpp_lib_char8_t # error "__cpp_lib_char8_t should not be defined before c++2a" # endif #elif TEST_STD_VER == 17 +# ifdef __cpp_lib_atomic_flag_test +# error "__cpp_lib_atomic_flag_test should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should not be defined before c++2a" +# endif + # if !defined(_LIBCPP_HAS_NO_THREADS) # ifndef __cpp_lib_atomic_is_always_lock_free # error "__cpp_lib_atomic_is_always_lock_free should be defined in c++17" @@ -67,16 +129,58 @@ # endif # endif +# ifdef __cpp_lib_atomic_lock_free_type_aliases +# error "__cpp_lib_atomic_lock_free_type_aliases should not be defined before c++2a" +# endif + # ifdef __cpp_lib_atomic_ref # error "__cpp_lib_atomic_ref should not be defined before c++2a" # endif +# ifdef __cpp_lib_atomic_shared_ptr +# error "__cpp_lib_atomic_shared_ptr should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_wait +# error "__cpp_lib_atomic_wait should not be defined before c++2a" +# endif + # ifdef __cpp_lib_char8_t # error "__cpp_lib_char8_t should not be defined before c++2a" # endif #elif TEST_STD_VER > 17 +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_atomic_flag_test +# error "__cpp_lib_atomic_flag_test should be defined in c++2a" +# endif +# if __cpp_lib_atomic_flag_test != 201907L +# error "__cpp_lib_atomic_flag_test should have the value 201907L in c++2a" +# endif +# else +# ifdef __cpp_lib_atomic_flag_test +# error "__cpp_lib_atomic_flag_test should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!" +# endif +# endif + +# if !defined(_LIBCPP_VERSION) +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++2a" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++2a" +# endif +# else // _LIBCPP_VERSION +# ifdef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" +# endif +# endif + # if !defined(_LIBCPP_HAS_NO_THREADS) # ifndef __cpp_lib_atomic_is_always_lock_free # error "__cpp_lib_atomic_is_always_lock_free should be defined in c++2a" @@ -90,6 +194,19 @@ # endif # endif +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_atomic_lock_free_type_aliases +# error "__cpp_lib_atomic_lock_free_type_aliases should be defined in c++2a" +# endif +# if __cpp_lib_atomic_lock_free_type_aliases != 201907L +# error "__cpp_lib_atomic_lock_free_type_aliases should have the value 201907L in c++2a" +# endif +# else +# ifdef __cpp_lib_atomic_lock_free_type_aliases +# error "__cpp_lib_atomic_lock_free_type_aliases should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!" +# endif +# endif + # if !defined(_LIBCPP_VERSION) # ifndef __cpp_lib_atomic_ref # error "__cpp_lib_atomic_ref should be defined in c++2a" @@ -103,6 +220,45 @@ # endif # endif +# if !defined(_LIBCPP_VERSION) +# ifndef __cpp_lib_atomic_shared_ptr +# error "__cpp_lib_atomic_shared_ptr should be defined in c++2a" +# endif +# if __cpp_lib_atomic_shared_ptr != 201711L +# error "__cpp_lib_atomic_shared_ptr should have the value 201711L in c++2a" +# endif +# else // _LIBCPP_VERSION +# ifdef __cpp_lib_atomic_shared_ptr +# error "__cpp_lib_atomic_shared_ptr should not be defined because it is unimplemented in libc++!" +# endif +# endif + +# if !defined(_LIBCPP_VERSION) +# ifndef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should be defined in c++2a" +# endif +# if __cpp_lib_atomic_value_initialization != 201911L +# error "__cpp_lib_atomic_value_initialization should have the value 201911L in c++2a" +# endif +# else // _LIBCPP_VERSION +# ifdef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should not be defined because it is unimplemented in libc++!" +# endif +# endif + +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_atomic_wait +# error "__cpp_lib_atomic_wait should be defined in c++2a" +# endif +# if __cpp_lib_atomic_wait != 201907L +# error "__cpp_lib_atomic_wait should have the value 201907L in c++2a" +# endif +# else +# ifdef __cpp_lib_atomic_wait +# error "__cpp_lib_atomic_wait should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!" +# endif +# endif + # if defined(__cpp_char8_t) # ifndef __cpp_lib_char8_t # error "__cpp_lib_char8_t should be defined in c++2a" diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.pass.cpp index 16febf8d3e24a..9ec2157d974ce 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.pass.cpp @@ -1,4 +1,3 @@ - //===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. @@ -7,29 +6,53 @@ // //===----------------------------------------------------------------------===// // -// feature macros +// WARNING: This test was generated by generate_feature_test_macro_components.py +// and should not be edited manually. + +// -/* Constant Value - __cpp_lib_concepts 201806L +// Test the feature test macros defined by +/* Constant Value + __cpp_lib_concepts 201806L [C++2a] */ -// XFAIL -// #include -#include +#include #include "test_macros.h" -int main(int, char**) -{ -// ensure that the macros that are supposed to be defined in are defined. +#if TEST_STD_VER < 14 -/* -#if !defined(__cpp_lib_fooby) -# error "__cpp_lib_fooby is not defined" -#elif __cpp_lib_fooby < 201606L -# error "__cpp_lib_fooby has an invalid value" -#endif -*/ +# ifdef __cpp_lib_concepts +# error "__cpp_lib_concepts should not be defined before c++2a" +# endif + +#elif TEST_STD_VER == 14 + +# ifdef __cpp_lib_concepts +# error "__cpp_lib_concepts should not be defined before c++2a" +# endif + +#elif TEST_STD_VER == 17 + +# ifdef __cpp_lib_concepts +# error "__cpp_lib_concepts should not be defined before c++2a" +# endif + +#elif TEST_STD_VER > 17 + +# if !defined(_LIBCPP_VERSION) +# ifndef __cpp_lib_concepts +# error "__cpp_lib_concepts should be defined in c++2a" +# endif +# if __cpp_lib_concepts != 201806L +# error "__cpp_lib_concepts should have the value 201806L in c++2a" +# endif +# else // _LIBCPP_VERSION +# ifdef __cpp_lib_concepts +# error "__cpp_lib_concepts should not be defined because it is unimplemented in libc++!" +# endif +# endif + +#endif // TEST_STD_VER > 17 - return 0; -} +int main(int, char**) { return 0; } diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.pass.cpp index b05f41bb1731c..1244efa4aebaf 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.pass.cpp @@ -1,4 +1,3 @@ - //===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. @@ -7,29 +6,62 @@ // //===----------------------------------------------------------------------===// // -// feature macros +// WARNING: This test was generated by generate_feature_test_macro_components.py +// and should not be edited manually. + +// -/* Constant Value - __cpp_lib_execution 201603L +// Test the feature test macros defined by +/* Constant Value + __cpp_lib_execution 201603L [C++17] */ -// XFAIL -// #include -#include +#include #include "test_macros.h" -int main(int, char**) -{ -// ensure that the macros that are supposed to be defined in are defined. +#if TEST_STD_VER < 14 -/* -#if !defined(__cpp_lib_fooby) -# error "__cpp_lib_fooby is not defined" -#elif __cpp_lib_fooby < 201606L -# error "__cpp_lib_fooby has an invalid value" -#endif -*/ +# ifdef __cpp_lib_execution +# error "__cpp_lib_execution should not be defined before c++17" +# endif + +#elif TEST_STD_VER == 14 + +# ifdef __cpp_lib_execution +# error "__cpp_lib_execution should not be defined before c++17" +# endif + +#elif TEST_STD_VER == 17 + +# if !defined(_LIBCPP_VERSION) +# ifndef __cpp_lib_execution +# error "__cpp_lib_execution should be defined in c++17" +# endif +# if __cpp_lib_execution != 201603L +# error "__cpp_lib_execution should have the value 201603L in c++17" +# endif +# else // _LIBCPP_VERSION +# ifdef __cpp_lib_execution +# error "__cpp_lib_execution should not be defined because it is unimplemented in libc++!" +# endif +# endif + +#elif TEST_STD_VER > 17 + +# if !defined(_LIBCPP_VERSION) +# ifndef __cpp_lib_execution +# error "__cpp_lib_execution should be defined in c++2a" +# endif +# if __cpp_lib_execution != 201603L +# error "__cpp_lib_execution should have the value 201603L in c++2a" +# endif +# else // _LIBCPP_VERSION +# ifdef __cpp_lib_execution +# error "__cpp_lib_execution should not be defined because it is unimplemented in libc++!" +# endif +# endif + +#endif // TEST_STD_VER > 17 - return 0; -} +int main(int, char**) { return 0; } diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.pass.cpp index 6c845d71febd7..0117fd83a60c6 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.pass.cpp @@ -16,6 +16,7 @@ /* Constant Value __cpp_lib_addressof_constexpr 201603L [C++17] __cpp_lib_allocator_traits_is_always_equal 201411L [C++17] + __cpp_lib_atomic_value_initialization 201911L [C++2a] __cpp_lib_enable_shared_from_this 201603L [C++17] __cpp_lib_make_unique 201304L [C++14] __cpp_lib_ranges 201811L [C++2a] @@ -37,6 +38,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17" # endif +# ifdef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should not be defined before c++2a" +# endif + # ifdef __cpp_lib_enable_shared_from_this # error "__cpp_lib_enable_shared_from_this should not be defined before c++17" # endif @@ -71,6 +76,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17" # endif +# ifdef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should not be defined before c++2a" +# endif + # ifdef __cpp_lib_enable_shared_from_this # error "__cpp_lib_enable_shared_from_this should not be defined before c++17" # endif @@ -120,6 +129,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++17" # endif +# ifdef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should not be defined before c++2a" +# endif + # ifndef __cpp_lib_enable_shared_from_this # error "__cpp_lib_enable_shared_from_this should be defined in c++17" # endif @@ -187,6 +200,19 @@ # error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++2a" # endif +# if !defined(_LIBCPP_VERSION) +# ifndef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should be defined in c++2a" +# endif +# if __cpp_lib_atomic_value_initialization != 201911L +# error "__cpp_lib_atomic_value_initialization should have the value 201911L in c++2a" +# endif +# else // _LIBCPP_VERSION +# ifdef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should not be defined because it is unimplemented in libc++!" +# endif +# endif + # ifndef __cpp_lib_enable_shared_from_this # error "__cpp_lib_enable_shared_from_this should be defined in c++2a" # endif diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp index afbee586df3c6..46b2e1f21d183 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp @@ -21,8 +21,14 @@ __cpp_lib_array_constexpr 201603L [C++17] 201811L [C++2a] __cpp_lib_as_const 201510L [C++17] + __cpp_lib_atomic_flag_test 201907L [C++2a] + __cpp_lib_atomic_float 201711L [C++2a] __cpp_lib_atomic_is_always_lock_free 201603L [C++17] + __cpp_lib_atomic_lock_free_type_aliases 201907L [C++2a] __cpp_lib_atomic_ref 201806L [C++2a] + __cpp_lib_atomic_shared_ptr 201711L [C++2a] + __cpp_lib_atomic_value_initialization 201911L [C++2a] + __cpp_lib_atomic_wait 201907L [C++2a] __cpp_lib_bind_front 201811L [C++2a] __cpp_lib_bit_cast 201806L [C++2a] __cpp_lib_bool_constant 201505L [C++17] @@ -135,14 +141,38 @@ # error "__cpp_lib_as_const should not be defined before c++17" # endif +# ifdef __cpp_lib_atomic_flag_test +# error "__cpp_lib_atomic_flag_test should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should not be defined before c++2a" +# endif + # ifdef __cpp_lib_atomic_is_always_lock_free # error "__cpp_lib_atomic_is_always_lock_free should not be defined before c++17" # endif +# ifdef __cpp_lib_atomic_lock_free_type_aliases +# error "__cpp_lib_atomic_lock_free_type_aliases should not be defined before c++2a" +# endif + # ifdef __cpp_lib_atomic_ref # error "__cpp_lib_atomic_ref should not be defined before c++2a" # endif +# ifdef __cpp_lib_atomic_shared_ptr +# error "__cpp_lib_atomic_shared_ptr should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_wait +# error "__cpp_lib_atomic_wait should not be defined before c++2a" +# endif + # ifdef __cpp_lib_bind_front # error "__cpp_lib_bind_front should not be defined before c++2a" # endif @@ -489,14 +519,38 @@ # error "__cpp_lib_as_const should not be defined before c++17" # endif +# ifdef __cpp_lib_atomic_flag_test +# error "__cpp_lib_atomic_flag_test should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should not be defined before c++2a" +# endif + # ifdef __cpp_lib_atomic_is_always_lock_free # error "__cpp_lib_atomic_is_always_lock_free should not be defined before c++17" # endif +# ifdef __cpp_lib_atomic_lock_free_type_aliases +# error "__cpp_lib_atomic_lock_free_type_aliases should not be defined before c++2a" +# endif + # ifdef __cpp_lib_atomic_ref # error "__cpp_lib_atomic_ref should not be defined before c++2a" # endif +# ifdef __cpp_lib_atomic_shared_ptr +# error "__cpp_lib_atomic_shared_ptr should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_wait +# error "__cpp_lib_atomic_wait should not be defined before c++2a" +# endif + # ifdef __cpp_lib_bind_front # error "__cpp_lib_bind_front should not be defined before c++2a" # endif @@ -933,6 +987,14 @@ # error "__cpp_lib_as_const should have the value 201510L in c++17" # endif +# ifdef __cpp_lib_atomic_flag_test +# error "__cpp_lib_atomic_flag_test should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should not be defined before c++2a" +# endif + # if !defined(_LIBCPP_HAS_NO_THREADS) # ifndef __cpp_lib_atomic_is_always_lock_free # error "__cpp_lib_atomic_is_always_lock_free should be defined in c++17" @@ -946,10 +1008,26 @@ # endif # endif +# ifdef __cpp_lib_atomic_lock_free_type_aliases +# error "__cpp_lib_atomic_lock_free_type_aliases should not be defined before c++2a" +# endif + # ifdef __cpp_lib_atomic_ref # error "__cpp_lib_atomic_ref should not be defined before c++2a" # endif +# ifdef __cpp_lib_atomic_shared_ptr +# error "__cpp_lib_atomic_shared_ptr should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should not be defined before c++2a" +# endif + +# ifdef __cpp_lib_atomic_wait +# error "__cpp_lib_atomic_wait should not be defined before c++2a" +# endif + # ifdef __cpp_lib_bind_front # error "__cpp_lib_bind_front should not be defined before c++2a" # endif @@ -1575,6 +1653,32 @@ # error "__cpp_lib_as_const should have the value 201510L in c++2a" # endif +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_atomic_flag_test +# error "__cpp_lib_atomic_flag_test should be defined in c++2a" +# endif +# if __cpp_lib_atomic_flag_test != 201907L +# error "__cpp_lib_atomic_flag_test should have the value 201907L in c++2a" +# endif +# else +# ifdef __cpp_lib_atomic_flag_test +# error "__cpp_lib_atomic_flag_test should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!" +# endif +# endif + +# if !defined(_LIBCPP_VERSION) +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++2a" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++2a" +# endif +# else // _LIBCPP_VERSION +# ifdef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" +# endif +# endif + # if !defined(_LIBCPP_HAS_NO_THREADS) # ifndef __cpp_lib_atomic_is_always_lock_free # error "__cpp_lib_atomic_is_always_lock_free should be defined in c++2a" @@ -1588,6 +1692,19 @@ # endif # endif +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_atomic_lock_free_type_aliases +# error "__cpp_lib_atomic_lock_free_type_aliases should be defined in c++2a" +# endif +# if __cpp_lib_atomic_lock_free_type_aliases != 201907L +# error "__cpp_lib_atomic_lock_free_type_aliases should have the value 201907L in c++2a" +# endif +# else +# ifdef __cpp_lib_atomic_lock_free_type_aliases +# error "__cpp_lib_atomic_lock_free_type_aliases should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!" +# endif +# endif + # if !defined(_LIBCPP_VERSION) # ifndef __cpp_lib_atomic_ref # error "__cpp_lib_atomic_ref should be defined in c++2a" @@ -1601,6 +1718,45 @@ # endif # endif +# if !defined(_LIBCPP_VERSION) +# ifndef __cpp_lib_atomic_shared_ptr +# error "__cpp_lib_atomic_shared_ptr should be defined in c++2a" +# endif +# if __cpp_lib_atomic_shared_ptr != 201711L +# error "__cpp_lib_atomic_shared_ptr should have the value 201711L in c++2a" +# endif +# else // _LIBCPP_VERSION +# ifdef __cpp_lib_atomic_shared_ptr +# error "__cpp_lib_atomic_shared_ptr should not be defined because it is unimplemented in libc++!" +# endif +# endif + +# if !defined(_LIBCPP_VERSION) +# ifndef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should be defined in c++2a" +# endif +# if __cpp_lib_atomic_value_initialization != 201911L +# error "__cpp_lib_atomic_value_initialization should have the value 201911L in c++2a" +# endif +# else // _LIBCPP_VERSION +# ifdef __cpp_lib_atomic_value_initialization +# error "__cpp_lib_atomic_value_initialization should not be defined because it is unimplemented in libc++!" +# endif +# endif + +# if !defined(_LIBCPP_HAS_NO_THREADS) +# ifndef __cpp_lib_atomic_wait +# error "__cpp_lib_atomic_wait should be defined in c++2a" +# endif +# if __cpp_lib_atomic_wait != 201907L +# error "__cpp_lib_atomic_wait should have the value 201907L in c++2a" +# endif +# else +# ifdef __cpp_lib_atomic_wait +# error "__cpp_lib_atomic_wait should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!" +# endif +# endif + # if !defined(_LIBCPP_VERSION) # ifndef __cpp_lib_bind_front # error "__cpp_lib_bind_front should be defined in c++2a" diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.cat/member_function_pointer_no_variadics.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.cat/member_function_pointer_no_variadics.pass.cpp deleted file mode 100644 index 916c580d59120..0000000000000 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.cat/member_function_pointer_no_variadics.pass.cpp +++ /dev/null @@ -1,84 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// type_traits - -// member_function_pointer - -#define _LIBCPP_HAS_NO_VARIADICS -#include - -#include "test_macros.h" - -template -void test_member_function_pointer_imp() -{ - static_assert(!std::is_void::value, ""); -#if TEST_STD_VER > 11 - static_assert(!std::is_null_pointer::value, ""); -#endif - static_assert(!std::is_integral::value, ""); - static_assert(!std::is_floating_point::value, ""); - static_assert(!std::is_array::value, ""); - static_assert(!std::is_pointer::value, ""); - static_assert(!std::is_lvalue_reference::value, ""); - static_assert(!std::is_rvalue_reference::value, ""); - static_assert(!std::is_member_object_pointer::value, ""); - static_assert( std::is_member_function_pointer::value, ""); - static_assert(!std::is_enum::value, ""); - static_assert(!std::is_union::value, ""); - static_assert(!std::is_class::value, ""); - static_assert(!std::is_function::value, ""); -} - -template -void test_member_function_pointer() -{ - test_member_function_pointer_imp(); - test_member_function_pointer_imp(); - test_member_function_pointer_imp(); - test_member_function_pointer_imp(); -} - -class Class -{ -}; - -struct incomplete_type; - -int main(int, char**) -{ - test_member_function_pointer(); - test_member_function_pointer(); - test_member_function_pointer(); - - test_member_function_pointer(); - test_member_function_pointer(); - test_member_function_pointer(); - - test_member_function_pointer(); - test_member_function_pointer(); - test_member_function_pointer(); - - test_member_function_pointer(); - test_member_function_pointer(); - test_member_function_pointer(); - - test_member_function_pointer(); - test_member_function_pointer(); - test_member_function_pointer(); - - test_member_function_pointer(); - test_member_function_pointer(); - test_member_function_pointer(); - -// LWG#2582 - static_assert(!std::is_member_function_pointer::value, ""); - - return 0; -} diff --git a/libcxx/test/support/cmpxchg_loop.h b/libcxx/test/support/cmpxchg_loop.h index 50bd00a30bdba..e341606098131 100644 --- a/libcxx/test/support/cmpxchg_loop.h +++ b/libcxx/test/support/cmpxchg_loop.h @@ -8,8 +8,8 @@ #include -template -bool cmpxchg_weak_loop(A& atomic, T& expected, T desired) { +template +bool cmpxchg_weak_loop(A& atomic, typename A::value_type& expected, typename A::value_type desired) { for (int i = 0; i < 10; i++) { if (atomic.compare_exchange_weak(expected, desired) == true) { return true; @@ -19,8 +19,8 @@ bool cmpxchg_weak_loop(A& atomic, T& expected, T desired) { return false; } -template -bool cmpxchg_weak_loop(A& atomic, T& expected, T desired, +template +bool cmpxchg_weak_loop(A& atomic, typename A::value_type& expected, typename A::value_type desired, std::memory_order success, std::memory_order failure) { for (int i = 0; i < 10; i++) { @@ -33,8 +33,8 @@ bool cmpxchg_weak_loop(A& atomic, T& expected, T desired, return false; } -template -bool c_cmpxchg_weak_loop(A* atomic, T* expected, T desired) { +template +bool c_cmpxchg_weak_loop(A* atomic, typename A::value_type* expected, typename A::value_type desired) { for (int i = 0; i < 10; i++) { if (std::atomic_compare_exchange_weak(atomic, expected, desired) == true) { return true; @@ -44,8 +44,8 @@ bool c_cmpxchg_weak_loop(A* atomic, T* expected, T desired) { return false; } -template -bool c_cmpxchg_weak_loop(A* atomic, T* expected, T desired, +template +bool c_cmpxchg_weak_loop(A* atomic, typename A::value_type* expected, typename A::value_type desired, std::memory_order success, std::memory_order failure) { for (int i = 0; i < 10; i++) { diff --git a/libcxx/utils/ci/macos-backdeployment.sh b/libcxx/utils/ci/macos-backdeployment.sh index 24b866cdc1aef..04549aa346456 100755 --- a/libcxx/utils/ci/macos-backdeployment.sh +++ b/libcxx/utils/ci/macos-backdeployment.sh @@ -134,7 +134,7 @@ echo "@@@ Running tests for libc++ @@@" ${ENABLE_FILESYSTEM} \ --param=cxx_headers="${LLVM_INSTALL_DIR}/include/c++/v1" \ --param=std="${STD}" \ - --param=platform="macosx${DEPLOYMENT_TARGET}" \ + --param=target_triple="x86_64-apple-macosx${DEPLOYMENT_TARGET}" \ --param=cxx_library_root="${LLVM_INSTALL_DIR}/lib" \ --param=cxx_runtime_root="${LIBCXX_ROOT_ON_DEPLOYMENT_TARGET}" \ --param=abi_library_path="${LIBCXXABI_ROOT_ON_DEPLOYMENT_TARGET}" \ diff --git a/libcxx/utils/docker/debian9/buildbot/Dockerfile b/libcxx/utils/docker/debian9/buildbot/Dockerfile index ea2ac9d55933e..7da50687b9527 100644 --- a/libcxx/utils/docker/debian9/buildbot/Dockerfile +++ b/libcxx/utils/docker/debian9/buildbot/Dockerfile @@ -14,7 +14,6 @@ ADD install-packages.sh /tmp/ RUN /tmp/install-packages.sh && rm /tmp/install-packages.sh COPY --from=ericwf/gcc:5.5.0 /compiler /opt/gcc-5 -COPY --from=ericwf/llvm:9.x /compiler /opt/llvm-9 FROM base-image as worker-image diff --git a/libcxx/utils/docker/debian9/buildbot/docker-compose.yml b/libcxx/utils/docker/debian9/buildbot/docker-compose.yml index f9a2a2ad9c31c..bd61dea4871c6 100644 --- a/libcxx/utils/docker/debian9/buildbot/docker-compose.yml +++ b/libcxx/utils/docker/debian9/buildbot/docker-compose.yml @@ -5,7 +5,7 @@ services: context: https://github.com/llvm/llvm-project.git#master:libcxx/utils/docker/debian9/buildbot args: gcc_tot: "ericwf/gcc:9.2.0" - llvm_tot: "ericwf/llvm:9.x" + llvm_tot: "ericwf/llvm:11.x" image: llvm-buildbot-worker volumes: - /var/run/docker.sock:/var/run/docker.sock diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index 6ad1a18569893..211702e9982c9 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -613,6 +613,57 @@ def add_version_header(tc): }, "headers": ["utility"], }, + {"name": "__cpp_lib_atomic_flag_test", + "values": { + "c++2a": int(201907), + }, + "headers": ["atomic"], + "depends": "!defined(_LIBCPP_HAS_NO_THREADS)", + "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS)", + }, + {"name": "__cpp_lib_atomic_lock_free_type_aliases", + "values": { + "c++2a": int(201907), + }, + "headers": ["atomic"], + "depends": "!defined(_LIBCPP_HAS_NO_THREADS)", + "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS)", + }, + {"name": "__cpp_lib_atomic_wait", + "values": { + "c++2a": int(201907), + }, + "headers": ["atomic"], + "depends": "!defined(_LIBCPP_HAS_NO_THREADS)", + "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS)", + }, + {"name": "__cpp_lib_atomic_float", + "values": { + "c++2a": int(201711), + }, + "headers": ["atomic"], + "unimplemented": True, + "depends": "!defined(_LIBCPP_HAS_NO_THREADS)", + "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS)", + }, + {"name": "__cpp_lib_atomic_shared_ptr", + "values": { + "c++2a": int(201711), + }, + "headers": ["atomic"], + "unimplemented": True, + "depends": "!defined(_LIBCPP_HAS_NO_THREADS)", + "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS)", + }, + {"name": "__cpp_lib_atomic_value_initialization", + "values": { + "c++2a": int(201911), + }, + "headers": ["atomic", "memory"], + "unimplemented": True, + "depends": "!defined(_LIBCPP_HAS_NO_THREADS)", + "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS)", + }, ]], key=lambda tc: tc["name"]) def get_std_dialects(): diff --git a/libcxx/utils/libcxx/test/config.py b/libcxx/utils/libcxx/test/config.py index d54ee8fa32913..fdc8bbce1cf18 100644 --- a/libcxx/utils/libcxx/test/config.py +++ b/libcxx/utils/libcxx/test/config.py @@ -8,7 +8,6 @@ import copy import os -import platform import pkgutil import pipes import re @@ -72,7 +71,6 @@ def __init__(self, lit_config, config): self.link_shared = self.get_lit_bool('enable_shared', default=True) self.debug_build = self.get_lit_bool('debug_build', default=False) self.exec_env = dict() - self.use_target = False self.use_system_cxx_lib = self.get_lit_bool('use_system_cxx_lib', False) self.use_clang_verify = False @@ -123,7 +121,6 @@ def configure(self): self.executor = self.get_lit_conf('executor') self.configure_cxx() self.configure_triple() - self.configure_deployment() self.configure_src_root() self.configure_obj_root() self.cxx_stdlib_under_test = self.get_lit_conf('cxx_stdlib_under_test', 'libc++') @@ -148,6 +145,8 @@ def configure(self): self.lit_config ) + self.lit_config.note("All available features: {}".format(self.config.available_features)) + def print_config_info(self): if self.cxx.use_modules: self.lit_config.note('Using modules flags: %s' % @@ -246,22 +245,12 @@ def configure_features(self): # XFAIL markers for tests that are known to fail with versions of # libc++ as were shipped with a particular triple. if self.use_system_cxx_lib: - self.config.available_features.add('with_system_cxx_lib=%s' % self.config.target_triple) - - # Add available features for more generic versions of the target - # triple attached to with_system_cxx_lib. - if self.use_deployment: - (_, name, version) = self.config.deployment - self.config.available_features.add('with_system_cxx_lib=%s' % name) - self.config.available_features.add('with_system_cxx_lib=%s%s' % (name, version)) - - # Configure the availability feature. Availability is only enabled - # with libc++, because other standard libraries do not provide - # availability markup. - if self.use_deployment and self.cxx_stdlib_under_test == 'libc++': - (_, name, version) = self.config.deployment - self.config.available_features.add('availability=%s' % name) - self.config.available_features.add('availability=%s%s' % (name, version)) + (arch, vendor, platform) = self.config.target_triple.split('-', 2) + (sysname, version) = re.match(r'([^0-9]+)([0-9\.]*)', platform).groups() + + self.config.available_features.add('with_system_cxx_lib={}-{}-{}{}'.format(arch, vendor, sysname, version)) + self.config.available_features.add('with_system_cxx_lib={}{}'.format(sysname, version)) + self.config.available_features.add('with_system_cxx_lib={}'.format(sysname)) if self.target_info.is_windows(): if self.cxx_stdlib_under_test == 'libc++': @@ -297,6 +286,7 @@ def configure_default_compile_flags(self): # Configure include paths self.configure_compile_flags_header_includes() self.target_info.add_cxx_compile_flags(self.cxx.compile_flags) + self.target_info.add_cxx_flags(self.cxx.flags) # Configure feature flags. enable_32bit = self.get_lit_bool('enable_32bit', False) if enable_32bit: @@ -315,20 +305,19 @@ def configure_default_compile_flags(self): # being elided. if self.target_info.is_windows() and self.debug_build: self.cxx.compile_flags += ['-D_DEBUG'] - if self.use_target: - if not self.cxx.addFlagIfSupported( - ['--target=' + self.config.target_triple]): - self.lit_config.warning('use_target is true but --target is '\ - 'not supported by the compiler') - if self.use_deployment: - arch, name, version = self.config.deployment - self.cxx.flags += ['-arch', arch] - self.cxx.flags += ['-m' + name + '-version-min=' + version] + if not self.cxx.addFlagIfSupported(['--target=' + self.config.target_triple]): + self.lit_config.warning('Not adding any target triple -- the compiler does ' + 'not support --target=') # Add includes for support headers used in the tests. support_path = os.path.join(self.libcxx_src_root, 'test/support') self.cxx.compile_flags += ['-I' + support_path] + # If we're testing the upstream LLVM libc++, disable availability markup, + # which is not relevant for non-shipped flavors of libc++. + if not self.use_system_cxx_lib: + self.cxx.compile_flags += ['-D_LIBCPP_DISABLE_AVAILABILITY'] + # Add includes for the PSTL headers pstl_src_root = self.get_lit_conf('pstl_src_root') pstl_obj_root = self.get_lit_conf('pstl_obj_root') @@ -639,37 +628,15 @@ def configure_substitutions(self): if self.get_lit_conf('libcxx_gdb'): sub.append(('%{libcxx_gdb}', self.get_lit_conf('libcxx_gdb'))) - def can_use_deployment(self): - # Check if the host is on an Apple platform using clang. - if not self.target_info.is_darwin(): - return False - if not self.target_info.is_host_macosx(): - return False - if not self.cxx.type.endswith('clang'): - return False - return True - def configure_triple(self): # Get or infer the target triple. target_triple = self.get_lit_conf('target_triple') - self.use_target = self.get_lit_bool('use_target', False) - if self.use_target and target_triple: - self.lit_config.warning('use_target is true but no triple is specified') - - # Use deployment if possible. - self.use_deployment = not self.use_target and self.can_use_deployment() - if self.use_deployment: - return - - # Save the triple (and warn on Apple platforms). - self.config.target_triple = target_triple - if self.use_target and 'apple' in target_triple: - self.lit_config.warning('consider using arch and platform instead' - ' of target_triple on Apple platforms') # If no target triple was given, try to infer it from the compiler # under test. - if not self.config.target_triple: + if not target_triple: + self.lit_config.note('Trying to infer the target_triple because none was specified') + target_triple = self.cxx.getTriple() # Drop sub-major version components from the triple, because the # current XFAIL handling expects exact matches for feature checks. @@ -684,44 +651,10 @@ def configure_triple(self): if (target_triple.endswith('redhat-linux') or target_triple.endswith('suse-linux')): target_triple += '-gnu' - self.config.target_triple = target_triple - self.lit_config.note( - "inferred target_triple as: %r" % self.config.target_triple) - - def configure_deployment(self): - assert not self.use_deployment is None - assert not self.use_target is None - if not self.use_deployment: - # Warn about ignored parameters. - if self.get_lit_conf('arch'): - self.lit_config.warning('ignoring arch, using target_triple') - if self.get_lit_conf('platform'): - self.lit_config.warning('ignoring platform, using target_triple') - return - - assert not self.use_target - assert self.target_info.is_host_macosx() - - # Always specify deployment explicitly on Apple platforms, since - # otherwise a platform is picked up from the SDK. If the SDK version - # doesn't match the system version, tests that use the system library - # may fail spuriously. - arch = self.get_lit_conf('arch') - if not arch: - arch = self.cxx.getTriple().split('-', 1)[0] - - _, name, version = self.target_info.get_platform() - self.config.deployment = (arch, name, version) - - # Set the target triple for use by lit. - self.config.target_triple = arch + '-apple-' + name + version - self.lit_config.note( - "computed target_triple as: %r" % self.config.target_triple) - # If we're testing the upstream LLVM libc++, disable availability markup, - # which is not relevant for non-shipped flavors of libc++. - if not self.use_system_cxx_lib: - self.cxx.compile_flags += ['-D_LIBCPP_DISABLE_AVAILABILITY'] + # Save the triple + self.lit_config.note("Setting target_triple to {}".format(target_triple)) + self.config.target_triple = target_triple def configure_env(self): self.config.environment = dict(os.environ) diff --git a/libcxx/utils/libcxx/test/target_info.py b/libcxx/utils/libcxx/test/target_info.py index 3197276ffa5b5..130d5600ed173 100644 --- a/libcxx/utils/libcxx/test/target_info.py +++ b/libcxx/utils/libcxx/test/target_info.py @@ -30,6 +30,7 @@ def is_windows(self): def is_darwin(self): return self.platform() == 'darwin' + def add_cxx_flags(self, flags): pass def add_cxx_compile_flags(self, flags): pass def add_cxx_link_flags(self, flags): pass def allow_cxxabi_link(self): return True @@ -73,34 +74,8 @@ def get_sdk_version(self, name): return re.sub(r'.*/[^0-9]+([0-9.]+)\.sdk', r'\1', out) - def get_platform(self): - platform = self.full_config.get_lit_conf('platform') - if platform: - platform = re.sub(r'([^0-9]+)([0-9\.]*)', r'\1-\2', platform) - name, version = tuple(platform.split('-', 1)) - else: - name = 'macosx' - version = None - - if version: - return (False, name, version) - - # Infer the version, either from the SDK or the system itself. For - # macosx, ignore the SDK version; what matters is what's at - # /usr/lib/libc++.dylib. - if name == 'macosx': - version = self.get_macosx_version() - else: - version = self.get_sdk_version(name) - return (True, name, version) - - def add_cxx_compile_flags(self, flags): - if self.full_config.use_deployment: - _, name, _ = self.full_config.config.deployment - cmd = ['xcrun', '--sdk', name, '--show-sdk-path'] - else: - cmd = ['xcrun', '--show-sdk-path'] - out, err, exit_code = executeCommand(cmd) + def add_cxx_flags(self, flags): + out, err, exit_code = executeCommand(['xcrun', '--show-sdk-path']) if exit_code != 0: self.full_config.lit_config.warning("Could not determine macOS SDK path! stderr was " + err) if exit_code == 0 and out: diff --git a/libcxx/www/cxx2a_status.html b/libcxx/www/cxx2a_status.html index 73a2c50c71c90..88df02bcb117d 100644 --- a/libcxx/www/cxx2a_status.html +++ b/libcxx/www/cxx2a_status.html @@ -261,7 +261,7 @@

    Paper Status

    The missing bits in P0600 are in [mem.res.class], [mem.poly.allocator.class], and [container.node.overview]

    -

    The missing bits in P0202 are in copy, copy_backwards, move, and move_backwards (and the ones that call them: copy_n, rotate_copy, merge, set_union, set_difference, and set_symmetric_difference). This is because the first four algorithms have specializations that call memmove which is not constexpr. See Bug 25165

    +

    The missing bits in P0202 are in copy and copy_backwards (and the ones that call them: copy_n, set_union, set_difference, and set_symmetric_difference). This is because the first two algorithms have specializations that call memmove which is not constexpr. See Bug 25165

    Library Working group Issues Status

    diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt index 96a1c625222a8..10ac112c90d9f 100644 --- a/libcxxabi/CMakeLists.txt +++ b/libcxxabi/CMakeLists.txt @@ -352,6 +352,7 @@ if (NOT LIBCXXABI_ENABLE_THREADS) " is also set to ON.") endif() add_definitions(-D_LIBCXXABI_HAS_NO_THREADS) + add_definitions(-D_LIBCPP_HAS_NO_THREADS) endif() if (LIBCXXABI_HAS_EXTERNAL_THREAD_API) diff --git a/libcxxabi/include/cxxabi.h b/libcxxabi/include/cxxabi.h index 29e28a69a9195..43ce6f5f740d5 100644 --- a/libcxxabi/include/cxxabi.h +++ b/libcxxabi/include/cxxabi.h @@ -137,9 +137,9 @@ __cxa_vec_cctor(void *dest_array, void *src_array, size_t element_count, void (*destructor)(void *)); // 3.3.5.3 Runtime API -extern _LIBCXXABI_FUNC_VIS int __cxa_atexit(void (*f)(void *), void *p, - void *d); -extern _LIBCXXABI_FUNC_VIS int __cxa_finalize(void *); +// These functions are part of the C++ ABI, but they are not defined in libc++abi: +// int __cxa_atexit(void (*)(void *), void *, void *); +// void __cxa_finalize(void *); // 3.4 Demangler API extern _LIBCXXABI_FUNC_VIS char *__cxa_demangle(const char *mangled_name, diff --git a/libcxxabi/test/lit.site.cfg.in b/libcxxabi/test/lit.site.cfg.in index 06d5706da7d24..87f955e321610 100644 --- a/libcxxabi/test/lit.site.cfg.in +++ b/libcxxabi/test/lit.site.cfg.in @@ -25,7 +25,6 @@ config.enable_shared = @LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX@ config.enable_exceptions = @LIBCXXABI_ENABLE_EXCEPTIONS@ config.host_triple = "@LLVM_HOST_TRIPLE@" config.target_triple = "@TARGET_TRIPLE@" -config.use_target = bool("@LIBCXXABI_TARGET_TRIPLE@") config.sysroot = "@LIBCXXABI_SYSROOT@" config.gcc_toolchain = "@LIBCXXABI_GCC_TOOLCHAIN@" config.cxx_ext_threads = @LIBCXXABI_BUILD_EXTERNAL_THREAD_LIBRARY@ diff --git a/libunwind/src/AddressSpace.hpp b/libunwind/src/AddressSpace.hpp index e6f2609d679b9..26397c28798e1 100644 --- a/libunwind/src/AddressSpace.hpp +++ b/libunwind/src/AddressSpace.hpp @@ -98,22 +98,15 @@ extern char __eh_frame_hdr_end; extern char __exidx_start; extern char __exidx_end; -#elif defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) - -// ELF-based systems may use dl_iterate_phdr() to access sections -// containing unwinding information. The ElfW() macro for pointer-size -// independent ELF header traversal is not provided by on some -// systems (e.g., FreeBSD). On these systems the data structures are -// just called Elf_XXX. Define ElfW() locally. -#ifndef _WIN32 -#include -#else +#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_WIN32) + #include #include -#endif -#if !defined(ElfW) -#define ElfW(type) Elf_##type -#endif + +#elif defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) || \ + defined(_LIBUNWIND_USE_DL_UNWIND_FIND_EXIDX) + +#include #endif @@ -126,6 +119,10 @@ struct UnwindInfoSections { // No dso_base for SEH or ARM EHABI. uintptr_t dso_base; #endif +#if defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) && \ + defined(_LIBUNWIND_SUPPORT_DWARF_INDEX) + uintptr_t text_segment_length; +#endif #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) uintptr_t dwarf_section; uintptr_t dwarf_section_length; @@ -351,23 +348,14 @@ LocalAddressSpace::getEncodedP(pint_t &addr, pint_t end, uint8_t encoding, return result; } -#ifdef __APPLE__ -#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_LIBUNWIND_IS_BAREMETAL) -#elif defined(_LIBUNWIND_ARM_EHABI) && defined(_LIBUNWIND_IS_BAREMETAL) -#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_WIN32) -#elif defined(_LIBUNWIND_SUPPORT_SEH_UNWIND) && defined(_WIN32) -#elif defined(_LIBUNWIND_ARM_EHABI) && defined(__BIONIC__) -// Code inside findUnwindSections handles all these cases. -// -// Although the above ifdef chain is ugly, there doesn't seem to be a cleaner -// way to handle it. The generalized boolean expression is: -// -// A OR (B AND C) OR (D AND C) OR (B AND E) OR (F AND E) OR (D AND G) -// -// Running it through various boolean expression simplifiers gives expressions -// that don't help at all. -#elif defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) +#if defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) +// The ElfW() macro for pointer-size independent ELF header traversal is not +// provided by on some systems (e.g., FreeBSD). On these systems the +// data structures are just called Elf_XXX. Define ElfW() locally. +#if !defined(ElfW) + #define ElfW(type) Elf_##type +#endif #if !defined(Elf_Half) typedef ElfW(Half) Elf_Half; #endif @@ -426,7 +414,7 @@ static bool checkAddrInSegment(const Elf_Phdr *phdr, size_t image_base, uintptr_t end = begin + phdr->p_memsz; if (cbdata->targetAddr >= begin && cbdata->targetAddr < end) { cbdata->sects->dso_base = begin; - cbdata->sects->dwarf_section_length = phdr->p_memsz; + cbdata->sects->text_segment_length = phdr->p_memsz; return true; } } @@ -466,8 +454,12 @@ static int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, found_hdr = EHHeaderParser::decodeEHHdr( *cbdata->addressSpace, eh_frame_hdr_start, phdr->p_memsz, hdrInfo); - if (found_hdr) + if (found_hdr) { + // .eh_frame_hdr records the start of .eh_frame, but not its size. + // Rely on a zero terminator to find the end of the section. cbdata->sects->dwarf_section = hdrInfo.eh_frame_ptr; + cbdata->sects->dwarf_section_length = UINTPTR_MAX; + } } else if (!found_obj) { found_obj = checkAddrInSegment(phdr, image_base, cbdata); } @@ -478,13 +470,10 @@ static int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, return 1; } } - cbdata->sects->dwarf_section_length = 0; return 0; } -#else // defined(LIBUNWIND_SUPPORT_DWARF_UNWIND) -// Given all the #ifdef's above, the code here is for -// defined(LIBUNWIND_ARM_EHABI) +#elif defined(_LIBUNWIND_ARM_EHABI) static int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t, void *data) { @@ -516,8 +505,9 @@ static int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t, } return found_obj && found_hdr; } -#endif // defined(LIBUNWIND_SUPPORT_DWARF_UNWIND) -#endif // defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) + +#endif +#endif // defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr, @@ -535,6 +525,7 @@ inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr, return true; } #elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_LIBUNWIND_IS_BAREMETAL) + info.dso_base = 0; // Bare metal is statically linked, so no need to ask the dynamic loader info.dwarf_section_length = (uintptr_t)(&__eh_frame_end - &__eh_frame_start); info.dwarf_section = (uintptr_t)(&__eh_frame_start); @@ -601,16 +592,14 @@ inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr, (void)targetAddr; (void)info; return true; -#elif defined(_LIBUNWIND_ARM_EHABI) && defined(__BIONIC__) - // For ARM EHABI, Bionic didn't implement dl_iterate_phdr until API 21. After - // API 21, dl_iterate_phdr exists, but dl_unwind_find_exidx is much faster. +#elif defined(_LIBUNWIND_USE_DL_UNWIND_FIND_EXIDX) int length = 0; info.arm_section = (uintptr_t)dl_unwind_find_exidx((_Unwind_Ptr)targetAddr, &length); info.arm_section_length = (uintptr_t)length * sizeof(EHABIIndexEntry); if (info.arm_section && info.arm_section_length) return true; -#elif defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) +#elif defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) dl_iterate_cb_data cb_data = {this, &info, targetAddr}; int found = dl_iterate_phdr(findUnwindSectionsByPhdr, &cb_data); return static_cast(found); diff --git a/libunwind/src/DwarfInstructions.hpp b/libunwind/src/DwarfInstructions.hpp index ee98f538d437e..c39cabe1f7830 100644 --- a/libunwind/src/DwarfInstructions.hpp +++ b/libunwind/src/DwarfInstructions.hpp @@ -93,7 +93,8 @@ typename A::pint_t DwarfInstructions::getSavedRegister( case CFI_Parser::kRegisterInRegister: return registers.getRegister((int)savedReg.value); - + case CFI_Parser::kRegisterUndefined: + return 0; case CFI_Parser::kRegisterUnused: case CFI_Parser::kRegisterOffsetFromCFA: // FIX ME @@ -117,6 +118,7 @@ double DwarfInstructions::getSavedFloatRegister( case CFI_Parser::kRegisterIsExpression: case CFI_Parser::kRegisterUnused: + case CFI_Parser::kRegisterUndefined: case CFI_Parser::kRegisterOffsetFromCFA: case CFI_Parser::kRegisterInRegister: // FIX ME @@ -140,6 +142,7 @@ v128 DwarfInstructions::getSavedVectorRegister( case CFI_Parser::kRegisterIsExpression: case CFI_Parser::kRegisterUnused: + case CFI_Parser::kRegisterUndefined: case CFI_Parser::kRegisterOffsetFromCFA: case CFI_Parser::kRegisterInRegister: // FIX ME @@ -190,6 +193,10 @@ int DwarfInstructions::stepWithDwarf(A &addressSpace, pint_t pc, prolog.savedRegisters[i])); else return UNW_EBADREG; + } else if (i == (int)cieInfo.returnAddressRegister) { + // Leaf function keeps the return address in register and there is no + // explicit intructions how to restore it. + returnAddress = registers.getRegister(cieInfo.returnAddressRegister); } } diff --git a/libunwind/src/DwarfParser.hpp b/libunwind/src/DwarfParser.hpp index c98c4f92a6ad3..86c0522afd3ff 100644 --- a/libunwind/src/DwarfParser.hpp +++ b/libunwind/src/DwarfParser.hpp @@ -69,6 +69,7 @@ class CFI_Parser { }; enum RegisterSavedWhere { kRegisterUnused, + kRegisterUndefined, kRegisterInCFA, kRegisterOffsetFromCFA, kRegisterInRegister, @@ -135,7 +136,7 @@ class CFI_Parser { }; static bool findFDE(A &addressSpace, pint_t pc, pint_t ehSectionStart, - uint32_t sectionLength, pint_t fdeHint, FDE_Info *fdeInfo, + uintptr_t sectionLength, pint_t fdeHint, FDE_Info *fdeInfo, CIE_Info *cieInfo); static const char *decodeFDE(A &addressSpace, pint_t fdeStart, FDE_Info *fdeInfo, CIE_Info *cieInfo); @@ -166,7 +167,7 @@ const char *CFI_Parser::decodeFDE(A &addressSpace, pint_t fdeStart, p += 8; } if (cfiLength == 0) - return "FDE has zero length"; // end marker + return "FDE has zero length"; // zero terminator uint32_t ciePointer = addressSpace.get32(p); if (ciePointer == 0) return "FDE is really a CIE"; // this is a CIE not an FDE @@ -211,11 +212,13 @@ const char *CFI_Parser::decodeFDE(A &addressSpace, pint_t fdeStart, /// Scan an eh_frame section to find an FDE for a pc template bool CFI_Parser::findFDE(A &addressSpace, pint_t pc, pint_t ehSectionStart, - uint32_t sectionLength, pint_t fdeHint, + uintptr_t sectionLength, pint_t fdeHint, FDE_Info *fdeInfo, CIE_Info *cieInfo) { //fprintf(stderr, "findFDE(0x%llX)\n", (long long)pc); pint_t p = (fdeHint != 0) ? fdeHint : ehSectionStart; - const pint_t ehSectionEnd = p + sectionLength; + const pint_t ehSectionEnd = (sectionLength == UINTPTR_MAX) + ? static_cast(-1) + : (ehSectionStart + sectionLength); while (p < ehSectionEnd) { pint_t currentCFI = p; //fprintf(stderr, "findFDE() CFI at 0x%llX\n", (long long)p); @@ -227,7 +230,7 @@ bool CFI_Parser::findFDE(A &addressSpace, pint_t pc, pint_t ehSectionStart, p += 8; } if (cfiLength == 0) - return false; // end marker + return false; // zero terminator uint32_t id = addressSpace.get32(p); if (id == 0) { // Skip over CIEs. @@ -503,7 +506,7 @@ bool CFI_Parser::parseInstructions(A &addressSpace, pint_t instructions, "malformed DW_CFA_undefined DWARF unwind, reg too big"); return false; } - results->setRegisterLocation(reg, kRegisterUnused, initialState); + results->setRegisterLocation(reg, kRegisterUndefined, initialState); _LIBUNWIND_TRACE_DWARF("DW_CFA_undefined(reg=%" PRIu64 ")\n", reg); break; case DW_CFA_same_value: diff --git a/libunwind/src/FrameHeaderCache.hpp b/libunwind/src/FrameHeaderCache.hpp index 813fcd408b262..54d5d33c3cd7e 100644 --- a/libunwind/src/FrameHeaderCache.hpp +++ b/libunwind/src/FrameHeaderCache.hpp @@ -32,7 +32,7 @@ class _LIBUNWIND_HIDDEN FrameHeaderCache { struct CacheEntry { uintptr_t LowPC() { return Info.dso_base; }; - uintptr_t HighPC() { return Info.dso_base + Info.dwarf_section_length; }; + uintptr_t HighPC() { return Info.dso_base + Info.text_segment_length; }; UnwindInfoSections Info; CacheEntry *Next; }; diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp index e6a36764fc793..9f8fa65107b41 100644 --- a/libunwind/src/UnwindCursor.hpp +++ b/libunwind/src/UnwindCursor.hpp @@ -81,6 +81,7 @@ template class _LIBUNWIND_HIDDEN DwarfFDECache { typedef typename A::pint_t pint_t; public: + static constexpr pint_t kSearchAll = static_cast(-1); static pint_t findFDE(pint_t mh, pint_t pc); static void add(pint_t mh, pint_t ip_start, pint_t ip_end, pint_t fde); static void removeAllIn(pint_t mh); @@ -138,7 +139,7 @@ typename A::pint_t DwarfFDECache::findFDE(pint_t mh, pint_t pc) { pint_t result = 0; _LIBUNWIND_LOG_IF_FALSE(_lock.lock_shared()); for (entry *p = _buffer; p < _bufferUsed; ++p) { - if ((mh == p->mh) || (mh == 0)) { + if ((mh == p->mh) || (mh == kSearchAll)) { if ((p->ip_start <= pc) && (pc < p->ip_end)) { result = p->fde; break; @@ -1516,7 +1517,7 @@ bool UnwindCursor::getInfoFromDwarfSection(pint_t pc, // If compact encoding table gave offset into dwarf section, go directly there if (fdeSectionOffsetHint != 0) { foundFDE = CFI_Parser::findFDE(_addressSpace, pc, sects.dwarf_section, - (uint32_t)sects.dwarf_section_length, + sects.dwarf_section_length, sects.dwarf_section + fdeSectionOffsetHint, &fdeInfo, &cieInfo); } @@ -1533,7 +1534,7 @@ bool UnwindCursor::getInfoFromDwarfSection(pint_t pc, if (cachedFDE != 0) { foundFDE = CFI_Parser::findFDE(_addressSpace, pc, sects.dwarf_section, - (uint32_t)sects.dwarf_section_length, + sects.dwarf_section_length, cachedFDE, &fdeInfo, &cieInfo); foundInCache = foundFDE; } @@ -1541,7 +1542,7 @@ bool UnwindCursor::getInfoFromDwarfSection(pint_t pc, if (!foundFDE) { // Still not found, do full scan of __eh_frame section. foundFDE = CFI_Parser::findFDE(_addressSpace, pc, sects.dwarf_section, - (uint32_t)sects.dwarf_section_length, 0, + sects.dwarf_section_length, 0, &fdeInfo, &cieInfo); } if (foundFDE) { @@ -1945,7 +1946,8 @@ void UnwindCursor::setInfoBasedOnIPRegister(bool isReturnAddress) { #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) // There is no static unwind info for this pc. Look to see if an FDE was // dynamically registered for it. - pint_t cachedFDE = DwarfFDECache::findFDE(0, pc); + pint_t cachedFDE = DwarfFDECache::findFDE(DwarfFDECache::kSearchAll, + pc); if (cachedFDE != 0) { typename CFI_Parser::FDE_Info fdeInfo; typename CFI_Parser::CIE_Info cieInfo; diff --git a/libunwind/src/config.h b/libunwind/src/config.h index fd177dd7338c1..0885dccda07eb 100644 --- a/libunwind/src/config.h +++ b/libunwind/src/config.h @@ -34,7 +34,18 @@ #else #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 #endif +#elif defined(_LIBUNWIND_IS_BAREMETAL) + #if !defined(_LIBUNWIND_ARM_EHABI) + #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 + #define _LIBUNWIND_SUPPORT_DWARF_INDEX 1 + #endif +#elif defined(__BIONIC__) && defined(_LIBUNWIND_ARM_EHABI) + // For ARM EHABI, Bionic didn't implement dl_iterate_phdr until API 21. After + // API 21, dl_iterate_phdr exists, but dl_unwind_find_exidx is much faster. + #define _LIBUNWIND_USE_DL_UNWIND_FIND_EXIDX 1 #else + // Assume an ELF system with a dl_iterate_phdr function. + #define _LIBUNWIND_USE_DL_ITERATE_PHDR 1 #if !defined(_LIBUNWIND_ARM_EHABI) #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 #define _LIBUNWIND_SUPPORT_DWARF_INDEX 1 diff --git a/libunwind/test/frameheadercache_test.pass.cpp b/libunwind/test/frameheadercache_test.pass.cpp index ebbc00464e072..15c7c67c58eae 100644 --- a/libunwind/test/frameheadercache_test.pass.cpp +++ b/libunwind/test/frameheadercache_test.pass.cpp @@ -3,27 +3,10 @@ #include "../src/config.h" // Only run this test under supported configurations. -// The frame header cache should work fine for other architectures, -// but the #ifdefs end up being even more complicated than this. -#if defined(__x86_64__) && defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE) - -// This #if chain is ugly, but see the comments in AddressSpace.hpp for -// the reasoning. - -#ifdef __APPLE__ -int main() { return 0; } -#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_LIBUNWIND_IS_BAREMETAL) -int main() { return 0; } -#elif defined(_LIBUNWIND_ARM_EHABI) && defined(_LIBUNWIND_IS_BAREMETAL) -int main() { return 0; } -#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_WIN32) -int main() { return 0; } -#elif defined(_LIBUNWIND_SUPPORT_SEH_UNWIND) && defined(_WIN32) -int main() { return 0; } -#elif defined(_LIBUNWIND_ARM_EHABI) && defined(__BIONIC__) -int main() { return 0; } -#elif defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) +#if defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) && \ + defined(_LIBUNWIND_SUPPORT_DWARF_INDEX) && \ + defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE) #include #include @@ -33,7 +16,7 @@ int main() { return 0; } #include "../src/AddressSpace.hpp" #define kBaseAddr 0xFFF000 -#define kDwarfSectionLength 0xFF +#define kTextSegmentLength 0xFF using namespace libunwind; @@ -49,7 +32,7 @@ int main() { UnwindInfoSections UIS; UIS.dso_base = kBaseAddr; - UIS.dwarf_section_length = kDwarfSectionLength; + UIS.text_segment_length = kTextSegmentLength; dl_iterate_cb_data CBData; // Unused by the cache. CBData.addressSpace = nullptr; @@ -75,7 +58,7 @@ int main() { abort(); // Add enough things to the cache that the entry is evicted. for (int i = 0; i < 9; i++) { - UIS.dso_base = kBaseAddr + (kDwarfSectionLength * i); + UIS.dso_base = kBaseAddr + (kTextSegmentLength * i); FHC.add(&UIS); } CBData.targetAddr = kBaseAddr; @@ -84,9 +67,7 @@ int main() { abort(); return 0; } -#else -int main() { return 0; } -#endif + #else int main() { return 0;} #endif diff --git a/libunwind/test/lit.site.cfg.in b/libunwind/test/lit.site.cfg.in index 30a996cf37837..84dae3c2bfb0d 100644 --- a/libunwind/test/lit.site.cfg.in +++ b/libunwind/test/lit.site.cfg.in @@ -25,7 +25,6 @@ config.enable_shared = @LIBCXX_ENABLE_SHARED@ config.arm_ehabi = @LIBUNWIND_USES_ARM_EHABI@ config.host_triple = "@LLVM_HOST_TRIPLE@" config.target_triple = "@TARGET_TRIPLE@" -config.use_target = bool("@LIBUNWIND_TARGET_TRIPLE@") config.sysroot = "@LIBUNWIND_SYSROOT@" config.gcc_toolchain = "@LIBUNWIND_GCC_TOOLCHAIN@" config.cxx_ext_threads = @LIBUNWIND_BUILD_EXTERNAL_THREAD_LIBRARY@ @@ -45,6 +44,10 @@ config.test_source_root = os.path.join(config.libunwind_src_root, 'test') # Allow expanding substitutions that are based on other substitutions config.recursiveExpansionLimit = 10 +# Make symbols available in the tests. +config.test_compiler_flags += " -funwind-tables " +config.test_linker_flags += " -Wl,--export-dynamic " + # Infer the test_exec_root from the build directory. config.test_exec_root = os.path.join(config.libunwind_obj_root, 'test') diff --git a/libunwind/test/signal_unwind.pass.cpp b/libunwind/test/signal_unwind.pass.cpp new file mode 100644 index 0000000000000..295dd75bb7264 --- /dev/null +++ b/libunwind/test/signal_unwind.pass.cpp @@ -0,0 +1,44 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Ensure that the unwinder can cope with the signal handler. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +_Unwind_Reason_Code frame_handler(struct _Unwind_Context* ctx, void* arg) { + (void)arg; + Dl_info info = { 0, 0, 0, 0 }; + assert(dladdr((void*)_Unwind_GetIP(ctx), &info)); + + // Unwind util the main is reached, above frames deeped on the platfrom and architecture. + if(info.dli_sname && !strcmp("main", info.dli_sname)) { + _Exit(0); + } + return _URC_NO_REASON; +} + +void signal_handler(int signum) { + (void)signum; + _Unwind_Backtrace(frame_handler, NULL); + _Exit(-1); +} + +int main() { + signal(SIGUSR1, signal_handler); + kill(getpid(), SIGUSR1); + return -2; +} diff --git a/libunwind/test/unwind_leaffunction.pass.cpp b/libunwind/test/unwind_leaffunction.pass.cpp new file mode 100644 index 0000000000000..b8a114516d0a6 --- /dev/null +++ b/libunwind/test/unwind_leaffunction.pass.cpp @@ -0,0 +1,50 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Ensure that leaf function can be unwund. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +_Unwind_Reason_Code frame_handler(struct _Unwind_Context* ctx, void* arg) { + (void)arg; + Dl_info info = { 0, 0, 0, 0 }; + assert(dladdr((void*)_Unwind_GetIP(ctx), &info)); + + // Unwind util the main is reached, above frames deeped on the platfrom and architecture. + if(info.dli_sname && !strcmp("main", info.dli_sname)) { + _Exit(0); + } + return _URC_NO_REASON; +} + +void signal_handler(int signum) { + (void)signum; + _Unwind_Backtrace(frame_handler, NULL); + _Exit(-1); +} + +int* faultyPointer = NULL; + +__attribute__((noinline)) void crashing_leaf_func(void) { + *faultyPointer = 0; +} + +int main() { + signal(SIGSEGV, signal_handler); + crashing_leaf_func(); + return -2; +} \ No newline at end of file diff --git a/lld/CMakeLists.txt b/lld/CMakeLists.txt index 7dae682cdef07..8b8c7178c616c 100644 --- a/lld/CMakeLists.txt +++ b/lld/CMakeLists.txt @@ -57,38 +57,19 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) include(CheckAtomic) if(LLVM_INCLUDE_TESTS) - if(CMAKE_VERSION VERSION_LESS 3.12) - include(FindPythonInterp) - if(NOT PYTHONINTERP_FOUND) - message(FATAL_ERROR - "Unable to find Python interpreter, required for testing. - - Please install Python or specify the PYTHON_EXECUTABLE CMake variable.") - endif() - - if(${PYTHON_VERSION_STRING} VERSION_LESS 2.7) - message(FATAL_ERROR "Python 2.7 or newer is required") + find_package(Python3 COMPONENTS Interpreter) + if(NOT Python3_Interpreter_FOUND) + message(WARNING "Python3 not found, using python2 as a fallback") + find_package(Python2 COMPONENTS Interpreter REQUIRED) + if(Python2_VERSION VERSION_LESS 2.7) + message(SEND_ERROR "Python 2.7 or newer is required") endif() - add_executable(Python3::Interpeter IMPORTED) + # Treat python2 as python3 + add_executable(Python3::Interpreter IMPORTED) set_target_properties(Python3::Interpreter PROPERTIES - IMPORTED_LOCATION ${PYTHON_EXECUTABLE}) - set(Python3_EXECUTABLE ${PYTHON_EXECUTABLE}) - else() - find_package(Python3 COMPONENTS Interpreter) - if(NOT Python3_Interpreter_FOUND) - message(WARNING "Python3 not found, using python2 as a fallback") - find_package(Python2 COMPONENTS Interpreter REQUIRED) - if(Python2_VERSION VERSION_LESS 2.7) - message(SEND_ERROR "Python 2.7 or newer is required") - endif() - - # Treat python2 as python3 - add_executable(Python3::Interpreter IMPORTED) - set_target_properties(Python3::Interpreter PROPERTIES - IMPORTED_LOCATION ${Python2_EXECUTABLE}) - set(Python3_EXECUTABLE ${Python2_EXECUTABLE}) - endif() + IMPORTED_LOCATION ${Python2_EXECUTABLE}) + set(Python3_EXECUTABLE ${Python2_EXECUTABLE}) endif() # Check prebuilt llvm/utils. @@ -193,6 +174,12 @@ endif() option(LLD_BUILD_TOOLS "Build the lld tools. If OFF, just generate build targets." ON) +option(LLD_DEFAULT_LD_LLD_IS_MINGW + "Use MinGW as the default backend for ld.lld. If OFF, ELF will be used." OFF) +if (LLD_DEFAULT_LD_LLD_IS_MINGW) + add_definitions("-DLLD_DEFAULT_LD_LLD_IS_MINGW=1") +endif() + if (MSVC) add_definitions(-wd4530) # Suppress 'warning C4530: C++ exception handler used, but unwind semantics are not enabled.' add_definitions(-wd4062) # Suppress 'warning C4062: enumerator X in switch of enum Y is not handled' from system header. diff --git a/lld/COFF/DebugTypes.cpp b/lld/COFF/DebugTypes.cpp index b8c488f26908a..46959334e6676 100644 --- a/lld/COFF/DebugTypes.cpp +++ b/lld/COFF/DebugTypes.cpp @@ -29,6 +29,8 @@ using namespace lld; using namespace lld::coff; namespace { +class TypeServerIpiSource; + // The TypeServerSource class represents a PDB type server, a file referenced by // OBJ files compiled with MSVC /Zi. A single PDB can be shared by several OBJ // files, therefore there must be only once instance per OBJ lot. The file path @@ -49,20 +51,35 @@ class TypeServerSource : public TpiSource { auto it = mappings.emplace(expectedInfo->getGuid(), this); assert(it.second); (void)it; - tsIndexMap.isTypeServerMap = true; } - Expected mergeDebugT(TypeMerger *m, - CVIndexMap *indexMap) override; + Error mergeDebugT(TypeMerger *m) override; bool isDependency() const override { return true; } PDBInputFile *pdbInputFile = nullptr; - CVIndexMap tsIndexMap; + // TpiSource for IPI stream. + TypeServerIpiSource *ipiSrc = nullptr; static std::map mappings; }; +// Companion to TypeServerSource. Stores the index map for the IPI stream in the +// PDB. Modeling PDBs with two sources for TPI and IPI helps establish the +// invariant of one type index space per source. +class TypeServerIpiSource : public TpiSource { +public: + explicit TypeServerIpiSource() : TpiSource(PDBIpi, nullptr) {} + + friend class TypeServerSource; + + // IPI merging is handled in TypeServerSource::mergeDebugT, since it depends + // directly on type merging. + Error mergeDebugT(TypeMerger *m) override { return Error::success(); } + + bool isDependency() const override { return true; } +}; + // This class represents the debug type stream of an OBJ file that depends on a // PDB type server (see TypeServerSource). class UseTypeServerSource : public TpiSource { @@ -70,8 +87,7 @@ class UseTypeServerSource : public TpiSource { UseTypeServerSource(ObjFile *f, TypeServer2Record ts) : TpiSource(UsingPDB, f), typeServerDependency(ts) {} - Expected mergeDebugT(TypeMerger *m, - CVIndexMap *indexMap) override; + Error mergeDebugT(TypeMerger *m) override; // Information about the PDB type server dependency, that needs to be loaded // in before merging this OBJ. @@ -92,15 +108,10 @@ class PrecompSource : public TpiSource { if (!it.second) fatal("a PCH object with the same signature has already been provided (" + toString(it.first->second->file) + " and " + toString(file) + ")"); - precompIndexMap.isPrecompiledTypeMap = true; } - Expected mergeDebugT(TypeMerger *m, - CVIndexMap *indexMap) override; bool isDependency() const override { return true; } - CVIndexMap precompIndexMap; - static std::map mappings; }; @@ -111,8 +122,7 @@ class UsePrecompSource : public TpiSource { UsePrecompSource(ObjFile *f, PrecompRecord precomp) : TpiSource(UsingPCH, f), precompDependency(precomp) {} - Expected mergeDebugT(TypeMerger *m, - CVIndexMap *indexMap) override; + Error mergeDebugT(TypeMerger *m) override; // Information about the Precomp OBJ dependency, that needs to be loaded in // before merging this OBJ. @@ -134,7 +144,11 @@ TpiSource *lld::coff::makeTpiSource(ObjFile *file) { } TpiSource *lld::coff::makeTypeServerSource(PDBInputFile *pdbInputFile) { - return make(pdbInputFile); + // Type server sources come in pairs: the TPI stream, and the IPI stream. + auto *tpiSource = make(pdbInputFile); + if (pdbInputFile->session->getPDBFile().hasPDBIpiStream()) + tpiSource->ipiSrc = make(); + return tpiSource; } TpiSource *lld::coff::makeUseTypeServerSource(ObjFile *file, @@ -196,8 +210,7 @@ getHashesFromDebugH(ArrayRef debugH) { } // Merge .debug$T for a generic object file. -Expected TpiSource::mergeDebugT(TypeMerger *m, - CVIndexMap *indexMap) { +Error TpiSource::mergeDebugT(TypeMerger *m) { CVTypeArray types; BinaryStreamReader reader(file->debugTypes, support::little); cantFail(reader.readArray(types, reader.getLength())); @@ -213,18 +226,22 @@ Expected TpiSource::mergeDebugT(TypeMerger *m, } if (auto err = mergeTypeAndIdRecords(m->globalIDTable, m->globalTypeTable, - indexMap->tpiMap, types, hashes, + indexMapStorage, types, hashes, file->pchSignature)) fatal("codeview::mergeTypeAndIdRecords failed: " + toString(std::move(err))); } else { if (auto err = - mergeTypeAndIdRecords(m->idTable, m->typeTable, indexMap->tpiMap, + mergeTypeAndIdRecords(m->idTable, m->typeTable, indexMapStorage, types, file->pchSignature)) fatal("codeview::mergeTypeAndIdRecords failed: " + toString(std::move(err))); } + // In an object, there is only one mapping for both types and items. + tpiMap = indexMapStorage; + ipiMap = indexMapStorage; + if (config->showSummary) { // Count how many times we saw each type record in our input. This // calculation requires a second pass over the type records to classify each @@ -234,7 +251,7 @@ Expected TpiSource::mergeDebugT(TypeMerger *m, m->ipiCounts.resize(m->getIDTable().size()); uint32_t srcIdx = 0; for (CVType &ty : types) { - TypeIndex dstIdx = indexMap->tpiMap[srcIdx++]; + TypeIndex dstIdx = tpiMap[srcIdx++]; // Type merging may fail, so a complex source type may become the simple // NotTranslated type, which cannot be used as an array index. if (dstIdx.isSimple()) @@ -245,12 +262,11 @@ Expected TpiSource::mergeDebugT(TypeMerger *m, } } - return indexMap; + return Error::success(); } // Merge types from a type server PDB. -Expected TypeServerSource::mergeDebugT(TypeMerger *m, - CVIndexMap *) { +Error TypeServerSource::mergeDebugT(TypeMerger *m) { pdb::PDBFile &pdbFile = pdbInputFile->session->getPDBFile(); Expected expectedTpi = pdbFile.getPDBTpiStream(); if (auto e = expectedTpi.takeError()) @@ -273,30 +289,34 @@ Expected TypeServerSource::mergeDebugT(TypeMerger *m, Optional endPrecomp; // Merge TPI first, because the IPI stream will reference type indices. if (auto err = - mergeTypeRecords(m->globalTypeTable, tsIndexMap.tpiMap, + mergeTypeRecords(m->globalTypeTable, indexMapStorage, expectedTpi->typeArray(), tpiHashes, endPrecomp)) fatal("codeview::mergeTypeRecords failed: " + toString(std::move(err))); + tpiMap = indexMapStorage; // Merge IPI. if (maybeIpi) { auto ipiHashes = GloballyHashedType::hashIds(maybeIpi->typeArray(), tpiHashes); - if (auto err = mergeIdRecords(m->globalIDTable, tsIndexMap.tpiMap, - tsIndexMap.ipiMap, maybeIpi->typeArray(), - ipiHashes)) + if (auto err = + mergeIdRecords(m->globalIDTable, tpiMap, ipiSrc->indexMapStorage, + maybeIpi->typeArray(), ipiHashes)) fatal("codeview::mergeIdRecords failed: " + toString(std::move(err))); + ipiMap = ipiSrc->indexMapStorage; } } else { // Merge TPI first, because the IPI stream will reference type indices. - if (auto err = mergeTypeRecords(m->typeTable, tsIndexMap.tpiMap, + if (auto err = mergeTypeRecords(m->typeTable, indexMapStorage, expectedTpi->typeArray())) fatal("codeview::mergeTypeRecords failed: " + toString(std::move(err))); + tpiMap = indexMapStorage; // Merge IPI. if (maybeIpi) { - if (auto err = mergeIdRecords(m->idTable, tsIndexMap.tpiMap, - tsIndexMap.ipiMap, maybeIpi->typeArray())) + if (auto err = mergeIdRecords(m->idTable, tpiMap, ipiSrc->indexMapStorage, + maybeIpi->typeArray())) fatal("codeview::mergeIdRecords failed: " + toString(std::move(err))); + ipiMap = ipiSrc->indexMapStorage; } } @@ -306,19 +326,18 @@ Expected TypeServerSource::mergeDebugT(TypeMerger *m, // map, that means we saw it once in the input. Add it to our histogram. m->tpiCounts.resize(m->getTypeTable().size()); m->ipiCounts.resize(m->getIDTable().size()); - for (TypeIndex ti : tsIndexMap.tpiMap) + for (TypeIndex ti : tpiMap) if (!ti.isSimple()) ++m->tpiCounts[ti.toArrayIndex()]; - for (TypeIndex ti : tsIndexMap.ipiMap) + for (TypeIndex ti : ipiMap) if (!ti.isSimple()) ++m->ipiCounts[ti.toArrayIndex()]; } - return &tsIndexMap; + return Error::success(); } -Expected -UseTypeServerSource::mergeDebugT(TypeMerger *m, CVIndexMap *indexMap) { +Error UseTypeServerSource::mergeDebugT(TypeMerger *m) { const codeview::GUID &tsId = typeServerDependency.getGuid(); StringRef tsPath = typeServerDependency.getName(); @@ -342,7 +361,7 @@ UseTypeServerSource::mergeDebugT(TypeMerger *m, CVIndexMap *indexMap) { pdb::PDBFile &pdbSession = tsSrc->pdbInputFile->session->getPDBFile(); auto expectedInfo = pdbSession.getPDBInfoStream(); if (!expectedInfo) - return &tsSrc->tsIndexMap; + return expectedInfo.takeError(); // Just because a file with a matching name was found and it was an actual // PDB file doesn't mean it matches. For it to match the InfoStream's GUID @@ -352,7 +371,10 @@ UseTypeServerSource::mergeDebugT(TypeMerger *m, CVIndexMap *indexMap) { tsPath, make_error(pdb::pdb_error_code::signature_out_of_date)); - return &tsSrc->tsIndexMap; + // Reuse the type index map of the type server. + tpiMap = tsSrc->tpiMap; + ipiMap = tsSrc->ipiMap; + return Error::success(); } static bool equalsPath(StringRef path1, StringRef path2) { @@ -377,8 +399,8 @@ static PrecompSource *findObjByName(StringRef fileNameOnly) { return nullptr; } -static Expected findPrecompMap(ObjFile *file, - PrecompRecord &pr) { +static Expected findPrecompMap(ObjFile *file, + PrecompRecord &pr) { // Cross-compile warning: given that Clang doesn't generate LF_PRECOMP // records, we assume the OBJ comes from a Windows build of cl.exe. Thusly, // the paths embedded in the OBJs are in the Windows format. @@ -409,63 +431,42 @@ static Expected findPrecompMap(ObjFile *file, toString(precomp->file), make_error(pdb::pdb_error_code::no_matching_pch)); - return &precomp->precompIndexMap; + return precomp; } /// Merges a precompiled headers TPI map into the current TPI map. The /// precompiled headers object will also be loaded and remapped in the /// process. -static Expected -mergeInPrecompHeaderObj(ObjFile *file, CVIndexMap *indexMap, +static Error +mergeInPrecompHeaderObj(ObjFile *file, + SmallVectorImpl &indexMapStorage, PrecompRecord &precomp) { auto e = findPrecompMap(file, precomp); if (!e) return e.takeError(); - const CVIndexMap *precompIndexMap = *e; - assert(precompIndexMap->isPrecompiledTypeMap); - - if (precompIndexMap->tpiMap.empty()) - return precompIndexMap; + PrecompSource *precompSrc = *e; + if (precompSrc->tpiMap.empty()) + return Error::success(); assert(precomp.getStartTypeIndex() == TypeIndex::FirstNonSimpleIndex); - assert(precomp.getTypesCount() <= precompIndexMap->tpiMap.size()); + assert(precomp.getTypesCount() <= precompSrc->tpiMap.size()); // Use the previously remapped index map from the precompiled headers. - indexMap->tpiMap.append(precompIndexMap->tpiMap.begin(), - precompIndexMap->tpiMap.begin() + - precomp.getTypesCount()); - return indexMap; + indexMapStorage.append(precompSrc->tpiMap.begin(), + precompSrc->tpiMap.begin() + precomp.getTypesCount()); + return Error::success(); } -Expected -UsePrecompSource::mergeDebugT(TypeMerger *m, CVIndexMap *indexMap) { +Error UsePrecompSource::mergeDebugT(TypeMerger *m) { // This object was compiled with /Yu, so process the corresponding // precompiled headers object (/Yc) first. Some type indices in the current // object are referencing data in the precompiled headers object, so we need // both to be loaded. - auto e = mergeInPrecompHeaderObj(file, indexMap, precompDependency); - if (!e) - return e.takeError(); - - // Drop LF_PRECOMP record from the input stream, as it has been replaced - // with the precompiled headers Type stream in the mergeInPrecompHeaderObj() - // call above. Note that we can't just call Types.drop_front(), as we - // explicitly want to rebase the stream. - CVTypeArray types; - BinaryStreamReader reader(file->debugTypes, support::little); - cantFail(reader.readArray(types, reader.getLength())); - auto firstType = types.begin(); - file->debugTypes = file->debugTypes.drop_front(firstType->RecordData.size()); - - return TpiSource::mergeDebugT(m, indexMap); -} + if (Error e = + mergeInPrecompHeaderObj(file, indexMapStorage, precompDependency)) + return e; -Expected PrecompSource::mergeDebugT(TypeMerger *m, - CVIndexMap *) { - // Note that we're not using the provided CVIndexMap. Instead, we use our - // local one. Precompiled headers objects need to save the index map for - // further reference by other objects which use the precompiled headers. - return TpiSource::mergeDebugT(m, &precompIndexMap); + return TpiSource::mergeDebugT(m); } uint32_t TpiSource::countTypeServerPDBs() { diff --git a/lld/COFF/DebugTypes.h b/lld/COFF/DebugTypes.h index 24d79d83e4c6d..f97c0f7617445 100644 --- a/lld/COFF/DebugTypes.h +++ b/lld/COFF/DebugTypes.h @@ -9,6 +9,8 @@ #ifndef LLD_COFF_DEBUGTYPES_H #define LLD_COFF_DEBUGTYPES_H +#include "lld/Common/LLVM.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/Support/Error.h" #include "llvm/Support/MemoryBuffer.h" @@ -25,14 +27,15 @@ class NativeSession; namespace lld { namespace coff { +using llvm::codeview::TypeIndex; + class ObjFile; class PDBInputFile; -struct CVIndexMap; class TypeMerger; class TpiSource { public: - enum TpiKind { Regular, PCH, UsingPCH, PDB, UsingPDB }; + enum TpiKind { Regular, PCH, UsingPCH, PDB, PDBIpi, UsingPDB }; TpiSource(TpiKind k, ObjFile *f); virtual ~TpiSource(); @@ -48,8 +51,8 @@ class TpiSource { /// If the object does not use a type server PDB (compiled with /Z7), we merge /// all the type and item records from the .debug$S stream and fill in the /// caller-provided ObjectIndexMap. - virtual llvm::Expected mergeDebugT(TypeMerger *m, - CVIndexMap *indexMap); + virtual Error mergeDebugT(TypeMerger *m); + /// Is this a dependent file that needs to be processed first, before other /// OBJs? virtual bool isDependency() const { return false; } @@ -64,6 +67,15 @@ class TpiSource { const TpiKind kind; ObjFile *file; + + // Storage for tpiMap or ipiMap, depending on the kind of source. + llvm::SmallVector indexMapStorage; + + // Source type index to PDB type index mapping for type and item records. + // These mappings will be the same for /Z7 objects, and distinct for /Zi + // objects. + llvm::ArrayRef tpiMap; + llvm::ArrayRef ipiMap; }; TpiSource *makeTpiSource(ObjFile *file); diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index a692dfe95d6d9..aaa00d0f7279a 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -785,8 +785,14 @@ void ObjFile::initializeDependencies() { else data = getDebugSection(".debug$T"); - if (data.empty()) + // Don't make a TpiSource for objects with no debug info. If the object has + // symbols but no types, make a plain, empty TpiSource anyway, because it + // simplifies adding the symbols later. + if (data.empty()) { + if (!debugChunks.empty()) + debugTypesObj = makeTpiSource(this); return; + } // Get the first type record. It will indicate if this object uses a type // server (/Zi) or a PCH file (/Yu). @@ -821,6 +827,8 @@ void ObjFile::initializeDependencies() { PrecompRecord precomp = cantFail( TypeDeserializer::deserializeAs(firstType->data())); debugTypesObj = makeUsePrecompSource(this, precomp); + // Drop the LF_PRECOMP record from the input stream. + debugTypes = debugTypes.drop_front(firstType->RecordData.size()); return; } diff --git a/lld/COFF/PDB.cpp b/lld/COFF/PDB.cpp index 49d04add5be04..bfa7bd8148dfd 100644 --- a/lld/COFF/PDB.cpp +++ b/lld/COFF/PDB.cpp @@ -112,11 +112,11 @@ class PDBLinker { /// externally. void addDebug(TpiSource *source); - const CVIndexMap *mergeTypeRecords(TpiSource *source, CVIndexMap *localMap); + bool mergeTypeRecords(TpiSource *source); - void addDebugSymbols(ObjFile *file, const CVIndexMap *indexMap); + void addDebugSymbols(TpiSource *source); - void mergeSymbolRecords(ObjFile *file, const CVIndexMap &indexMap, + void mergeSymbolRecords(TpiSource *source, std::vector &stringTableRefs, BinaryStreamRef symData); @@ -156,7 +156,7 @@ class DebugSHandler { ObjFile &file; /// The result of merging type indices. - const CVIndexMap *indexMap; + TpiSource *source; /// The DEBUG_S_STRINGTABLE subsection. These strings are referred to by /// index from other records in the .debug$S section. All of these strings @@ -188,8 +188,8 @@ class DebugSHandler { void mergeInlineeLines(const DebugSubsectionRecord &inlineeLines); public: - DebugSHandler(PDBLinker &linker, ObjFile &file, const CVIndexMap *indexMap) - : linker(linker), file(file), indexMap(indexMap) {} + DebugSHandler(PDBLinker &linker, ObjFile &file, TpiSource *source) + : linker(linker), file(file), source(source) {} void handleDebugS(ArrayRef relocatedDebugContents); @@ -261,7 +261,7 @@ static bool remapTypeIndex(TypeIndex &ti, ArrayRef typeIndexMap) { static void remapTypesInSymbolRecord(ObjFile *file, SymbolKind symKind, MutableArrayRef recordBytes, - const CVIndexMap &indexMap, + TpiSource *source, ArrayRef typeRefs) { MutableArrayRef contents = recordBytes.drop_front(sizeof(RecordPrefix)); @@ -271,10 +271,9 @@ static void remapTypesInSymbolRecord(ObjFile *file, SymbolKind symKind, fatal("symbol record too short"); // This can be an item index or a type index. Choose the appropriate map. - ArrayRef typeOrItemMap = indexMap.tpiMap; bool isItemIndex = ref.Kind == TiRefKind::IndexRef; - if (isItemIndex && indexMap.isTypeServerMap) - typeOrItemMap = indexMap.ipiMap; + ArrayRef typeOrItemMap = + isItemIndex ? source->ipiMap : source->tpiMap; MutableArrayRef tIs( reinterpret_cast(contents.data() + ref.Offset), ref.Count); @@ -505,9 +504,10 @@ static void addGlobalSymbol(pdb::GSIStreamBuilder &builder, uint16_t modIndex, } } -void PDBLinker::mergeSymbolRecords(ObjFile *file, const CVIndexMap &indexMap, +void PDBLinker::mergeSymbolRecords(TpiSource *source, std::vector &stringTableRefs, BinaryStreamRef symData) { + ObjFile *file = source->file; ArrayRef symsBuffer; cantFail(symData.readBytes(0, symData.getLength(), symsBuffer)); SmallVector scopes; @@ -571,7 +571,7 @@ void PDBLinker::mergeSymbolRecords(ObjFile *file, const CVIndexMap &indexMap, } // Re-map all the type index references. - remapTypesInSymbolRecord(file, sym.kind(), recordBytes, indexMap, + remapTypesInSymbolRecord(file, sym.kind(), recordBytes, source, typeRefs); // An object file may have S_xxx_ID symbols, but these get converted to @@ -665,11 +665,6 @@ void DebugSHandler::handleDebugS(ArrayRef relocatedDebugContents) { BinaryStreamReader reader(relocatedDebugContents, support::little); exitOnErr(reader.readArray(subsections, relocatedDebugContents.size())); - // If there is no index map, use an empty one. - CVIndexMap tempIndexMap; - if (!indexMap) - indexMap = &tempIndexMap; - for (const DebugSubsectionRecord &ss : subsections) { // Ignore subsections with the 'ignore' bit. Some versions of the Visual C++ // runtime have subsections with this bit set. @@ -709,7 +704,7 @@ void DebugSHandler::handleDebugS(ArrayRef relocatedDebugContents) { break; } case DebugSubsectionKind::Symbols: { - linker.mergeSymbolRecords(&file, *indexMap, stringTableReferences, + linker.mergeSymbolRecords(source, stringTableReferences, ss.getRecordData()); break; } @@ -757,9 +752,7 @@ void DebugSHandler::mergeInlineeLines( // Remap type indices in inlinee line records in place. for (const InlineeSourceLine &line : inlineeLines) { TypeIndex &inlinee = *const_cast(&line.Header->Inlinee); - ArrayRef typeOrItemMap = - indexMap->isTypeServerMap ? indexMap->ipiMap : indexMap->tpiMap; - if (!remapTypeIndex(inlinee, typeOrItemMap)) { + if (!remapTypeIndex(inlinee, source->ipiMap)) { log("bad inlinee line record in " + file.getName() + " with bad inlinee index 0x" + utohexstr(inlinee.getIndex())); } @@ -834,21 +827,18 @@ static void warnUnusable(InputFile *f, Error e) { warn(msg); } -const CVIndexMap *PDBLinker::mergeTypeRecords(TpiSource *source, - CVIndexMap *localMap) { +bool PDBLinker::mergeTypeRecords(TpiSource *source) { ScopedTimer t(typeMergingTimer); // Before we can process symbol substreams from .debug$S, we need to process // type information, file checksums, and the string table. Add type info to // the PDB first, so that we can get the map from object file type and item // indices to PDB type and item indices. - Expected r = source->mergeDebugT(&tMerger, localMap); - - // If the .debug$T sections fail to merge, assume there is no debug info. - if (!r) { - warnUnusable(source->file, r.takeError()); - return nullptr; + if (Error e = source->mergeDebugT(&tMerger)) { + // If the .debug$T sections fail to merge, assume there is no debug info. + warnUnusable(source->file, std::move(e)); + return false; } - return *r; + return true; } // Allocate memory for a .debug$S / .debug$F section and relocate it. @@ -860,12 +850,17 @@ static ArrayRef relocateDebugChunk(SectionChunk &debugChunk) { return makeArrayRef(buffer, debugChunk.getSize()); } -void PDBLinker::addDebugSymbols(ObjFile *file, const CVIndexMap *indexMap) { +void PDBLinker::addDebugSymbols(TpiSource *source) { + // If this TpiSource doesn't have an object file, it must be from a type + // server PDB. Type server PDBs do not contain symbols, so stop here. + if (!source->file) + return; + ScopedTimer t(symbolMergingTimer); pdb::DbiStreamBuilder &dbiBuilder = builder.getDbiBuilder(); - DebugSHandler dsh(*this, *file, indexMap); + DebugSHandler dsh(*this, *source->file, source); // Now do all live .debug$S and .debug$F sections. - for (SectionChunk *debugChunk : file->getDebugChunks()) { + for (SectionChunk *debugChunk : source->file->getDebugChunks()) { if (!debugChunk->live || debugChunk->getSize() == 0) continue; @@ -925,13 +920,9 @@ static void createModuleDBI(pdb::PDBFileBuilder &builder, ObjFile *file) { } void PDBLinker::addDebug(TpiSource *source) { - CVIndexMap localMap; - const CVIndexMap *indexMap = mergeTypeRecords(source, &localMap); - - if (source->kind == TpiSource::PDB) - return; // No symbols in TypeServer PDBs - - addDebugSymbols(source->file, indexMap); + // If type merging failed, ignore the symbols. + if (mergeTypeRecords(source)) + addDebugSymbols(source); } static pdb::BulkPublic createPublic(Defined *def) { @@ -964,15 +955,6 @@ void PDBLinker::addObjectsToPDB() { for_each(ObjFile::instances, [&](ObjFile *obj) { createModuleDBI(builder, obj); }); - // Merge OBJs that do not have debug types - for_each(ObjFile::instances, [&](ObjFile *obj) { - if (obj->debugTypesObj) - return; - // Even if there're no types, still merge non-symbol .Debug$S and .Debug$F - // sections - addDebugSymbols(obj, nullptr); - }); - // Merge dependencies TpiSource::forEachSource([&](TpiSource *source) { if (source->isDependency()) diff --git a/lld/COFF/TypeMerger.h b/lld/COFF/TypeMerger.h index 858f55b6856d0..d3184a7f18d74 100644 --- a/lld/COFF/TypeMerger.h +++ b/lld/COFF/TypeMerger.h @@ -55,15 +55,6 @@ class TypeMerger { SmallVector ipiCounts; }; -/// Map from type index and item index in a type server PDB to the -/// corresponding index in the destination PDB. -struct CVIndexMap { - llvm::SmallVector tpiMap; - llvm::SmallVector ipiMap; - bool isTypeServerMap = false; - bool isPrecompiledTypeMap = false; -}; - } // namespace coff } // namespace lld diff --git a/lld/ELF/AArch64ErrataFix.h b/lld/ELF/AArch64ErrataFix.h index 0548b58751ff9..dfe57b95dd996 100644 --- a/lld/ELF/AArch64ErrataFix.h +++ b/lld/ELF/AArch64ErrataFix.h @@ -18,7 +18,7 @@ namespace elf { class Defined; class InputSection; -struct InputSectionDescription; +class InputSectionDescription; class OutputSection; class Patch843419Section; diff --git a/lld/ELF/ARMErrataFix.h b/lld/ELF/ARMErrataFix.h index 5a39bcc75cd3b..a93609b35bafc 100644 --- a/lld/ELF/ARMErrataFix.h +++ b/lld/ELF/ARMErrataFix.h @@ -19,7 +19,7 @@ namespace elf { class Defined; class InputSection; -struct InputSectionDescription; +class InputSectionDescription; class OutputSection; class Patch657417Section; diff --git a/lld/ELF/Arch/AMDGPU.cpp b/lld/ELF/Arch/AMDGPU.cpp index 3610a38692d6d..4f4ce0094bbfd 100644 --- a/lld/ELF/Arch/AMDGPU.cpp +++ b/lld/ELF/Arch/AMDGPU.cpp @@ -41,7 +41,7 @@ AMDGPU::AMDGPU() { } static uint32_t getEFlags(InputFile *file) { - return cast>(file)->getObj().getHeader()->e_flags; + return cast>(file)->getObj().getHeader().e_flags; } uint32_t AMDGPU::calcEFlags() const { diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp index 7740ce9a71e03..4896c75c44911 100644 --- a/lld/ELF/Arch/Hexagon.cpp +++ b/lld/ELF/Arch/Hexagon.cpp @@ -66,7 +66,7 @@ uint32_t Hexagon::calcEFlags() const { // greatest revision in the list of inputs. uint32_t ret = 0; for (InputFile *f : objectFiles) { - uint32_t eflags = cast>(f)->getObj().getHeader()->e_flags; + uint32_t eflags = cast>(f)->getObj().getHeader().e_flags; if (eflags > ret) ret = eflags; } diff --git a/lld/ELF/Arch/Mips.cpp b/lld/ELF/Arch/Mips.cpp index fd1c5f5077342..d5eaf94625e00 100644 --- a/lld/ELF/Arch/Mips.cpp +++ b/lld/ELF/Arch/Mips.cpp @@ -372,7 +372,7 @@ bool MIPS::needsThunk(RelExpr expr, RelType type, const InputFile *file, if (!f) return false; // If current file has PIC code, LA25 stub is not required. - if (f->getObj().getHeader()->e_flags & EF_MIPS_PIC) + if (f->getObj().getHeader().e_flags & EF_MIPS_PIC) return false; auto *d = dyn_cast(&s); // LA25 is required if target file has PIC code @@ -749,7 +749,7 @@ template bool elf::isMipsPIC(const Defined *sym) { if (!file) return false; - return file->getObj().getHeader()->e_flags & EF_MIPS_PIC; + return file->getObj().getHeader().e_flags & EF_MIPS_PIC; } template TargetInfo *elf::getMipsTargetInfo() { diff --git a/lld/ELF/Arch/MipsArchTree.cpp b/lld/ELF/Arch/MipsArchTree.cpp index 85329c3bef536..77c05a818a5d3 100644 --- a/lld/ELF/Arch/MipsArchTree.cpp +++ b/lld/ELF/Arch/MipsArchTree.cpp @@ -297,7 +297,7 @@ static uint32_t getArchFlags(ArrayRef files) { template uint32_t elf::calcMipsEFlags() { std::vector v; for (InputFile *f : objectFiles) - v.push_back({f, cast>(f)->getObj().getHeader()->e_flags}); + v.push_back({f, cast>(f)->getObj().getHeader().e_flags}); if (v.empty()) { // If we don't have any input files, we'll have to rely on the information // we can derive from emulation information, since this at least gets us @@ -363,7 +363,7 @@ uint8_t elf::getMipsFpAbiFlag(uint8_t oldFlag, uint8_t newFlag, template static bool isN32Abi(const InputFile *f) { if (auto *ef = dyn_cast(f)) - return ef->template getObj().getHeader()->e_flags & EF_MIPS_ABI2; + return ef->template getObj().getHeader().e_flags & EF_MIPS_ABI2; return false; } diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp index cfb3ca9df4066..522546331f51f 100644 --- a/lld/ELF/Arch/PPC64.cpp +++ b/lld/ELF/Arch/PPC64.cpp @@ -22,8 +22,8 @@ using namespace llvm::ELF; using namespace lld; using namespace lld::elf; -static uint64_t ppc64TocOffset = 0x8000; -static uint64_t dynamicThreadPointerOffset = 0x8000; +constexpr uint64_t ppc64TocOffset = 0x8000; +constexpr uint64_t dynamicThreadPointerOffset = 0x8000; // The instruction encoding of bits 21-30 from the ISA for the Xform and Dform // instructions that can be used as part of the initial exec TLS sequence. @@ -62,6 +62,8 @@ enum DFormOpcd { ADDI = 14 }; +constexpr uint32_t NOP = 0x60000000; + enum class PPCLegacyInsn : uint32_t { NOINSN = 0, // Loads. @@ -618,8 +620,8 @@ int PPC64::getTlsGdRelaxSkip(RelType type) const { static uint32_t getEFlags(InputFile *file) { if (config->ekind == ELF64BEKind) - return cast>(file)->getObj().getHeader()->e_flags; - return cast>(file)->getObj().getHeader()->e_flags; + return cast>(file)->getObj().getHeader().e_flags; + return cast>(file)->getObj().getHeader().e_flags; } // This file implements v2 ABI. This function makes sure that all @@ -691,7 +693,7 @@ void PPC64::relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) const { writePrefixedInstruction(loc, pcRelInsn | ((totalDisp & 0x3ffff0000) << 16) | (totalDisp & 0xffff)); - write32(loc + rel.addend, 0x60000000); // nop accessInsn. + write32(loc + rel.addend, NOP); // nop accessInsn. break; } default: @@ -718,7 +720,7 @@ void PPC64::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, switch (rel.type) { case R_PPC64_GOT_TLSGD16_HA: - writeFromHalf16(loc, 0x60000000); // nop + writeFromHalf16(loc, NOP); break; case R_PPC64_GOT_TLSGD16: case R_PPC64_GOT_TLSGD16_LO: @@ -726,7 +728,7 @@ void PPC64::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, relocateNoSym(loc, R_PPC64_TPREL16_HA, val); break; case R_PPC64_TLSGD: - write32(loc, 0x60000000); // nop + write32(loc, NOP); write32(loc + 4, 0x38630000); // addi r3, r3 // Since we are relocating a half16 type relocation and Loc + 4 points to // the start of an instruction we need to advance the buffer by an extra @@ -758,13 +760,13 @@ void PPC64::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, switch (rel.type) { case R_PPC64_GOT_TLSLD16_HA: - writeFromHalf16(loc, 0x60000000); // nop + writeFromHalf16(loc, NOP); break; case R_PPC64_GOT_TLSLD16_LO: writeFromHalf16(loc, 0x3c6d0000); // addis r3, r13, 0 break; case R_PPC64_TLSLD: - write32(loc, 0x60000000); // nop + write32(loc, NOP); write32(loc + 4, 0x38631000); // addi r3, r3, 4096 break; case R_PPC64_DTPREL16: @@ -829,7 +831,7 @@ void PPC64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, unsigned offset = (config->ekind == ELF64BEKind) ? 2 : 0; switch (rel.type) { case R_PPC64_GOT_TPREL16_HA: - write32(loc - offset, 0x60000000); // nop + write32(loc - offset, NOP); break; case R_PPC64_GOT_TPREL16_LO_DS: case R_PPC64_GOT_TPREL16_DS: { @@ -936,6 +938,7 @@ RelExpr PPC64::getRelExpr(RelType type, const Symbol &s, case R_PPC64_TPREL16_HIGHERA: case R_PPC64_TPREL16_HIGHEST: case R_PPC64_TPREL16_HIGHESTA: + case R_PPC64_TPREL34: return R_TLS; case R_PPC64_DTPREL16: case R_PPC64_DTPREL16_DS: @@ -1128,7 +1131,7 @@ void PPC64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { case R_PPC64_REL16_HA: case R_PPC64_TPREL16_HA: if (config->tocOptimize && shouldTocOptimize && ha(val) == 0) - writeFromHalf16(loc, 0x60000000); + writeFromHalf16(loc, NOP); else write16(loc, ha(val)); break; @@ -1233,7 +1236,8 @@ void PPC64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { (val & si1Mask)); break; } - case R_PPC64_GOT_PCREL34: { + case R_PPC64_GOT_PCREL34: + case R_PPC64_TPREL34: { const uint64_t si0Mask = 0x00000003ffff0000; const uint64_t si1Mask = 0x000000000000ffff; const uint64_t fullMask = 0x0003ffff0000ffff; @@ -1353,7 +1357,7 @@ void PPC64::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, return; } case R_PPC64_TLSGD: - write32(loc, 0x60000000); // bl __tls_get_addr(sym@tlsgd) --> nop + write32(loc, NOP); // bl __tls_get_addr(sym@tlsgd) --> nop write32(loc + 4, 0x7c636A14); // nop --> add r3, r3, r13 return; default: @@ -1424,7 +1428,7 @@ bool PPC64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end, uint32_t secondInstr = read32(loc + 8); if (!loImm && getPrimaryOpCode(secondInstr) == 14) { loImm = secondInstr & 0xFFFF; - } else if (secondInstr != 0x60000000) { + } else if (secondInstr != NOP) { return false; } @@ -1438,7 +1442,7 @@ bool PPC64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end, }; if (!checkRegOperands(firstInstr, 12, 1)) return false; - if (secondInstr != 0x60000000 && !checkRegOperands(secondInstr, 12, 12)) + if (secondInstr != NOP && !checkRegOperands(secondInstr, 12, 12)) return false; int32_t stackFrameSize = (hiImm * 65536) + loImm; @@ -1457,12 +1461,12 @@ bool PPC64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end, if (hiImm) { write32(loc + 4, 0x3D810000 | (uint16_t)hiImm); // If the low immediate is zero the second instruction will be a nop. - secondInstr = loImm ? 0x398C0000 | (uint16_t)loImm : 0x60000000; + secondInstr = loImm ? 0x398C0000 | (uint16_t)loImm : NOP; write32(loc + 8, secondInstr); } else { // addi r12, r1, imm write32(loc + 4, (0x39810000) | (uint16_t)loImm); - write32(loc + 8, 0x60000000); + write32(loc + 8, NOP); } return true; diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp index b340fd00deee6..4cbf925dcfa26 100644 --- a/lld/ELF/Arch/RISCV.cpp +++ b/lld/ELF/Arch/RISCV.cpp @@ -104,8 +104,8 @@ RISCV::RISCV() { static uint32_t getEFlags(InputFile *f) { if (config->is64) - return cast>(f)->getObj().getHeader()->e_flags; - return cast>(f)->getObj().getHeader()->e_flags; + return cast>(f)->getObj().getHeader().e_flags; + return cast>(f)->getObj().getHeader().e_flags; } uint32_t RISCV::calcEFlags() const { diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 34f2cd633e425..0f2e80b659879 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1719,7 +1719,7 @@ static void findKeepUniqueSections(opt::InputArgList &args) { ArrayRef syms = obj->getSymbols(); if (obj->addrsigSec) { ArrayRef contents = - check(obj->getObj().getSectionContents(obj->addrsigSec)); + check(obj->getObj().getSectionContents(*obj->addrsigSec)); const uint8_t *cur = contents.begin(); while (cur != contents.end()) { unsigned size; diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index acdb5c71efb96..bd079b41ac908 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -274,6 +274,16 @@ std::string InputFile::getSrcMsg(const Symbol &sym, InputSectionBase &sec, } } +StringRef InputFile::getNameForScript() const { + if (archiveName.empty()) + return getName(); + + if (nameForScriptCache.empty()) + nameForScriptCache = (archiveName + Twine(':') + getName()).str(); + + return nameForScriptCache; +} + template DWARFCache *ObjFile::getDwarf() { llvm::call_once(initDwarf, [this]() { dwarf = std::make_unique(std::make_unique( @@ -348,9 +358,9 @@ template void ELFFileBase::init() { // Initialize trivial attributes. const ELFFile &obj = getObj(); - emachine = obj.getHeader()->e_machine; - osabi = obj.getHeader()->e_ident[llvm::ELF::EI_OSABI]; - abiVersion = obj.getHeader()->e_ident[llvm::ELF::EI_ABIVERSION]; + emachine = obj.getHeader().e_machine; + osabi = obj.getHeader().e_ident[llvm::ELF::EI_OSABI]; + abiVersion = obj.getHeader().e_ident[llvm::ELF::EI_ABIVERSION]; ArrayRef sections = CHECK(obj.sections(), this); @@ -378,7 +388,7 @@ template void ELFFileBase::init() { template uint32_t ObjFile::getSectionIndex(const Elf_Sym &sym) const { return CHECK( - this->getObj().getSectionIndex(&sym, getELFSyms(), shndxTable), + this->getObj().getSectionIndex(sym, getELFSyms(), shndxTable), this); } @@ -566,7 +576,7 @@ void ObjFile::initializeSections(bool ignoreComdats) { if (sec.sh_type == ELF::SHT_LLVM_CALL_GRAPH_PROFILE) cgProfile = - check(obj.template getSectionContentsAsArray(&sec)); + check(obj.template getSectionContentsAsArray(sec)); // SHF_EXCLUDE'ed sections are discarded by the linker. However, // if -r is given, we'll let the final link discard such sections. @@ -595,7 +605,7 @@ void ObjFile::initializeSections(bool ignoreComdats) { ArrayRef entries = - CHECK(obj.template getSectionContentsAsArray(&sec), this); + CHECK(obj.template getSectionContentsAsArray(sec), this); if (entries.empty()) fatal(toString(this) + ": empty SHT_GROUP"); @@ -870,7 +880,7 @@ InputSectionBase *ObjFile::createInputSection(const Elf_Shdr &sec) { if (config->emachine == EM_ARM && sec.sh_type == SHT_ARM_ATTRIBUTES) { ARMAttributeParser attributes; - ArrayRef contents = check(this->getObj().getSectionContents(&sec)); + ArrayRef contents = check(this->getObj().getSectionContents(sec)); if (Error e = attributes.parse(contents, config->ekind == ELF32LEKind ? support::little : support::big)) { @@ -894,7 +904,7 @@ InputSectionBase *ObjFile::createInputSection(const Elf_Shdr &sec) { if (config->emachine == EM_RISCV && sec.sh_type == SHT_RISCV_ATTRIBUTES) { RISCVAttributeParser attributes; - ArrayRef contents = check(this->getObj().getSectionContents(&sec)); + ArrayRef contents = check(this->getObj().getSectionContents(sec)); if (Error e = attributes.parse(contents, support::little)) { auto *isec = make(*this, sec, name); warn(toString(isec) + ": " + llvm::toString(std::move(e))); @@ -919,7 +929,7 @@ InputSectionBase *ObjFile::createInputSection(const Elf_Shdr &sec) { if (config->relocatable) break; ArrayRef data = - CHECK(this->getObj().template getSectionContentsAsArray(&sec), this); + CHECK(this->getObj().template getSectionContentsAsArray(sec), this); if (!data.empty() && data.back() != '\0') { error(toString(this) + ": corrupted dependent libraries section (unterminated string): " + @@ -959,12 +969,12 @@ InputSectionBase *ObjFile::createInputSection(const Elf_Shdr &sec) { ": multiple relocation sections to one section are not supported"); if (sec.sh_type == SHT_RELA) { - ArrayRef rels = CHECK(getObj().relas(&sec), this); + ArrayRef rels = CHECK(getObj().relas(sec), this); target->firstRelocation = rels.begin(); target->numRelocations = rels.size(); target->areRelocsRela = true; } else { - ArrayRef rels = CHECK(getObj().rels(&sec), this); + ArrayRef rels = CHECK(getObj().rels(sec), this); target->firstRelocation = rels.begin(); target->numRelocations = rels.size(); target->areRelocsRela = false; @@ -1065,7 +1075,7 @@ InputSectionBase *ObjFile::createInputSection(const Elf_Shdr &sec) { template StringRef ObjFile::getSectionName(const Elf_Shdr &sec) { - return CHECK(getObj().getSectionName(&sec, sectionStringTable), this); + return CHECK(getObj().getSectionName(sec, sectionStringTable), this); } // Initialize this->Symbols. this->Symbols is a parallel array as @@ -1279,7 +1289,7 @@ std::vector SharedFile::parseVerneed(const ELFFile &obj, if (!sec) return {}; std::vector verneeds; - ArrayRef data = CHECK(obj.getSectionContents(sec), this); + ArrayRef data = CHECK(obj.getSectionContents(*sec), this); const uint8_t *verneedBuf = data.begin(); for (unsigned i = 0; i != sec->sh_info; ++i) { if (verneedBuf + sizeof(typename ELFT::Verneed) > data.end()) @@ -1355,7 +1365,7 @@ template void SharedFile::parse() { continue; case SHT_DYNAMIC: dynamicTags = - CHECK(obj.template getSectionContentsAsArray(&sec), this); + CHECK(obj.template getSectionContentsAsArray(sec), this); break; case SHT_GNU_versym: versymSec = &sec; @@ -1414,7 +1424,7 @@ template void SharedFile::parse() { std::vector versyms(size, VER_NDX_GLOBAL); if (versymSec) { ArrayRef versym = - CHECK(obj.template getSectionContentsAsArray(versymSec), + CHECK(obj.template getSectionContentsAsArray(*versymSec), this) .slice(firstGlobal); for (size_t i = 0; i < size; ++i) diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h index 7af85e417ca58..b1c83ddf384fb 100644 --- a/lld/ELF/InputFiles.h +++ b/lld/ELF/InputFiles.h @@ -92,6 +92,9 @@ class InputFile { return symbols; } + // Get filename to use for linker script processing. + StringRef getNameForScript() const; + // Filename of .a which contained this file. If this file was // not in an archive file, it is the empty string. We use this // string for creating error messages. @@ -147,6 +150,9 @@ class InputFile { private: const Kind fileKind; + + // Cache for getNameForScript(). + mutable std::string nameForScriptCache; }; class ELFFileBase : public InputFile { diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index ad4a12855ad1d..497fb607f4243 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -53,7 +53,7 @@ static ArrayRef getSectionContents(ObjFile &file, const typename ELFT::Shdr &hdr) { if (hdr.sh_type == SHT_NOBITS) return makeArrayRef(nullptr, hdr.sh_size); - return check(file.getObj().getSectionContents(&hdr)); + return check(file.getObj().getSectionContents(hdr)); } InputSectionBase::InputSectionBase(InputFile *file, uint64_t flags, @@ -456,7 +456,7 @@ void InputSection::copyRelocations(uint8_t *buf, ArrayRef rels) { Elf_Shdr_Impl sec = CHECK(file->getObj().sections(), file)[secIdx]; warn("relocation refers to a discarded section: " + - CHECK(file->getObj().getSectionName(&sec), file) + + CHECK(file->getObj().getSectionName(sec), file) + "\n>>> referenced by " + getObjMsg(p->r_offset)); } p->setSymbolAndType(0, 0, false); diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp index ae77fadcc78d3..30281a1541f1a 100644 --- a/lld/ELF/LTO.cpp +++ b/lld/ELF/LTO.cpp @@ -57,6 +57,19 @@ static std::unique_ptr openFile(StringRef file) { return ret; } +// The merged bitcode after LTO is large. Try openning a file stream that +// supports reading, seeking and writing. Such a file allows BitcodeWriter to +// flush buffered data to reduce memory comsuption. If this fails, open a file +// stream that supports only write. +static std::unique_ptr openLTOOutputFile(StringRef file) { + std::error_code ec; + std::unique_ptr fs = + std::make_unique(file, ec); + if (!ec) + return fs; + return openFile(file); +} + static std::string getThinLTOOutputFile(StringRef modulePath) { return lto::getThinLTOOutputFile( std::string(modulePath), std::string(config->thinLTOPrefixReplace.first), @@ -151,7 +164,8 @@ static lto::Config createConfig() { if (config->emitLLVM) { c.PostInternalizeModuleHook = [](size_t task, const Module &m) { - if (std::unique_ptr os = openFile(config->outputFile)) + if (std::unique_ptr os = + openLTOOutputFile(config->outputFile)) WriteBitcodeToFile(m, *os, false); return false; }; diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp index 11f0fc9d5fbe2..ba51a8b402fd1 100644 --- a/lld/ELF/LinkerScript.cpp +++ b/lld/ELF/LinkerScript.cpp @@ -320,20 +320,33 @@ void LinkerScript::assignSymbol(SymbolAssignment *cmd, bool inSec) { cmd->sym->type = v.type; } -static std::string getFilename(InputFile *file) { - if (!file) - return ""; - if (file->archiveName.empty()) - return std::string(file->getName()); - return (file->archiveName + ':' + file->getName()).str(); +static inline StringRef getFilename(const InputFile *file) { + return file ? file->getNameForScript() : StringRef(); } -bool LinkerScript::shouldKeep(InputSectionBase *s) { - if (keptSections.empty()) +bool InputSectionDescription::matchesFile(const InputFile *file) const { + if (filePat.isTrivialMatchAll()) + return true; + + if (!matchesFileCache || matchesFileCache->first != file) + matchesFileCache.emplace(file, filePat.match(getFilename(file))); + + return matchesFileCache->second; +} + +bool SectionPattern::excludesFile(const InputFile *file) const { + if (excludedFilePat.empty()) return false; - std::string filename = getFilename(s->file); + + if (!excludesFileCache || excludesFileCache->first != file) + excludesFileCache.emplace(file, excludedFilePat.match(getFilename(file))); + + return excludesFileCache->second; +} + +bool LinkerScript::shouldKeep(InputSectionBase *s) { for (InputSectionDescription *id : keptSections) - if (id->filePat.match(filename)) + if (id->matchesFile(s->file)) for (SectionPattern &p : id->sectionPatterns) if (p.sectionPat.match(s->name) && (s->flags & id->withFlags) == id->withFlags && @@ -433,9 +446,7 @@ LinkerScript::computeInputSections(const InputSectionDescription *cmd, if (!pat.sectionPat.match(sec->name)) continue; - std::string filename = getFilename(sec->file); - if (!cmd->filePat.match(filename) || - pat.excludedFilePat.match(filename) || + if (!cmd->matchesFile(sec->file) || pat.excludesFile(sec->file) || (sec->flags & cmd->withFlags) != cmd->withFlags || (sec->flags & cmd->withoutFlags) != 0) continue; diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h index 4a1a5fd71b67f..efa473f45e308 100644 --- a/lld/ELF/LinkerScript.h +++ b/lld/ELF/LinkerScript.h @@ -29,6 +29,7 @@ namespace lld { namespace elf { class Defined; +class InputFile; class InputSection; class InputSectionBase; class OutputSection; @@ -146,19 +147,32 @@ struct MemoryRegion { // This struct represents one section match pattern in SECTIONS() command. // It can optionally have negative match pattern for EXCLUDED_FILE command. // Also it may be surrounded with SORT() command, so contains sorting rules. -struct SectionPattern { +class SectionPattern { + StringMatcher excludedFilePat; + + // Cache of the most recent input argument and result of excludesFile(). + mutable llvm::Optional> excludesFileCache; + +public: SectionPattern(StringMatcher &&pat1, StringMatcher &&pat2) : excludedFilePat(pat1), sectionPat(pat2), sortOuter(SortSectionPolicy::Default), sortInner(SortSectionPolicy::Default) {} - StringMatcher excludedFilePat; + bool excludesFile(const InputFile *file) const; + StringMatcher sectionPat; SortSectionPolicy sortOuter; SortSectionPolicy sortInner; }; -struct InputSectionDescription : BaseCommand { +class InputSectionDescription : public BaseCommand { + SingleStringMatcher filePat; + + // Cache of the most recent input argument and result of matchesFile(). + mutable llvm::Optional> matchesFileCache; + +public: InputSectionDescription(StringRef filePattern, uint64_t withFlags = 0, uint64_t withoutFlags = 0) : BaseCommand(InputSectionKind), filePat(filePattern), @@ -168,7 +182,7 @@ struct InputSectionDescription : BaseCommand { return c->kind == InputSectionKind; } - SingleStringMatcher filePat; + bool matchesFile(const InputFile *file) const; // Input sections that matches at least one of SectionPatterns // will be associated with this InputSectionDescription. diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp index 28e13e8c1234b..af6c08c215816 100644 --- a/lld/ELF/MarkLive.cpp +++ b/lld/ELF/MarkLive.cpp @@ -152,9 +152,9 @@ void MarkLive::scanEhFrameSection(EhInputSection &eh, // a LSDA. We only need to keep the LSDA alive, so ignore anything that // points to executable sections. uint64_t pieceEnd = piece.inputOff + piece.size; - for (size_t j = firstRelI, end2 = rels.size(); j < end2; ++j) - if (rels[j].r_offset < pieceEnd) - resolveReloc(eh, rels[j], true); + for (size_t j = firstRelI, end2 = rels.size(); + j < end2 && rels[j].r_offset < pieceEnd; ++j) + resolveReloc(eh, rels[j], true); } } diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 3080d53c33295..4c6a70d9034e9 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -113,6 +113,17 @@ void elf::reportRangeError(uint8_t *loc, const Relocation &rel, const Twine &v, ", " + Twine(max).str() + "]" + hint); } +void elf::reportRangeError(uint8_t *loc, int64_t v, int n, const Symbol &sym, + const Twine &msg) { + ErrorPlace errPlace = getErrorPlace(loc); + std::string hint; + if (!sym.getName().empty()) + hint = "; references " + lld::toString(sym) + getDefinedLocation(sym); + errorOrWarn(errPlace.loc + msg + " is out of range: " + Twine(v) + + " is not in [" + Twine(llvm::minIntN(n)) + ", " + + Twine(llvm::maxIntN(n)) + "]" + hint); +} + namespace { // Build a bitmask with one bit set for each RelExpr. // @@ -681,7 +692,7 @@ static std::string maybeReportDiscarded(Undefined &sym) { if (sym.type == ELF::STT_SECTION) { msg = "relocation refers to a discarded section: "; msg += CHECK( - file->getObj().getSectionName(&objSections[sym.discardedSecIdx]), file); + file->getObj().getSectionName(objSections[sym.discardedSecIdx]), file); } else { msg = "relocation refers to a symbol in a discarded section: " + toString(sym); diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h index 4f48082b8be9d..fccd56880718a 100644 --- a/lld/ELF/Relocations.h +++ b/lld/ELF/Relocations.h @@ -131,7 +131,7 @@ bool hexagonNeedsTLSSymbol(ArrayRef outputSections); class ThunkSection; class Thunk; -struct InputSectionDescription; +class InputSectionDescription; class ThunkCreator { public: diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h index e53ac4d066272..9399ecf526f4f 100644 --- a/lld/ELF/Target.h +++ b/lld/ELF/Target.h @@ -229,6 +229,8 @@ template bool isMipsPIC(const Defined *sym); void reportRangeError(uint8_t *loc, const Relocation &rel, const Twine &v, int64_t min, uint64_t max); +void reportRangeError(uint8_t *loc, int64_t v, int n, const Symbol &sym, + const Twine &msg); // Make sure that V can be represented as an N bit signed integer. inline void checkInt(uint8_t *loc, int64_t v, int n, const Relocation &rel) { diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp index 6a8ea4dc0e48f..684ff5154a332 100644 --- a/lld/ELF/Thunks.cpp +++ b/lld/ELF/Thunks.cpp @@ -896,7 +896,7 @@ void PPC64R2SaveStub::writeTo(uint8_t *buf) { int64_t offset = destination.getVA() - (getThunkTargetSym()->getVA() + 4); // The branch offset needs to fit in 26 bits. if (!isInt<26>(offset)) - fatal("R2 save stub branch offset is too large: " + Twine(offset)); + reportRangeError(buf, offset, 26, destination, "R2 save stub offset"); write32(buf + 0, 0xf8410018); // std r2,24(r1) write32(buf + 4, 0x48000000 | (offset & 0x03fffffc)); // b } @@ -910,7 +910,7 @@ void PPC64R2SaveStub::addSymbols(ThunkSection &isec) { void PPC64R12SetupStub::writeTo(uint8_t *buf) { int64_t offset = destination.getVA() - getThunkTargetSym()->getVA(); if (!isInt<34>(offset)) - fatal("offset must fit in 34 bits to encode in the instruction"); + reportRangeError(buf, offset, 34, destination, "R12 setup stub offset"); uint64_t paddi = PADDI_R12_NO_DISP | (((offset >> 16) & 0x3ffff) << 32) | (offset & 0xffff); @@ -927,7 +927,8 @@ void PPC64R12SetupStub::addSymbols(ThunkSection &isec) { void PPC64PCRelPLTStub::writeTo(uint8_t *buf) { int64_t offset = destination.getGotPltVA() - getThunkTargetSym()->getVA(); if (!isInt<34>(offset)) - fatal("offset must fit in 34 bits to encode in the instruction"); + reportRangeError(buf, offset, 34, destination, + "PC-relative PLT stub offset"); uint64_t pld = PLD_R12_NO_DISP | (((offset >> 16) & 0x3ffff) << 32) | (offset & 0xffff); diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index b26817b66e271..f42686f08e640 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -1346,9 +1346,11 @@ static DenseMap buildSectionOrder() { addSym(*sym); for (InputFile *file : objectFiles) - for (Symbol *sym : file->getSymbols()) - if (sym->isLocal()) - addSym(*sym); + for (Symbol *sym : file->getSymbols()) { + if (!sym->isLocal()) + break; + addSym(*sym); + } if (config->warnSymbolOrdering) for (auto orderEntry : symbolOrder) @@ -1699,8 +1701,8 @@ template void Writer::finalizeAddressDependentContent() { bool changed = target->needsThunks && tc.createThunks(outputSections); // With Thunk Size much smaller than branch range we expect to - // converge quickly; if we get to 10 something has gone wrong. - if (changed && tc.pass >= 10) { + // converge quickly; if we get to 15 something has gone wrong. + if (changed && tc.pass >= 15) { error("thunk creation not converged"); break; } diff --git a/lld/docs/ELF/warn_backrefs.rst b/lld/docs/ELF/warn_backrefs.rst new file mode 100644 index 0000000000000..d4388f9afbb42 --- /dev/null +++ b/lld/docs/ELF/warn_backrefs.rst @@ -0,0 +1,99 @@ +--warn-backrefs +=============== + +``--warn-backrefs`` gives a warning when an undefined symbol reference is +resolved by a definition in an archive to the left of it on the command line. + +A linker such as GNU ld makes a single pass over the input files from left to +right maintaining the set of undefined symbol references from the files loaded +so far. When encountering an archive or an object file surrounded by +``--start-lib`` and ``--end-lib`` that archive will be searched for resolving +symbol definitions; this may result in input files being loaded, updating the +set of undefined symbol references. When all resolving definitions have been +loaded from the archive, the linker moves on the next file and will not return +to it. This means that if an input file to the right of a archive cannot have +an undefined symbol resolved by a archive to the left of it. For example: + + ld def.a ref.o + +will result in an ``undefined reference`` error. If there are no cyclic +references, the archives can be ordered in such a way that there are no +backward references. If there are cyclic references then the ``--start-group`` +and ``--end-group`` options can be used, or the same archive can be placed on +the command line twice. + +LLD remembers the symbol table of archives that it has previously seen, so if +there is a reference from an input file to the right of an archive, LLD will +still search that archive for resolving any undefined references. This means +that an archive only needs to be included once on the command line and the +``--start-group`` and ``--end-group`` options are redundant. + +A consequence of the differing archive searching semantics is that the same +linker command line can result in different outcomes. A link may succeed with +LLD that will fail with GNU ld, or even worse both links succeed but they have +selected different objects from different archives that both define the same +symbols. + +The ``warn-backrefs`` option provides information that helps identify cases +where LLD and GNU ld archive selection may differ. + + % ld.lld --warn-backrefs ... -lB -lA + ld.lld: warning: backward reference detected: system in A.a(a.o) refers to B.a(b.o) + + % ld.lld --warn-backrefs ... --start-lib B/b.o --end-lib --start-lib A/a.o --end-lib + ld.lld: warning: backward reference detected: system in A/a.o refers to B/b.o + + # To suppress the warning, you can specify --warn-backrefs-exclude= to match B/b.o or B.a(b.o) + +The ``--warn-backrefs`` option can also provide a check to enforce a +topological order of archives, which can be useful to detect layering +violations (albeit unable to catch all cases). There are two cases where GNU ld +will result in an ``undefined reference`` error: + +* If adding the dependency does not form a cycle: conceptually ``A`` is higher + level library while ``B`` is at a lower level. When you are developing an + application ``P`` which depends on ``A``, but does not directly depend on + ``B``, your link may fail surprisingly with ``undefined symbol: + symbol_defined_in_B`` if the used/linked part of ``A`` happens to need some + components of ``B``. It is inappropriate for ``P`` to add a dependency on + ``B`` since ``P`` does not use ``B`` directly. +* If adding the dependency forms a cycle, e.g. ``B->C->A ~> B``. ``A`` + is supposed to be at the lowest level while ``B`` is supposed to be at the + highest level. When you are developing ``C_test`` testing ``C``, your link may + fail surprisingly with ``undefined symbol`` if there is somehow a dependency on + some components of ``B``. You could fix the issue by adding the missing + dependency (``B``), however, then every test (``A_test``, ``B_test``, + ``C_test``) will link against every library. This breaks the motivation + of splitting ``B``, ``C`` and ``A`` into separate libraries and makes binaries + unnecessarily large. Moreover, the layering violation makes lower-level + libraries (e.g. ``A``) vulnerable to changes to higher-level libraries (e.g. + ``B``, ``C``). + +Resolution: + +* Add a dependency from ``A`` to ``B``. +* The reference may be unintended and can be removed. +* The dependency may be intentionally omitted because there are multiple + libraries like ``B``. Consider linking ``B`` with object semantics by + surrounding it with ``--whole-archive`` and ``--no-whole-archive``. +* In the case of circular dependency, sometimes merging the libraries are the best. + +There are two cases like a library sandwich where GNU ld will select a +different object. + +* ``A.a B A2.so``: ``A.a`` may be used as an interceptor (e.g. it provides some + optimized libc functions and ``A2`` is libc). ``B`` does not need to know + about ``A.a``, and ``A.a`` may be pulled into the link by other part of the + program. For linker portability, consider ``--whole-archive`` and + ``--no-whole-archive``. + +* ``A.a B A2.a``: similar to the above case but ``--warn-backrefs`` does not + flag the problem, because ``A2.a`` may be a replicate of ``A.a``, which is + redundant but benign. In some cases ``A.a`` and ``B`` should be surrounded by + a pair of ``--start-group`` and ``--end-group``. This is especially common + among system libraries (e.g. ``-lc __isnanl references -lm``, ``-lc + _IO_funlockfile references -lpthread``, ``-lc __gcc_personality_v0 references + -lgcc_eh``, and ``-lpthread _Unwind_GetCFA references -lunwind``). + + In C++, this is likely an ODR violation. We probably need a dedicated option + for ODR detection. diff --git a/lld/docs/index.rst b/lld/docs/index.rst index b820d57e3d354..900ad8219fe07 100644 --- a/lld/docs/index.rst +++ b/lld/docs/index.rst @@ -177,3 +177,4 @@ document soon. Partitions ReleaseNotes ELF/linker_script + ELF/warn_backrefs diff --git a/lld/include/lld/Common/Strings.h b/lld/include/lld/Common/Strings.h index 3940d2443cd45..38d93e01c0b95 100644 --- a/lld/include/lld/Common/Strings.h +++ b/lld/include/lld/Common/Strings.h @@ -39,6 +39,11 @@ class SingleStringMatcher { // Match s against this pattern, exactly if ExactMatch is true. bool match(llvm::StringRef s) const; + // Returns true for pattern "*" which will match all inputs. + bool isTrivialMatchAll() const { + return !ExactMatch && GlobPatternMatcher.isTrivialMatchAll(); + } + private: // Whether to do an exact match irregardless of the presence of wildcard // character. @@ -69,7 +74,7 @@ class StringMatcher { // Add a new pattern to the existing ones to match against. void addPattern(SingleStringMatcher Matcher) { patterns.push_back(Matcher); } - bool empty() { return patterns.empty(); } + bool empty() const { return patterns.empty(); } // Match s against the patterns. bool match(llvm::StringRef s) const; diff --git a/lld/test/CMakeLists.txt b/lld/test/CMakeLists.txt index 52e6118ba876b..ff957e8912114 100644 --- a/lld/test/CMakeLists.txt +++ b/lld/test/CMakeLists.txt @@ -6,7 +6,7 @@ set(LLVM_LIBS_DIR "${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/%(build_config)s" llvm_canonicalize_cmake_booleans( LLVM_ENABLE_ZLIB - LLVM_LIBXML2_ENABLED + LLVM_ENABLE_LIBXML2 ) configure_lit_site_cfg( diff --git a/lld/test/ELF/map-file.s b/lld/test/ELF/map-file.s index 1cd3b9087cbea..55b6b9e672812 100644 --- a/lld/test/ELF/map-file.s +++ b/lld/test/ELF/map-file.s @@ -11,7 +11,7 @@ # RUN: ld.lld %t1.o %t2.o %t3.o %t4.a %t5.so -o %t -M | FileCheck --match-full-lines --strict-whitespace %s # RUN: ld.lld %t1.o %t2.o %t3.o %t4.a %t5.so -o %t --print-map | FileCheck --match-full-lines -strict-whitespace %s # RUN: ld.lld %t1.o %t2.o %t3.o %t4.a %t5.so -o %t -Map=%t.map -# RUN: FileCheck -strict-whitespace %s < %t.map +# RUN: FileCheck -match-full-lines -strict-whitespace %s < %t.map .global _start _start: diff --git a/lld/test/ELF/ppc64-tls-pcrel-le.s b/lld/test/ELF/ppc64-tls-pcrel-le.s new file mode 100644 index 0000000000000..bff7d075eda49 --- /dev/null +++ b/lld/test/ELF/ppc64-tls-pcrel-le.s @@ -0,0 +1,56 @@ +# REQUIRES: ppc +# RUN: llvm-mc -filetype=obj -triple=powerpc64le %s -o %t.o +# RUN: ld.lld %t.o -o %t +# RUN: llvm-readelf -s %t | FileCheck %s --check-prefix=SYMBOL +# RUN: llvm-objdump -d --no-show-raw-insn --mcpu=pwr10 %t | FileCheck %s + +# RUN: llvm-mc -filetype=obj -triple=powerpc64 %s -o %t.o +# RUN: ld.lld %t.o -o %t +# RUN: llvm-readelf -s %t | FileCheck %s --check-prefix=SYMBOL +# RUN: llvm-objdump -d --no-show-raw-insn --mcpu=pwr10 %t | FileCheck %s + +## This test checks the LLD implementation of the Local Exec TLS model +## when using prefixed instructions like paddi. + +# SYMBOL: Symbol table '.symtab' contains 6 entries: +# SYMBOL: 3: 0000000000000000 0 TLS LOCAL DEFAULT 2 x +# SYMBOL-NEXT: 4: 0000000000000004 0 TLS LOCAL DEFAULT 2 y +# SYMBOL-NEXT: 5: 0000000000000008 0 TLS LOCAL DEFAULT 2 z + +# CHECK-LABEL: : +# CHECK: paddi 3, 13, -28672, 0 +# CHECK-NEXT: paddi 3, 13, -28668, 0 +# CHECK-NEXT: paddi 3, 13, -28652, 0 +# CHECK-NEXT: blr + +# CHECK-LABEL: : +# CHECK: paddi 3, 13, -28672, 0 +# CHECK-NEXT: lwz 3, 0(3) +# CHECK-NEXT: paddi 3, 13, -28668, 0 +# CHECK-NEXT: lwz 3, 0(3) +# CHECK-NEXT: paddi 3, 13, -28652, 0 +# CHECK-NEXT: lwz 3, 0(3) +# CHECK-NEXT: blr + +LocalExecAddr: + paddi 3, 13, x@TPREL, 0 + paddi 3, 13, y@TPREL, 0 + paddi 3, 13, z@TPREL+12, 0 + blr + +LocalExecVal: + paddi 3, 13, x@TPREL, 0 + lwz 3, 0(3) + paddi 3, 13, y@TPREL, 0 + lwz 3, 0(3) + paddi 3, 13, z@TPREL+12, 0 + lwz 3, 0(3) + blr + +.section .tbss, "awT", @nobits +x: + .long 0 +y: + .long 0 +z: + .space 20 diff --git a/lld/test/ELF/ppc64-toc-call-to-pcrel-long-jump.s b/lld/test/ELF/ppc64-toc-call-to-pcrel-long-jump.s index a6e99db8c5c0b..4175ba3131082 100644 --- a/lld/test/ELF/ppc64-toc-call-to-pcrel-long-jump.s +++ b/lld/test/ELF/ppc64-toc-call-to-pcrel-long-jump.s @@ -10,7 +10,10 @@ # RUN: llvm-mc -filetype=obj -triple=powerpc64 %s -o %t.o # RUN: not ld.lld -T %t.script %t.o -o /dev/null 2>&1 | FileCheck %s -# CHECK: error: R2 save stub branch offset is too large: -268501028 +# CHECK: error: R2 save stub offset is out of range: -268501028 is not in [-33554432, 33554431]; references callee +# CHECK-NEXT: >>> defined in {{.*}}.o + +# RUN: ld.lld -T %t.script %t.o -o /dev/null --noinhibit-exec .section .text_callee, "ax", %progbits callee: diff --git a/lld/test/lit.cfg.py b/lld/test/lit.cfg.py index 267f8c5178584..090a7c21fa782 100644 --- a/lld/test/lit.cfg.py +++ b/lld/test/lit.cfg.py @@ -87,11 +87,11 @@ # Indirectly check if the mt.exe Microsoft utility exists by searching for # cvtres, which always accompanies it. Alternatively, check if we can use # libxml2 to merge manifests. -if (lit.util.which('cvtres', config.environment['PATH']) or - config.llvm_libxml2_enabled): +if (lit.util.which('cvtres', config.environment['PATH']) or + config.have_libxml2): config.available_features.add('manifest_tool') -if config.llvm_libxml2_enabled: +if config.have_libxml2: config.available_features.add('libxml2') if config.have_dia_sdk: diff --git a/lld/test/lit.site.cfg.py.in b/lld/test/lit.site.cfg.py.in index 3d4c51f4ab647..bbc2c892eb715 100644 --- a/lld/test/lit.site.cfg.py.in +++ b/lld/test/lit.site.cfg.py.in @@ -7,7 +7,6 @@ config.llvm_src_root = "@LLVM_SOURCE_DIR@" config.llvm_obj_root = "@LLVM_BINARY_DIR@" config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" config.llvm_libs_dir = "@LLVM_LIBS_DIR@" -config.llvm_libxml2_enabled = @LLVM_LIBXML2_ENABLED@ config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@" config.lld_obj_root = "@LLD_BINARY_DIR@" config.lld_libs_dir = "@LLVM_LIBRARY_OUTPUT_INTDIR@" @@ -15,6 +14,7 @@ config.lld_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@" config.target_triple = "@TARGET_TRIPLE@" config.python_executable = "@Python3_EXECUTABLE@" config.have_zlib = @LLVM_ENABLE_ZLIB@ +config.have_libxml2 = @LLVM_ENABLE_LIBXML2@ config.sizeof_void_p = @CMAKE_SIZEOF_VOID_P@ # Support substitution of the tools and libs dirs with user parameters. This is diff --git a/lld/test/wasm/Inputs/undefined-globals.s b/lld/test/wasm/Inputs/undefined-globals.s new file mode 100644 index 0000000000000..54dc4189a7770 --- /dev/null +++ b/lld/test/wasm/Inputs/undefined-globals.s @@ -0,0 +1,11 @@ +.globl use_undef_global +.globl unused_undef_global +.globl used_undef_global + +use_undef_global: + .functype use_undef_global () -> (i64) + global.get used_undef_global + end_function + +.globaltype unused_undef_global, i64, immutable +.globaltype used_undef_global, i64, immutable diff --git a/lld/test/wasm/Inputs/undefined-globals.yaml b/lld/test/wasm/Inputs/undefined-globals.yaml deleted file mode 100644 index 41bc64356400b..0000000000000 --- a/lld/test/wasm/Inputs/undefined-globals.yaml +++ /dev/null @@ -1,53 +0,0 @@ ---- !WASM -FileHeader: - Version: 0x00000001 -Sections: - - Type: TYPE - Signatures: - - Index: 0 - ParamTypes: - ReturnTypes: - - I64 - - Type: IMPORT - Imports: - - Module: env - Field: unused_undef_global - Kind: GLOBAL - GlobalType: I64 - GlobalMutable: true - - Module: env - Field: used_undef_global - Kind: GLOBAL - GlobalType: I64 - GlobalMutable: true - - Type: FUNCTION - FunctionTypes: [ 0 ] - - Type: CODE - Functions: - - Index: 0 - Locals: - Body: 2381808080000B - Relocations: - - Type: R_WASM_GLOBAL_INDEX_LEB - Index: 1 - Offset: 0x00000004 - - Type: CUSTOM - Name: linking - Version: 2 - SymbolTable: - - Index: 0 - Kind: GLOBAL - Name: unused_undef_global - Flags: [ VISIBILITY_HIDDEN, UNDEFINED ] - Global: 0 - - Index: 1 - Kind: GLOBAL - Name: used_undef_global - Flags: [ VISIBILITY_HIDDEN, UNDEFINED ] - Global: 1 - - Index: 2 - Kind: FUNCTION - Name: use_undef_global - Flags: [ VISIBILITY_HIDDEN ] - Function: 0 -... diff --git a/lld/test/wasm/early-exit-for-bad-paths.s b/lld/test/wasm/early-exit-for-bad-paths.s index 2866bfa62f865..21cec318e4490 100644 --- a/lld/test/wasm/early-exit-for-bad-paths.s +++ b/lld/test/wasm/early-exit-for-bad-paths.s @@ -4,10 +4,16 @@ # RUN: FileCheck %s -check-prefixes=NO-DIR-OUTPUT,CHECK # RUN: not wasm-ld %t.o -o %s/dir_is_a_file 2>&1 | \ # RUN: FileCheck %s -check-prefixes=DIR-IS-OUTPUT,CHECK -# TODO(sbc): check similar check for -Map file once we add that option + +# RUN: not wasm-ld %t.o -o %t -Map=does_not_exist/output 2>&1 | \ +# RUN: FileCheck %s -check-prefixes=NO-DIR-MAP,CHECK +# RUN: not wasm-ld %t.o -o %t -Map=%s/dir_is_a_file 2>&1 | \ +# RUN: FileCheck %s -check-prefixes=DIR-IS-MAP,CHECK # NO-DIR-OUTPUT: error: cannot open output file does_not_exist/output: # DIR-IS-OUTPUT: error: cannot open output file {{.*}}/dir_is_a_file: +# NO-DIR-MAP: error: cannot open map file does_not_exist/output: +# DIR-IS-MAP: error: cannot open map file {{.*}}/dir_is_a_file: # We should exit before doing the actual link. If an undefined symbol error is # discovered we haven't bailed out early as expected. diff --git a/lld/test/wasm/emit-relocs-fpic.s b/lld/test/wasm/emit-relocs-fpic.s index c70e1e6751098..1d81ca62786be 100644 --- a/lld/test/wasm/emit-relocs-fpic.s +++ b/lld/test/wasm/emit-relocs-fpic.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj -o %t.o < %s +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.o # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o -# RUN: wasm-ld -pie --export-all --no-gc-sections --no-entry --emit-relocs -o %t.wasm %t.o %t.ret32.o +# RUN: wasm-ld -pie --export-all --no-check-features --no-gc-sections --no-entry --emit-relocs -o %t.wasm %t.o %t.ret32.o # RUN: obj2yaml %t.wasm | FileCheck %s load_hidden_data: diff --git a/lld/test/wasm/export-all.s b/lld/test/wasm/export-all.s new file mode 100644 index 0000000000000..5f013813cdf17 --- /dev/null +++ b/lld/test/wasm/export-all.s @@ -0,0 +1,48 @@ +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s +# RUN: wasm-ld --export-all -o %t.wasm %t.o +# RUN: obj2yaml %t.wasm | FileCheck %s + +.globl _start + +_start: + .functype _start () -> () + i32.const 3 + global.set __stack_pointer + end_function + +foo: + .functype foo () -> (i32) + i32.const 42 + end_function + +.globaltype __stack_pointer, i32 + +# CHECK: - Type: EXPORT +# CHECK-NEXT: Exports: +# CHECK-NEXT: - Name: memory +# CHECK-NEXT: Kind: MEMORY +# CHECK-NEXT: Index: 0 +# CHECK-NEXT: - Name: __wasm_call_ctors +# CHECK-NEXT: Kind: FUNCTION +# CHECK-NEXT: Index: 0 +# CHECK-NEXT: - Name: _start +# CHECK-NEXT: Kind: FUNCTION +# CHECK-NEXT: Index: 1 +# CHECK-NEXT: - Name: __dso_handle +# CHECK-NEXT: Kind: GLOBAL +# CHECK-NEXT: Index: 1 +# CHECK-NEXT: - Name: __data_end +# CHECK-NEXT: Kind: GLOBAL +# CHECK-NEXT: Index: 2 +# CHECK-NEXT: - Name: __global_base +# CHECK-NEXT: Kind: GLOBAL +# CHECK-NEXT: Index: 3 +# CHECK-NEXT: - Name: __heap_base +# CHECK-NEXT: Kind: GLOBAL +# CHECK-NEXT: Index: 4 +# CHECK-NEXT: - Name: __memory_base +# CHECK-NEXT: Kind: GLOBAL +# CHECK-NEXT: Index: 5 +# CHECK-NEXT: - Name: __table_base +# CHECK-NEXT: Kind: GLOBAL +# CHECK-NEXT: Index: 6 diff --git a/lld/test/wasm/gc-imports.ll b/lld/test/wasm/gc-imports.ll deleted file mode 100644 index 68d403765916b..0000000000000 --- a/lld/test/wasm/gc-imports.ll +++ /dev/null @@ -1,91 +0,0 @@ -; RUN: llc -filetype=obj %s -o %t.o -; RUN: yaml2obj %S/Inputs/undefined-globals.yaml -o %t_globals.o -; RUN: wasm-ld --allow-undefined -o %t1.wasm %t.o %t_globals.o - -target triple = "wasm32-unknown-unknown" - -declare i64 @unused_undef_function(i64 %arg) - -declare i32 @used_undef_function() - -declare i64 @use_undef_global() - -define hidden void @foo() { -entry: - call i64 @unused_undef_function(i64 0) - ret void -} - -define hidden void @_start() { -entry: - call i32 @used_undef_function() - call i64 @use_undef_global() - ret void -} - -; RUN: obj2yaml %t1.wasm | FileCheck %s - -; CHECK: - Type: IMPORT -; CHECK-NEXT: Imports: -; CHECK-NEXT: - Module: env -; CHECK-NEXT: Field: used_undef_function -; CHECK-NEXT: Kind: FUNCTION -; CHECK-NEXT: SigIndex: 0 -; CHECK-NEXT: - Module: env -; CHECK-NEXT: Field: used_undef_global -; CHECK-NEXT: Kind: GLOBAL -; CHECK-NEXT: GlobalType: I64 -; CHECK-NEXT: GlobalMutable: true -; CHECK-NEXT: - Type: -; CHECK: - Type: CUSTOM -; CHECK-NEXT: Name: name -; CHECK-NEXT: FunctionNames: -; CHECK-NEXT: - Index: 0 -; CHECK-NEXT: Name: used_undef_function -; CHECK-NEXT: - Index: 1 -; CHECK-NEXT: Name: _start -; CHECK-NEXT: - Index: 2 -; CHECK-NEXT: Name: use_undef_global -; CHECK-NEXT: ... - -; RUN: wasm-ld --no-gc-sections --allow-undefined \ -; RUN: -o %t1.no-gc.wasm %t.o %t_globals.o -; RUN: obj2yaml %t1.no-gc.wasm | FileCheck %s -check-prefix=NO-GC - -; NO-GC: - Type: IMPORT -; NO-GC-NEXT: Imports: -; NO-GC-NEXT: - Module: env -; NO-GC-NEXT: Field: unused_undef_function -; NO-GC-NEXT: Kind: FUNCTION -; NO-GC-NEXT: SigIndex: 0 -; NO-GC-NEXT: - Module: env -; NO-GC-NEXT: Field: used_undef_function -; NO-GC-NEXT: Kind: FUNCTION -; NO-GC-NEXT: SigIndex: 1 -; NO-GC-NEXT: - Module: env -; NO-GC-NEXT: Field: unused_undef_global -; NO-GC-NEXT: Kind: GLOBAL -; NO-GC-NEXT: GlobalType: I64 -; NO-GC-NEXT: GlobalMutable: true -; NO-GC-NEXT: - Module: env -; NO-GC-NEXT: Field: used_undef_global -; NO-GC-NEXT: Kind: GLOBAL -; NO-GC-NEXT: GlobalType: I64 -; NO-GC-NEXT: GlobalMutable: true -; NO-GC-NEXT: - Type: -; NO-GC: - Type: CUSTOM -; NO-GC-NEXT: Name: name -; NO-GC-NEXT: FunctionNames: -; NO-GC-NEXT: - Index: 0 -; NO-GC-NEXT: Name: unused_undef_function -; NO-GC-NEXT: - Index: 1 -; NO-GC-NEXT: Name: used_undef_function -; NO-GC-NEXT: - Index: 2 -; NO-GC-NEXT: Name: __wasm_call_ctors -; NO-GC-NEXT: - Index: 3 -; NO-GC-NEXT: Name: foo -; NO-GC-NEXT: - Index: 4 -; NO-GC-NEXT: Name: _start -; NO-GC-NEXT: - Index: 5 -; NO-GC-NEXT: Name: use_undef_global -; NO-GC-NEXT: ... diff --git a/lld/test/wasm/gc-imports.s b/lld/test/wasm/gc-imports.s new file mode 100644 index 0000000000000..1f8bca9064e09 --- /dev/null +++ b/lld/test/wasm/gc-imports.s @@ -0,0 +1,87 @@ +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.o +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %S/Inputs/undefined-globals.s -o %t_globals.o +# RUN: wasm-ld --allow-undefined -o %t1.wasm %t.o %t_globals.o + +.functype unused_undef_function (i64) -> (i64) +.functype used_undef_function () -> (i32) +.functype use_undef_global () -> (i64) + +foo: + .functype foo () -> () + call unused_undef_function + end_function + +.globl _start + +_start: + .functype _start () -> () + call used_undef_function + call use_undef_global + end_function + +# RUN: obj2yaml %t1.wasm | FileCheck %s + +# CHECK: - Type: IMPORT +# CHECK-NEXT: Imports: +# CHECK-NEXT: - Module: env +# CHECK-NEXT: Field: used_undef_function +# CHECK-NEXT: Kind: FUNCTION +# CHECK-NEXT: SigIndex: 0 +# CHECK-NEXT: - Module: env +# CHECK-NEXT: Field: used_undef_global +# CHECK-NEXT: Kind: GLOBAL +# CHECK-NEXT: GlobalType: I64 +# CHECK-NEXT: GlobalMutable: false +# CHECK-NEXT: - Type: +# CHECK: - Type: CUSTOM +# CHECK-NEXT: Name: name +# CHECK-NEXT: FunctionNames: +# CHECK-NEXT: - Index: 0 +# CHECK-NEXT: Name: used_undef_function +# CHECK-NEXT: - Index: 1 +# CHECK-NEXT: Name: _start +# CHECK-NEXT: - Index: 2 +# CHECK-NEXT: Name: use_undef_global +# CHECK-NEXT: ... + +# RUN: wasm-ld --no-gc-sections --allow-undefined \ +# RUN: -o %t1.no-gc.wasm %t.o %t_globals.o +# RUN: obj2yaml %t1.no-gc.wasm | FileCheck %s -check-prefix=NO-GC + +# NO-GC: - Type: IMPORT +# NO-GC-NEXT: Imports: +# NO-GC-NEXT: - Module: env +# NO-GC-NEXT: Field: unused_undef_function +# NO-GC-NEXT: Kind: FUNCTION +# NO-GC-NEXT: SigIndex: 0 +# NO-GC-NEXT: - Module: env +# NO-GC-NEXT: Field: used_undef_function +# NO-GC-NEXT: Kind: FUNCTION +# NO-GC-NEXT: SigIndex: 1 +# NO-GC-NEXT: - Module: env +# NO-GC-NEXT: Field: unused_undef_global +# NO-GC-NEXT: Kind: GLOBAL +# NO-GC-NEXT: GlobalType: I64 +# NO-GC-NEXT: GlobalMutable: false +# NO-GC-NEXT: - Module: env +# NO-GC-NEXT: Field: used_undef_global +# NO-GC-NEXT: Kind: GLOBAL +# NO-GC-NEXT: GlobalType: I64 +# NO-GC-NEXT: GlobalMutable: false +# NO-GC-NEXT: - Type: +# NO-GC: - Type: CUSTOM +# NO-GC-NEXT: Name: name +# NO-GC-NEXT: FunctionNames: +# NO-GC-NEXT: - Index: 0 +# NO-GC-NEXT: Name: unused_undef_function +# NO-GC-NEXT: - Index: 1 +# NO-GC-NEXT: Name: used_undef_function +# NO-GC-NEXT: - Index: 2 +# NO-GC-NEXT: Name: __wasm_call_ctors +# NO-GC-NEXT: - Index: 3 +# NO-GC-NEXT: Name: foo +# NO-GC-NEXT: - Index: 4 +# NO-GC-NEXT: Name: _start +# NO-GC-NEXT: - Index: 5 +# NO-GC-NEXT: Name: use_undef_global +# NO-GC-NEXT: ... diff --git a/lld/test/wasm/globals.s b/lld/test/wasm/globals.s index ec8d247779de1..6e049e1e73f91 100644 --- a/lld/test/wasm/globals.s +++ b/lld/test/wasm/globals.s @@ -8,10 +8,11 @@ .globaltype foo_global, i32 .globaltype bar_global, f32 +.globaltype immutable_global, i32, immutable read_global: .functype read_global () -> (i32) - global.get foo_global + global.get immutable_global end_function write_global: @@ -26,10 +27,13 @@ _start: .functype _start () -> () i32.const 1 call write_global + call read_global + drop end_function foo_global: bar_global: +immutable_global: # CHECK: - Type: GLOBAL # CHECK-NEXT: Globals: @@ -39,13 +43,19 @@ bar_global: # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST # CHECK-NEXT: Value: 66560 -# CHECK-NEXT: - Index: 1 +# CHECK-NEXT: - Index: 1 +# CHECK-NEXT: Type: I32 +# CHECK-NEXT: Mutable: false +# CHECK-NEXT: InitExpr: +# CHECK-NEXT: Opcode: I32_CONST +# CHECK-NEXT: Value: 0 +# CHECK-NEXT: - Index: 2 # CHECK-NEXT: Type: I32 # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: # CHECK-NEXT: Opcode: I32_CONST # CHECK-NEXT: Value: 0 -# CHECK-NEXT: - Index: 2 +# CHECK-NEXT: - Index: 3 # CHECK-NEXT: Type: F32 # CHECK-NEXT: Mutable: true # CHECK-NEXT: InitExpr: diff --git a/lld/test/wasm/map-file.s b/lld/test/wasm/map-file.s new file mode 100644 index 0000000000000..c2ec089ccb137 --- /dev/null +++ b/lld/test/wasm/map-file.s @@ -0,0 +1,47 @@ +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t1.o +# RUN: wasm-ld %t1.o -o %t -M | FileCheck --match-full-lines --strict-whitespace %s +# RUN: wasm-ld %t1.o -o %t -print-map | FileCheck --match-full-lines --strict-whitespace %s +# RUN: wasm-ld %t1.o -o %t -Map=%t.map +# RUN: FileCheck --match-full-lines --strict-whitespace %s < %t.map + +bar: + .functype bar () -> () + i32.const somedata + end_function + + .globl _start +_start: + .functype _start () -> () + call bar + end_function + +.section .data.somedata,"",@ +somedata: + .int32 123 +.size somedata, 4 + +.section .debug_info,"",@ + .int32 bar + +# CHECK: Addr Off Size Out In Symbol +# CHECK-NEXT: - 8 6 TYPE +# CHECK-NEXT: - e 5 FUNCTION +# CHECK-NEXT: - 13 7 TABLE +# CHECK-NEXT: - 1a 5 MEMORY +# CHECK-NEXT: - 1f a GLOBAL +# CHECK-NEXT: - 29 15 EXPORT +# CHECK-NEXT: - 3e 15 CODE +# CHECK-NEXT: - 3f 9 {{.*}}{{/|\\}}map-file.s.tmp1.o:(bar) +# CHECK-NEXT: - 3f 9 bar +# CHECK-NEXT: - 48 9 {{.*}}{{/|\\}}map-file.s.tmp1.o:(_start) +# CHECK-NEXT: - 48 9 _start +# CHECK-NEXT: - 53 d DATA +# CHECK-NEXT: 400 54 4 .data +# CHECK-NEXT: 400 5a 4 {{.*}}{{/|\\}}map-file.s.tmp1.o:(.data.somedata) +# CHECK-NEXT: 400 5a 4 somedata +# CHECK-NEXT: - 60 12 CUSTOM(.debug_info) +# CHECK-NEXT: - 72 17 CUSTOM(name) + +# RUN: not wasm-ld %t1.o -o /dev/null -Map=/ 2>&1 \ +# RUN: | FileCheck -check-prefix=FAIL %s +# FAIL: wasm-ld: error: cannot open map file / diff --git a/lld/test/wasm/mutable-globals.s b/lld/test/wasm/mutable-globals.s new file mode 100644 index 0000000000000..ea856e5112895 --- /dev/null +++ b/lld/test/wasm/mutable-globals.s @@ -0,0 +1,15 @@ +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s +# RUN: not wasm-ld %t.o -o %t.wasm 2>&1 | FileCheck %s + +.globl _start +_start: + .functype _start () -> () + i32.const 1 + global.set foo + end_function + +.globaltype foo, i32 +.import_module foo, env +.import_name foo, foo + +# CHECK: error: mutable global imported but 'mutable-globals' feature not present in inputs: `foo`. Use --no-check-features to suppress. diff --git a/lld/test/wasm/pie.ll b/lld/test/wasm/pie.ll index c576e7c7bf706..a203d31798c96 100644 --- a/lld/test/wasm/pie.ll +++ b/lld/test/wasm/pie.ll @@ -1,4 +1,4 @@ -; RUN: llc -relocation-model=pic -filetype=obj %s -o %t.o +; RUN: llc -relocation-model=pic -mattr=+mutable-globals -filetype=obj %s -o %t.o ; RUN: wasm-ld --no-gc-sections --allow-undefined -pie -o %t.wasm %t.o ; RUN: obj2yaml %t.wasm | FileCheck %s diff --git a/lld/test/wasm/shared.ll b/lld/test/wasm/shared.ll index 89fae3342ac2a..59c1855bed563 100644 --- a/lld/test/wasm/shared.ll +++ b/lld/test/wasm/shared.ll @@ -1,4 +1,4 @@ -; RUN: llc -relocation-model=pic -filetype=obj %s -o %t.o +; RUN: llc -relocation-model=pic -mattr=+mutable-globals -filetype=obj %s -o %t.o ; RUN: wasm-ld -shared -o %t.wasm %t.o ; RUN: obj2yaml %t.wasm | FileCheck %s diff --git a/lld/tools/lld/lld.cpp b/lld/tools/lld/lld.cpp index 8a8f8d04bbda6..d4e2fbb0309a7 100644 --- a/lld/tools/lld/lld.cpp +++ b/lld/tools/lld/lld.cpp @@ -92,7 +92,12 @@ static bool isPETarget(std::vector &v) { continue; return isPETargetName(*(it + 1)); } + +#ifdef LLD_DEFAULT_LD_LLD_IS_MINGW + return true; +#else return false; +#endif } static Flavor parseProgname(StringRef progname) { diff --git a/lld/wasm/CMakeLists.txt b/lld/wasm/CMakeLists.txt index cd46f0a826ac9..37902ededa0c7 100644 --- a/lld/wasm/CMakeLists.txt +++ b/lld/wasm/CMakeLists.txt @@ -7,6 +7,7 @@ add_lld_library(lldWasm InputChunks.cpp InputFiles.cpp LTO.cpp + MapFile.cpp MarkLive.cpp OutputSections.cpp Relocations.cpp diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h index e8d018f09bf6e..cd6d57333a212 100644 --- a/lld/wasm/Config.h +++ b/lld/wasm/Config.h @@ -58,6 +58,7 @@ struct Configuration { llvm::StringRef thinLTOJobs; llvm::StringRef entry; + llvm::StringRef mapFile; llvm::StringRef outputFile; llvm::StringRef thinLTOCacheDir; diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index 7307aaa3f7be1..09318421574c2 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -344,6 +344,7 @@ static void readConfigs(opt::InputArgList &args) { config->importTable = args.hasArg(OPT_import_table); config->ltoo = args::getInteger(args, OPT_lto_O, 2); config->ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1); + config->mapFile = args.getLastArgValue(OPT_Map); config->optimize = args::getInteger(args, OPT_O, 0); config->outputFile = args.getLastArgValue(OPT_o); config->relocatable = args.hasArg(OPT_relocatable); @@ -410,6 +411,9 @@ static void readConfigs(opt::InputArgList &args) { for (StringRef s : arg->getValues()) config->features->push_back(std::string(s)); } + + if (args.hasArg(OPT_print_map)) + config->mapFile = "-"; } // Some Config members do not directly correspond to any particular @@ -795,7 +799,8 @@ void LinkerDriver::link(ArrayRef argsArr) { // find that it failed because there was a mistake in their command-line. if (auto e = tryCreateFile(config->outputFile)) error("cannot open output file " + config->outputFile + ": " + e.message()); - // TODO(sbc): add check for map file too once we add support for that. + if (auto e = tryCreateFile(config->mapFile)) + error("cannot open map file " + config->mapFile + ": " + e.message()); if (errorCount()) return; diff --git a/lld/wasm/InputChunks.h b/lld/wasm/InputChunks.h index cadff6883fa4f..be91b19ed452c 100644 --- a/lld/wasm/InputChunks.h +++ b/lld/wasm/InputChunks.h @@ -57,6 +57,8 @@ class InputChunk { void writeRelocations(llvm::raw_ostream &os) const; ObjFile *file; + OutputSection *outputSec = nullptr; + // Offset withing the output section int32_t outputOffset = 0; // Signals that the section is part of the output. The garbage collector, @@ -214,8 +216,6 @@ class InputSection : public InputChunk { StringRef getDebugName() const override { return StringRef(); } uint32_t getComdat() const override { return UINT32_MAX; } - OutputSection *outputSec = nullptr; - protected: ArrayRef data() const override { return section.Content; } diff --git a/lld/wasm/MapFile.cpp b/lld/wasm/MapFile.cpp new file mode 100644 index 0000000000000..a08d2a97d74a4 --- /dev/null +++ b/lld/wasm/MapFile.cpp @@ -0,0 +1,148 @@ +//===- MapFile.cpp --------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the -Map option. It shows lists in order and +// hierarchically the output sections, input sections, input files and +// symbol: +// +// Addr Off Size Out In Symbol +// - 00000015 10 .text +// - 0000000e 10 test.o:(.text) +// - 00000000 5 local +// - 00000000 5 f(int) +// +//===----------------------------------------------------------------------===// + +#include "MapFile.h" +#include "InputFiles.h" +#include "OutputSections.h" +#include "OutputSegment.h" +#include "SymbolTable.h" +#include "Symbols.h" +#include "SyntheticSections.h" +#include "lld/Common/Strings.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Support/Parallel.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace llvm::object; +using namespace lld; +using namespace lld::wasm; + +using SymbolMapTy = DenseMap>; + +// Print out the first three columns of a line. +static void writeHeader(raw_ostream &os, int64_t vma, uint64_t lma, + uint64_t size) { + // Not all entries in the map has a virtual memory address (e.g. functions) + if (vma == -1) + os << format(" - %8llx %8llx ", lma, size); + else + os << format("%8llx %8llx %8llx ", vma, lma, size); +} + +// Returns a list of all symbols that we want to print out. +static std::vector getSymbols() { + std::vector v; + for (InputFile *file : symtab->objectFiles) + for (Symbol *b : file->getSymbols()) + if (auto *dr = dyn_cast(b)) + if ((!isa(dr)) && dr->isLive() && + (dr->getFile() == file)) + v.push_back(dr); + return v; +} + +// Returns a map from sections to their symbols. +static SymbolMapTy getSectionSyms(ArrayRef syms) { + SymbolMapTy ret; + for (Symbol *dr : syms) + ret[dr->getChunk()].push_back(dr); + return ret; +} + +// Construct a map from symbols to their stringified representations. +// Demangling symbols (which is what toString() does) is slow, so +// we do that in batch using parallel-for. +static DenseMap +getSymbolStrings(ArrayRef syms) { + std::vector str(syms.size()); + parallelForEachN(0, syms.size(), [&](size_t i) { + raw_string_ostream os(str[i]); + auto &chunk = *syms[i]->getChunk(); + uint64_t fileOffset = chunk.outputSec->getOffset() + chunk.outputOffset; + uint64_t vma = -1; + uint64_t size = 0; + if (auto *DD = dyn_cast(syms[i])) { + vma = DD->getVirtualAddress(); + size = DD->getSize(); + fileOffset += DD->offset; + } + if (auto *DF = dyn_cast(syms[i])) { + size = DF->function->getSize(); + } + writeHeader(os, vma, fileOffset, size); + os.indent(16) << toString(*syms[i]); + }); + + DenseMap ret; + for (size_t i = 0, e = syms.size(); i < e; ++i) + ret[syms[i]] = std::move(str[i]); + return ret; +} + +void lld::wasm::writeMapFile(ArrayRef outputSections) { + if (config->mapFile.empty()) + return; + + // Open a map file for writing. + std::error_code ec; + raw_fd_ostream os(config->mapFile, ec, sys::fs::OF_None); + if (ec) { + error("cannot open " + config->mapFile + ": " + ec.message()); + return; + } + + // Collect symbol info that we want to print out. + std::vector syms = getSymbols(); + SymbolMapTy sectionSyms = getSectionSyms(syms); + DenseMap symStr = getSymbolStrings(syms); + + // Print out the header line. + os << " Addr Off Size Out In Symbol\n"; + + for (OutputSection *osec : outputSections) { + writeHeader(os, -1, osec->getOffset(), osec->getSize()); + os << toString(*osec) << '\n'; + if (auto *code = dyn_cast(osec)) { + for (auto *chunk : code->functions) { + writeHeader(os, -1, chunk->outputSec->getOffset() + chunk->outputOffset, + chunk->getSize()); + os.indent(8) << toString(chunk) << '\n'; + for (Symbol *sym : sectionSyms[chunk]) + os << symStr[sym] << '\n'; + } + } else if (auto *data = dyn_cast(osec)) { + for (auto *oseg : data->segments) { + writeHeader(os, oseg->startVA, data->getOffset() + oseg->sectionOffset, + oseg->size); + os << oseg->name << '\n'; + for (auto *chunk : oseg->inputSegments) { + writeHeader(os, oseg->startVA + chunk->outputSegmentOffset, + chunk->outputSec->getOffset() + chunk->outputOffset, + chunk->getSize()); + os.indent(8) << toString(chunk) << '\n'; + for (Symbol *sym : sectionSyms[chunk]) + os << symStr[sym] << '\n'; + } + } + } + } +} diff --git a/lld/wasm/MapFile.h b/lld/wasm/MapFile.h new file mode 100644 index 0000000000000..ef2cc783a6c2c --- /dev/null +++ b/lld/wasm/MapFile.h @@ -0,0 +1,21 @@ +//===- MapFile.h ------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_WASM_MAPFILE_H +#define LLD_WASM_MAPFILE_H + +#include "llvm/ADT/ArrayRef.h" + +namespace lld { +namespace wasm { +class OutputSection; +void writeMapFile(llvm::ArrayRef outputSections); +} // namespace wasm +} // namespace lld + +#endif diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td index 16c784f74828a..27d54c5cdc648 100644 --- a/lld/wasm/Options.td +++ b/lld/wasm/Options.td @@ -66,6 +66,8 @@ def m: JoinedOrSeparate<["-"], "m">, HelpText<"Set target emulation">; def mllvm: S<"mllvm">, HelpText<"Options to pass to LLVM">; +defm Map: Eq<"Map", "Print a link map to the specified file">; + def no_color_diagnostics: F<"no-color-diagnostics">, HelpText<"Do not use colors in diagnostics">; @@ -84,6 +86,9 @@ defm print_gc_sections: B<"print-gc-sections", "List removed unused sections", "Do not list removed unused sections">; +def print_map: F<"print-map">, + HelpText<"Print a link map to the standard output">; + def relocatable: F<"relocatable">, HelpText<"Create relocatable object file">; defm reproduce: Eq<"reproduce", "Dump linker invocation and input files for debugging">; @@ -181,6 +186,7 @@ def: JoinedOrSeparate<["-"], "e">, Alias; def: J<"entry=">, Alias; def: Flag<["-"], "E">, Alias, HelpText<"Alias for --export-dynamic">; def: Flag<["-"], "i">, Alias; +def: Flag<["-"], "M">, Alias, HelpText<"Alias for --print-map">; def: Flag<["-"], "r">, Alias; def: Flag<["-"], "s">, Alias, HelpText<"Alias for --strip-all">; def: Flag<["-"], "S">, Alias, HelpText<"Alias for --strip-debug">; diff --git a/lld/wasm/OutputSections.cpp b/lld/wasm/OutputSections.cpp index a936562992dd3..dbdabddb9320d 100644 --- a/lld/wasm/OutputSections.cpp +++ b/lld/wasm/OutputSections.cpp @@ -87,6 +87,7 @@ void CodeSection::finalizeContents() { bodySize = codeSectionHeader.size(); for (InputFunction *func : functions) { + func->outputSec = this; func->outputOffset = bodySize; func->calculateSize(); bodySize += func->getSize(); @@ -166,9 +167,11 @@ void DataSection::finalizeContents() { log("Data segment: size=" + Twine(segment->size) + ", startVA=" + Twine::utohexstr(segment->startVA) + ", name=" + segment->name); - for (InputSegment *inputSeg : segment->inputSegments) + for (InputSegment *inputSeg : segment->inputSegments) { + inputSeg->outputSec = this; inputSeg->outputOffset = segment->sectionOffset + segment->header.size() + inputSeg->outputSegmentOffset; + } } createHeader(bodySize); @@ -227,8 +230,8 @@ void CustomSection::finalizeContents() { os.flush(); for (InputSection *section : inputSections) { - section->outputOffset = payloadSize; section->outputSec = this; + section->outputOffset = payloadSize; payloadSize += section->getSize(); } diff --git a/lld/wasm/OutputSections.h b/lld/wasm/OutputSections.h index 1fcb5723df980..444116dac7d8c 100644 --- a/lld/wasm/OutputSections.h +++ b/lld/wasm/OutputSections.h @@ -40,6 +40,7 @@ class OutputSection { void createHeader(size_t bodySize); virtual bool isNeeded() const { return true; } virtual size_t getSize() const = 0; + virtual size_t getOffset() { return offset; } virtual void writeTo(uint8_t *buf) = 0; virtual void finalizeContents() = 0; virtual uint32_t getNumRelocations() const { return 0; } @@ -60,6 +61,10 @@ class CodeSection : public OutputSection { explicit CodeSection(ArrayRef functions) : OutputSection(llvm::wasm::WASM_SEC_CODE), functions(functions) {} + static bool classof(const OutputSection *sec) { + return sec->type == llvm::wasm::WASM_SEC_CODE; + } + size_t getSize() const override { return header.size() + bodySize; } void writeTo(uint8_t *buf) override; uint32_t getNumRelocations() const override; @@ -67,8 +72,9 @@ class CodeSection : public OutputSection { bool isNeeded() const override { return functions.size() > 0; } void finalizeContents() override; -protected: ArrayRef functions; + +protected: std::string codeSectionHeader; size_t bodySize = 0; }; @@ -78,6 +84,10 @@ class DataSection : public OutputSection { explicit DataSection(ArrayRef segments) : OutputSection(llvm::wasm::WASM_SEC_DATA), segments(segments) {} + static bool classof(const OutputSection *sec) { + return sec->type == llvm::wasm::WASM_SEC_DATA; + } + size_t getSize() const override { return header.size() + bodySize; } void writeTo(uint8_t *buf) override; uint32_t getNumRelocations() const override; @@ -85,8 +95,9 @@ class DataSection : public OutputSection { bool isNeeded() const override; void finalizeContents() override; -protected: ArrayRef segments; + +protected: std::string dataSectionHeader; size_t bodySize = 0; }; @@ -103,6 +114,11 @@ class CustomSection : public OutputSection { CustomSection(std::string name, ArrayRef inputSections) : OutputSection(llvm::wasm::WASM_SEC_CUSTOM, name), inputSections(inputSections) {} + + static bool classof(const OutputSection *sec) { + return sec->type == llvm::wasm::WASM_SEC_CUSTOM; + } + size_t getSize() const override { return header.size() + nameData.size() + payloadSize; } diff --git a/lld/wasm/Relocations.cpp b/lld/wasm/Relocations.cpp index 2559e0f869cce..0a364d1a53ac4 100644 --- a/lld/wasm/Relocations.cpp +++ b/lld/wasm/Relocations.cpp @@ -21,10 +21,13 @@ static bool requiresGOTAccess(const Symbol *sym) { } static bool allowUndefined(const Symbol* sym) { - // Undefined functions with explicit import name are allowed to be undefined - // at link time. - if (auto *F = dyn_cast(sym)) - if (F->importName) + // Undefined functions and globals with explicit import name are allowed to be + // undefined at link time. + if (auto *f = dyn_cast(sym)) + if (f->importName) + return true; + if (auto *g = dyn_cast(sym)) + if (g->importName) return true; return (config->allowUndefined || config->allowUndefinedSymbols.count(sym->getName()) != 0); diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h index 73f555217f260..eed481a0b44da 100644 --- a/lld/wasm/Symbols.h +++ b/lld/wasm/Symbols.h @@ -284,9 +284,9 @@ class DefinedData : public DataSymbol { uint64_t getSize() const { return size; } InputSegment *segment = nullptr; + uint32_t offset = 0; protected: - uint64_t offset = 0; uint64_t size = 0; }; diff --git a/lld/wasm/SyntheticSections.h b/lld/wasm/SyntheticSections.h index 3e125ca84e401..335bfe843184a 100644 --- a/lld/wasm/SyntheticSections.h +++ b/lld/wasm/SyntheticSections.h @@ -221,6 +221,7 @@ class ExportSection : public SyntheticSection { void writeBody() override; std::vector exports; + std::vector exportedSymbols; }; class StartSection : public SyntheticSection { diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp index 495050c0b6319..8d5b98050cb13 100644 --- a/lld/wasm/Writer.cpp +++ b/lld/wasm/Writer.cpp @@ -11,6 +11,7 @@ #include "InputChunks.h" #include "InputEvent.h" #include "InputGlobal.h" +#include "MapFile.h" #include "OutputSections.h" #include "OutputSegment.h" #include "Relocations.h" @@ -461,6 +462,25 @@ void Writer::populateTargetFeatures() { if (!config->checkFeatures) return; + if (!config->relocatable && used.count("mutable-globals") == 0) { + for (const Symbol *sym : out.importSec->importedSymbols) { + if (auto *global = dyn_cast(sym)) { + if (global->getGlobalType()->Mutable) { + error(Twine("mutable global imported but 'mutable-globals' feature " + "not present in inputs: `") + + toString(*sym) + "`. Use --no-check-features to suppress."); + } + } + } + for (const Symbol *sym : out.exportSec->exportedSymbols) { + if (auto *global = dyn_cast(sym)) { + error(Twine("mutable global exported but 'mutable-globals' feature " + "not present in inputs: `") + + toString(*sym) + "`. Use --no-check-features to suppress."); + } + } + } + if (config->sharedMemory) { if (disallowed.count("shared-mem")) error("--shared-memory is disallowed by " + disallowed["shared-mem"] + @@ -579,6 +599,7 @@ void Writer::calculateExports() { LLVM_DEBUG(dbgs() << "Export: " << name << "\n"); out.exportSec->exports.push_back(export_); + out.exportSec->exportedSymbols.push_back(sym); } } @@ -1051,8 +1072,6 @@ void Writer::run() { createSyntheticSections(); log("-- populateProducers"); populateProducers(); - log("-- populateTargetFeatures"); - populateTargetFeatures(); log("-- calculateImports"); calculateImports(); log("-- layoutMemory"); @@ -1095,6 +1114,8 @@ void Writer::run() { calculateCustomSections(); log("-- populateSymtab"); populateSymtab(); + log("-- populateTargetFeatures"); + populateTargetFeatures(); log("-- addSections"); addSections(); @@ -1114,6 +1135,9 @@ void Writer::run() { log("-- finalizeSections"); finalizeSections(); + log("-- writeMapFile"); + writeMapFile(outputSections); + log("-- openFile"); openFile(); if (errorCount()) diff --git a/lldb/bindings/interface/SBBlock.i b/lldb/bindings/interface/SBBlock.i index 812b41fe5c3ea..3972b939b18b9 100644 --- a/lldb/bindings/interface/SBBlock.i +++ b/lldb/bindings/interface/SBBlock.i @@ -22,7 +22,7 @@ public: ~SBBlock (); %feature("docstring", - "Does this block represent an inlined function?" + "Is this block contained within an inlined function?" ) IsInlined; bool IsInlined () const; diff --git a/lldb/bindings/interface/SBBreakpoint.i b/lldb/bindings/interface/SBBreakpoint.i index a2d747db0bf6d..e386ace9dee8a 100644 --- a/lldb/bindings/interface/SBBreakpoint.i +++ b/lldb/bindings/interface/SBBreakpoint.i @@ -234,6 +234,8 @@ public: SBError AddLocation(SBAddress &address); + SBStructuredData SBBreakpoint::SerializeToStructuredData(); + static bool EventIsBreakpointEvent (const lldb::SBEvent &event); diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst index c1cb6ec1a9343..579f7574dac53 100644 --- a/lldb/docs/resources/build.rst +++ b/lldb/docs/resources/build.rst @@ -71,7 +71,7 @@ commands below. :: > yum install libedit-devel libxml2-devel ncurses-devel python-devel swig - > sudo apt-get install build-essential subversion swig python2.7-dev libedit-dev libncurses5-dev + > sudo apt-get install build-essential subversion swig python3-dev libedit-dev libncurses5-dev > pkg install swig python > pkgin install swig python27 cmake ninja-build > brew install swig cmake ninja @@ -244,7 +244,7 @@ Windows On Windows the LLDB test suite requires lld. Either add ``lld`` to ``LLVM_ENABLE_PROJECTS`` or disable the test suite with -``LLDB_ENABLE_TESTS=OFF``. +``LLDB_INCLUDE_TESTS=OFF``. Although the following CMake variables are by no means Windows specific, they are commonly used on Windows. @@ -300,7 +300,7 @@ macOS On macOS the LLDB test suite requires libc++. Either add ``libcxx`` to ``LLVM_ENABLE_PROJECTS`` or disable the test suite with -``LLDB_ENABLE_TESTS=OFF``. Further useful options: +``LLDB_INCLUDE_TESTS=OFF``. Further useful options: * ``LLDB_BUILD_FRAMEWORK:BOOL``: Builds the LLDB.framework. * ``LLDB_CODESIGN_IDENTITY:STRING``: Set the identity to use for code-signing diff --git a/lldb/include/lldb/API/SBBreakpoint.h b/lldb/include/lldb/API/SBBreakpoint.h index c9a52fcacf1a4..39a021145fb7b 100644 --- a/lldb/include/lldb/API/SBBreakpoint.h +++ b/lldb/include/lldb/API/SBBreakpoint.h @@ -140,7 +140,9 @@ class LLDB_API SBBreakpoint { // Can only be called from a ScriptedBreakpointResolver... SBError AddLocation(SBAddress &address); - + + SBStructuredData SerializeToStructuredData(); + private: friend class SBBreakpointList; friend class SBBreakpointLocation; diff --git a/lldb/include/lldb/Core/IOHandler.h b/lldb/include/lldb/Core/IOHandler.h index c96dc1cd18880..2e8f3225fd5f7 100644 --- a/lldb/include/lldb/Core/IOHandler.h +++ b/lldb/include/lldb/Core/IOHandler.h @@ -128,11 +128,11 @@ class IOHandler { FILE *GetErrorFILE(); - lldb::FileSP &GetInputFileSP(); + lldb::FileSP GetInputFileSP(); - lldb::StreamFileSP &GetOutputStreamFileSP(); + lldb::StreamFileSP GetOutputStreamFileSP(); - lldb::StreamFileSP &GetErrorStreamFileSP(); + lldb::StreamFileSP GetErrorStreamFileSP(); Debugger &GetDebugger() { return m_debugger; } diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h index c5e19773d51c7..6143739381659 100644 --- a/lldb/include/lldb/Symbol/CompilerType.h +++ b/lldb/include/lldb/Symbol/CompilerType.h @@ -20,7 +20,7 @@ namespace lldb_private { class DataExtractor; -/// Represents a generic type in a programming language. +/// Generic representation of a type in a programming language. /// /// This class serves as an abstraction for a type inside one of the TypeSystems /// implemented by the language plugins. It does not have any actual logic in it diff --git a/lldb/packages/Python/lldbsuite/test/builders/darwin.py b/lldb/packages/Python/lldbsuite/test/builders/darwin.py index 4548217c3fab8..236e4fac13682 100644 --- a/lldb/packages/Python/lldbsuite/test/builders/darwin.py +++ b/lldb/packages/Python/lldbsuite/test/builders/darwin.py @@ -78,7 +78,7 @@ def getExtraMakeArgs(self): {'{}="{}"'.format(key, value) for key, value in args.items()}) - def getArchCFlags(self, architecture): + def getArchCFlags(self, arch): """Returns the ARCH_CFLAGS for the make system.""" # Get the triple components. vendor, os, version, env = get_triple() @@ -86,7 +86,7 @@ def getArchCFlags(self, architecture): return "" # Construct the triple from its components. - triple = "{}-{}-{}-{}".format(vendor, os, version, env) + triple = '-'.join([arch, vendor, os, version, env]) # Construct min version argument version_min = "" diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py index 30d6afc231fda..b4eddda914033 100644 --- a/lldb/packages/Python/lldbsuite/test/dotest.py +++ b/lldb/packages/Python/lldbsuite/test/dotest.py @@ -449,6 +449,18 @@ def parseOptionsAndInitTestdirs(): lldbtest_config.codesign_identity = args.codesign_identity +def registerFaulthandler(): + try: + import faulthandler + except ImportError: + # faulthandler is not available until python3 + return + + faulthandler.enable() + # faulthandler.register is not available on Windows. + if getattr(faulthandler, 'register', None): + faulthandler.register(signal.SIGTERM, chain=True) + def setupSysPath(): """ Add LLDB.framework/Resources/Python to the search paths for modules. @@ -875,6 +887,9 @@ def run_suite(): # parseOptionsAndInitTestdirs() + # Print a stack trace if the test hangs or is passed SIGTERM. + registerFaulthandler() + setupSysPath() import lldbconfig diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py index fa5a9c0db1ebd..5710751ec34bf 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py @@ -282,7 +282,7 @@ def launch(self, program=None, args=None, cwd=None, env=None, trace=False, initCommands=None, preRunCommands=None, stopCommands=None, exitCommands=None, terminateCommands=None, sourcePath=None, debuggerRoot=None, launchCommands=None, - sourceMap=None, disconnectAutomatically=True): + sourceMap=None, disconnectAutomatically=True, runInTerminal=False): '''Sending launch request to vscode ''' @@ -316,10 +316,16 @@ def cleanup(): sourcePath=sourcePath, debuggerRoot=debuggerRoot, launchCommands=launchCommands, - sourceMap=sourceMap) + sourceMap=sourceMap, + runInTerminal=runInTerminal) if not (response and response['success']): self.assertTrue(response['success'], 'launch failed (%s)' % (response['message'])) + # We need to trigger a request_configurationDone after we've successfully + # attached a runInTerminal process to finish initialization. + if runInTerminal: + self.vscode.request_configurationDone() + def build_and_launch(self, program, args=None, cwd=None, env=None, stopOnEntry=False, disableASLR=True, @@ -327,7 +333,7 @@ def build_and_launch(self, program, args=None, cwd=None, env=None, trace=False, initCommands=None, preRunCommands=None, stopCommands=None, exitCommands=None, terminateCommands=None, sourcePath=None, - debuggerRoot=None): + debuggerRoot=None, runInTerminal=False): '''Build the default Makefile target, create the VSCode debug adaptor, and launch the process. ''' @@ -337,4 +343,4 @@ def build_and_launch(self, program, args=None, cwd=None, env=None, self.launch(program, args, cwd, env, stopOnEntry, disableASLR, disableSTDIO, shellExpandArguments, trace, initCommands, preRunCommands, stopCommands, exitCommands, - terminateCommands, sourcePath, debuggerRoot) + terminateCommands, sourcePath, debuggerRoot, runInTerminal=runInTerminal) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py index 6b1c1c961b545..834e33ef5c3da 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py @@ -300,12 +300,29 @@ def send_recv(self, command): self.send_packet(command) done = False while not done: - response = self.recv_packet(filter_type='response') - if response is None: + response_or_request = self.recv_packet(filter_type=['response', 'request']) + if response_or_request is None: desc = 'no response for "%s"' % (command['command']) raise ValueError(desc) - self.validate_response(command, response) - return response + if response_or_request['type'] == 'response': + self.validate_response(command, response_or_request) + return response_or_request + else: + if response_or_request['command'] == 'runInTerminal': + subprocess.Popen(response_or_request['arguments']['args'], + env=response_or_request['arguments']['env']) + self.send_packet({ + "type": "response", + "seq": -1, + "request_seq": response_or_request['seq'], + "success": True, + "command": "runInTerminal", + "body": {} + }, set_sequence=False) + else: + desc = 'unkonwn reverse request "%s"' % (response_or_request['command']) + raise ValueError(desc) + return None def wait_for_event(self, filter=None, timeout=None): @@ -599,7 +616,8 @@ def request_launch(self, program, args=None, cwd=None, env=None, trace=False, initCommands=None, preRunCommands=None, stopCommands=None, exitCommands=None, terminateCommands=None ,sourcePath=None, - debuggerRoot=None, launchCommands=None, sourceMap=None): + debuggerRoot=None, launchCommands=None, sourceMap=None, + runInTerminal=False): args_dict = { 'program': program } @@ -638,6 +656,8 @@ def request_launch(self, program, args=None, cwd=None, env=None, args_dict['launchCommands'] = launchCommands if sourceMap: args_dict['sourceMap'] = sourceMap + if runInTerminal: + args_dict['runInTerminal'] = runInTerminal command_dict = { 'command': 'launch', 'type': 'request', diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt index 8a7f28c01a9c2..aeb1f15e294b2 100644 --- a/lldb/source/API/CMakeLists.txt +++ b/lldb/source/API/CMakeLists.txt @@ -182,10 +182,10 @@ if (NOT CMAKE_SYSTEM_NAME MATCHES "Windows") set_target_properties(liblldb_exports PROPERTIES FOLDER "lldb misc") endif() -if ( CMAKE_SYSTEM_NAME MATCHES "Windows" ) +if (MSVC) # Only MSVC has the ABI compatibility problem and avoids using FindPythonLibs, # so only it needs to explicitly link against ${Python3_LIBRARIES} - if (MSVC AND LLDB_ENABLE_PYTHON) + if (LLDB_ENABLE_PYTHON) target_link_libraries(liblldb PRIVATE ${Python3_LIBRARIES}) endif() else() diff --git a/lldb/source/API/SBBreakpoint.cpp b/lldb/source/API/SBBreakpoint.cpp index eb75bf8b33f43..96b77bd8539e8 100644 --- a/lldb/source/API/SBBreakpoint.cpp +++ b/lldb/source/API/SBBreakpoint.cpp @@ -575,7 +575,22 @@ SBError SBBreakpoint::AddLocation(SBAddress &address) { return LLDB_RECORD_RESULT(error); } -void SBBreakpoint ::SetCallback(SBBreakpointHitCallback callback, void *baton) { +SBStructuredData SBBreakpoint::SerializeToStructuredData() { + LLDB_RECORD_METHOD_NO_ARGS(lldb::SBStructuredData, SBBreakpoint, + SerializeToStructuredData); + + SBStructuredData data; + BreakpointSP bkpt_sp = GetSP(); + + if (!bkpt_sp) + return LLDB_RECORD_RESULT(data); + + StructuredData::ObjectSP bkpt_dict = bkpt_sp->SerializeToStructuredData(); + data.m_impl_up->SetObjectSP(bkpt_dict); + return LLDB_RECORD_RESULT(data); +} + +void SBBreakpoint::SetCallback(SBBreakpointHitCallback callback, void *baton) { LLDB_RECORD_DUMMY(void, SBBreakpoint, SetCallback, (lldb::SBBreakpointHitCallback, void *), callback, baton); @@ -1017,6 +1032,8 @@ void RegisterMethods(Registry &R) { (lldb::SBStream &, bool)); LLDB_REGISTER_METHOD(lldb::SBError, SBBreakpoint, AddLocation, (lldb::SBAddress &)); + LLDB_REGISTER_METHOD(lldb::SBStructuredData, SBBreakpoint, + SerializeToStructuredData, ()); LLDB_REGISTER_METHOD(void, SBBreakpoint, SetScriptCallbackFunction, (const char *)); LLDB_REGISTER_METHOD(lldb::SBError, SBBreakpoint, SetScriptCallbackFunction, diff --git a/lldb/source/API/SBPlatform.cpp b/lldb/source/API/SBPlatform.cpp index 3c6422e211fca..f118048156b96 100644 --- a/lldb/source/API/SBPlatform.cpp +++ b/lldb/source/API/SBPlatform.cpp @@ -93,8 +93,8 @@ SBPlatformConnectOptions::SBPlatformConnectOptions( SBPlatformConnectOptions::~SBPlatformConnectOptions() { delete m_opaque_ptr; } -SBPlatformConnectOptions &SBPlatformConnectOptions:: -operator=(const SBPlatformConnectOptions &rhs) { +SBPlatformConnectOptions & +SBPlatformConnectOptions::operator=(const SBPlatformConnectOptions &rhs) { LLDB_RECORD_METHOD( SBPlatformConnectOptions &, SBPlatformConnectOptions, operator=,( @@ -196,8 +196,8 @@ SBPlatformShellCommand::SBPlatformShellCommand( *m_opaque_ptr = *rhs.m_opaque_ptr; } -SBPlatformShellCommand &SBPlatformShellCommand:: -operator=(const SBPlatformShellCommand &rhs) { +SBPlatformShellCommand & +SBPlatformShellCommand::operator=(const SBPlatformShellCommand &rhs) { LLDB_RECORD_METHOD( SBPlatformShellCommand &, @@ -581,25 +581,25 @@ SBError SBPlatform::Install(SBFileSpec &src, SBFileSpec &dst) { SBError SBPlatform::Run(SBPlatformShellCommand &shell_command) { LLDB_RECORD_METHOD(lldb::SBError, SBPlatform, Run, (lldb::SBPlatformShellCommand &), shell_command); - return LLDB_RECORD_RESULT(ExecuteConnected([&](const lldb::PlatformSP - &platform_sp) { - const char *command = shell_command.GetCommand(); - if (!command) - return Status("invalid shell command (empty)"); - - const char *working_dir = shell_command.GetWorkingDirectory(); - if (working_dir == nullptr) { - working_dir = platform_sp->GetWorkingDirectory().GetCString(); - if (working_dir) - shell_command.SetWorkingDirectory(working_dir); - } - return platform_sp->RunShellCommand(shell_command.m_opaque_ptr->m_shell, - command, FileSpec(working_dir), - &shell_command.m_opaque_ptr->m_status, - &shell_command.m_opaque_ptr->m_signo, - &shell_command.m_opaque_ptr->m_output, - shell_command.m_opaque_ptr->m_timeout); - })); + return LLDB_RECORD_RESULT( + ExecuteConnected([&](const lldb::PlatformSP &platform_sp) { + const char *command = shell_command.GetCommand(); + if (!command) + return Status("invalid shell command (empty)"); + + const char *working_dir = shell_command.GetWorkingDirectory(); + if (working_dir == nullptr) { + working_dir = platform_sp->GetWorkingDirectory().GetCString(); + if (working_dir) + shell_command.SetWorkingDirectory(working_dir); + } + return platform_sp->RunShellCommand( + shell_command.m_opaque_ptr->m_shell, command, FileSpec(working_dir), + &shell_command.m_opaque_ptr->m_status, + &shell_command.m_opaque_ptr->m_signo, + &shell_command.m_opaque_ptr->m_output, + shell_command.m_opaque_ptr->m_timeout); + })); } SBError SBPlatform::Launch(SBLaunchInfo &launch_info) { @@ -705,8 +705,7 @@ SBEnvironment SBPlatform::GetEnvironment() { namespace lldb_private { namespace repro { -template <> -void RegisterMethods(Registry &R) { +template <> void RegisterMethods(Registry &R) { LLDB_REGISTER_CONSTRUCTOR(SBPlatformConnectOptions, (const char *)); LLDB_REGISTER_CONSTRUCTOR(SBPlatformConnectOptions, (const lldb::SBPlatformConnectOptions &)); @@ -715,8 +714,7 @@ void RegisterMethods(Registry &R) { SBPlatformConnectOptions, operator=,( const lldb::SBPlatformConnectOptions &)); LLDB_REGISTER_METHOD(const char *, SBPlatformConnectOptions, GetURL, ()); - LLDB_REGISTER_METHOD(void, SBPlatformConnectOptions, SetURL, - (const char *)); + LLDB_REGISTER_METHOD(void, SBPlatformConnectOptions, SetURL, (const char *)); LLDB_REGISTER_METHOD(bool, SBPlatformConnectOptions, GetRsyncEnabled, ()); LLDB_REGISTER_METHOD(void, SBPlatformConnectOptions, EnableRsync, (const char *, const char *, bool)); @@ -727,8 +725,7 @@ void RegisterMethods(Registry &R) { (const char *)); } -template <> -void RegisterMethods(Registry &R) { +template <> void RegisterMethods(Registry &R) { LLDB_REGISTER_CONSTRUCTOR(SBPlatformShellCommand, (const char *)); LLDB_REGISTER_CONSTRUCTOR(SBPlatformShellCommand, (const lldb::SBPlatformShellCommand &)); @@ -745,8 +742,7 @@ void RegisterMethods(Registry &R) { GetWorkingDirectory, ()); LLDB_REGISTER_METHOD(void, SBPlatformShellCommand, SetWorkingDirectory, (const char *)); - LLDB_REGISTER_METHOD(uint32_t, SBPlatformShellCommand, GetTimeoutSeconds, - ()); + LLDB_REGISTER_METHOD(uint32_t, SBPlatformShellCommand, GetTimeoutSeconds, ()); LLDB_REGISTER_METHOD(void, SBPlatformShellCommand, SetTimeoutSeconds, (uint32_t)); LLDB_REGISTER_METHOD(int, SBPlatformShellCommand, GetSignal, ()); @@ -754,15 +750,16 @@ void RegisterMethods(Registry &R) { LLDB_REGISTER_METHOD(const char *, SBPlatformShellCommand, GetOutput, ()); } -template <> -void RegisterMethods(Registry &R) { +template <> void RegisterMethods(Registry &R) { LLDB_REGISTER_CONSTRUCTOR(SBPlatform, ()); LLDB_REGISTER_CONSTRUCTOR(SBPlatform, (const char *)); LLDB_REGISTER_CONSTRUCTOR(SBPlatform, (const lldb::SBPlatform &)); + LLDB_REGISTER_CONSTRUCTOR(SBPlatformShellCommand, + (const char *, const char *)); LLDB_REGISTER_METHOD(SBPlatform &, SBPlatform, operator=,(const lldb::SBPlatform &)); LLDB_REGISTER_METHOD_CONST(bool, SBPlatform, IsValid, ()); - LLDB_REGISTER_METHOD_CONST(bool, SBPlatform, operator bool, ()); + LLDB_REGISTER_METHOD_CONST(bool, SBPlatform, operator bool,()); LLDB_REGISTER_METHOD(void, SBPlatform, Clear, ()); LLDB_REGISTER_METHOD(const char *, SBPlatform, GetName, ()); LLDB_REGISTER_METHOD(const char *, SBPlatform, GetWorkingDirectory, ()); @@ -802,5 +799,5 @@ void RegisterMethods(Registry &R) { ()); } -} -} +} // namespace repro +} // namespace lldb_private diff --git a/lldb/source/Commands/CommandObjectScript.cpp b/lldb/source/Commands/CommandObjectScript.cpp index e5ae244cade19..9dadf11ebfc89 100644 --- a/lldb/source/Commands/CommandObjectScript.cpp +++ b/lldb/source/Commands/CommandObjectScript.cpp @@ -10,36 +10,107 @@ #include "lldb/Core/Debugger.h" #include "lldb/DataFormatters/DataVisualization.h" #include "lldb/Host/Config.h" +#include "lldb/Host/OptionParser.h" #include "lldb/Interpreter/CommandInterpreter.h" #include "lldb/Interpreter/CommandReturnObject.h" +#include "lldb/Interpreter/OptionArgParser.h" #include "lldb/Interpreter/ScriptInterpreter.h" #include "lldb/Utility/Args.h" using namespace lldb; using namespace lldb_private; -// CommandObjectScript +static constexpr OptionEnumValueElement g_script_option_enumeration[] = { + { + eScriptLanguagePython, + "python", + "Python", + }, + { + eScriptLanguageLua, + "lua", + "Lua", + }, + { + eScriptLanguageNone, + "default", + "The default scripting language.", + }, +}; + +static constexpr OptionEnumValues ScriptOptionEnum() { + return OptionEnumValues(g_script_option_enumeration); +} + +#define LLDB_OPTIONS_script +#include "CommandOptions.inc" + +Status CommandObjectScript::CommandOptions::SetOptionValue( + uint32_t option_idx, llvm::StringRef option_arg, + ExecutionContext *execution_context) { + Status error; + const int short_option = m_getopt_table[option_idx].val; + + switch (short_option) { + case 'l': + language = (lldb::ScriptLanguage)OptionArgParser::ToOptionEnum( + option_arg, GetDefinitions()[option_idx].enum_values, + eScriptLanguageNone, error); + if (!error.Success()) + error.SetErrorStringWithFormat("unrecognized value for language '%s'", + option_arg.str().c_str()); + break; + default: + llvm_unreachable("Unimplemented option"); + } + + return error; +} + +void CommandObjectScript::CommandOptions::OptionParsingStarting( + ExecutionContext *execution_context) { + language = lldb::eScriptLanguageNone; +} + +llvm::ArrayRef +CommandObjectScript::CommandOptions::GetDefinitions() { + return llvm::makeArrayRef(g_script_options); +} CommandObjectScript::CommandObjectScript(CommandInterpreter &interpreter) : CommandObjectRaw( interpreter, "script", "Invoke the script interpreter with provided code and display any " "results. Start the interactive interpreter if no code is supplied.", - "script []") {} + "script [--language --] []") {} CommandObjectScript::~CommandObjectScript() {} bool CommandObjectScript::DoExecute(llvm::StringRef command, CommandReturnObject &result) { - if (m_interpreter.GetDebugger().GetScriptLanguage() == - lldb::eScriptLanguageNone) { + // Try parsing the language option but when the command contains a raw part + // separated by the -- delimiter. + OptionsWithRaw raw_args(command); + if (raw_args.HasArgs()) { + if (!ParseOptions(raw_args.GetArgs(), result)) + return false; + command = raw_args.GetRawPart(); + } + + lldb::ScriptLanguage language = + (m_options.language == lldb::eScriptLanguageNone) + ? m_interpreter.GetDebugger().GetScriptLanguage() + : m_options.language; + + if (language == lldb::eScriptLanguageNone) { result.AppendError( "the script-lang setting is set to none - scripting not available"); result.SetStatus(eReturnStatusFailed); return false; } - ScriptInterpreter *script_interpreter = GetDebugger().GetScriptInterpreter(); + ScriptInterpreter *script_interpreter = + GetDebugger().GetScriptInterpreter(true, language); if (script_interpreter == nullptr) { result.AppendError("no script interpreter"); diff --git a/lldb/source/Commands/CommandObjectScript.h b/lldb/source/Commands/CommandObjectScript.h index 40abf8bd730c7..b9fee7124818a 100644 --- a/lldb/source/Commands/CommandObjectScript.h +++ b/lldb/source/Commands/CommandObjectScript.h @@ -17,9 +17,24 @@ class CommandObjectScript : public CommandObjectRaw { public: CommandObjectScript(CommandInterpreter &interpreter); ~CommandObjectScript() override; + Options *GetOptions() override { return &m_options; } + + class CommandOptions : public Options { + public: + CommandOptions() : Options() {} + ~CommandOptions() override = default; + Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg, + ExecutionContext *execution_context) override; + void OptionParsingStarting(ExecutionContext *execution_context) override; + llvm::ArrayRef GetDefinitions() override; + lldb::ScriptLanguage language = lldb::eScriptLanguageNone; + }; protected: bool DoExecute(llvm::StringRef command, CommandReturnObject &result) override; + +private: + CommandOptions m_options; }; } // namespace lldb_private diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td index eacd6de1910c1..b41b1871ad81f 100644 --- a/lldb/source/Commands/Options.td +++ b/lldb/source/Commands/Options.td @@ -717,6 +717,12 @@ let Command = "script add" in { "LLDB event system.">; } +let Command = "script" in { + def script_language : Option<"language", "l">, + EnumArg<"ScriptLang", "ScriptOptionEnum()">, Desc<"Specify the scripting " + " language. If none is specific the default scripting language is used.">; +} + let Command = "source info" in { def source_info_count : Option<"count", "c">, Arg<"Count">, Desc<"The number of line entries to display.">; diff --git a/lldb/source/Core/IOHandler.cpp b/lldb/source/Core/IOHandler.cpp index 0648cf41f28aa..8c654d9d8a98b 100644 --- a/lldb/source/Core/IOHandler.cpp +++ b/lldb/source/Core/IOHandler.cpp @@ -103,11 +103,11 @@ FILE *IOHandler::GetErrorFILE() { return (m_error_sp ? m_error_sp->GetFile().GetStream() : nullptr); } -FileSP &IOHandler::GetInputFileSP() { return m_input_sp; } +FileSP IOHandler::GetInputFileSP() { return m_input_sp; } -StreamFileSP &IOHandler::GetOutputStreamFileSP() { return m_output_sp; } +StreamFileSP IOHandler::GetOutputStreamFileSP() { return m_output_sp; } -StreamFileSP &IOHandler::GetErrorStreamFileSP() { return m_error_sp; } +StreamFileSP IOHandler::GetErrorStreamFileSP() { return m_error_sp; } bool IOHandler::GetIsInteractive() { return GetInputFileSP() ? GetInputFileSP()->GetIsInteractive() : false; diff --git a/lldb/source/Expression/REPL.cpp b/lldb/source/Expression/REPL.cpp index fd7c39686921d..c3d14960f74c5 100644 --- a/lldb/source/Expression/REPL.cpp +++ b/lldb/source/Expression/REPL.cpp @@ -123,10 +123,11 @@ const char *REPL::IOHandlerGetHelpPrologue() { "Valid statements, expressions, and declarations are immediately " "compiled and executed.\n\n" "The complete set of LLDB debugging commands are also available as " - "described below. Commands " + "described below.\n\nCommands " "must be prefixed with a colon at the REPL prompt (:quit for " "example.) Typing just a colon " - "followed by return will switch to the LLDB prompt.\n\n"; + "followed by return will switch to the LLDB prompt.\n\n" + "Type “< path” to read in code from a text file “path”.\n\n"; } bool REPL::IOHandlerIsInputComplete(IOHandler &io_handler, StringList &lines) { @@ -179,6 +180,36 @@ int REPL::IOHandlerFixIndentation(IOHandler &io_handler, return (int)desired_indent - actual_indent; } +static bool ReadCode(const std::string &path, std::string &code, + lldb::StreamFileSP &error_sp) { + auto &fs = FileSystem::Instance(); + llvm::Twine pathTwine(path); + if (!fs.Exists(pathTwine)) { + error_sp->Printf("no such file at path '%s'\n", path.c_str()); + return false; + } + if (!fs.Readable(pathTwine)) { + error_sp->Printf("could not read file at path '%s'\n", path.c_str()); + return false; + } + const size_t file_size = fs.GetByteSize(pathTwine); + const size_t max_size = code.max_size(); + if (file_size > max_size) { + error_sp->Printf("file at path '%s' too large: " + "file_size = %zu, max_size = %zu\n", + path.c_str(), file_size, max_size); + return false; + } + auto data_sp = fs.CreateDataBuffer(pathTwine); + if (data_sp == nullptr) { + error_sp->Printf("could not create buffer for file at path '%s'\n", + path.c_str()); + return false; + } + code.assign((const char *)data_sp->GetBytes(), data_sp->GetByteSize()); + return true; +} + void REPL::IOHandlerInputComplete(IOHandler &io_handler, std::string &code) { lldb::StreamFileSP output_sp(io_handler.GetOutputStreamFileSP()); lldb::StreamFileSP error_sp(io_handler.GetErrorStreamFileSP()); @@ -257,6 +288,15 @@ void REPL::IOHandlerInputComplete(IOHandler &io_handler, std::string &code) { } } } else { + if (code[0] == '<') { + // User wants to read code from a file. + // Interpret rest of line as a literal path. + auto path = llvm::StringRef(code.substr(1)).trim().str(); + if (!ReadCode(path, code, error_sp)) { + return; + } + } + // Unwind any expression we might have been running in case our REPL // expression crashed and the user was looking around if (m_dedicated_repl_mode) { diff --git a/lldb/source/Host/netbsd/HostNetBSD.cpp b/lldb/source/Host/netbsd/HostNetBSD.cpp index 4708fb45deed0..38e2aa5c1e058 100644 --- a/lldb/source/Host/netbsd/HostNetBSD.cpp +++ b/lldb/source/Host/netbsd/HostNetBSD.cpp @@ -220,7 +220,7 @@ uint32_t Host::FindProcessesImpl(const ProcessInstanceInfoMatch &match_info, if (proc_kinfo[i].p_nlwps > 1) { bool already_registered = false; for (size_t pi = 0; pi < process_infos.size(); pi++) { - if (process_infos[pi].GetProcessID() == proc_kinfo[i].p_pid) { + if ((::pid_t)process_infos[pi].GetProcessID() == proc_kinfo[i].p_pid) { already_registered = true; break; } diff --git a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp index 5cf9fb4ad37f9..7cae4cc427501 100644 --- a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp +++ b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp @@ -33,6 +33,12 @@ ABIAArch64::GetEHAndDWARFNums(llvm::StringRef name) { return MCBasedABI::GetEHAndDWARFNums(name); } +std::string ABIAArch64::GetMCName(std::string reg) { + MapRegisterName(reg, "v", "q"); + MapRegisterName(reg, "x29", "fp"); + MapRegisterName(reg, "x30", "lr"); + return reg; +} uint32_t ABIAArch64::GetGenericNum(llvm::StringRef name) { return llvm::StringSwitch(name) .Case("pc", LLDB_REGNUM_GENERIC_PC) diff --git a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.h b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.h index 981145e2017e3..bdff648f1b522 100644 --- a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.h +++ b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.h @@ -20,10 +20,7 @@ class ABIAArch64: public lldb_private::MCBasedABI { std::pair GetEHAndDWARFNums(llvm::StringRef name) override; - std::string GetMCName(std::string reg) override { - MapRegisterName(reg, "v", "q"); - return reg; - } + std::string GetMCName(std::string reg) override; uint32_t GetGenericNum(llvm::StringRef name) override; diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h b/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h index 769b18d54cedd..b70ec223df4df 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h +++ b/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h @@ -359,15 +359,12 @@ class SemaSourceWithPriorities : public clang::ExternalSemaSource { } void CompleteType(clang::TagDecl *Tag) override { - while (!Tag->isCompleteDefinition()) - for (size_t i = 0; i < Sources.size(); ++i) { - // FIXME: We are technically supposed to loop here too until - // Tag->isCompleteDefinition() is true, but if our low quality source - // is failing to complete the tag this code will deadlock. - Sources[i]->CompleteType(Tag); - if (Tag->isCompleteDefinition()) - break; - } + for (clang::ExternalSemaSource *S : Sources) { + S->CompleteType(Tag); + // Stop after the first source completed the type. + if (Tag->isCompleteDefinition()) + break; + } } void CompleteType(clang::ObjCInterfaceDecl *Class) override { diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp index 73042c205a5ae..e2601a059bb77 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp @@ -216,7 +216,12 @@ namespace { /// imported while completing the original Decls). class CompleteTagDeclsScope : public ClangASTImporter::NewDeclListener { ClangASTImporter::ImporterDelegateSP m_delegate; - llvm::SmallVector m_decls_to_complete; + /// List of declarations in the target context that need to be completed. + /// Every declaration should only be completed once and therefore should only + /// be once in this list. + llvm::SetVector m_decls_to_complete; + /// Set of declarations that already were successfully completed (not just + /// added to m_decls_to_complete). llvm::SmallPtrSet m_decls_already_completed; clang::ASTContext *m_dst_ctx; clang::ASTContext *m_src_ctx; @@ -244,6 +249,9 @@ class CompleteTagDeclsScope : public ClangASTImporter::NewDeclListener { NamedDecl *decl = m_decls_to_complete.pop_back_val(); m_decls_already_completed.insert(decl); + // The decl that should be completed has to be imported into the target + // context from some other context. + assert(to_context_md->hasOrigin(decl)); // We should only complete decls coming from the source context. assert(to_context_md->getOrigin(decl).ctx == m_src_ctx); @@ -287,7 +295,8 @@ class CompleteTagDeclsScope : public ClangASTImporter::NewDeclListener { // Check if we already completed this type. if (m_decls_already_completed.count(to_named_decl) != 0) return; - m_decls_to_complete.push_back(to_named_decl); + // Queue this type to be completed. + m_decls_to_complete.insert(to_named_decl); } }; } // namespace diff --git a/lldb/source/Plugins/ExpressionParser/Clang/CxxModuleHandler.cpp b/lldb/source/Plugins/ExpressionParser/Clang/CxxModuleHandler.cpp index 2f8cf1846ee77..38d9f8d1e4b80 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/CxxModuleHandler.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/CxxModuleHandler.cpp @@ -34,6 +34,7 @@ CxxModuleHandler::CxxModuleHandler(ASTImporter &importer, ASTContext *target) "weak_ptr", // utility "allocator", + "pair", }; m_supported_templates.insert(supported_names.begin(), supported_names.end()); } diff --git a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp index 3dc07678f92f5..b3209160cecf0 100644 --- a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp @@ -388,7 +388,7 @@ bool lldb_private::formatters::NSDictionarySummaryProvider( return false; ObjCLanguageRuntime::ClassDescriptorSP descriptor( - runtime->GetClassDescriptor(valobj)); + runtime->GetNonKVOClassDescriptor(valobj)); if (!descriptor || !descriptor->IsValid()) return false; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugAranges.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugAranges.cpp index 7dc52c1e2df06..7062c9bfae235 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugAranges.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugAranges.cpp @@ -39,7 +39,7 @@ DWARFDebugAranges::extract(const DWARFDataExtractor &debug_aranges_data) { Range range; while (debug_aranges_data.ValidOffset(offset)) { llvm::Error error = set.extract(debug_aranges_data, &offset); - if (!error) + if (error) return error; const uint32_t num_descriptors = set.NumDescriptors(); diff --git a/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/Makefile b/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/Makefile new file mode 100644 index 0000000000000..4915cdae87641 --- /dev/null +++ b/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/Makefile @@ -0,0 +1,9 @@ +# We don't have any standard include directories, so we can't +# parse the test_common.h header we usually inject as it includes +# system headers. +NO_TEST_COMMON_H := 1 + +CXXFLAGS_EXTRAS = -I $(SRCDIR)/root/usr/include/c++/v1/ -I $(SRCDIR)/root/usr/include/ -nostdinc -nostdinc++ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/TestForwardDeclFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/TestForwardDeclFromStdModule.py new file mode 100644 index 0000000000000..48459abb92668 --- /dev/null +++ b/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/TestForwardDeclFromStdModule.py @@ -0,0 +1,39 @@ +""" +Tests forward declarations coming from the `std` module. +""" + +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil +import os + +class TestCase(TestBase): + + mydir = TestBase.compute_mydir(__file__) + + # We only emulate a fake libc++ in this test and don't use the real libc++, + # but we still add the libc++ category so that this test is only run in + # test configurations where libc++ is actually supposed to be tested. + @add_test_categories(["libc++"]) + @skipIfRemote + @skipIf(compiler=no_match("clang")) + def test(self): + self.build() + + sysroot = os.path.join(os.getcwd(), "root") + + # Set the sysroot where our dummy libc++ exists. + self.runCmd("platform select --sysroot '" + sysroot + "' host", CURRENT_EXECUTABLE_SET) + + lldbutil.run_to_source_breakpoint(self, + "// Set break point at this line.", lldb.SBFileSpec("main.cpp")) + + self.runCmd("settings set target.import-std-module true") + + # Print the dummy `std::vector`. It only has the dummy member in it + # so the standard `std::vector` formatter can't format it. Instead use + # the raw output so LLDB has to show the member variable. + # Both `std::vector` and the type of the member have forward + # declarations before their definitions. + self.expect("expr --raw -- v", + substrs=['(std::__1::vector) $0 = {', 'f = 0x', '}']) diff --git a/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/main.cpp b/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/main.cpp new file mode 100644 index 0000000000000..a0b02d5c68141 --- /dev/null +++ b/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/main.cpp @@ -0,0 +1,8 @@ +#include + +int main(int argc, char **argv) { + // Makes sure we have the mock libc headers in the debug information. + libc_struct s; + std::vector v; + return 0; // Set break point at this line. +} diff --git a/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/root/usr/include/c++/v1/module.modulemap b/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/root/usr/include/c++/v1/module.modulemap new file mode 100644 index 0000000000000..f149be7b7d21a --- /dev/null +++ b/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/root/usr/include/c++/v1/module.modulemap @@ -0,0 +1,3 @@ +module std { + module "vector" { header "vector" export * } +} diff --git a/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/root/usr/include/c++/v1/vector b/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/root/usr/include/c++/v1/vector new file mode 100644 index 0000000000000..c2d77aab07110 --- /dev/null +++ b/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/root/usr/include/c++/v1/vector @@ -0,0 +1,14 @@ +#include "libc_header.h" + +namespace std { + inline namespace __1 { + // A forward decl of `vector`. + template class vector; + // Pretend to be a std::vector template we need to instantiate in LLDB + // when import-std-module is enabled. + template + struct vector { class F; F *f; }; + // The definition of our forward declared nested class. + template class vector::F { int x; }; + } +} diff --git a/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/root/usr/include/libc_header.h b/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/root/usr/include/libc_header.h new file mode 100644 index 0000000000000..47525c9db3467 --- /dev/null +++ b/lldb/test/API/commands/expression/import-std-module/forward_decl_from_module/root/usr/include/libc_header.h @@ -0,0 +1 @@ +struct libc_struct {}; diff --git a/lldb/test/API/commands/expression/import-std-module/pair/Makefile b/lldb/test/API/commands/expression/import-std-module/pair/Makefile new file mode 100644 index 0000000000000..f938f7428468a --- /dev/null +++ b/lldb/test/API/commands/expression/import-std-module/pair/Makefile @@ -0,0 +1,3 @@ +USE_LIBCPP := 1 +CXX_SOURCES := main.cpp +include Makefile.rules diff --git a/lldb/test/API/commands/expression/import-std-module/pair/TestPairFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/pair/TestPairFromStdModule.py new file mode 100644 index 0000000000000..4f5b1ea8028b0 --- /dev/null +++ b/lldb/test/API/commands/expression/import-std-module/pair/TestPairFromStdModule.py @@ -0,0 +1,25 @@ +""" +Test basic std::pair functionality. +""" + +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + +class TestCase(TestBase): + + mydir = TestBase.compute_mydir(__file__) + + @add_test_categories(["libc++"]) + @skipIf(compiler=no_match("clang")) + def test(self): + self.build() + + lldbutil.run_to_source_breakpoint(self, + "// Set break point at this line.", lldb.SBFileSpec("main.cpp")) + + self.runCmd("settings set target.import-std-module true") + + self.expect_expr("pair_int.first", result_type="int", result_value="1234") + self.expect_expr("pair_int.second", result_type="int", result_value="5678") + self.expect("expr pair_int", substrs=['first = 1234, second = 5678']) \ No newline at end of file diff --git a/lldb/test/API/commands/expression/import-std-module/pair/main.cpp b/lldb/test/API/commands/expression/import-std-module/pair/main.cpp new file mode 100644 index 0000000000000..1363698f1fc7f --- /dev/null +++ b/lldb/test/API/commands/expression/import-std-module/pair/main.cpp @@ -0,0 +1,6 @@ +#include + +int main(int argc, char **argv) { + std::pair pair_int(1234, 5678); + return 0; // Set break point at this line. +} diff --git a/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py b/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py index 6a3f40ff3a35b..b26af93525dc9 100644 --- a/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py +++ b/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py @@ -3,6 +3,7 @@ """ import os +import json import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * @@ -56,6 +57,41 @@ def test_scripted_extra_args(self): self.setup_targets_and_cleanup() self.do_check_extra_args() + def test_structured_data_serialization(self): + target = self.dbg.GetDummyTarget() + self.assertTrue(target.IsValid(), VALID_TARGET) + + interpreter = self.dbg.GetCommandInterpreter() + result = lldb.SBCommandReturnObject() + interpreter.HandleCommand("br set -f foo -l 42", result) + result = lldb.SBCommandReturnObject() + interpreter.HandleCommand("br set -c 'argc == 1' -n main", result) + + bkp1 = target.GetBreakpointAtIndex(0) + self.assertTrue(bkp1.IsValid(), VALID_BREAKPOINT) + stream = lldb.SBStream() + sd = bkp1.SerializeToStructuredData() + sd.GetAsJSON(stream) + serialized_data = json.loads(stream.GetData()) + self.assertEqual(serialized_data["Breakpoint"]["BKPTResolver"]["Options"]["FileName"], "foo") + self.assertEqual(serialized_data["Breakpoint"]["BKPTResolver"]["Options"]["LineNumber"], 42) + + bkp2 = target.GetBreakpointAtIndex(1) + self.assertTrue(bkp2.IsValid(), VALID_BREAKPOINT) + stream = lldb.SBStream() + sd = bkp2.SerializeToStructuredData() + sd.GetAsJSON(stream) + serialized_data = json.loads(stream.GetData()) + self.assertIn("main", serialized_data["Breakpoint"]["BKPTResolver"]["Options"]["SymbolNames"]) + self.assertEqual(serialized_data["Breakpoint"]["BKPTOptions"]["ConditionText"],"argc == 1") + + invalid_bkp = lldb.SBBreakpoint() + self.assertFalse(invalid_bkp.IsValid(), "Breakpoint should not be valid.") + stream = lldb.SBStream() + sd = invalid_bkp.SerializeToStructuredData() + sd.GetAsJSON(stream) + self.assertFalse(stream.GetData(), "Invalid breakpoint should have an empty structured data") + def setup_targets_and_cleanup(self): def cleanup (): self.RemoveTempFile(self.bkpts_file_path) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSContainer.py b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSContainer.py index d13d5d5df1d5b..05367c144b302 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSContainer.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSContainer.py @@ -21,7 +21,7 @@ def test_nscontainers_with_run_command(self): def nscontainers_data_formatter_commands(self): self.expect( - 'frame variable newArray nsDictionary newDictionary nscfDictionary cfDictionaryRef newMutableDictionary cfarray_ref mutable_array_ref', + 'frame variable newArray nsDictionary newDictionary nscfDictionary cfDictionaryRef newMutableDictionary newMutableDictionaryRef cfarray_ref mutable_array_ref', substrs=[ '(NSArray *) newArray = ', ' @"50 elements"', @@ -35,6 +35,8 @@ def nscontainers_data_formatter_commands(self): ' 2 key/value pairs', '(NSDictionary *) newMutableDictionary = ', ' 21 key/value pairs', + '(CFMutableDictionaryRef) newMutableDictionaryRef = ', + ' 21 key/value pairs', '(CFArrayRef) cfarray_ref = ', ' @"3 elements"', '(CFMutableArrayRef) mutable_array_ref = ', diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m index 169b3aed4f222..409cb0a993f9d 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m @@ -476,6 +476,8 @@ int main(int argc, const char *argv[]) { [newMutableDictionary setObject:@"foo" forKey:@"bar19"]; [newMutableDictionary setObject:@"foo" forKey:@"bar20"]; + CFMutableDictionaryRef newMutableDictionaryRef = CFDictionaryCreateMutableCopy(kCFAllocatorDefault, 0, newMutableDictionary); + id cfKeys[4] = {@"foo", @"bar", @"baz", @"quux"}; id cfValues[4] = {@"foo", @"bar", @"baz", @"quux"}; NSDictionary *nsDictionary = CFBridgingRelease( diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestQemuAArch64TargetXml.py b/lldb/test/API/functionalities/gdb_remote_client/TestQemuAArch64TargetXml.py new file mode 100644 index 0000000000000..9368de7b055aa --- /dev/null +++ b/lldb/test/API/functionalities/gdb_remote_client/TestQemuAArch64TargetXml.py @@ -0,0 +1,73 @@ +from lldbsuite.test.lldbtest import * +from lldbsuite.test.decorators import * +from gdbclientutils import * +from textwrap import dedent + +class MyResponder(MockGDBServerResponder): + def qXferRead(self, obj, annex, offset, length): + if annex == "target.xml": + return dedent("""\ + + + aarch64 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + """), False + else: + return None, False + +class TestQemuAarch64TargetXml(GDBRemoteTestBase): + + @skipIfXmlSupportMissing + @skipIfRemote + @skipIfLLVMTargetMissing("AArch64") + def test_register_augmentation(self): + """ + Test that we correctly associate the register info with the eh_frame + register numbers. + """ + + target = self.createTarget("basic_eh_frame-aarch64.yaml") + self.server.responder = MyResponder() + + process = self.connect(target) + lldbutil.expect_state_changes(self, self.dbg.GetListener(), process, + [lldb.eStateStopped]) + self.filecheck("image show-unwind -n foo", __file__, + "--check-prefix=UNWIND") +# UNWIND: eh_frame UnwindPlan: +# UNWIND: row[0]: 0: CFA=x29+16 => x30=[CFA-8] diff --git a/lldb/test/API/functionalities/gdb_remote_client/basic_eh_frame-aarch64.yaml b/lldb/test/API/functionalities/gdb_remote_client/basic_eh_frame-aarch64.yaml new file mode 100644 index 0000000000000..acc66082495e7 --- /dev/null +++ b/lldb/test/API/functionalities/gdb_remote_client/basic_eh_frame-aarch64.yaml @@ -0,0 +1,25 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000401000 + AddressAlign: 0x0000000000000001 + Content: DEADBEEF + - Name: .eh_frame + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x0000000000402000 + AddressAlign: 0x0000000000000008 + Content: 0c000000000000000100017C1E0000001c0000001400000000104000000000000100000000000000000C1d109e820000 +Symbols: + - Name: foo + Section: .text + Binding: STB_GLOBAL + Value: 0x0000000000401000 +... diff --git a/lldb/test/API/lang/c/record_decl_in_expr/TestRecordDeclInExpr.py b/lldb/test/API/lang/c/record_decl_in_expr/TestRecordDeclInExpr.py new file mode 100644 index 0000000000000..16bf098dce8f3 --- /dev/null +++ b/lldb/test/API/lang/c/record_decl_in_expr/TestRecordDeclInExpr.py @@ -0,0 +1,34 @@ +""" +Tests declaring RecordDecls in non-top-level expressions. +""" + +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + +class TestCase(TestBase): + + mydir = TestBase.compute_mydir(__file__) + + @no_debug_info_test + def test_fwd_decl(self): + # Declare a forward decl and import it to the scratch AST. + self.expect_expr("struct S; S *s = nullptr; s", result_type="S *") + + @no_debug_info_test + def test_struct(self): + # Declare a struct and import it to the scratch AST. + self.expect("expr struct S {}; S s; s", substrs=["= {}"]) + + @no_debug_info_test + def test_struct_with_fwd_decl(self): + # Import the forward decl to the scratch AST. + self.expect_expr("struct S; S *s = nullptr; s", result_type="S *") + # Merge the definition into the scratch AST. + self.expect("expr struct S {}; S s; s", substrs=["= {}"]) + + @no_debug_info_test + def test_struct_with_fwd_decl_same_expr(self): + # Test both a forward decl and a definition in one expression and + # import them into the scratch AST. + self.expect("expr struct S; struct S{}; S s; s", substrs=["= {}"]) diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in index 6554d05d7df97..f2e1f855fe390 100644 --- a/lldb/test/API/lit.site.cfg.py.in +++ b/lldb/test/API/lit.site.cfg.py.in @@ -58,6 +58,7 @@ try: config.test_compiler = config.test_compiler % lit_config.params config.dsymutil = config.dsymutil % lit_config.params config.filecheck = config.filecheck % lit_config.params + config.yaml2obj = config.yaml2obj % lit_config.params config.dotest_args_str = config.dotest_args_str % lit_config.params except KeyError as e: key, = e.args diff --git a/lldb/test/API/tools/lldb-vscode/runInTerminal/Makefile b/lldb/test/API/tools/lldb-vscode/runInTerminal/Makefile new file mode 100644 index 0000000000000..10495940055b6 --- /dev/null +++ b/lldb/test/API/tools/lldb-vscode/runInTerminal/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/tools/lldb-vscode/runInTerminal/TestVSCode_runInTerminal.py b/lldb/test/API/tools/lldb-vscode/runInTerminal/TestVSCode_runInTerminal.py new file mode 100644 index 0000000000000..6a463dfacc1f9 --- /dev/null +++ b/lldb/test/API/tools/lldb-vscode/runInTerminal/TestVSCode_runInTerminal.py @@ -0,0 +1,48 @@ +""" +Test lldb-vscode runInTerminal reverse request +""" + + +import unittest2 +import vscode +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil +import lldbvscode_testcase +import time +import os + + +class TestVSCode_runInTerminal(lldbvscode_testcase.VSCodeTestCaseBase): + + mydir = TestBase.compute_mydir(__file__) + + @skipUnlessDarwin + @skipIfRemote + def test_runInTerminal(self): + ''' + Tests the "runInTerminal" reverse request. It makes sure that the IDE can + launch the inferior with the correct environment variables and arguments. + ''' + program = self.getBuildArtifact("a.out") + source = 'main.c' + self.build_and_launch(program, stopOnEntry=True, runInTerminal=True, args=["foobar"], env=["FOO=bar"]) + breakpoint_line = line_number(source, '// breakpoint') + + self.set_source_breakpoints(source, [breakpoint_line]) + self.continue_to_next_stop() + + # We verify we actually stopped inside the loop + counter = int(self.vscode.get_local_variable_value('counter')) + self.assertTrue(counter > 0) + + # We verify we were able to set the launch arguments + argc = int(self.vscode.get_local_variable_value('argc')) + self.assertEqual(argc, 2) + + argv1 = self.vscode.request_evaluate('argv[1]')['body']['result'] + self.assertIn('foobar', argv1) + + # We verify we were able to set the environment + env = self.vscode.request_evaluate('foo')['body']['result'] + self.assertIn('bar', env) diff --git a/lldb/test/API/tools/lldb-vscode/runInTerminal/main.c b/lldb/test/API/tools/lldb-vscode/runInTerminal/main.c new file mode 100644 index 0000000000000..676bd830e657b --- /dev/null +++ b/lldb/test/API/tools/lldb-vscode/runInTerminal/main.c @@ -0,0 +1,11 @@ +#include +#include +#include + +int main(int argc, char *argv[]) { + const char *foo = getenv("FOO"); + for (int counter = 1;; counter++) { + sleep(1); // breakpoint + } + return 0; +} diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/lua-python.test b/lldb/test/Shell/ScriptInterpreter/Lua/lua-python.test new file mode 100644 index 0000000000000..c40b8e068d9fe --- /dev/null +++ b/lldb/test/Shell/ScriptInterpreter/Lua/lua-python.test @@ -0,0 +1,17 @@ +# REQUIRES: lua +# REQUIRES: python +# UNSUPPORTED: lldb-repro + +# RUN: mkdir -p %t +# RUN: cd %t +# RUN: echo "int main() { return 0; }" | %clang_host -x c - -o a.out +# RUN: cat %s | %lldb 2>&1 | FileCheck %s +script -l lua -- +target = lldb.debugger:CreateTarget("a.out") +print("target is valid:", tostring(target:IsValid())) +lldb.debugger:SetSelectedTarget(target) +quit +# CHECK: target is valid: true +script -l python -- +print("selected target: {}".format(lldb.debugger.GetSelectedTarget())) +# CHECK: selected target: a.out diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/lua.test b/lldb/test/Shell/ScriptInterpreter/Lua/lua.test index 70184edbab1a8..28042efa8c813 100644 --- a/lldb/test/Shell/ScriptInterpreter/Lua/lua.test +++ b/lldb/test/Shell/ScriptInterpreter/Lua/lua.test @@ -1,3 +1,7 @@ # REQUIRES: lua -# RUN: %lldb --script-language lua -o 'script print(1000+100+10+1)' 2>&1 | FileCheck %s +# RUN: %lldb --script-language lua -o 'script io.stdout:write(1000+100+10+1, "\n")' 2>&1 | FileCheck %s +# RUN: %lldb --script-language lua -o 'script -- io.stdout:write(1000+100+10+1, "\n")' 2>&1 | FileCheck %s +# RUN: %lldb --script-language lua -o 'script --language default -- io.stdout:write(1000+100+10+1, "\n")' 2>&1 | FileCheck %s +# RUN: %lldb -o 'script -l lua -- io.stdout:write(1000+100+10+1, "\n")' 2>&1 | FileCheck %s +# RUN: %lldb -o 'script --language lua -- io.stdout:write(1000+100+10+1, "\n")' 2>&1 | FileCheck %s # CHECK: 1111 diff --git a/lldb/test/Shell/ScriptInterpreter/Python/python.test b/lldb/test/Shell/ScriptInterpreter/Python/python.test new file mode 100644 index 0000000000000..77d20294bc476 --- /dev/null +++ b/lldb/test/Shell/ScriptInterpreter/Python/python.test @@ -0,0 +1,13 @@ +# REQUIRES: python +# RUN: %lldb --script-language python -o 'script print("{}".format(1000+100+10+1))' 2>&1 | FileCheck %s +# RUN: %lldb --script-language python -o 'script -- print("{}".format(1000+100+10+1))' 2>&1 | FileCheck %s +# RUN: %lldb --script-language python -o 'script --language default -- print("{}".format(1000+100+10+1))' 2>&1 | FileCheck %s +# RUN: %lldb -o 'script -l python -- print("{}".format(1000+100+10+1))' 2>&1 | FileCheck %s +# RUN: %lldb -o 'script -lpython -- print("{}".format(1000+100+10+1))' 2>&1 | FileCheck %s +# RUN: %lldb -o 'script --language python -- print("{}".format(1000+100+10+1))' 2>&1 | FileCheck %s +# RUN: %lldb -o 'script --language=python -- print("{}".format(1000+100+10+1))' 2>&1 | FileCheck %s +# CHECK: 1111 + +# RUN: %lldb -o 'script --language invalid -- print("{}".format(1000+100+10+1))' 2>&1 | FileCheck %s --check-prefix INVALID +# INVALID: error: unrecognized value for language 'invalid' +# INVALID-NOT: 1111 diff --git a/lldb/tools/debugserver/source/debugserver.cpp b/lldb/tools/debugserver/source/debugserver.cpp index 04cbd2c8b503e..feb65eb6d3fbe 100644 --- a/lldb/tools/debugserver/source/debugserver.cpp +++ b/lldb/tools/debugserver/source/debugserver.cpp @@ -156,18 +156,36 @@ RNBRunLoopMode RNBRunLoopGetStartModeFromRemote(RNBRemote *remote) { return eRNBRunLoopModeExit; } -// Check the name to see if it ends with .app -static bool is_dot_app (const char *app_name) { - size_t len = strlen(app_name); - if (len < 4) +static nub_launch_flavor_t default_launch_flavor(const char *app_name) { +#if defined(WITH_FBS) || defined(WITH_BKS) || defined(WITH_SPRINGBOARD) + // Check the name to see if it ends with .app + auto is_dot_app = [](const char *app_name) { + size_t len = strlen(app_name); + if (len < 4) + return false; + + if (app_name[len - 4] == '.' && app_name[len - 3] == 'a' && + app_name[len - 2] == 'p' && app_name[len - 1] == 'p') + return true; return false; - - if (app_name[len - 4] == '.' && - app_name[len - 3] == 'a' && - app_name[len - 2] == 'p' && - app_name[len - 1] == 'p') - return true; - return false; + }; + + if (is_dot_app(app_name)) { +#if defined WITH_FBS + // Check if we have an app bundle, if so launch using FrontBoard Services. + return eLaunchFlavorFBS; +#elif defined WITH_BKS + // Check if we have an app bundle, if so launch using BackBoard Services. + return eLaunchFlavorBKS; +#elif defined WITH_SPRINGBOARD + // Check if we have an app bundle, if so launch using SpringBoard. + return eLaunchFlavorSpringBoard; +#endif + } +#endif + + // Our default launch method is posix spawn + return eLaunchFlavorPosixSpawn; } // This run loop mode will wait for the process to launch and hit its @@ -208,29 +226,8 @@ RNBRunLoopMode RNBRunLoopLaunchInferior(RNBRemote *remote, // figure our how we are going to launch automatically. nub_launch_flavor_t launch_flavor = g_launch_flavor; - if (launch_flavor == eLaunchFlavorDefault) { - // Our default launch method is posix spawn - launch_flavor = eLaunchFlavorPosixSpawn; - - const bool dot_app = is_dot_app(inferior_argv[0]); - (void)dot_app; -#if defined WITH_FBS - // Check if we have an app bundle, if so launch using BackBoard Services. - if (dot_app) { - launch_flavor = eLaunchFlavorFBS; - } -#elif defined WITH_BKS - // Check if we have an app bundle, if so launch using BackBoard Services. - if (dot_app) { - launch_flavor = eLaunchFlavorBKS; - } -#elif defined WITH_SPRINGBOARD - // Check if we have an app bundle, if so launch using SpringBoard. - if (dot_app) { - launch_flavor = eLaunchFlavorSpringBoard; - } -#endif - } + if (launch_flavor == eLaunchFlavorDefault) + launch_flavor = default_launch_flavor(inferior_argv[0]); ctx.SetLaunchFlavor(launch_flavor); char resolved_path[PATH_MAX]; @@ -1509,27 +1506,8 @@ int main(int argc, char *argv[]) { timeout_ptr = &attach_timeout_abstime; } nub_launch_flavor_t launch_flavor = g_launch_flavor; - if (launch_flavor == eLaunchFlavorDefault) { - // Our default launch method is posix spawn - launch_flavor = eLaunchFlavorPosixSpawn; - -#if defined WITH_FBS - // Check if we have an app bundle, if so launch using SpringBoard. - if (is_dot_app(waitfor_pid_name.c_str())) { - launch_flavor = eLaunchFlavorFBS; - } -#elif defined WITH_BKS - // Check if we have an app bundle, if so launch using SpringBoard. - if (is_dot_app(waitfor_pid_name.c_str())) { - launch_flavor = eLaunchFlavorBKS; - } -#elif defined WITH_SPRINGBOARD - // Check if we have an app bundle, if so launch using SpringBoard. - if (is_dot_app(waitfor_pid_name.c_str())) { - launch_flavor = eLaunchFlavorSpringBoard; - } -#endif - } + if (launch_flavor == eLaunchFlavorDefault) + launch_flavor = default_launch_flavor(waitfor_pid_name.c_str()); ctx.SetLaunchFlavor(launch_flavor); bool ignore_existing = false; diff --git a/lldb/tools/lldb-vscode/JSONUtils.cpp b/lldb/tools/lldb-vscode/JSONUtils.cpp index 36156ca2c42f9..044bfd13ec463 100644 --- a/lldb/tools/lldb-vscode/JSONUtils.cpp +++ b/lldb/tools/lldb-vscode/JSONUtils.cpp @@ -998,4 +998,44 @@ llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit unit) { return llvm::json::Value(std::move(object)); } +/// See +/// https://microsoft.github.io/debug-adapter-protocol/specification#Reverse_Requests_RunInTerminal +llvm::json::Object +CreateRunInTerminalReverseRequest(const llvm::json::Object &launch_request) { + llvm::json::Object reverse_request; + reverse_request.try_emplace("type", "request"); + reverse_request.try_emplace("command", "runInTerminal"); + + llvm::json::Object run_in_terminal_args; + // This indicates the IDE to open an embedded terminal, instead of opening the + // terminal in a new window. + run_in_terminal_args.try_emplace("kind", "integrated"); + + auto launch_request_arguments = launch_request.getObject("arguments"); + std::vector args = GetStrings(launch_request_arguments, "args"); + // The program path must be the first entry in the "args" field + args.insert(args.begin(), + GetString(launch_request_arguments, "program").str()); + run_in_terminal_args.try_emplace("args", args); + + const auto cwd = GetString(launch_request_arguments, "cwd"); + if (!cwd.empty()) + run_in_terminal_args.try_emplace("cwd", cwd); + + // We need to convert the input list of environments variables into a + // dictionary + std::vector envs = GetStrings(launch_request_arguments, "env"); + llvm::json::Object environment; + for (const std::string &env : envs) { + size_t index = env.find("="); + environment.try_emplace(env.substr(0, index), env.substr(index + 1)); + } + run_in_terminal_args.try_emplace("env", + llvm::json::Value(std::move(environment))); + + reverse_request.try_emplace( + "arguments", llvm::json::Value(std::move(run_in_terminal_args))); + return reverse_request; +} + } // namespace lldb_vscode diff --git a/lldb/tools/lldb-vscode/JSONUtils.h b/lldb/tools/lldb-vscode/JSONUtils.h index df4428f390ba2..88cbef9e5fdd4 100644 --- a/lldb/tools/lldb-vscode/JSONUtils.h +++ b/lldb/tools/lldb-vscode/JSONUtils.h @@ -443,6 +443,18 @@ llvm::json::Value CreateVariable(lldb::SBValue v, int64_t variablesReference, llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit unit); +/// Create a runInTerminal reverse request object +/// +/// \param[in] launch_request +/// The original launch_request object whose fields are used to construct +/// the reverse request object. +/// +/// \return +/// A "runInTerminal" JSON object that follows the specification outlined by +/// Microsoft. +llvm::json::Object +CreateRunInTerminalReverseRequest(const llvm::json::Object &launch_request); + } // namespace lldb_vscode #endif diff --git a/lldb/tools/lldb-vscode/VSCode.cpp b/lldb/tools/lldb-vscode/VSCode.cpp index 537cae7868631..9450cdf3132a1 100644 --- a/lldb/tools/lldb-vscode/VSCode.cpp +++ b/lldb/tools/lldb-vscode/VSCode.cpp @@ -38,7 +38,8 @@ VSCode::VSCode() {"swift_catch", "Swift Catch", lldb::eLanguageTypeSwift}, {"swift_throw", "Swift Throw", lldb::eLanguageTypeSwift}}), focus_tid(LLDB_INVALID_THREAD_ID), sent_terminated_event(false), - stop_at_entry(false), is_attach(false) { + stop_at_entry(false), is_attach(false), + reverse_request_seq(0), waiting_for_run_in_terminal(false) { const char *log_file_path = getenv("LLDBVSCODE_LOG"); #if defined(_WIN32) // Windows opens stdout and stdin in text mode which converts \n to 13,10 @@ -362,4 +363,71 @@ void VSCode::SetTarget(const lldb::SBTarget target) { } } +PacketStatus VSCode::GetNextObject(llvm::json::Object &object) { + std::string json = ReadJSON(); + if (json.empty()) + return PacketStatus::EndOfFile; + + llvm::StringRef json_sref(json); + llvm::Expected json_value = llvm::json::parse(json_sref); + if (!json_value) { + auto error = json_value.takeError(); + if (log) { + std::string error_str; + llvm::raw_string_ostream strm(error_str); + strm << error; + strm.flush(); + *log << "error: failed to parse JSON: " << error_str << std::endl + << json << std::endl; + } + return PacketStatus::JSONMalformed; + } + object = *json_value->getAsObject(); + if (!json_value->getAsObject()) { + if (log) + *log << "error: json packet isn't a object" << std::endl; + return PacketStatus::JSONNotObject; + } + return PacketStatus::Success; +} + +bool VSCode::HandleObject(const llvm::json::Object &object) { + const auto packet_type = GetString(object, "type"); + if (packet_type == "request") { + const auto command = GetString(object, "command"); + auto handler_pos = request_handlers.find(std::string(command)); + if (handler_pos != request_handlers.end()) { + handler_pos->second(object); + return true; // Success + } else { + if (log) + *log << "error: unhandled command \"" << command.data() << std::endl; + return false; // Fail + } + } + return false; +} + +PacketStatus VSCode::SendReverseRequest(llvm::json::Object request, + llvm::json::Object &response) { + request.try_emplace("seq", ++reverse_request_seq); + SendJSON(llvm::json::Value(std::move(request))); + while (true) { + PacketStatus status = GetNextObject(response); + const auto packet_type = GetString(response, "type"); + if (packet_type == "response") + return status; + else { + // Not our response, we got another packet + HandleObject(response); + } + } + return PacketStatus::EndOfFile; +} + +void VSCode::RegisterRequestCallback(std::string request, + RequestCallback callback) { + request_handlers[request] = callback; +} + } // namespace lldb_vscode diff --git a/lldb/tools/lldb-vscode/VSCode.h b/lldb/tools/lldb-vscode/VSCode.h index 88a0c08de2454..28e9eef13d6b3 100644 --- a/lldb/tools/lldb-vscode/VSCode.h +++ b/lldb/tools/lldb-vscode/VSCode.h @@ -9,6 +9,7 @@ #ifndef LLDB_TOOLS_LLDB_VSCODE_VSCODE_H #define LLDB_TOOLS_LLDB_VSCODE_VSCODE_H +#include #include #include #include @@ -19,6 +20,7 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/JSON.h" #include "llvm/Support/raw_ostream.h" #include "lldb/API/SBAttachInfo.h" @@ -65,6 +67,15 @@ enum class OutputType { Console, Stdout, Stderr, Telemetry }; enum VSCodeBroadcasterBits { eBroadcastBitStopEventThread = 1u << 0 }; +typedef void (*RequestCallback)(const llvm::json::Object &command); + +enum class PacketStatus { + Success = 0, + EndOfFile, + JSONMalformed, + JSONNotObject +}; + struct VSCode { InputStream input; OutputStream output; @@ -91,6 +102,10 @@ struct VSCode { bool sent_terminated_event; bool stop_at_entry; bool is_attach; + uint32_t reverse_request_seq; + std::map request_handlers; + std::condition_variable request_in_terminal_cv; + bool waiting_for_run_in_terminal; // Keep track of the last stop thread index IDs as threads won't go away // unless we send a "thread" event to indicate the thread exited. llvm::DenseSet thread_ids; @@ -152,6 +167,36 @@ struct VSCode { /// Set given target object as a current target for lldb-vscode and start /// listeing for its breakpoint events. void SetTarget(const lldb::SBTarget target); + + const std::map &GetRequestHandlers(); + + PacketStatus GetNextObject(llvm::json::Object &object); + bool HandleObject(const llvm::json::Object &object); + + /// Send a Debug Adapter Protocol reverse request to the IDE + /// + /// \param[in] request + /// The payload of the request to send. + /// + /// \param[out] response + /// The response of the IDE. It might be undefined if there was an error. + /// + /// \return + /// A \a PacketStatus object indicating the sucess or failure of the + /// request. + PacketStatus SendReverseRequest(llvm::json::Object request, + llvm::json::Object &response); + + /// Registers a callback handler for a Debug Adapter Protocol request + /// + /// \param[in] request + /// The name of the request following the Debug Adapter Protocol + /// specification. + /// + /// \param[in] callback + /// The callback to execute when the given request is triggered by the + /// IDE. + void RegisterRequestCallback(std::string request, RequestCallback callback); }; extern VSCode g_vsc; diff --git a/lldb/tools/lldb-vscode/lldb-vscode.cpp b/lldb/tools/lldb-vscode/lldb-vscode.cpp index 54f2e653d0697..3b0817c71e62f 100644 --- a/lldb/tools/lldb-vscode/lldb-vscode.cpp +++ b/lldb/tools/lldb-vscode/lldb-vscode.cpp @@ -384,7 +384,12 @@ void EventThreadFunction() { break; case lldb::eStateSuspended: break; - case lldb::eStateStopped: + case lldb::eStateStopped: { + if (g_vsc.waiting_for_run_in_terminal) { + g_vsc.waiting_for_run_in_terminal = false; + g_vsc.request_in_terminal_cv.notify_one(); + } + } // Only report a stopped event if the process was not restarted. if (!lldb::SBProcess::GetRestartedFromEvent(event)) { SendStdOutStdErr(process); @@ -1374,6 +1379,9 @@ void request_initialize(const llvm::json::Object &request) { filters.emplace_back(CreateExceptionBreakpointFilter(exc_bp)); } body.try_emplace("exceptionBreakpointFilters", std::move(filters)); + // The debug adapter supports launching a debugee in intergrated VSCode + // terminal. + body.try_emplace("supportsRunInTerminalRequest", true); // The debug adapter supports stepping back via the stepBack and // reverseContinue requests. body.try_emplace("supportsStepBack", false); @@ -1433,6 +1441,49 @@ void request_initialize(const llvm::json::Object &request) { g_vsc.SendJSON(llvm::json::Value(std::move(response))); } +void request_runInTerminal(const llvm::json::Object &launch_request, + llvm::json::Object &launch_response) { + // We have already created a target that has a valid "program" path to the + // executable. We will attach to the next process whose name matches that + // of the target's. + g_vsc.is_attach = true; + lldb::SBAttachInfo attach_info; + lldb::SBError error; + attach_info.SetWaitForLaunch(true, /*async*/ true); + g_vsc.target.Attach(attach_info, error); + + llvm::json::Object reverse_request = + CreateRunInTerminalReverseRequest(launch_request); + llvm::json::Object reverse_response; + lldb_vscode::PacketStatus status = + g_vsc.SendReverseRequest(reverse_request, reverse_response); + if (status != lldb_vscode::PacketStatus::Success) + error.SetErrorString("Process cannot be launched by IDE."); + + if (error.Success()) { + // Wait for the attach stop event to happen or for a timeout. + g_vsc.waiting_for_run_in_terminal = true; + static std::mutex mutex; + std::unique_lock locker(mutex); + g_vsc.request_in_terminal_cv.wait_for(locker, std::chrono::seconds(10)); + + auto attached_pid = g_vsc.target.GetProcess().GetProcessID(); + if (attached_pid == LLDB_INVALID_PROCESS_ID) + error.SetErrorString("Failed to attach to a process"); + else + SendProcessEvent(Attach); + } + + if (error.Fail()) { + launch_response["success"] = llvm::json::Value(false); + EmplaceSafeString(launch_response, "message", + std::string(error.GetCString())); + } else { + launch_response["success"] = llvm::json::Value(true); + g_vsc.SendJSON(CreateEventObject("initialized")); + } +} + // "LaunchRequest": { // "allOf": [ { "$ref": "#/definitions/Request" }, { // "type": "object", @@ -1505,6 +1556,12 @@ void request_launch(const llvm::json::Object &request) { return; } + if (GetBoolean(arguments, "runInTerminal", false)) { + request_runInTerminal(request, response); + g_vsc.SendJSON(llvm::json::Value(std::move(response))); + return; + } + // Instantiate a launch info instance for the target. auto launch_info = g_vsc.target.GetLaunchInfo(); @@ -2831,45 +2888,41 @@ void request__testGetTargetBreakpoints(const llvm::json::Object &request) { g_vsc.SendJSON(llvm::json::Value(std::move(response))); } -const std::map &GetRequestHandlers() { -#define REQUEST_CALLBACK(name) \ - { #name, request_##name } - static std::map g_request_handlers = { - // VSCode Debug Adaptor requests - REQUEST_CALLBACK(attach), - REQUEST_CALLBACK(completions), - REQUEST_CALLBACK(continue), - REQUEST_CALLBACK(configurationDone), - REQUEST_CALLBACK(disconnect), - REQUEST_CALLBACK(evaluate), - REQUEST_CALLBACK(exceptionInfo), - REQUEST_CALLBACK(getCompileUnits), - REQUEST_CALLBACK(initialize), - REQUEST_CALLBACK(launch), - REQUEST_CALLBACK(next), - REQUEST_CALLBACK(pause), - REQUEST_CALLBACK(scopes), - REQUEST_CALLBACK(setBreakpoints), - REQUEST_CALLBACK(setExceptionBreakpoints), - REQUEST_CALLBACK(setFunctionBreakpoints), - REQUEST_CALLBACK(setVariable), - REQUEST_CALLBACK(source), - REQUEST_CALLBACK(stackTrace), - REQUEST_CALLBACK(stepIn), - REQUEST_CALLBACK(stepOut), - REQUEST_CALLBACK(threads), - REQUEST_CALLBACK(variables), - // Testing requests - REQUEST_CALLBACK(_testGetTargetBreakpoints), - }; -#undef REQUEST_CALLBACK - return g_request_handlers; +void RegisterRequestCallbacks() { + g_vsc.RegisterRequestCallback("attach", request_attach); + g_vsc.RegisterRequestCallback("completions", request_completions); + g_vsc.RegisterRequestCallback("continue", request_continue); + g_vsc.RegisterRequestCallback("configurationDone", request_configurationDone); + g_vsc.RegisterRequestCallback("disconnect", request_disconnect); + g_vsc.RegisterRequestCallback("evaluate", request_evaluate); + g_vsc.RegisterRequestCallback("exceptionInfo", request_exceptionInfo); + g_vsc.RegisterRequestCallback("getCompileUnits", request_getCompileUnits); + g_vsc.RegisterRequestCallback("initialize", request_initialize); + g_vsc.RegisterRequestCallback("launch", request_launch); + g_vsc.RegisterRequestCallback("next", request_next); + g_vsc.RegisterRequestCallback("pause", request_pause); + g_vsc.RegisterRequestCallback("scopes", request_scopes); + g_vsc.RegisterRequestCallback("setBreakpoints", request_setBreakpoints); + g_vsc.RegisterRequestCallback("setExceptionBreakpoints", + request_setExceptionBreakpoints); + g_vsc.RegisterRequestCallback("setFunctionBreakpoints", + request_setFunctionBreakpoints); + g_vsc.RegisterRequestCallback("setVariable", request_setVariable); + g_vsc.RegisterRequestCallback("source", request_source); + g_vsc.RegisterRequestCallback("stackTrace", request_stackTrace); + g_vsc.RegisterRequestCallback("stepIn", request_stepIn); + g_vsc.RegisterRequestCallback("stepOut", request_stepOut); + g_vsc.RegisterRequestCallback("threads", request_threads); + g_vsc.RegisterRequestCallback("variables", request_variables); + // Testing requests + g_vsc.RegisterRequestCallback("_testGetTargetBreakpoints", + request__testGetTargetBreakpoints); } } // anonymous namespace static void printHelp(LLDBVSCodeOptTable &table, llvm::StringRef tool_name) { - std::string usage_str = tool_name.str() + "options"; + std::string usage_str = tool_name.str() + " options"; table.PrintHelp(llvm::outs(), usage_str.c_str(), "LLDB VSCode", false); std::string examples = R"___( @@ -2895,6 +2948,8 @@ int main(int argc, char *argv[]) { // Initialize LLDB first before we do anything. lldb::SBDebugger::Initialize(); + RegisterRequestCallbacks(); + int portno = -1; LLDBVSCodeOptTable T; @@ -2937,49 +2992,17 @@ int main(int argc, char *argv[]) { g_vsc.output.descriptor = StreamDescriptor::from_file(fileno(stdout), false); } - auto request_handlers = GetRequestHandlers(); uint32_t packet_idx = 0; while (!g_vsc.sent_terminated_event) { - std::string json = g_vsc.ReadJSON(); - if (json.empty()) + llvm::json::Object object; + lldb_vscode::PacketStatus status = g_vsc.GetNextObject(object); + if (status == lldb_vscode::PacketStatus::EndOfFile) break; + if (status != lldb_vscode::PacketStatus::Success) + return 1; // Fatal error - llvm::StringRef json_sref(json); - llvm::Expected json_value = llvm::json::parse(json_sref); - if (!json_value) { - auto error = json_value.takeError(); - if (g_vsc.log) { - std::string error_str; - llvm::raw_string_ostream strm(error_str); - strm << error; - strm.flush(); - - *g_vsc.log << "error: failed to parse JSON: " << error_str << std::endl - << json << std::endl; - } - return 1; - } - - auto object = json_value->getAsObject(); - if (!object) { - if (g_vsc.log) - *g_vsc.log << "error: json packet isn't a object" << std::endl; + if (!g_vsc.HandleObject(object)) return 1; - } - - const auto packet_type = GetString(object, "type"); - if (packet_type == "request") { - const auto command = GetString(object, "command"); - auto handler_pos = request_handlers.find(std::string(command)); - if (handler_pos != request_handlers.end()) { - handler_pos->second(*object); - } else { - if (g_vsc.log) - *g_vsc.log << "error: unhandled command \"" << command.data() - << std::endl; - return 1; - } - } ++packet_idx; } diff --git a/lldb/tools/lldb-vscode/package.json b/lldb/tools/lldb-vscode/package.json index 29ca06dd17d63..9077ab51dd7fa 100644 --- a/lldb/tools/lldb-vscode/package.json +++ b/lldb/tools/lldb-vscode/package.json @@ -175,6 +175,11 @@ "type": "array", "description": "Commands executed at the end of debugging session.", "default": [] + }, + "runInTerminal": { + "type": "boolean", + "description": "Launch the program inside an integrated terminal in the IDE. Useful for debugging interactive command line programs", + "default": false } } }, diff --git a/lldb/unittests/Expression/CMakeLists.txt b/lldb/unittests/Expression/CMakeLists.txt index 2f5304ab212d9..0e8230d19bad9 100644 --- a/lldb/unittests/Expression/CMakeLists.txt +++ b/lldb/unittests/Expression/CMakeLists.txt @@ -11,5 +11,6 @@ add_lldb_unittest(ExpressionTests lldbPluginTypeSystemClang lldbUtility lldbUtilityHelpers + lldbSymbolHelpers LLVMTestingSupport ) diff --git a/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt b/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt index 64a7b78c478a1..30620a61dc5fd 100644 --- a/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt +++ b/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt @@ -11,8 +11,9 @@ add_lldb_unittest(SymbolFileDWARFTests lldbPluginSymbolFileDWARF lldbPluginSymbolFilePDB lldbPluginTypeSystemClang - lldbUtilityHelpers lldbPluginPlatformMacOSX + lldbUtilityHelpers + lldbSymbolHelpers LINK_COMPONENTS Support DebugInfoPDB diff --git a/lldb/unittests/SymbolFile/DWARF/SymbolFileDWARFTests.cpp b/lldb/unittests/SymbolFile/DWARF/SymbolFileDWARFTests.cpp index 8bf019ea9ed65..4898b94413cab 100644 --- a/lldb/unittests/SymbolFile/DWARF/SymbolFileDWARFTests.cpp +++ b/lldb/unittests/SymbolFile/DWARF/SymbolFileDWARFTests.cpp @@ -19,6 +19,7 @@ #include "Plugins/SymbolFile/DWARF/DWARFDataExtractor.h" #include "Plugins/SymbolFile/DWARF/DWARFDebugAbbrev.h" #include "Plugins/SymbolFile/DWARF/DWARFDebugArangeSet.h" +#include "Plugins/SymbolFile/DWARF/DWARFDebugAranges.h" #include "Plugins/SymbolFile/DWARF/SymbolFileDWARF.h" #include "Plugins/SymbolFile/PDB/SymbolFilePDB.h" #include "Plugins/TypeSystem/Clang/TypeSystemClang.h" @@ -70,7 +71,7 @@ TEST_F(SymbolFileDWARFTests, TestAbilitiesForDWARF) { TEST_F(SymbolFileDWARFTests, TestAbbrevOrder1Start1) { // Test that if we have a .debug_abbrev that contains ordered abbreviation // codes that start at 1, that we get O(1) access. - + const auto byte_order = eByteOrderLittle; const uint8_t addr_size = 4; StreamString encoder(Stream::eBinary, addr_size, byte_order); @@ -81,7 +82,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevOrder1Start1) { encoder.PutULEB128(DW_FORM_strp); encoder.PutULEB128(0); encoder.PutULEB128(0); - + encoder.PutULEB128(2); // Abbrev code 2 encoder.PutULEB128(DW_TAG_subprogram); encoder.PutHex8(DW_CHILDREN_no); @@ -89,9 +90,9 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevOrder1Start1) { encoder.PutULEB128(DW_FORM_strp); encoder.PutULEB128(0); encoder.PutULEB128(0); - + encoder.PutULEB128(0); // Abbrev code 0 (termination) - + DWARFDataExtractor data; data.SetData(encoder.GetData(), encoder.GetSize(), byte_order); DWARFAbbreviationDeclarationSet abbrev_set; @@ -101,7 +102,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevOrder1Start1) { // Make sure we have O(1) access to each abbreviation by making sure the // index offset is 1 and not UINT32_MAX EXPECT_EQ(abbrev_set.GetIndexOffset(), 1u); - + auto abbrev1 = abbrev_set.GetAbbreviationDeclaration(1); EXPECT_EQ(abbrev1->Tag(), DW_TAG_compile_unit); EXPECT_TRUE(abbrev1->HasChildren()); @@ -115,7 +116,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevOrder1Start1) { TEST_F(SymbolFileDWARFTests, TestAbbrevOrder1Start5) { // Test that if we have a .debug_abbrev that contains ordered abbreviation // codes that start at 5, that we get O(1) access. - + const auto byte_order = eByteOrderLittle; const uint8_t addr_size = 4; StreamString encoder(Stream::eBinary, addr_size, byte_order); @@ -126,7 +127,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevOrder1Start5) { encoder.PutULEB128(DW_FORM_strp); encoder.PutULEB128(0); encoder.PutULEB128(0); - + encoder.PutULEB128(6); // Abbrev code 6 encoder.PutULEB128(DW_TAG_subprogram); encoder.PutHex8(DW_CHILDREN_no); @@ -134,9 +135,9 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevOrder1Start5) { encoder.PutULEB128(DW_FORM_strp); encoder.PutULEB128(0); encoder.PutULEB128(0); - + encoder.PutULEB128(0); // Abbrev code 0 (termination) - + DWARFDataExtractor data; data.SetData(encoder.GetData(), encoder.GetSize(), byte_order); DWARFAbbreviationDeclarationSet abbrev_set; @@ -146,7 +147,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevOrder1Start5) { // Make sure we have O(1) access to each abbreviation by making sure the // index offset is 5 and not UINT32_MAX EXPECT_EQ(abbrev_set.GetIndexOffset(), 5u); - + auto abbrev1 = abbrev_set.GetAbbreviationDeclaration(5); EXPECT_EQ(abbrev1->Tag(), DW_TAG_compile_unit); EXPECT_TRUE(abbrev1->HasChildren()); @@ -160,7 +161,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevOrder1Start5) { TEST_F(SymbolFileDWARFTests, TestAbbrevOutOfOrder) { // Test that if we have a .debug_abbrev that contains unordered abbreviation // codes, that we can access the information correctly. - + const auto byte_order = eByteOrderLittle; const uint8_t addr_size = 4; StreamString encoder(Stream::eBinary, addr_size, byte_order); @@ -171,7 +172,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevOutOfOrder) { encoder.PutULEB128(DW_FORM_strp); encoder.PutULEB128(0); encoder.PutULEB128(0); - + encoder.PutULEB128(1); // Abbrev code 1 encoder.PutULEB128(DW_TAG_subprogram); encoder.PutHex8(DW_CHILDREN_no); @@ -179,9 +180,9 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevOutOfOrder) { encoder.PutULEB128(DW_FORM_strp); encoder.PutULEB128(0); encoder.PutULEB128(0); - + encoder.PutULEB128(0); // Abbrev code 0 (termination) - + DWARFDataExtractor data; data.SetData(encoder.GetData(), encoder.GetSize(), byte_order); DWARFAbbreviationDeclarationSet abbrev_set; @@ -191,7 +192,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevOutOfOrder) { // Make sure we don't have O(1) access to each abbreviation by making sure // the index offset is UINT32_MAX EXPECT_EQ(abbrev_set.GetIndexOffset(), UINT32_MAX); - + auto abbrev1 = abbrev_set.GetAbbreviationDeclaration(2); EXPECT_EQ(abbrev1->Tag(), DW_TAG_compile_unit); EXPECT_TRUE(abbrev1->HasChildren()); @@ -205,7 +206,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevOutOfOrder) { TEST_F(SymbolFileDWARFTests, TestAbbrevInvalidNULLTag) { // Test that we detect when an abbreviation has a NULL tag and that we get // an error when decoding. - + const auto byte_order = eByteOrderLittle; const uint8_t addr_size = 4; StreamString encoder(Stream::eBinary, addr_size, byte_order); @@ -214,9 +215,9 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevInvalidNULLTag) { encoder.PutHex8(DW_CHILDREN_no); encoder.PutULEB128(0); encoder.PutULEB128(0); - + encoder.PutULEB128(0); // Abbrev code 0 (termination) - + DWARFDataExtractor data; data.SetData(encoder.GetData(), encoder.GetSize(), byte_order); DWARFAbbreviationDeclarationSet abbrev_set; @@ -232,7 +233,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevInvalidNULLTag) { TEST_F(SymbolFileDWARFTests, TestAbbrevNullAttrValidForm) { // Test that we detect when an abbreviation has a NULL attribute and a non // NULL form and that we get an error when decoding. - + const auto byte_order = eByteOrderLittle; const uint8_t addr_size = 4; StreamString encoder(Stream::eBinary, addr_size, byte_order); @@ -245,7 +246,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevNullAttrValidForm) { encoder.PutULEB128(0); encoder.PutULEB128(0); // Abbrev code 0 (termination) - + DWARFDataExtractor data; data.SetData(encoder.GetData(), encoder.GetSize(), byte_order); DWARFAbbreviationDeclarationSet abbrev_set; @@ -255,13 +256,12 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevNullAttrValidForm) { EXPECT_TRUE(bool(error)); EXPECT_EQ("malformed abbreviation declaration attribute", llvm::toString(std::move(error))); - } TEST_F(SymbolFileDWARFTests, TestAbbrevValidAttrNullForm) { // Test that we detect when an abbreviation has a valid attribute and a // NULL form and that we get an error when decoding. - + const auto byte_order = eByteOrderLittle; const uint8_t addr_size = 4; StreamString encoder(Stream::eBinary, addr_size, byte_order); @@ -272,9 +272,9 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevValidAttrNullForm) { encoder.PutULEB128(0); // NULL form encoder.PutULEB128(0); encoder.PutULEB128(0); - + encoder.PutULEB128(0); // Abbrev code 0 (termination) - + DWARFDataExtractor data; data.SetData(encoder.GetData(), encoder.GetSize(), byte_order); DWARFAbbreviationDeclarationSet abbrev_set; @@ -290,7 +290,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevMissingTerminator) { // Test that we detect when an abbreviation has a valid attribute and a // form, but is missing the NULL attribute and form that terminates an // abbreviation - + const auto byte_order = eByteOrderLittle; const uint8_t addr_size = 4; StreamString encoder(Stream::eBinary, addr_size, byte_order); @@ -300,7 +300,7 @@ TEST_F(SymbolFileDWARFTests, TestAbbrevMissingTerminator) { encoder.PutULEB128(DW_AT_name); encoder.PutULEB128(DW_FORM_strp); // Don't add the NULL DW_AT and NULL DW_FORM terminator - + DWARFDataExtractor data; data.SetData(encoder.GetData(), encoder.GetSize(), byte_order); DWARFAbbreviationDeclarationSet abbrev_set; @@ -346,3 +346,42 @@ TEST_F(SymbolFileDWARFTests, ParseArangesNonzeroSegmentSize) { llvm::toString(std::move(error))); EXPECT_EQ(off, 12U); // Parser should read no further than the segment size } + +TEST_F(SymbolFileDWARFTests, ParseAranges) { + // Test we can successfully parse a DWARFDebugAranges. The initial error + // checking code had a bug where it would always return an empty address + // ranges for everything in .debug_aranges and no error. + const unsigned char binary_data[] = { + 60, 0, 0, 0, // unit_length + 2, 0, // DWARF version number + 255, 0, 0, 0, // offset into the .debug_info_table + 8, // address size + 0, // segment size + 0, 0, 0, 0, // pad bytes + // BEGIN TUPLES + // First tuple: [0x1000-0x1100) + 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Address 0x1000 + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Size 0x0100 + // Second tuple: [0x2000-0x2100) + 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Address 0x2000 + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Size 0x0100 + // Terminating tuple + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Terminator + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 // Terminator + }; + DWARFDataExtractor data; + data.SetData(static_cast(binary_data), sizeof binary_data, + lldb::ByteOrder::eByteOrderLittle); + DWARFDebugAranges debug_aranges; + llvm::Error error = debug_aranges.extract(data); + ASSERT_FALSE(bool(error)); + EXPECT_EQ(debug_aranges.GetNumRanges(), 2u); + EXPECT_EQ(debug_aranges.FindAddress(0x0fff), DW_INVALID_OFFSET); + EXPECT_EQ(debug_aranges.FindAddress(0x1000), 255u); + EXPECT_EQ(debug_aranges.FindAddress(0x1100 - 1), 255u); + EXPECT_EQ(debug_aranges.FindAddress(0x1100), DW_INVALID_OFFSET); + EXPECT_EQ(debug_aranges.FindAddress(0x1fff), DW_INVALID_OFFSET); + EXPECT_EQ(debug_aranges.FindAddress(0x2000), 255u); + EXPECT_EQ(debug_aranges.FindAddress(0x2100 - 1), 255u); + EXPECT_EQ(debug_aranges.FindAddress(0x2100), DW_INVALID_OFFSET); +} diff --git a/lldb/unittests/TestingSupport/CMakeLists.txt b/lldb/unittests/TestingSupport/CMakeLists.txt index 4599ada1ec506..c62bc3b023b77 100644 --- a/lldb/unittests/TestingSupport/CMakeLists.txt +++ b/lldb/unittests/TestingSupport/CMakeLists.txt @@ -5,7 +5,6 @@ add_lldb_library(lldbUtilityHelpers LINK_LIBS lldbUtility - lldbSymbolHelpers gtest LINK_COMPONENTS diff --git a/lldb/utils/lldb-dotest/CMakeLists.txt b/lldb/utils/lldb-dotest/CMakeLists.txt index 0ef60c1427610..e5a73c2b1dec3 100644 --- a/lldb/utils/lldb-dotest/CMakeLists.txt +++ b/lldb/utils/lldb-dotest/CMakeLists.txt @@ -49,7 +49,7 @@ if(LLDB_BUILT_STANDALONE) string(REPLACE ${CMAKE_CFG_INTDIR} "." LLDB_TEST_COMPILER_CONFIGURED "${LLDB_TEST_COMPILER}") string(REPLACE ${CMAKE_CFG_INTDIR} "." LLDB_TEST_DSYMUTIL_CONFIGURED "${LLDB_TEST_DSYMUTIL}") string(REPLACE ${CMAKE_CFG_INTDIR} "." LLDB_TEST_FILECHECK_CONFIGURED "${LLDB_TEST_FILECHECK}") - string(REPLACE ${CMAKE_CFG_INTDIR} "." LLDB_TEST_YAML2OBJ_CONFIGURED "${LLDB_TEST_YAML2OBJ_CONFIGURED}") + string(REPLACE ${CMAKE_CFG_INTDIR} "." LLDB_TEST_YAML2OBJ_CONFIGURED "${LLDB_TEST_YAML2OBJ}") string(REPLACE ${CMAKE_CFG_INTDIR} "." LLDB_LIBS_DIR_CONFIGURED "${LLDB_LIBS_DIR}") endif() diff --git a/llvm-spirv/.travis.yml b/llvm-spirv/.travis.yml index 8aab9776d1fcf..99e4583b70638 100644 --- a/llvm-spirv/.travis.yml +++ b/llvm-spirv/.travis.yml @@ -23,9 +23,9 @@ before_install: - | if [ $TRAVIS_OS_NAME == "linux" ]; then curl -L "https://apt.llvm.org/llvm-snapshot.gpg.key" | sudo apt-key add - - curl -L "http://packages.lunarg.com/lunarg-signing-key-pub.asc" | sudo apt-key add - + curl -L "https://packages.lunarg.com/lunarg-signing-key-pub.asc" | sudo apt-key add - curl -L "https://apt.kitware.com/keys/kitware-archive-latest.asc" | sudo apt-key add - - echo "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main" | sudo tee -a ${TRAVIS_ROOT}/etc/apt/sources.list + echo "deb https://apt.llvm.org/bionic/ llvm-toolchain-bionic main" | sudo tee -a ${TRAVIS_ROOT}/etc/apt/sources.list echo "deb https://packages.lunarg.com/vulkan bionic main" | sudo tee -a ${TRAVIS_ROOT}/etc/apt/sources.list echo "deb https://apt.kitware.com/ubuntu/ bionic main" | sudo tee -a ${TRAVIS_ROOT}/etc/apt/sources.list sudo apt-get update @@ -40,7 +40,6 @@ before_install: compiler: - gcc - - clang env: global: @@ -57,7 +56,6 @@ env: - BUILD_TYPE=Debug BUILD_EXTERNAL=1 SHARED_LIBS=OFF MAKE_TARGETS="" MAKE_TEST_TARGET="test" - BUILD_TYPE=Release BUILD_EXTERNAL=0 SHARED_LIBS=OFF MAKE_TARGETS="llvm-spirv" MAKE_TEST_TARGET="check-llvm-spirv" - BUILD_TYPE=Debug BUILD_EXTERNAL=0 SHARED_LIBS=OFF MAKE_TARGETS="llvm-spirv" MAKE_TEST_TARGET="check-llvm-spirv" - # some bug inside clang-5.0.0, works with 5.0.1 matrix: include: @@ -69,6 +67,15 @@ matrix: env: BUILD_TYPE=Debug BUILD_EXTERNAL=0 MAKE_TARGETS="llvm-spirv" MAKE_TEST_TARGET="check-llvm-spirv" osx_image: xcode12 + - compiler: clang + env: BUILD_TYPE=Release BUILD_EXTERNAL=1 SHARED_LIBS=OFF MAKE_TARGETS="" MAKE_TEST_TARGET="test" + + - compiler: clang + env: BUILD_TYPE=Debug BUILD_EXTERNAL=1 SHARED_LIBS=ON MAKE_TARGETS="" MAKE_TEST_TARGET="test" + + - compiler: clang + env: BUILD_TYPE=Release BUILD_EXTERNAL=0 SHARED_LIBS=ON MAKE_TARGETS="llvm-spirv" MAKE_TEST_TARGET="check-llvm-spirv" + - env: BUILD_EXTERNAL=1 CHECK_FORMAT=1 - env: BUILD_EXTERNAL=1 CHECK_TIDY=1 diff --git a/llvm-spirv/include/LLVMSPIRVOpts.h b/llvm-spirv/include/LLVMSPIRVOpts.h index bdc24cd2d22a1..8e62f4dac787e 100644 --- a/llvm-spirv/include/LLVMSPIRVOpts.h +++ b/llvm-spirv/include/LLVMSPIRVOpts.h @@ -148,6 +148,14 @@ class TranslatorOpts { SPIRVAllowUnknownIntrinsics = AllowUnknownIntrinsics; } + bool allowExtraDIExpressions() const noexcept { + return AllowExtraDIExpressions; + } + + void setAllowExtraDIExpressionsEnabled(bool Allow) noexcept { + AllowExtraDIExpressions = Allow; + } + DebugInfoEIS getDebugInfoEIS() const { return DebugInfoVersion; } void setDebugInfoEIS(DebugInfoEIS EIS) { DebugInfoVersion = EIS; } @@ -179,6 +187,10 @@ class TranslatorOpts { // SPIR-V bool SPIRVAllowUnknownIntrinsics = false; + // Enable support for extra DIExpression opcodes not listed in the SPIR-V + // DebugInfo specification. + bool AllowExtraDIExpressions = false; + DebugInfoEIS DebugInfoVersion = DebugInfoEIS::OpenCL_DebugInfo_100; }; diff --git a/llvm-spirv/lib/SPIRV/LLVMToSPIRVDbgTran.cpp b/llvm-spirv/lib/SPIRV/LLVMToSPIRVDbgTran.cpp index 80815a3d62e44..68161b43186f6 100644 --- a/llvm-spirv/lib/SPIRV/LLVMToSPIRVDbgTran.cpp +++ b/llvm-spirv/lib/SPIRV/LLVMToSPIRVDbgTran.cpp @@ -959,10 +959,14 @@ SPIRVEntry *LLVMToSPIRVDbgTran::transDbgExpression(const DIExpression *Expr) { for (unsigned I = 0, N = Expr->getNumElements(); I < N; ++I) { using namespace SPIRVDebug::Operand::Operation; auto DWARFOpCode = static_cast(Expr->getElement(I)); + SPIRVDebug::ExpressionOpCode OC = SPIRV::DbgExpressionOpCodeMap::map(DWARFOpCode); - assert(OpCountMap.find(OC) != OpCountMap.end() && - "unhandled opcode found in DIExpression"); + if (OpCountMap.find(OC) == OpCountMap.end()) + report_fatal_error("unknown opcode found in DIExpression"); + if (OC > SPIRVDebug::Fragment && !BM->allowExtraDIExpressions()) + report_fatal_error("unsupported opcode found in DIExpression"); + unsigned OpCount = OpCountMap[OC]; SPIRVWordVec Op(OpCount); Op[OpCodeIdx] = OC; diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp index e4574bebbcb8c..95455e703cb4e 100644 --- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp +++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp @@ -40,6 +40,7 @@ #include "OCLTypeToSPIRV.h" #include "OCLUtil.h" #include "SPIRVInternal.h" +#include "libSPIRV/SPIRVDebug.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/ValueTracking.h" @@ -47,9 +48,7 @@ #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" -#include "llvm/Support/Debug.h" #include #include @@ -349,11 +348,8 @@ bool OCL20ToSPIRV::runOnModule(Module &Module) { eraseUselessFunctions(M); // remove unused functions declarations LLVM_DEBUG(dbgs() << "After OCL20ToSPIRV:\n" << *M); - std::string Err; - raw_string_ostream ErrorOS(Err); - if (verifyModule(*M, &ErrorOS)) { - LLVM_DEBUG(errs() << "Fails to verify module: " << ErrorOS.str()); - } + verifyRegularizationPass(*M, "OCL20ToSPIRV"); + return true; } @@ -1261,11 +1257,16 @@ void OCL20ToSPIRV::transWorkItemBuiltinsToVariables() { for (auto UI = I.user_begin(), UE = I.user_end(); UI != UE; ++UI) { auto CI = dyn_cast(*UI); assert(CI && "invalid instruction"); - Value *NewValue = new LoadInst(GVType, BV, "", CI); + const DebugLoc &DLoc = CI->getDebugLoc(); + Instruction *NewValue = new LoadInst(GVType, BV, "", CI); + if (DLoc) + NewValue->setDebugLoc(DLoc); LLVM_DEBUG(dbgs() << "Transform: " << *CI << " => " << *NewValue << '\n'); if (IsVec) { NewValue = ExtractElementInst::Create(NewValue, CI->getArgOperand(0), "", CI); + if (DLoc) + NewValue->setDebugLoc(DLoc); LLVM_DEBUG(dbgs() << *NewValue << '\n'); } NewValue->takeName(CI); diff --git a/llvm-spirv/lib/SPIRV/OCL21ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL21ToSPIRV.cpp index 6ec3a41b46530..c82aae6ed2dc2 100644 --- a/llvm-spirv/lib/SPIRV/OCL21ToSPIRV.cpp +++ b/llvm-spirv/lib/SPIRV/OCL21ToSPIRV.cpp @@ -39,13 +39,13 @@ #include "OCLUtil.h" #include "SPIRVInternal.h" +#include "libSPIRV/SPIRVDebug.h" + #include "llvm/ADT/StringSwitch.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" -#include "llvm/Support/Debug.h" #include @@ -54,7 +54,6 @@ using namespace SPIRV; using namespace OCLUtil; namespace SPIRV { - class OCL21ToSPIRV : public ModulePass, public InstVisitor { public: OCL21ToSPIRV() : ModulePass(ID), M(nullptr), Ctx(nullptr), CLVer(0) { @@ -122,11 +121,8 @@ bool OCL21ToSPIRV::runOnModule(Module &Module) { GV->eraseFromParent(); LLVM_DEBUG(dbgs() << "After OCL21ToSPIRV:\n" << *M); - std::string Err; - raw_string_ostream ErrorOS(Err); - if (verifyModule(*M, &ErrorOS)) { - LLVM_DEBUG(errs() << "Fails to verify module: " << ErrorOS.str()); - } + verifyRegularizationPass(*M, "OCL21ToSPIRV"); + return true; } diff --git a/llvm-spirv/lib/SPIRV/OCLUtil.cpp b/llvm-spirv/lib/SPIRV/OCLUtil.cpp index 749bf53abe835..ec6ab317bd8e6 100644 --- a/llvm-spirv/lib/SPIRV/OCLUtil.cpp +++ b/llvm-spirv/lib/SPIRV/OCLUtil.cpp @@ -46,7 +46,6 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp b/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp index 145b5f60e9a00..80cc214644433 100644 --- a/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp +++ b/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp @@ -43,14 +43,12 @@ #include "SPIRVMDBuilder.h" #include "SPIRVMDWalker.h" #include "VectorComputeUtil.h" +#include "libSPIRV/SPIRVDebug.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" -#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" using namespace llvm; using namespace SPIRV; @@ -88,13 +86,10 @@ bool PreprocessMetadata::runOnModule(Module &Module) { LLVM_DEBUG(dbgs() << "Enter PreprocessMetadata:\n"); visit(M); - LLVM_DEBUG(dbgs() << "After PreprocessMetadata:\n" << *M); - std::string Err; - raw_string_ostream ErrorOS(Err); - if (verifyModule(*M, &ErrorOS)) { - LLVM_DEBUG(errs() << "Fails to verify module: " << ErrorOS.str()); - } + + verifyRegularizationPass(*M, "PreprocessMetadata"); + return true; } diff --git a/llvm-spirv/lib/SPIRV/SPIRVLowerBool.cpp b/llvm-spirv/lib/SPIRV/SPIRVLowerBool.cpp index 75b51edee32ea..8c3dea70138c9 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVLowerBool.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVLowerBool.cpp @@ -38,22 +38,17 @@ #define DEBUG_TYPE "spvbool" #include "SPIRVInternal.h" +#include "libSPIRV/SPIRVDebug.h" + #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" using namespace llvm; using namespace SPIRV; namespace SPIRV { -cl::opt SPIRVLowerBoolValidate( - "spvbool-validate", - cl::desc("Validate module after lowering boolean instructions for SPIR-V")); - class SPIRVLowerBool : public ModulePass, public InstVisitor { public: SPIRVLowerBool() : ModulePass(ID), Context(nullptr) { @@ -119,15 +114,7 @@ class SPIRVLowerBool : public ModulePass, public InstVisitor { Context = &M.getContext(); visit(M); - if (SPIRVLowerBoolValidate) { - LLVM_DEBUG(dbgs() << "After SPIRVLowerBool:\n" << M); - std::string Err; - raw_string_ostream ErrorOS(Err); - if (verifyModule(M, &ErrorOS)) { - Err = std::string("Fails to verify module: ") + Err; - report_fatal_error(Err.c_str(), false); - } - } + verifyRegularizationPass(M, "SPIRVLowerBool"); return true; } diff --git a/llvm-spirv/lib/SPIRV/SPIRVLowerConstExpr.cpp b/llvm-spirv/lib/SPIRV/SPIRVLowerConstExpr.cpp index ceb7b54ce6367..2416e2680eeb0 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVLowerConstExpr.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVLowerConstExpr.cpp @@ -41,16 +41,14 @@ #include "SPIRVInternal.h" #include "SPIRVMDBuilder.h" #include "SPIRVMDWalker.h" +#include "libSPIRV/SPIRVDebug.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" #include #include @@ -93,12 +91,8 @@ bool SPIRVLowerConstExpr::runOnModule(Module &Module) { LLVM_DEBUG(dbgs() << "Enter SPIRVLowerConstExpr:\n"); visit(M); - LLVM_DEBUG(dbgs() << "After SPIRVLowerConstExpr:\n" << *M); - std::string Err; - raw_string_ostream ErrorOS(Err); - if (verifyModule(*M, &ErrorOS)) { - LLVM_DEBUG(errs() << "Fails to verify module: " << ErrorOS.str()); - } + verifyRegularizationPass(*M, "SPIRVLowerConstExpr"); + return true; } diff --git a/llvm-spirv/lib/SPIRV/SPIRVLowerMemmove.cpp b/llvm-spirv/lib/SPIRV/SPIRVLowerMemmove.cpp index 5d4a764264e73..550185402ee40 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVLowerMemmove.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVLowerMemmove.cpp @@ -38,24 +38,18 @@ #define DEBUG_TYPE "spvmemmove" #include "SPIRVInternal.h" +#include "libSPIRV/SPIRVDebug.h" + #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" -#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" using namespace llvm; using namespace SPIRV; namespace SPIRV { -cl::opt SPIRVLowerMemmoveValidate( - "spvmemmove-validate", - cl::desc("Validate module after lowering llvm.memmove instructions into " - "llvm.memcpy")); - class SPIRVLowerMemmove : public ModulePass, public InstVisitor { public: @@ -119,15 +113,7 @@ class SPIRVLowerMemmove : public ModulePass, Mod = &M; visit(M); - if (SPIRVLowerMemmoveValidate) { - LLVM_DEBUG(dbgs() << "After SPIRVLowerMemmove:\n" << M); - std::string Err; - raw_string_ostream ErrorOS(Err); - if (verifyModule(M, &ErrorOS)) { - Err = std::string("Fails to verify module: ") + Err; - report_fatal_error(Err.c_str(), false); - } - } + verifyRegularizationPass(M, "SPIRVLowerMemmove"); return true; } diff --git a/llvm-spirv/lib/SPIRV/SPIRVLowerSaddWithOverflow.cpp b/llvm-spirv/lib/SPIRV/SPIRVLowerSaddWithOverflow.cpp index 9b12cfd98f760..90978e922ea25 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVLowerSaddWithOverflow.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVLowerSaddWithOverflow.cpp @@ -39,28 +39,23 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "spv-lower-llvm_sadd_with_overflow" -#include "LLVMSPIRVLib.h" #include "LLVMSaddWithOverflow.h" + +#include "LLVMSPIRVLib.h" #include "SPIRVError.h" +#include "libSPIRV/SPIRVDebug.h" + #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" -#include "llvm/IR/Verifier.h" #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" #include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" using namespace llvm; using namespace SPIRV; namespace SPIRV { -cl::opt SPIRVLowerSaddWithOverflowValidate( - "spv-lower-saddwithoverflow-validate", - cl::desc("Validate module after lowering llvm.sadd.with.overflow.*" - "intrinsics")); - class SPIRVLowerSaddWithOverflow : public ModulePass, public InstVisitor { @@ -125,15 +120,7 @@ class SPIRVLowerSaddWithOverflow Mod = &M; visit(M); - if (SPIRVLowerSaddWithOverflowValidate) { - LLVM_DEBUG(dbgs() << "After SPIRVLowerSaddWithOverflow:\n" << M); - std::string Err; - raw_string_ostream ErrorOS(Err); - if (verifyModule(M, &ErrorOS)) { - Err = std::string("Fails to verify module: ") + Err; - report_fatal_error(Err.c_str(), false); - } - } + verifyRegularizationPass(M, "SPIRVLowerSaddWithOverflow"); return TheModuleIsModified; } diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp index 9781d595eadef..5690e61778e00 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp @@ -340,11 +340,16 @@ bool SPIRVToLLVM::transOCLBuiltinFromVariable(GlobalVariable *GV, LD->getPointerOperandType()->getPointerElementType()); Value *EmptyVec = UndefValue::get(VecTy); Vectors.push_back(EmptyVec); + const DebugLoc &DLoc = LD->getDebugLoc(); for (unsigned I = 0; I < VecTy->getNumElements(); ++I) { auto *Idx = ConstantInt::get(Type::getInt32Ty(*Context), I); auto *Call = CallInst::Create(Func, {Idx}, "", LD); + if (DLoc) + Call->setDebugLoc(DLoc); setAttrByCalledFunc(Call); auto *Insert = InsertElementInst::Create(Vectors.back(), Call, Idx); + if (DLoc) + Insert->setDebugLoc(DLoc); Insert->insertAfter(Call); Vectors.push_back(Insert); } @@ -4090,18 +4095,14 @@ Instruction *SPIRVToLLVM::transOCLBuiltinFromExtInst(SPIRVExtInst *BC, assert(BB && "Invalid BB"); std::string MangledName; SPIRVWord EntryPoint = BC->getExtOp(); - bool IsVarArg = false; - bool IsPrintf = false; std::string UnmangledName; - auto BArgs = BC->getArguments(); + std::vector BArgs = BC->getArguments(); assert(BM->getBuiltinSet(BC->getExtSetId()) == SPIRVEIS_OpenCL && "Not OpenCL extended instruction"); - if (EntryPoint == OpenCLLIB::Printf) - IsPrintf = true; - else { - UnmangledName = OCLExtOpMap::map(static_cast(EntryPoint)); - } + + bool IsPrintf = (EntryPoint == OpenCLLIB::Printf); + UnmangledName = OCLExtOpMap::map(static_cast(EntryPoint)); SPIRVDBG(spvdbgs() << "[transOCLBuiltinFromExtInst] OrigUnmangledName: " << UnmangledName << '\n'); @@ -4111,12 +4112,7 @@ Instruction *SPIRVToLLVM::transOCLBuiltinFromExtInst(SPIRVExtInst *BC, if (IsPrintf) { MangledName = "printf"; - IsVarArg = true; ArgTypes.resize(1); - } else if (UnmangledName.find("read_image") == 0) { - auto ModifiedArgTypes = ArgTypes; - ModifiedArgTypes[1] = getOrCreateOpaquePtrType(M, "opencl.sampler_t"); - mangleOpenClBuiltin(UnmangledName, ModifiedArgTypes, MangledName); } else { mangleOpenClBuiltin(UnmangledName, ArgTypes, MangledName); } @@ -4124,8 +4120,8 @@ Instruction *SPIRVToLLVM::transOCLBuiltinFromExtInst(SPIRVExtInst *BC, << UnmangledName << " MangledName: " << MangledName << '\n'); - FunctionType *FT = - FunctionType::get(transType(BC->getType()), ArgTypes, IsVarArg); + FunctionType *FT = FunctionType::get(transType(BC->getType()), ArgTypes, + /* IsVarArg */ IsPrintf); Function *F = M->getFunction(MangledName); if (!F) { F = Function::Create(FT, GlobalValue::ExternalLinkage, MangledName, M); diff --git a/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp b/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp index e9ca33d2e0141..c7a0a7429e644 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp @@ -39,13 +39,12 @@ #include "OCLUtil.h" #include "SPIRVInternal.h" +#include "libSPIRV/SPIRVDebug.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Operator.h" -#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" -#include "llvm/Support/Debug.h" #include #include @@ -90,13 +89,10 @@ bool SPIRVRegularizeLLVM::runOnModule(Module &Module) { LLVM_DEBUG(dbgs() << "Enter SPIRVRegularizeLLVM:\n"); regularize(); - LLVM_DEBUG(dbgs() << "After SPIRVRegularizeLLVM:\n" << *M); - std::string Err; - raw_string_ostream ErrorOS(Err); - if (verifyModule(*M, &ErrorOS)) { - LLVM_DEBUG(errs() << "Fails to verify module: " << ErrorOS.str()); - } + + verifyRegularizationPass(*M, "SPIRVRegularizeLLVM"); + return true; } @@ -206,13 +202,6 @@ bool SPIRVRegularizeLLVM::regularize() { } } - std::string Err; - raw_string_ostream ErrorOS(Err); - if (verifyModule(*M, &ErrorOS)) { - SPIRVDBG(errs() << "Fails to verify module: " << ErrorOS.str();) - return false; - } - if (SPIRVDbgSaveRegularizedModule) saveLLVMModule(M, RegularizedModuleTmpFile); return true; diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRV.debug.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRV.debug.h index 0dfbf0fc5c3f7..c34ae787fa62b 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRV.debug.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRV.debug.h @@ -105,7 +105,162 @@ enum ExpressionOpCode { Xderef = 6, StackValue = 7, Constu = 8, - Fragment = 9 + Fragment = 9, + Convert = 10, + Addr = 11, + Const1u = 12, + Const1s = 13, + Const2u = 14, + Const2s = 15, + Const4u = 16, + Const4s = 17, + Const8u = 18, + Const8s = 19, + Consts = 20, + Dup = 21, + Drop = 22, + Over = 23, + Pick = 24, + Rot = 25, + Abs = 26, + And = 27, + Div = 28, + Mod = 29, + Mul = 30, + Neg = 31, + Not = 32, + Or = 33, + Shl = 34, + Shr = 35, + Shra = 36, + Xor = 37, + Bra = 38, + Eq = 39, + Ge = 40, + Gt = 41, + Le = 42, + Lt = 43, + Ne = 44, + Skip = 45, + Lit0 = 46, + Lit1 = 47, + Lit2 = 48, + Lit3 = 49, + Lit4 = 50, + Lit5 = 51, + Lit6 = 52, + Lit7 = 53, + Lit8 = 54, + Lit9 = 55, + Lit10 = 56, + Lit11 = 57, + Lit12 = 58, + Lit13 = 59, + Lit14 = 60, + Lit15 = 61, + Lit16 = 62, + Lit17 = 63, + Lit18 = 64, + Lit19 = 65, + Lit20 = 66, + Lit21 = 67, + Lit22 = 68, + Lit23 = 69, + Lit24 = 70, + Lit25 = 71, + Lit26 = 72, + Lit27 = 73, + Lit28 = 74, + Lit29 = 75, + Lit30 = 76, + Lit31 = 77, + Reg0 = 78, + Reg1 = 79, + Reg2 = 80, + Reg3 = 81, + Reg4 = 82, + Reg5 = 83, + Reg6 = 84, + Reg7 = 85, + Reg8 = 86, + Reg9 = 87, + Reg10 = 88, + Reg11 = 89, + Reg12 = 90, + Reg13 = 91, + Reg14 = 92, + Reg15 = 93, + Reg16 = 94, + Reg17 = 95, + Reg18 = 96, + Reg19 = 97, + Reg20 = 98, + Reg21 = 99, + Reg22 = 100, + Reg23 = 101, + Reg24 = 102, + Reg25 = 103, + Reg26 = 104, + Reg27 = 105, + Reg28 = 106, + Reg29 = 107, + Reg30 = 108, + Reg31 = 109, + Breg0 = 110, + Breg1 = 111, + Breg2 = 112, + Breg3 = 113, + Breg4 = 114, + Breg5 = 115, + Breg6 = 116, + Breg7 = 117, + Breg8 = 118, + Breg9 = 119, + Breg10 = 120, + Breg11 = 121, + Breg12 = 122, + Breg13 = 123, + Breg14 = 124, + Breg15 = 125, + Breg16 = 126, + Breg17 = 127, + Breg18 = 128, + Breg19 = 129, + Breg20 = 130, + Breg21 = 131, + Breg22 = 132, + Breg23 = 133, + Breg24 = 134, + Breg25 = 135, + Breg26 = 136, + Breg27 = 137, + Breg28 = 138, + Breg29 = 139, + Breg30 = 140, + Breg31 = 141, + Regx = 142, + Fbreg = 143, + Bregx = 144, + Piece = 145, + DerefSize = 146, + XderefSize = 147, + Nop = 148, + PushObjectAddress = 149, + Call2 = 150, + Call4 = 151, + CallRef = 152, + FormTlsAddress = 153, + CallFrameCfa = 154, + ImplicitValue = 155, + ImplicitPointer = 156, + Addrx = 157, + Constx = 158, + EntryValue = 159, + ConstTypeOp = 160, + RegvalType = 161, + DerefType = 162, + XderefType = 163, + Reinterpret = 164 }; enum ImportedEntityTag { @@ -432,16 +587,171 @@ enum { OpCodeIdx = 0 }; static std::map OpCountMap { - { Deref, 1 }, - { Plus, 1 }, - { Minus, 1 }, - { PlusUconst, 2 }, - { BitPiece, 3 }, - { Swap, 1 }, - { Xderef, 1 }, - { StackValue, 1 }, - { Constu, 2 }, - { Fragment, 3 } + { Deref, 1 }, + { Plus, 1 }, + { Minus, 1 }, + { PlusUconst, 2 }, + { BitPiece, 3 }, + { Swap, 1 }, + { Xderef, 1 }, + { StackValue, 1 }, + { Constu, 2 }, + { Fragment, 3 }, + { Convert, 3 }, + // { Addr, 2 }, /* not implemented */ + // { Const1u, 2 }, + // { Const1s, 2 }, + // { Const2u, 2 }, + // { Const2s, 2 }, + // { Const4u, 2 }, + // { Const4s, 2 }, + // { Const8u, 2 }, + // { Const8s, 2 }, + { Consts, 2 }, + { Dup, 1 }, + { Drop, 1 }, + { Over, 1 }, + { Pick, 1 }, + { Rot, 1 }, + { Abs, 1 }, + { And, 1 }, + { Div, 1 }, + { Mod, 1 }, + { Mul, 1 }, + { Neg, 1 }, + { Not, 1 }, + { Or, 1 }, + { Shl, 1 }, + { Shr, 1 }, + { Shra, 1 }, + { Xor, 1 }, + // { Bra, 2 }, /* not implemented */ + { Eq, 1 }, + { Ge, 1 }, + { Gt, 1 }, + { Le, 1 }, + { Lt, 1 }, + { Ne, 1 }, + // { Skip, 2 }, /* not implemented */ + { Lit0, 1 }, + { Lit1, 1 }, + { Lit2, 1 }, + { Lit3, 1 }, + { Lit4, 1 }, + { Lit5, 1 }, + { Lit6, 1 }, + { Lit7, 1 }, + { Lit8, 1 }, + { Lit9, 1 }, + { Lit10, 1 }, + { Lit11, 1 }, + { Lit12, 1 }, + { Lit13, 1 }, + { Lit14, 1 }, + { Lit15, 1 }, + { Lit16, 1 }, + { Lit17, 1 }, + { Lit18, 1 }, + { Lit19, 1 }, + { Lit20, 1 }, + { Lit21, 1 }, + { Lit22, 1 }, + { Lit23, 1 }, + { Lit24, 1 }, + { Lit25, 1 }, + { Lit26, 1 }, + { Lit27, 1 }, + { Lit28, 1 }, + { Lit29, 1 }, + { Lit30, 1 }, + { Lit31, 1 }, + { Reg0, 1 }, + { Reg1, 1 }, + { Reg2, 1 }, + { Reg3, 1 }, + { Reg4, 1 }, + { Reg5, 1 }, + { Reg6, 1 }, + { Reg7, 1 }, + { Reg8, 1 }, + { Reg9, 1 }, + { Reg10, 1 }, + { Reg11, 1 }, + { Reg12, 1 }, + { Reg13, 1 }, + { Reg14, 1 }, + { Reg15, 1 }, + { Reg16, 1 }, + { Reg17, 1 }, + { Reg18, 1 }, + { Reg19, 1 }, + { Reg20, 1 }, + { Reg21, 1 }, + { Reg22, 1 }, + { Reg23, 1 }, + { Reg24, 1 }, + { Reg25, 1 }, + { Reg26, 1 }, + { Reg27, 1 }, + { Reg28, 1 }, + { Reg29, 1 }, + { Reg30, 1 }, + { Reg31, 1 }, + { Breg0, 2 }, + { Breg1, 2 }, + { Breg2, 2 }, + { Breg3, 2 }, + { Breg4, 2 }, + { Breg5, 2 }, + { Breg6, 2 }, + { Breg7, 2 }, + { Breg8, 2 }, + { Breg9, 2 }, + { Breg10, 2 }, + { Breg11, 2 }, + { Breg12, 2 }, + { Breg13, 2 }, + { Breg14, 2 }, + { Breg15, 2 }, + { Breg16, 2 }, + { Breg17, 2 }, + { Breg18, 2 }, + { Breg19, 2 }, + { Breg20, 2 }, + { Breg21, 2 }, + { Breg22, 2 }, + { Breg23, 2 }, + { Breg24, 2 }, + { Breg25, 2 }, + { Breg26, 2 }, + { Breg27, 2 }, + { Breg28, 2 }, + { Breg29, 2 }, + { Breg30, 2 }, + { Breg31, 2 }, + { Regx, 2 }, + // { Fbreg, 1 }, /* not implemented */ + { Bregx, 3 }, + // { Piece, 2 }, /* not implemented */ + { DerefSize, 2 }, + { XderefSize, 2 }, + { Nop, 1 }, + { PushObjectAddress, 1 }, + // { Call2, 2 }, /* not implemented */ + // { Call4, 2 }, + // { CallRef, 2 }, + // { FormTlsAddress, 1 }, + // { CallFrameCfa, 1 }, + // { ImplicitValue, 3 }, + // { ImplicitPointer, 3 }, + // { Addrx, 2 }, + // { Constx, 2 }, + // { EntryValue, 3 }, + // { ConstTypeOp, 4 }, + // { RegvalType, 3 }, + // { DerefType, 3 }, + // { XderefType, 3 }, + // { Reinterpret, 2 }, }; } @@ -498,16 +808,144 @@ typedef SPIRVMap DbgExpressionOpCodeMap; template <> inline void DbgExpressionOpCodeMap::init() { - add(dwarf::DW_OP_deref, SPIRVDebug::Deref); - add(dwarf::DW_OP_plus, SPIRVDebug::Plus); - add(dwarf::DW_OP_minus, SPIRVDebug::Minus); - add(dwarf::DW_OP_plus_uconst, SPIRVDebug::PlusUconst); - add(dwarf::DW_OP_bit_piece, SPIRVDebug::BitPiece); - add(dwarf::DW_OP_swap, SPIRVDebug::Swap); - add(dwarf::DW_OP_xderef, SPIRVDebug::Xderef); - add(dwarf::DW_OP_stack_value, SPIRVDebug::StackValue); - add(dwarf::DW_OP_constu, SPIRVDebug::Constu); - add(dwarf::DW_OP_LLVM_fragment, SPIRVDebug::Fragment); + add(dwarf::DW_OP_deref, SPIRVDebug::Deref); + add(dwarf::DW_OP_plus, SPIRVDebug::Plus); + add(dwarf::DW_OP_minus, SPIRVDebug::Minus); + add(dwarf::DW_OP_plus_uconst, SPIRVDebug::PlusUconst); + add(dwarf::DW_OP_bit_piece, SPIRVDebug::BitPiece); + add(dwarf::DW_OP_swap, SPIRVDebug::Swap); + add(dwarf::DW_OP_xderef, SPIRVDebug::Xderef); + add(dwarf::DW_OP_stack_value, SPIRVDebug::StackValue); + add(dwarf::DW_OP_constu, SPIRVDebug::Constu); + add(dwarf::DW_OP_LLVM_fragment, SPIRVDebug::Fragment); + add(dwarf::DW_OP_LLVM_convert, SPIRVDebug::Convert); + add(dwarf::DW_OP_consts, SPIRVDebug::Consts); + add(dwarf::DW_OP_dup, SPIRVDebug::Dup); + add(dwarf::DW_OP_drop, SPIRVDebug::Drop); + add(dwarf::DW_OP_over, SPIRVDebug::Over); + add(dwarf::DW_OP_pick, SPIRVDebug::Pick); + add(dwarf::DW_OP_rot, SPIRVDebug::Rot); + add(dwarf::DW_OP_abs, SPIRVDebug::Abs); + add(dwarf::DW_OP_and, SPIRVDebug::And); + add(dwarf::DW_OP_div, SPIRVDebug::Div); + add(dwarf::DW_OP_mod, SPIRVDebug::Mod); + add(dwarf::DW_OP_mul, SPIRVDebug::Mul); + add(dwarf::DW_OP_neg, SPIRVDebug::Neg); + add(dwarf::DW_OP_not, SPIRVDebug::Not); + add(dwarf::DW_OP_or, SPIRVDebug::Or); + add(dwarf::DW_OP_shl, SPIRVDebug::Shl); + add(dwarf::DW_OP_shr, SPIRVDebug::Shr); + add(dwarf::DW_OP_shra, SPIRVDebug::Shra); + add(dwarf::DW_OP_xor, SPIRVDebug::Xor); + add(dwarf::DW_OP_bra, SPIRVDebug::Bra); + add(dwarf::DW_OP_eq, SPIRVDebug::Eq); + add(dwarf::DW_OP_ge, SPIRVDebug::Ge); + add(dwarf::DW_OP_gt, SPIRVDebug::Gt); + add(dwarf::DW_OP_le, SPIRVDebug::Le); + add(dwarf::DW_OP_lt, SPIRVDebug::Lt); + add(dwarf::DW_OP_ne, SPIRVDebug::Ne); + add(dwarf::DW_OP_lit0, SPIRVDebug::Lit0); + add(dwarf::DW_OP_lit1, SPIRVDebug::Lit1); + add(dwarf::DW_OP_lit2, SPIRVDebug::Lit2); + add(dwarf::DW_OP_lit3, SPIRVDebug::Lit3); + add(dwarf::DW_OP_lit4, SPIRVDebug::Lit4); + add(dwarf::DW_OP_lit5, SPIRVDebug::Lit5); + add(dwarf::DW_OP_lit6, SPIRVDebug::Lit6); + add(dwarf::DW_OP_lit7, SPIRVDebug::Lit7); + add(dwarf::DW_OP_lit8, SPIRVDebug::Lit8); + add(dwarf::DW_OP_lit9, SPIRVDebug::Lit9); + add(dwarf::DW_OP_lit10, SPIRVDebug::Lit10); + add(dwarf::DW_OP_lit11, SPIRVDebug::Lit11); + add(dwarf::DW_OP_lit12, SPIRVDebug::Lit12); + add(dwarf::DW_OP_lit13, SPIRVDebug::Lit13); + add(dwarf::DW_OP_lit14, SPIRVDebug::Lit14); + add(dwarf::DW_OP_lit15, SPIRVDebug::Lit15); + add(dwarf::DW_OP_lit16, SPIRVDebug::Lit16); + add(dwarf::DW_OP_lit17, SPIRVDebug::Lit17); + add(dwarf::DW_OP_lit18, SPIRVDebug::Lit18); + add(dwarf::DW_OP_lit19, SPIRVDebug::Lit19); + add(dwarf::DW_OP_lit20, SPIRVDebug::Lit20); + add(dwarf::DW_OP_lit21, SPIRVDebug::Lit21); + add(dwarf::DW_OP_lit22, SPIRVDebug::Lit22); + add(dwarf::DW_OP_lit23, SPIRVDebug::Lit23); + add(dwarf::DW_OP_lit24, SPIRVDebug::Lit24); + add(dwarf::DW_OP_lit25, SPIRVDebug::Lit25); + add(dwarf::DW_OP_lit26, SPIRVDebug::Lit26); + add(dwarf::DW_OP_lit27, SPIRVDebug::Lit27); + add(dwarf::DW_OP_lit28, SPIRVDebug::Lit28); + add(dwarf::DW_OP_lit29, SPIRVDebug::Lit29); + add(dwarf::DW_OP_lit30, SPIRVDebug::Lit30); + add(dwarf::DW_OP_lit31, SPIRVDebug::Lit31); + add(dwarf::DW_OP_reg0, SPIRVDebug::Reg0); + add(dwarf::DW_OP_reg1, SPIRVDebug::Reg1); + add(dwarf::DW_OP_reg2, SPIRVDebug::Reg2); + add(dwarf::DW_OP_reg3, SPIRVDebug::Reg3); + add(dwarf::DW_OP_reg4, SPIRVDebug::Reg4); + add(dwarf::DW_OP_reg5, SPIRVDebug::Reg5); + add(dwarf::DW_OP_reg6, SPIRVDebug::Reg6); + add(dwarf::DW_OP_reg7, SPIRVDebug::Reg7); + add(dwarf::DW_OP_reg8, SPIRVDebug::Reg8); + add(dwarf::DW_OP_reg9, SPIRVDebug::Reg9); + add(dwarf::DW_OP_reg10, SPIRVDebug::Reg10); + add(dwarf::DW_OP_reg11, SPIRVDebug::Reg11); + add(dwarf::DW_OP_reg12, SPIRVDebug::Reg12); + add(dwarf::DW_OP_reg13, SPIRVDebug::Reg13); + add(dwarf::DW_OP_reg14, SPIRVDebug::Reg14); + add(dwarf::DW_OP_reg15, SPIRVDebug::Reg15); + add(dwarf::DW_OP_reg16, SPIRVDebug::Reg16); + add(dwarf::DW_OP_reg17, SPIRVDebug::Reg17); + add(dwarf::DW_OP_reg18, SPIRVDebug::Reg18); + add(dwarf::DW_OP_reg19, SPIRVDebug::Reg19); + add(dwarf::DW_OP_reg20, SPIRVDebug::Reg20); + add(dwarf::DW_OP_reg21, SPIRVDebug::Reg21); + add(dwarf::DW_OP_reg22, SPIRVDebug::Reg22); + add(dwarf::DW_OP_reg23, SPIRVDebug::Reg23); + add(dwarf::DW_OP_reg24, SPIRVDebug::Reg24); + add(dwarf::DW_OP_reg25, SPIRVDebug::Reg25); + add(dwarf::DW_OP_reg26, SPIRVDebug::Reg26); + add(dwarf::DW_OP_reg27, SPIRVDebug::Reg27); + add(dwarf::DW_OP_reg28, SPIRVDebug::Reg28); + add(dwarf::DW_OP_reg29, SPIRVDebug::Reg29); + add(dwarf::DW_OP_reg30, SPIRVDebug::Reg30); + add(dwarf::DW_OP_reg31, SPIRVDebug::Reg31); + add(dwarf::DW_OP_breg0, SPIRVDebug::Breg0); + add(dwarf::DW_OP_breg1, SPIRVDebug::Breg1); + add(dwarf::DW_OP_breg2, SPIRVDebug::Breg2); + add(dwarf::DW_OP_breg3, SPIRVDebug::Breg3); + add(dwarf::DW_OP_breg4, SPIRVDebug::Breg4); + add(dwarf::DW_OP_breg5, SPIRVDebug::Breg5); + add(dwarf::DW_OP_breg6, SPIRVDebug::Breg6); + add(dwarf::DW_OP_breg7, SPIRVDebug::Breg7); + add(dwarf::DW_OP_breg8, SPIRVDebug::Breg8); + add(dwarf::DW_OP_breg9, SPIRVDebug::Breg9); + add(dwarf::DW_OP_breg10, SPIRVDebug::Breg10); + add(dwarf::DW_OP_breg11, SPIRVDebug::Breg11); + add(dwarf::DW_OP_breg12, SPIRVDebug::Breg12); + add(dwarf::DW_OP_breg13, SPIRVDebug::Breg13); + add(dwarf::DW_OP_breg14, SPIRVDebug::Breg14); + add(dwarf::DW_OP_breg15, SPIRVDebug::Breg15); + add(dwarf::DW_OP_breg16, SPIRVDebug::Breg16); + add(dwarf::DW_OP_breg17, SPIRVDebug::Breg17); + add(dwarf::DW_OP_breg18, SPIRVDebug::Breg18); + add(dwarf::DW_OP_breg19, SPIRVDebug::Breg19); + add(dwarf::DW_OP_breg20, SPIRVDebug::Breg20); + add(dwarf::DW_OP_breg21, SPIRVDebug::Breg21); + add(dwarf::DW_OP_breg22, SPIRVDebug::Breg22); + add(dwarf::DW_OP_breg23, SPIRVDebug::Breg23); + add(dwarf::DW_OP_breg24, SPIRVDebug::Breg24); + add(dwarf::DW_OP_breg25, SPIRVDebug::Breg25); + add(dwarf::DW_OP_breg26, SPIRVDebug::Breg26); + add(dwarf::DW_OP_breg27, SPIRVDebug::Breg27); + add(dwarf::DW_OP_breg28, SPIRVDebug::Breg28); + add(dwarf::DW_OP_breg29, SPIRVDebug::Breg29); + add(dwarf::DW_OP_breg30, SPIRVDebug::Breg30); + add(dwarf::DW_OP_breg31, SPIRVDebug::Breg31); + add(dwarf::DW_OP_regx, SPIRVDebug::Regx); + add(dwarf::DW_OP_bregx, SPIRVDebug::Bregx); + add(dwarf::DW_OP_deref_size, SPIRVDebug::DerefSize ); + add(dwarf::DW_OP_xderef_size, SPIRVDebug::XderefSize ); + add(dwarf::DW_OP_nop, SPIRVDebug::Nop); + add(dwarf::DW_OP_push_object_address, SPIRVDebug::PushObjectAddress ); } typedef SPIRVMap diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDebug.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDebug.cpp index a169b733f08f7..3c0ff0443bebf 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDebug.cpp +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDebug.cpp @@ -39,9 +39,33 @@ #include "SPIRVDebug.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "spirv-regularization" + using namespace SPIRV; bool SPIRV::SPIRVDbgEnable = false; SPIRV::SPIRVDbgErrorHandlingKinds SPIRV::SPIRVDbgError = SPIRVDbgErrorHandlingKinds::Exit; bool SPIRV::SPIRVDbgErrorMsgIncludesSourceInfo = true; + +llvm::cl::opt SPIRV::VerifyRegularizationPasses( + "spirv-verify-regularize-passes", llvm::cl::init(_SPIRVDBG), + llvm::cl::desc( + "Verify module after each pass in LLVM regularization phase")); + +namespace SPIRV { +void verifyRegularizationPass(llvm::Module &M, const std::string &PassName) { + if (VerifyRegularizationPasses) { + std::string Err; + llvm::raw_string_ostream ErrorOS(Err); + if (llvm::verifyModule(M, &ErrorOS)) { + LLVM_DEBUG(llvm::errs() + << "Failed to verify module after pass: " << PassName << "\n" + << ErrorOS.str()); + } + } +} +} // namespace SPIRV diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDebug.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDebug.h index 81c5e4146f974..bddb3b857c545 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDebug.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDebug.h @@ -41,20 +41,16 @@ #define SPIRV_LIBSPIRV_SPIRVDEBUG_H #include "SPIRVUtil.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + #include +#include namespace SPIRV { -#define _SPIRVDBG -#ifdef _SPIRVDBG - -#define SPIRVDBG(x) \ - if (SPIRVDbgEnable) { \ - x; \ - } - -// Enable debug output. -extern bool SPIRVDbgEnable; +extern llvm::cl::opt VerifyRegularizationPasses; // Include source file and line number in error message. extern bool SPIRVDbgErrorMsgIncludesSourceInfo; @@ -63,6 +59,26 @@ extern bool SPIRVDbgErrorMsgIncludesSourceInfo; enum class SPIRVDbgErrorHandlingKinds { Abort, Exit, Ignore }; extern SPIRVDbgErrorHandlingKinds SPIRVDbgError; +// Enable debug output. +extern bool SPIRVDbgEnable; + +void verifyRegularizationPass(llvm::Module &, const std::string &); + +#ifndef _SPIRVDBG +#if !defined(NDEBUG) || defined(_DEBUG) +#define _SPIRVDBG true +#else +#define _SPIRVDBG false +#endif +#endif + +#if _SPIRVDBG + +#define SPIRVDBG(x) \ + if (SPIRVDbgEnable) { \ + x; \ + } + // Output stream for SPIRV debug information. inline spv_ostream &spvdbgs() { return std::cerr; @@ -72,6 +88,29 @@ inline spv_ostream &spvdbgs() { #define SPIRVDBG(x) +// Minimal std::basic_ostream mock that ignores everything being printed via +// operator<< +class dev_null_stream { +public: + void flush() {} +}; + +template +const dev_null_stream &operator<<(const dev_null_stream &Out, const T &) { + return Out; +} + +template +const dev_null_stream &&operator<<(const dev_null_stream &&Out, const T &) { + return std::move(Out); +} + +// Output stream for SPIRV debug information. +inline dev_null_stream &spvdbgs() { + static dev_null_stream Out; + return Out; +} + #endif } // namespace SPIRV diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVError.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVError.h index c285572322654..a02c107d669b3 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVError.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVError.h @@ -41,6 +41,7 @@ #include "SPIRVDebug.h" #include "SPIRVUtil.h" +#include #include #include @@ -115,16 +116,17 @@ inline bool SPIRVErrorLog::checkError(bool Cond, SPIRVErrorCode ErrCode, setError(ErrCode, SS.str()); switch (SPIRVDbgError) { case SPIRVDbgErrorHandlingKinds::Abort: - spvdbgs() << SS.str() << '\n'; - spvdbgs().flush(); + std::cerr << SS.str() << std::endl; abort(); break; case SPIRVDbgErrorHandlingKinds::Exit: - spvdbgs() << SS.str() << '\n'; - spvdbgs().flush(); + std::cerr << SS.str() << std::endl; std::exit(ErrCode); break; case SPIRVDbgErrorHandlingKinds::Ignore: + // Still print info about the error into debug output stream + spvdbgs() << SS.str() << '\n'; + spvdbgs().flush(); break; } return Cond; diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h index 8ac5b8368a084..061c34197f659 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h @@ -2479,8 +2479,11 @@ class SPIRVGroupInstBase : public SPIRVInstTemplateBase { #define _SPIRV_OP(x, ...) \ typedef SPIRVInstTemplate SPIRV##x; -// Group instructions -_SPIRV_OP(GroupWaitEvents, false, 4) +// Group instructions. +// Even though GroupWaitEvents has Group in its name, it doesn't require the +// Group capability +typedef SPIRVInstTemplate + SPIRVGroupWaitEvents; _SPIRV_OP(GroupAll, true, 5) _SPIRV_OP(GroupAny, true, 5) _SPIRV_OP(GroupBroadcast, true, 6) diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h index 9e4bee64e93cc..ff2018551561e 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h @@ -479,6 +479,10 @@ class SPIRVModule { return TranslationOpts.isSPIRVAllowUnknownIntrinsicsEnabled(); } + bool allowExtraDIExpressions() const noexcept { + return TranslationOpts.allowExtraDIExpressions(); + } + SPIRVExtInstSetKind getDebugInfoEIS() const { switch (TranslationOpts.getDebugInfoEIS()) { case DebugInfoEIS::SPIRV_Debug: diff --git a/llvm-spirv/test/DebugInfo/X86/convert-debugloc.ll b/llvm-spirv/test/DebugInfo/X86/convert-debugloc.ll new file mode 100644 index 0000000000000..16920aecc6b0e --- /dev/null +++ b/llvm-spirv/test/DebugInfo/X86/convert-debugloc.ll @@ -0,0 +1,87 @@ +; RUN: llvm-as < %s -o %t.bc +; RUN: llvm-spirv %t.bc -o %t.spv --spirv-allow-extra-diexpressions +; RUN: llvm-spirv -r %t.spv -o - | llvm-dis -o %t.ll + +; RUN: llc -mtriple=%triple -dwarf-version=5 -filetype=obj -O0 < %t.ll | llvm-dwarfdump - \ +; RUN: | FileCheck %s --check-prefix=DW5 "--implicit-check-not={{DW_TAG|NULL}}" +; RUN: llc -mtriple=%triple -dwarf-version=4 -filetype=obj -O0 < %t.ll | llvm-dwarfdump - \ +; RUN: | FileCheck %s --check-prefix=DW4 "--implicit-check-not={{DW_TAG|NULL}}" + +; DW5: .debug_info contents: +; DW5: DW_TAG_compile_unit +; DW5:[[SIG8:.*]]: DW_TAG_base_type +; DW5-NEXT:DW_AT_name ("DW_ATE_signed_8") +; DW5-NEXT:DW_AT_encoding (DW_ATE_signed) +; DW5-NEXT:DW_AT_byte_size (0x01) +; DW5-NOT: DW_AT +; DW5:[[SIG32:.*]]: DW_TAG_base_type +; DW5-NEXT:DW_AT_name ("DW_ATE_signed_32") +; DW5-NEXT:DW_AT_encoding (DW_ATE_signed) +; DW5-NEXT:DW_AT_byte_size (0x04) +; DW5-NOT: DW_AT +; DW5: DW_TAG_subprogram +; DW5: DW_TAG_formal_parameter +; DW5: DW_TAG_variable +; DW5: DW_AT_location ( +; DW5: {{.*}}, DW_OP_convert ([[SIG8]]) "DW_ATE_signed_8", DW_OP_convert ([[SIG32]]) "DW_ATE_signed_32", DW_OP_stack_value) +; DW5: DW_AT_name ("y") +; DW5: NULL +; DW5: DW_TAG_base_type +; DW5: DW_AT_name ("signed char") +; DW5: DW_TAG_base_type +; DW5: DW_AT_name ("int") +; DW5: NULL + +; DW4: .debug_info contents: +; DW4: DW_TAG_compile_unit +; DW4: DW_TAG_subprogram +; DW4: DW_TAG_formal_parameter +; DW4: DW_TAG_variable +; DW4: DW_AT_location ( +; DW4: {{.*}}, DW_OP_dup, DW_OP_constu 0x7, DW_OP_shr, DW_OP_lit0, DW_OP_not, DW_OP_mul, DW_OP_constu 0x8, DW_OP_shl, DW_OP_or, DW_OP_stack_value) +; DW4: DW_AT_name ("y") +; DW4: NULL +; DW4: DW_TAG_base_type +; DW4: DW_AT_name ("signed char") +; DW4: DW_TAG_base_type +; DW4: DW_AT_name ("int") +; DW4: NULL + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: noinline nounwind uwtable +define dso_local signext i8 @foo(i8 signext %x) !dbg !7 { +entry: + call void @llvm.dbg.value(metadata i8 %x, metadata !11, metadata !DIExpression()), !dbg !12 + call void @llvm.dbg.value(metadata i8 32, metadata !13, metadata !DIExpression(DW_OP_LLVM_convert, 8, DW_ATE_signed, DW_OP_LLVM_convert, 32, DW_ATE_signed, DW_OP_stack_value)), !dbg !15 + ret i8 %x, !dbg !16 +} + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.declare(metadata, metadata, metadata) + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 9.0.0 (trunk 353791) (llvm/trunk 353801)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "dbg.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "2a034da6937f5b9cf6dd2d89127f57fd") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 5} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 9.0.0 (trunk 353791) (llvm/trunk 353801)"} +!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !9) +!9 = !{!10, !10} +!10 = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char) +!11 = !DILocalVariable(name: "x", arg: 1, scope: !7, file: !1, line: 1, type: !10) +!12 = !DILocation(line: 1, column: 29, scope: !7) +!13 = !DILocalVariable(name: "y", scope: !7, file: !1, line: 3, type: !14) +!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!15 = !DILocation(line: 3, column: 14, scope: !7) +!16 = !DILocation(line: 4, column: 3, scope: !7) diff --git a/llvm-spirv/test/DebugInfo/builtin-get-global-id.ll b/llvm-spirv/test/DebugInfo/builtin-get-global-id.ll new file mode 100644 index 0000000000000..9653c8849b764 --- /dev/null +++ b/llvm-spirv/test/DebugInfo/builtin-get-global-id.ll @@ -0,0 +1,66 @@ +; Check debug info of builtin get_global_id is preserved from LLVM IR to spirv +; and spirv to LLVM IR translation. + +; Original .cl source: +; kernel void test() { +; size_t gid = get_global_id(0); +; } + +; Command line: +; ./clang -cc1 1.cl -triple spir64 -cl-std=cl2.0 -emit-llvm -finclude-default-header -debug-info-kind=line-tables-only -O0 + +; RUN: llvm-as %s -o %t.bc +; RUN: llvm-spirv %t.bc -spirv-text -o - | FileCheck %s --check-prefix CHECK-SPIRV +; RUN: llvm-spirv %t.bc -o %t.spv +; RUN: llvm-spirv -r %t.spv -o - | llvm-dis -o - | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64" + +; CHECK-SPIRV: ExtInst {{.*}} DebugScope +; CHECK-SPIRV-NEXT: Line {{[0-9]+}} 2 16 +; CHECK-SPIRV-NEXT: Load {{[0-9]+}} [[LoadRes:[0-9]+]] +; CHECK-SPIRV-NEXT: CompositeExtract {{[0-9]+}} {{[0-9]+}} [[LoadRes]] 0 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define spir_kernel void @test() #0 !dbg !7 !kernel_arg_addr_space !2 !kernel_arg_access_qual !2 !kernel_arg_type !2 !kernel_arg_base_type !2 !kernel_arg_type_qual !2 { +entry: + %gid = alloca i64, align 8 + %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2, !dbg !10 +; CHECK: [[I0:%[0-9]]] = call spir_func i64 @_Z13get_global_idj(i32 0) #1, !dbg [[DBG:![0-9]+]] +; CHECK-NEXT: [[I1:%[0-9]]] = insertelement <3 x i64> undef, i64 [[I0]], i32 0, !dbg [[DBG]] +; CHECK-NEXT: [[I2:%[0-9]]] = call spir_func i64 @_Z13get_global_idj(i32 1) #1, !dbg [[DBG]] +; CHECK-NEXT: [[I3:%[0-9]]] = insertelement <3 x i64> [[I1]], i64 [[I2]], i32 1, !dbg [[DBG]] +; CHECK-NEXT: [[I4:%[0-9]]] = call spir_func i64 @_Z13get_global_idj(i32 2) #1, !dbg [[DBG]] +; CHECK-NEXT: [[I5:%[0-9]]] = insertelement <3 x i64> [[I3]], i64 [[I4]], i32 2, !dbg [[DBG]] +; CHECK-NEXT: %call = extractelement <3 x i64> [[I5]], i32 0, !dbg [[DBG]] + store i64 %call, i64* %gid, align 8, !dbg !11 + ret void, !dbg !12 +} + +; Function Attrs: convergent nounwind readnone +declare spir_func i64 @_Z13get_global_idj(i32) #1 + +attributes #0 = { convergent noinline norecurse nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nounwind readnone } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} +!opencl.ocl.version = !{!5} +!opencl.spir.version = !{!5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0 (https://github.com/llvm/llvm-project.git b5bc56da8aa23dc57db9d286b0591dbcf9b1bdd3)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "", directory: "") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 2, i32 0} +!6 = !{!"clang version 12.0.0 (https://github.com/llvm/llvm-project.git b5bc56da8aa23dc57db9d286b0591dbcf9b1bdd3)"} +!7 = distinct !DISubprogram(name: "test", scope: !8, file: !8, line: 1, type: !9, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!8 = !DIFile(filename: "1.cl", directory: "") +!9 = !DISubroutineType(types: !2) +!10 = !DILocation(line: 2, column: 16, scope: !7) +!11 = !DILocation(line: 2, column: 10, scope: !7) +!12 = !DILocation(line: 3, column: 1, scope: !7) diff --git a/llvm-spirv/test/DebugInfo/expr-opcode.ll b/llvm-spirv/test/DebugInfo/expr-opcode.ll new file mode 100644 index 0000000000000..482d7af842a9e --- /dev/null +++ b/llvm-spirv/test/DebugInfo/expr-opcode.ll @@ -0,0 +1,72 @@ +; RUN: llvm-as < %s -o %t.bc +; RUN: llvm-spirv %t.bc -o %t.spv --spirv-allow-extra-diexpressions +; RUN: llvm-spirv -r %t.spv -o - | llvm-dis -o %t.rev.ll +; RUN: FileCheck %s --input-file %t.rev.ll + +; RUN: llc -mtriple=%triple -dwarf-version=5 -filetype=obj -O0 < %t.rev.ll +; RUN: llc -mtriple=%triple -dwarf-version=4 -filetype=obj -O0 < %t.rev.ll + +; CHECK: DW_OP_constu, 42 +; CHECK: DW_OP_plus_uconst, 42 +; CHECK: DW_OP_plus +; CHECK: DW_OP_minus +; CHECK: DW_OP_mul +; CHECK: DW_OP_div +; CHECK: DW_OP_mod +; CHECK: DW_OP_or +; CHECK: DW_OP_and +; CHECK: DW_OP_xor +; CHECK: DW_OP_shl +; CHECK: DW_OP_shr +; CHECK: DW_OP_shra +; CHECK: DW_OP_deref +; CHECK: DW_OP_deref_size, 4 +; CHECK: DW_OP_xderef +; CHECK: DW_OP_lit0 +; CHECK: DW_OP_not +; CHECK: DW_OP_dup +; CHECK: DW_OP_regx, 1 +; CHECK: DW_OP_bregx, 1, 4 +; CHECK: DW_OP_push_object_address +; CHECK: DW_OP_swap +; CHECK: DW_OP_LLVM_convert, 8, DW_ATE_signed +; CHECK: DW_OP_stack_value + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: noinline nounwind uwtable +define dso_local signext i8 @foo(i8 signext %x) !dbg !7 { +entry: + call void @llvm.dbg.value(metadata i8 %x, metadata !11, metadata !DIExpression()), !dbg !12 + call void @llvm.dbg.value(metadata i8 32, metadata !13, metadata !DIExpression(DW_OP_constu, 42, DW_OP_plus_uconst, 42, DW_OP_plus, DW_OP_minus, DW_OP_mul, DW_OP_div, DW_OP_mod, DW_OP_or, DW_OP_and, DW_OP_xor, DW_OP_shl, DW_OP_shr, DW_OP_shra, DW_OP_deref, DW_OP_deref_size, 4, DW_OP_xderef, DW_OP_lit0, DW_OP_not, DW_OP_dup, DW_OP_regx, 1, DW_OP_bregx, 1, 4, DW_OP_push_object_address, DW_OP_swap, DW_OP_LLVM_convert, 8, DW_ATE_signed, DW_OP_stack_value)), !dbg !15 + ret i8 %x, !dbg !16 +} + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.declare(metadata, metadata, metadata) + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 9.0.0 (trunk 353791) (llvm/trunk 353801)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "dbg.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "2a034da6937f5b9cf6dd2d89127f57fd") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 5} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 9.0.0 (trunk 353791) (llvm/trunk 353801)"} +!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !9) +!9 = !{!10, !10} +!10 = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char) +!11 = !DILocalVariable(name: "x", arg: 1, scope: !7, file: !1, line: 1, type: !10) +!12 = !DILocation(line: 1, column: 29, scope: !7) +!13 = !DILocalVariable(name: "y", scope: !7, file: !1, line: 3, type: !14) +!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!15 = !DILocation(line: 3, column: 14, scope: !7) +!16 = !DILocation(line: 4, column: 3, scope: !7) diff --git a/llvm-spirv/test/event_no_group_cap.cl b/llvm-spirv/test/event_no_group_cap.cl new file mode 100644 index 0000000000000..6c20fe9d4ffe0 --- /dev/null +++ b/llvm-spirv/test/event_no_group_cap.cl @@ -0,0 +1,12 @@ +__kernel void test_fn( const __global char *src) +{ + wait_group_events(0, NULL); +} +// RUN: %clang_cc1 -triple spir64 -x cl -cl-std=CL2.0 -finclude-default-header -O0 -emit-llvm-bc %s -o %t.bc +// RUN: llvm-spirv %t.bc -spirv-text -o %t.spt +// RUN: FileCheck < %t.spt %s +// RUN: llvm-spirv %t.bc -o %t.spv +// RUN: spirv-val %t.spv + +// CHECK-NOT:Capability Groups +// CHECK:GroupWaitEvents diff --git a/llvm-spirv/test/exec_mode_float_control_intel.ll b/llvm-spirv/test/exec_mode_float_control_intel.ll old mode 100755 new mode 100644 diff --git a/llvm-spirv/test/nullptr-metadata-test.ll b/llvm-spirv/test/nullptr-metadata-test.ll old mode 100755 new mode 100644 diff --git a/llvm-spirv/test/transcoding/subgroup_avc_intel_generic.cl b/llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_generic.cl similarity index 100% rename from llvm-spirv/test/transcoding/subgroup_avc_intel_generic.cl rename to llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_generic.cl diff --git a/llvm-spirv/test/transcoding/subgroup_avc_intel_not_builtin.ll b/llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_not_builtin.ll similarity index 100% rename from llvm-spirv/test/transcoding/subgroup_avc_intel_not_builtin.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_not_builtin.ll diff --git a/llvm-spirv/test/transcoding/subgroup_avc_intel_not_builtin.spt b/llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_not_builtin.spt similarity index 100% rename from llvm-spirv/test/transcoding/subgroup_avc_intel_not_builtin.spt rename to llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_not_builtin.spt diff --git a/llvm-spirv/test/transcoding/subgroup_avc_intel_types.ll b/llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_types.ll similarity index 100% rename from llvm-spirv/test/transcoding/subgroup_avc_intel_types.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_types.ll diff --git a/llvm-spirv/test/transcoding/subgroup_avc_intel_types.spt b/llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_types.spt similarity index 100% rename from llvm-spirv/test/transcoding/subgroup_avc_intel_types.spt rename to llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_types.spt diff --git a/llvm-spirv/test/transcoding/subgroup_avc_intel_vme_image.cl b/llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_vme_image.cl similarity index 100% rename from llvm-spirv/test/transcoding/subgroup_avc_intel_vme_image.cl rename to llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_vme_image.cl diff --git a/llvm-spirv/test/transcoding/subgroup_avc_intel_wrappers.ll b/llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_wrappers.ll similarity index 100% rename from llvm-spirv/test/transcoding/subgroup_avc_intel_wrappers.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_device_side_avc_motion_esimation/subgroup_avc_intel_wrappers.ll diff --git a/llvm-spirv/test/transcoding/FPGAIVDepLoopAttr.ll b/llvm-spirv/test/transcoding/SPV_INTEL_fpga_loop_controls/FPGAIVDepLoopAttr.ll similarity index 100% rename from llvm-spirv/test/transcoding/FPGAIVDepLoopAttr.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_fpga_loop_controls/FPGAIVDepLoopAttr.ll diff --git a/llvm-spirv/test/transcoding/FPGALoopAttr.ll b/llvm-spirv/test/transcoding/SPV_INTEL_fpga_loop_controls/FPGALoopAttr.ll similarity index 100% rename from llvm-spirv/test/transcoding/FPGALoopAttr.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_fpga_loop_controls/FPGALoopAttr.ll diff --git a/llvm-spirv/test/transcoding/FPGALoopMergeInst.ll b/llvm-spirv/test/transcoding/SPV_INTEL_fpga_loop_controls/FPGALoopMergeInst.ll similarity index 100% rename from llvm-spirv/test/transcoding/FPGALoopMergeInst.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_fpga_loop_controls/FPGALoopMergeInst.ll diff --git a/llvm-spirv/test/transcoding/intel_multiple_fpga_loop_attrs.ll b/llvm-spirv/test/transcoding/SPV_INTEL_fpga_loop_controls/intel_multiple_fpga_loop_attrs.ll similarity index 100% rename from llvm-spirv/test/transcoding/intel_multiple_fpga_loop_attrs.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_fpga_loop_controls/intel_multiple_fpga_loop_attrs.ll diff --git a/llvm-spirv/test/transcoding/inline_asm_basic.cl b/llvm-spirv/test/transcoding/SPV_INTEL_inline_assembly/inline_asm_basic.cl similarity index 100% rename from llvm-spirv/test/transcoding/inline_asm_basic.cl rename to llvm-spirv/test/transcoding/SPV_INTEL_inline_assembly/inline_asm_basic.cl diff --git a/llvm-spirv/test/transcoding/inline_asm_clobbers.cl b/llvm-spirv/test/transcoding/SPV_INTEL_inline_assembly/inline_asm_clobbers.cl similarity index 100% rename from llvm-spirv/test/transcoding/inline_asm_clobbers.cl rename to llvm-spirv/test/transcoding/SPV_INTEL_inline_assembly/inline_asm_clobbers.cl diff --git a/llvm-spirv/test/transcoding/inline_asm_constraints.cl b/llvm-spirv/test/transcoding/SPV_INTEL_inline_assembly/inline_asm_constraints.cl similarity index 100% rename from llvm-spirv/test/transcoding/inline_asm_constraints.cl rename to llvm-spirv/test/transcoding/SPV_INTEL_inline_assembly/inline_asm_constraints.cl diff --git a/llvm-spirv/test/transcoding/buffer_surface_intel.ll b/llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/buffer_surface_intel.ll old mode 100755 new mode 100644 similarity index 100% rename from llvm-spirv/test/transcoding/buffer_surface_intel.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/buffer_surface_intel.ll diff --git a/llvm-spirv/test/transcoding/decoration_byte_offset.ll b/llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/decoration_byte_offset.ll old mode 100755 new mode 100644 similarity index 100% rename from llvm-spirv/test/transcoding/decoration_byte_offset.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/decoration_byte_offset.ll diff --git a/llvm-spirv/test/transcoding/decoration_simt_call.ll b/llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/decoration_simt_call.ll old mode 100755 new mode 100644 similarity index 100% rename from llvm-spirv/test/transcoding/decoration_simt_call.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/decoration_simt_call.ll diff --git a/llvm-spirv/test/transcoding/decoration_volatile.ll b/llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/decoration_volatile.ll old mode 100755 new mode 100644 similarity index 100% rename from llvm-spirv/test/transcoding/decoration_volatile.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/decoration_volatile.ll diff --git a/llvm-spirv/test/transcoding/exec_mode_argument_io_kind.ll b/llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/exec_mode_argument_io_kind.ll old mode 100755 new mode 100644 similarity index 100% rename from llvm-spirv/test/transcoding/exec_mode_argument_io_kind.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/exec_mode_argument_io_kind.ll diff --git a/llvm-spirv/test/transcoding/exec_mode_float_control.ll b/llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/exec_mode_float_control.ll old mode 100755 new mode 100644 similarity index 100% rename from llvm-spirv/test/transcoding/exec_mode_float_control.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/exec_mode_float_control.ll diff --git a/llvm-spirv/test/transcoding/exec_mode_shared_local_memory_size.ll b/llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/exec_mode_shared_local_memory_size.ll old mode 100755 new mode 100644 similarity index 100% rename from llvm-spirv/test/transcoding/exec_mode_shared_local_memory_size.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/exec_mode_shared_local_memory_size.ll diff --git a/llvm-spirv/test/transcoding/extension_spirv_intel_vector_compute.ll b/llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/extension_spirv_intel_vector_compute.ll old mode 100755 new mode 100644 similarity index 100% rename from llvm-spirv/test/transcoding/extension_spirv_intel_vector_compute.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/extension_spirv_intel_vector_compute.ll diff --git a/llvm-spirv/test/transcoding/extension_vector_compute_stability.ll b/llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/extension_vector_compute_stability.ll old mode 100755 new mode 100644 similarity index 100% rename from llvm-spirv/test/transcoding/extension_vector_compute_stability.ll rename to llvm-spirv/test/transcoding/SPV_INTEL_vector_compute/extension_vector_compute_stability.ll diff --git a/llvm-spirv/test/transcoding/exec_mode_float_control_empty.ll b/llvm-spirv/test/transcoding/exec_mode_float_control_empty.ll old mode 100755 new mode 100644 diff --git a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp index c25d96577eb4e..da5294c25ab88 100644 --- a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp +++ b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp @@ -183,6 +183,12 @@ cl::opt SPIRVAllowUnknownIntrinsics( cl::desc("Unknown LLVM intrinsics will be translated as external function " "calls in SPIR-V")); +static cl::opt SPIRVAllowExtraDIExpressions( + "spirv-allow-extra-diexpressions", cl::init(false), + cl::desc("Allow DWARF operations not listed in the OpenCL.DebugInfo.100 " + "specification (experimental, may produce incompatible SPIR-V " + "module)")); + static cl::opt DebugEIS( "spirv-debug-info-version", cl::desc("Set SPIR-V debug info version:"), cl::init(SPIRV::DebugInfoEIS::OpenCL_DebugInfo_100), @@ -581,6 +587,10 @@ int main(int Ac, char **Av) { } } + if (SPIRVAllowExtraDIExpressions.getNumOccurrences() != 0) { + Opts.setAllowExtraDIExpressionsEnabled(SPIRVAllowExtraDIExpressions); + } + if (DebugEIS.getNumOccurrences() != 0) { if (IsReverse) { errs() << "Note: --spirv-debug-info-version option ignored as it only " diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index ab8b4cee79483..89a7f4363768b 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -514,6 +514,19 @@ if( WIN32 AND NOT CYGWIN ) set(LLVM_LIT_TOOLS_DIR "" CACHE PATH "Path to GnuWin32 tools") endif() +set(LLVM_INTEGRATED_CRT_ALLOC "" CACHE PATH "Replace the Windows CRT allocator with any of {rpmalloc|mimalloc|snmalloc}. Only works with /MT enabled.") +if(LLVM_INTEGRATED_CRT_ALLOC) + if(NOT WIN32) + message(FATAL_ERROR "LLVM_INTEGRATED_CRT_ALLOC is only supported on Windows.") + endif() + if(LLVM_USE_SANITIZER) + message(FATAL_ERROR "LLVM_INTEGRATED_CRT_ALLOC cannot be used along with LLVM_USE_SANITIZER!") + endif() + if(CMAKE_BUILD_TYPE AND uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + message(FATAL_ERROR "The Debug target isn't supported along with LLVM_INTEGRATED_CRT_ALLOC!") + endif() +endif() + # Define options to control the inclusion and default build behavior for # components which may not strictly be necessary (tools, examples, and tests). # @@ -567,19 +580,6 @@ option (LLVM_BUILD_EXTERNAL_COMPILER_RT option (LLVM_VERSION_PRINTER_SHOW_HOST_TARGET_INFO "Show target and host info when tools are invoked with --version." ON) -option(LLVM_INTEGRATED_CRT_ALLOC "Replace the Windows CRT allocator with any of {rpmalloc|mimalloc|snmalloc}. Only works with /MT enabled." OFF) -if(LLVM_INTEGRATED_CRT_ALLOC) - if(NOT WIN32) - message(FATAL_ERROR "LLVM_INTEGRATED_CRT_ALLOC is only supported on Windows.") - endif() - if(LLVM_USE_SANITIZER) - message(FATAL_ERROR "LLVM_INTEGRATED_CRT_ALLOC cannot be used along with LLVM_USE_SANITIZER!") - endif() - if(CMAKE_BUILD_TYPE AND uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - message(FATAL_ERROR "The Debug target isn't supported along with LLVM_INTEGRATED_CRT_ALLOC!") - endif() -endif() - # You can configure which libraries from LLVM you want to include in the # shared library by setting LLVM_DYLIB_COMPONENTS to a semi-colon delimited # list of LLVM components. All component names handled by llvm-config are valid. @@ -696,38 +696,19 @@ option(LLVM_ENABLE_PLUGINS "Enable plugin support" ${LLVM_ENABLE_PLUGINS_default include(HandleLLVMOptions) -if(CMAKE_VERSION VERSION_LESS 3.12) - include(FindPythonInterp) - if( NOT PYTHONINTERP_FOUND ) - message(FATAL_ERROR - "Unable to find Python interpreter, required for builds and testing. - - Please install Python or specify the PYTHON_EXECUTABLE CMake variable.") - endif() - - if( ${PYTHON_VERSION_STRING} VERSION_LESS 2.7 ) - message(FATAL_ERROR "Python 2.7 or newer is required") +find_package(Python3 COMPONENTS Interpreter) +if(NOT Python3_Interpreter_FOUND) + message(WARNING "Python3 not found, using python2 as a fallback") + find_package(Python2 COMPONENTS Interpreter REQUIRED) + if(Python2_VERSION VERSION_LESS 2.7) + message(SEND_ERROR "Python 2.7 or newer is required") endif() + # Treat python2 as python3 add_executable(Python3::Interpreter IMPORTED) set_target_properties(Python3::Interpreter PROPERTIES - IMPORTED_LOCATION ${PYTHON_EXECUTABLE}) - set(Python3_EXECUTABLE ${PYTHON_EXECUTABLE}) -else() - find_package(Python3 COMPONENTS Interpreter) - if(NOT Python3_Interpreter_FOUND) - message(WARNING "Python3 not found, using python2 as a fallback") - find_package(Python2 COMPONENTS Interpreter REQUIRED) - if(Python2_VERSION VERSION_LESS 2.7) - message(SEND_ERROR "Python 2.7 or newer is required") - endif() - - # Treat python2 as python3 - add_executable(Python3::Interpreter IMPORTED) - set_target_properties(Python3::Interpreter PROPERTIES - IMPORTED_LOCATION ${Python2_EXECUTABLE}) - set(Python3_EXECUTABLE ${Python2_EXECUTABLE}) - endif() + IMPORTED_LOCATION ${Python2_EXECUTABLE}) + set(Python3_EXECUTABLE ${Python2_EXECUTABLE}) endif() ###### diff --git a/llvm/CODE_OWNERS.TXT b/llvm/CODE_OWNERS.TXT index cc1a568032a41..543858c29bd81 100644 --- a/llvm/CODE_OWNERS.TXT +++ b/llvm/CODE_OWNERS.TXT @@ -85,7 +85,11 @@ D: Branch weights and BlockFrequencyInfo N: Hal Finkel E: hfinkel@anl.gov -D: The loop reroller, alias analysis and the PowerPC target +D: The loop reroller and alias analysis + +N: Nemanja Ivanovic +E: nemanja.i.ibm@gmail.com +D: PowerPC Backend N: Dan Gohman E: llvm@sunfishcode.online diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 6b92180b739e8..eeaebf31c926f 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -137,6 +137,27 @@ if(LLVM_ENABLE_ZLIB) set(LLVM_ENABLE_ZLIB "${HAVE_ZLIB}") endif() +if(LLVM_ENABLE_LIBXML2) + if(LLVM_ENABLE_LIBXML2 STREQUAL FORCE_ON) + find_package(LibXml2 REQUIRED) + elseif(NOT LLVM_USE_SANITIZER MATCHES "Memory.*") + find_package(LibXml2) + endif() + if(LibXml2_FOUND) + # Check if libxml2 we found is usable; for example, we may have found a 32-bit + # library on a 64-bit system which would result in a link-time failure. + cmake_push_check_state() + set(CMAKE_REQUIRED_INCLUDES ${LIBXML2_INCLUDE_DIRS}) + set(CMAKE_REQUIRED_LIBRARIES ${LIBXML2_LIBRARIES}) + check_symbol_exists(xmlReadMemory libxml/xmlreader.h HAVE_LIBXML2) + cmake_pop_check_state() + if(LLVM_ENABLE_LIBXML2 STREQUAL FORCE_ON AND NOT HAVE_LIBXML2) + message(FATAL_ERROR "Failed to configure libxml2") + endif() + endif() + set(LLVM_ENABLE_LIBXML2 "${HAVE_LIBXML2}") +endif() + # Don't look for these libraries if we're using MSan, since uninstrumented third # party code may call MSan interceptors like strlen, leading to false positives. if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*") @@ -161,21 +182,6 @@ if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*") else() set(LLVM_ENABLE_TERMINFO 0) endif() - - find_library(ICONV_LIBRARY_PATH NAMES iconv libiconv libiconv-2 c) - set(LLVM_LIBXML2_ENABLED 0) - set(LIBXML2_FOUND 0) - if((LLVM_ENABLE_LIBXML2) AND ((CMAKE_SYSTEM_NAME MATCHES "Linux") AND (ICONV_LIBRARY_PATH) OR APPLE)) - find_package(LibXml2) - if (LIBXML2_FOUND) - set(LLVM_LIBXML2_ENABLED 1) - if ((CMAKE_OSX_SYSROOT) AND (EXISTS ${CMAKE_OSX_SYSROOT}/${LIBXML2_INCLUDE_DIR})) - include_directories(${CMAKE_OSX_SYSROOT}/${LIBXML2_INCLUDE_DIR}) - else() - include_directories(${LIBXML2_INCLUDE_DIR}) - endif() - endif() - endif() else() set(LLVM_ENABLE_TERMINFO 0) endif() @@ -183,10 +189,6 @@ else() set(LLVM_ENABLE_TERMINFO 0) endif() -if (LLVM_ENABLE_LIBXML2 STREQUAL "FORCE_ON" AND NOT LLVM_LIBXML2_ENABLED) - message(FATAL_ERROR "Failed to congifure libxml2") -endif() - check_library_exists(xar xar_open "" HAVE_LIBXAR) if(HAVE_LIBXAR) set(XAR_LIB xar) diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake index a40cf17426fe0..e57abea427530 100644 --- a/llvm/cmake/modules/AddLLVM.cmake +++ b/llvm/cmake/modules/AddLLVM.cmake @@ -567,7 +567,7 @@ function(llvm_add_library name) endif() if(ARG_SHARED) - if(WIN32) + if(MSVC) set_target_properties(${name} PROPERTIES PREFIX "" ) diff --git a/llvm/cmake/modules/GetLibraryName.cmake b/llvm/cmake/modules/GetLibraryName.cmake new file mode 100644 index 0000000000000..13c0080671a3c --- /dev/null +++ b/llvm/cmake/modules/GetLibraryName.cmake @@ -0,0 +1,17 @@ +# Returns library name for a given path. +function(get_library_name path name) + get_filename_component(path ${path} NAME) + set(prefixes ${CMAKE_FIND_LIBRARY_PREFIXES}) + set(suffixes ${CMAKE_FIND_LIBRARY_SUFFIXES}) + list(FILTER prefixes EXCLUDE REGEX "^\\s*$") + list(FILTER suffixes EXCLUDE REGEX "^\\s*$") + if(prefixes) + string(REPLACE ";" "|" prefixes "${prefixes}") + string(REGEX REPLACE "^(${prefixes})" "" path ${path}) + endif() + if(suffixes) + string(REPLACE ";" "|" suffixes "${suffixes}") + string(REGEX REPLACE "(${suffixes})$" "" path ${path}) + endif() + set(${name} "${path}" PARENT_SCOPE) +endfunction() diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in index a5c370bbc25e4..4453020cf4da4 100644 --- a/llvm/cmake/modules/LLVMConfig.cmake.in +++ b/llvm/cmake/modules/LLVMConfig.cmake.in @@ -55,7 +55,10 @@ if(LLVM_ENABLE_ZLIB) find_package(ZLIB) endif() -set(LLVM_LIBXML2_ENABLED @LLVM_LIBXML2_ENABLED@) +set(LLVM_ENABLE_LIBXML2 @LLVM_ENABLE_LIBXML2@) +if(LLVM_ENABLE_LIBXML2) + find_package(LibXml2) +endif() set(LLVM_WITH_Z3 @LLVM_WITH_Z3@) diff --git a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst index 8cc29803f2182..777e271423abe 100644 --- a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst +++ b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst @@ -2678,7 +2678,7 @@ architectures. DWARF address space identifiers are used by: -* The DWARF expession operations: ``DW_OP_LLVM_aspace_bregx``, +* The DWARF expression operations: ``DW_OP_LLVM_aspace_bregx``, ``DW_OP_LLVM_form_aspace_address``, ``DW_OP_LLVM_implicit_aspace_pointer``, and ``DW_OP_xderef*``. @@ -3387,7 +3387,7 @@ Standard Content Descriptions provided by the* ``DW_LNCT_path`` *field. When the source field is absent, consumers can access the file to get the source text.* - *This is particularly useful for programing languages that support runtime + *This is particularly useful for programming languages that support runtime compilation and runtime generation of source text. In these cases, the source text does not reside in any permanent file. For example, the OpenCL language [:ref:`OpenCL `] supports online compilation.* diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 967b667427e05..10f6a3e495092 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -266,9 +266,7 @@ names from both the *Processor* and *Alternative Processor* can be used. .. TODO Add product names. - ``gfx1031`` ``amdgcn`` dGPU - xnack *TBA* - [off] - - wavefrontsize64 + ``gfx1031`` ``amdgcn`` dGPU - wavefrontsize64 *TBA* [off] - cumode [off] diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst index 2972f1dec0e70..5a73b7d45211c 100644 --- a/llvm/docs/CMake.rst +++ b/llvm/docs/CMake.rst @@ -461,6 +461,10 @@ LLVM-specific variables **LLVM_PARALLEL_LINK_JOBS**:STRING Define the maximum number of concurrent link jobs. +**LLVM_EXTERNALIZE_DEBUGINFO**:BOOL + Generate dSYM files and strip executables and libraries (Darwin Only). + Defaults to OFF. + **LLVM_USE_CRT_{target}**:STRING On Windows, tells which version of the C runtime library (CRT) should be used. For example, -DLLVM_USE_CRT_RELEASE=MT would statically link the CRT into the @@ -552,7 +556,7 @@ LLVM-specific variables **SPHINX_EXECUTABLE**:STRING The path to the ``sphinx-build`` executable detected by CMake. For installation instructions, see - http://www.sphinx-doc.org/en/latest/usage/installation.html + https://www.sphinx-doc.org/en/master/usage/installation.html **SPHINX_OUTPUT_HTML**:BOOL If enabled (and ``LLVM_ENABLE_SPHINX`` is enabled) then the targets for diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst index 78954fcc8d876..ca489cdabf693 100644 --- a/llvm/docs/CommandGuide/dsymutil.rst +++ b/llvm/docs/CommandGuide/dsymutil.rst @@ -111,7 +111,7 @@ OPTIONS debug info. This prints a table after linking with the object file name, the size of the debug info in the object file (in bytes) and the size contributed (in bytes) to the linked dSYM. The table is sorted by the output size listing - the obj ect files with the largest contribution first. + the object files with the largest contribution first. .. option:: --symbol-map diff --git a/llvm/docs/CommandGuide/llvm-install-name-tool.rst b/llvm/docs/CommandGuide/llvm-install-name-tool.rst index da258afbaee8f..87775d4f20d0f 100644 --- a/llvm/docs/CommandGuide/llvm-install-name-tool.rst +++ b/llvm/docs/CommandGuide/llvm-install-name-tool.rst @@ -43,6 +43,10 @@ the same `` value. times to delete multiple rpaths. Throws an error if ```` is not listed in the binary. +.. option:: --help, -h + + Print a summary of command line options. + .. option:: -id Change shared library's identification name under LC_ID_DYLIB to ```` in the @@ -55,6 +59,10 @@ the same `` value. multiple times to change multiple rpaths. Throws an error if ```` is not listed in the binary or ```` is already listed in the binary. +.. option:: --version, -V + + Display the version of the :program:`llvm-install-name-tool` executable. + EXIT STATUS ----------- diff --git a/llvm/docs/Coroutines.rst b/llvm/docs/Coroutines.rst index 3f7cddef9b37d..5afb33fa0a0ab 100644 --- a/llvm/docs/Coroutines.rst +++ b/llvm/docs/Coroutines.rst @@ -257,10 +257,10 @@ Coroutine Transformation One of the steps of coroutine lowering is building the coroutine frame. The def-use chains are analyzed to determine which objects need be kept alive across suspend points. In the coroutine shown in the previous section, use of virtual register -`%n.val` is separated from the definition by a suspend point, therefore, it +`%inc` is separated from the definition by a suspend point, therefore, it cannot reside on the stack frame since the latter goes away once the coroutine is suspended and control is returned back to the caller. An i32 slot is -allocated in the coroutine frame and `%n.val` is spilled and reloaded from that +allocated in the coroutine frame and `%inc` is spilled and reloaded from that slot as needed. We also store addresses of the resume and destroy functions so that the diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 781b2385de500..5e35b913bef4a 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15824,7 +15824,12 @@ The '``llvm.experimental.vector.reduce.fmax.*``' intrinsics do a floating-point ``MAX`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. -If the intrinsic call has the ``nnan`` fast-math flag then the operation can +This instruction has the same comparison semantics as the '``llvm.maxnum.*``' +intrinsic. That is, the result will always be a number unless all elements of +the vector are NaN. For a vector with maximum element magnitude 0.0 and +containing both +0.0 and -0.0 elements, the sign of the result is unspecified. + +If the intrinsic call has the ``nnan`` fast-math flag, then the operation can assume that NaNs are not present in the input vector. Arguments: @@ -15850,7 +15855,12 @@ The '``llvm.experimental.vector.reduce.fmin.*``' intrinsics do a floating-point ``MIN`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. -If the intrinsic call has the ``nnan`` fast-math flag then the operation can +This instruction has the same comparison semantics as the '``llvm.minnum.*``' +intrinsic. That is, the result will always be a number unless all elements of +the vector are NaN. For a vector with minimum element magnitude 0.0 and +containing both +0.0 and -0.0 elements, the sign of the result is unspecified. + +If the intrinsic call has the ``nnan`` fast-math flag, then the operation can assume that NaNs are not present in the input vector. Arguments: diff --git a/llvm/docs/Lexicon.rst b/llvm/docs/Lexicon.rst index cf194eb0d1d3d..03090827ffe48 100644 --- a/llvm/docs/Lexicon.rst +++ b/llvm/docs/Lexicon.rst @@ -92,6 +92,19 @@ D **DSE** Dead Store Elimination +E +- + +**ento** + This namespace houses the + `Clang Static Analyzer `_. + It is an abbreviaton of `entomology `_. + + *"Entomology is the scientific study of insects."* + + In the past, this namespace had not only the name `GR` (aka. Graph Reachability) + but also `entoSA`. + F - diff --git a/llvm/docs/ORCv2.rst b/llvm/docs/ORCv2.rst index 0396fb0ad8111..67ce6e3d103d3 100644 --- a/llvm/docs/ORCv2.rst +++ b/llvm/docs/ORCv2.rst @@ -468,7 +468,7 @@ are now referred to as ORCv1. The majority of the ORCv1 layers and utilities were renamed with a 'Legacy' prefix in LLVM 8.0, and have deprecation warnings attached in LLVM 9.0. In LLVM -10.0 ORCv1 will be removed entirely. +12.0 ORCv1 will be removed entirely. Transitioning from ORCv1 to ORCv2 should be easy for most clients. Most of the ORCv1 layers and utilities have ORCv2 counterparts [2]_ that can be directly diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 59897806c37a5..47ce9fa10d908 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -69,10 +69,13 @@ Changes to building LLVM Changes to TableGen ------------------- +* The new "TableGen Programmer's Reference" replaces the "TableGen Language + Introduction" and "TableGen Language Reference" documents. + * The syntax for specifying an integer range in a range list has changed. The old syntax used a hyphen in the range (e.g., ``{0-9}``). The new syntax - uses the "`...`" range punctuator (e.g., ``{0...9}``). The hyphen syntax - is deprecated. The "TableGen Language Reference" document has been updated. + uses the "`...`" range punctuation (e.g., ``{0...9}``). The hyphen syntax + is deprecated. Changes to the ARM Backend -------------------------- diff --git a/llvm/docs/TableGen/BackEnds.rst b/llvm/docs/TableGen/BackEnds.rst index 8b31338356689..a93f2ace78808 100644 --- a/llvm/docs/TableGen/BackEnds.rst +++ b/llvm/docs/TableGen/BackEnds.rst @@ -226,16 +226,14 @@ SearchableTables **Purpose**: Generate custom searchable tables. -**Output**: Enums, global tables and lookup helper functions. +**Output**: Enums, global tables, and lookup helper functions. **Usage**: This backend allows generating free-form, target-specific tables from TableGen records. The ARM and AArch64 targets use this backend to generate tables of system registers; the AMDGPU target uses it to generate meta-data about complex image and memory buffer instructions. -More documentation is available in ``include/llvm/TableGen/SearchableTable.td``, -which also contains the definitions of TableGen classes which must be -instantiated in order to define the enums and tables emitted by this backend. +See `SearchableTables Reference`_ for a detailed description. CTags ----- @@ -438,6 +436,381 @@ used for documenting user-facing attributes. General BackEnds ================ +SearchableTables Reference +-------------------------- + +A TableGen include file, ``SearchableTable.td``, provides classes for +generating C++ searchable tables. These tables are described in the +following sections. To generate the C++ code, run ``llvm-tblgen`` with the +``--gen-searchable-tables`` option, which invokes the backend that generates +the tables from the records you provide. + +Each of the data structures generated for searchable tables is guarded by an +``#ifdef``. This allows you to include the generated ``.inc`` file and select only +certain data structures for inclusion. The examples below show the macro +names used in these guards. + +Generic Enumerated Types +~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``GenericEnum`` class makes it easy to define a C++ enumerated type and +the enumerated *elements* of that type. To define the type, define a record +whose parent class is ``GenericEnum`` and whose name is the desired enum +type. This class provides three fields, which you can set in the record +using the ``let`` statement. + +* ``string FilterClass``. The enum type will have one element for each record + that derives from this class. These records are collected to assemble the + complete set of elements. + +* ``string NameField``. The name of a field *in the collected records* that specifies + the name of the element. If a record has no such field, the record's + name will be used. + +* ``string ValueField``. The name of a field *in the collected records* that + specifies the numerical value of the element. If a record has no such + field, it will be assigned an integer value. Values are assigned in + alphabetical order starting with 0. + +Here is an example where the values of the elements are specified +explicitly, as a template argument to the ``BEntry`` class. The resulting +C++ code is shown. + +.. code-block:: text + + def BValues : GenericEnum { + let FilterClass = "BEntry"; + let NameField = "Name"; + let ValueField = "Encoding"; + } + + class BEntry enc> { + string Name = NAME; + bits<16> Encoding = enc; + } + + def BFoo : BEntry<0xac>; + def BBar : BEntry<0x14>; + def BZoo : BEntry<0x80>; + def BSnork : BEntry<0x4c>; + +.. code-block:: text + + #ifdef GET_BValues_DECL + enum BValues { + BBar = 20, + BFoo = 172, + BSnork = 76, + BZoo = 128, + }; + #endif + +In the following example, the values of the elements are assigned +automatically. Note that values are assigned from 0, in alphabetical order +by element name. + +.. code-block:: text + + def CEnum : GenericEnum { + let FilterClass = "CEnum"; + } + + class CEnum; + + def CFoo : CEnum; + def CBar : CEnum; + def CBaz : CEnum; + +.. code-block:: text + + #ifdef GET_CEnum_DECL + enum CEnum { + CBar = 0, + CBaz = 1, + CFoo = 2, + }; + #endif + + +Generic Tables +~~~~~~~~~~~~~~ + +The ``GenericTable`` class is used to define a searchable generic table. +TableGen produces C++ code to define the table entries and also produces +the declaration and definition of a function to search the table based on a +primary key. To define the table, define a record whose parent class is +``GenericTable`` and whose name is the name of the global table of entries. +This class provides six fields. + +* ``string FilterClass``. The table will have one entry for each record + that derives from this class. + +* ``string CppTypeName``. The name of the C++ struct/class type of the + table that holds the entries. If unspecified, the ``FilterClass`` name is + used. + +* ``list Fields``. A list of the names of the fields in the + collected records that contain the data for the table entries. The order of + this list determines the order of the values in the C++ initializers. See + below for information about the types of these fields. + +* ``list PrimaryKey``. The list of fields that make up the + primary key. + +* ``string PrimaryKeyName``. The name of the generated C++ function + that performs a lookup on the primary key. + +* ``bit PrimaryKeyEarlyOut``. See the third example below. + +TableGen attempts to deduce the type of each of the table fields. It can +deduce ``bit``, ``bits``, ``string``, ``Intrinsic``, and ``Instruction``. +These can be used in the primary key. TableGen also deduces ``code``, but it +cannot be used in the primary key. Any other field types must be specified +explicitly; this is done as shown in the second example below. Such fields +cannot be used in the primary key. + +Here is an example where TableGen can deduce the field types. Note that the +table entry records are anonymous; the names of entry records are +irrelevant. + +.. code-block:: text + + def ATable : GenericTable { + let FilterClass = "AEntry"; + let Fields = ["Str", "Val1", "Val2"]; + let PrimaryKey = ["Val1", "Val2"]; + let PrimaryKeyName = "lookupATableByValues"; + } + + class AEntry { + string Str = str; + bits<8> Val1 = val1; + bits<10> Val2 = val2; + } + + def : AEntry<"Bob", 5, 3>; + def : AEntry<"Carol", 2, 6>; + def : AEntry<"Ted", 4, 4>; + def : AEntry<"Alice", 4, 5>; + def : AEntry<"Costa", 2, 1>; + +Here is the generated C++ code. The declaration of ``lookupATableByValues`` +is guarded by ``GET_ATable_DECL``, while the definitions are guarded by +``GET_ATable_IMPL``. + +.. code-block:: text + + #ifdef GET_ATable_DECL + const AEntry *lookupATableByValues(uint8_t Val1, uint16_t Val2); + #endif + + #ifdef GET_ATable_IMPL + constexpr AEntry ATable[] = { + { "Costa", 0x2, 0x1 }, // 0 + { "Carol", 0x2, 0x6 }, // 1 + { "Ted", 0x4, 0x4 }, // 2 + { "Alice", 0x4, 0x5 }, // 3 + { "Bob", 0x5, 0x3 }, // 4 + }; + + const AEntry *lookupATableByValues(uint8_t Val1, uint16_t Val2) { + struct KeyType { + uint8_t Val1; + uint16_t Val2; + }; + KeyType Key = { Val1, Val2 }; + auto Table = makeArrayRef(ATable); + auto Idx = std::lower_bound(Table.begin(), Table.end(), Key, + [](const AEntry &LHS, const KeyType &RHS) { + if (LHS.Val1 < RHS.Val1) + return true; + if (LHS.Val1 > RHS.Val1) + return false; + if (LHS.Val2 < RHS.Val2) + return true; + if (LHS.Val2 > RHS.Val2) + return false; + return false; + }); + + if (Idx == Table.end() || + Key.Val1 != Idx->Val1 || + Key.Val2 != Idx->Val2) + return nullptr; + return &*Idx; + } + #endif + +The table entries in ``ATable`` are sorted in order by ``Val1``, and within +each of those values, by ``Val2``. This allows a binary search of the table, +which is performed in the lookup function by ``std::lower_bound``. The +lookup function returns a reference to the found table entry, or the null +pointer if no entry is found. + +This example includes a field whose type TableGen cannot deduce. The ``Kind`` +field uses the enumerated type ``CEnum`` defined above. To inform TableGen +of the type, the class derived from ``GenericTable`` must include a field +named ``TypeOf_``\ *field*, where *field* is the name of the field whose type +is required. + +.. code-block:: text + + def CTable : GenericTable { + let FilterClass = "CEntry"; + let Fields = ["Name", "Kind", "Encoding"]; + GenericEnum TypeOf_Kind = CEnum; + let PrimaryKey = ["Encoding"]; + let PrimaryKeyName = "lookupCEntryByEncoding"; + } + + class CEntry { + string Name = name; + CEnum Kind = kind; + bits<16> Encoding = enc; + } + + def : CEntry<"Apple", CFoo, 10>; + def : CEntry<"Pear", CBaz, 15>; + def : CEntry<"Apple", CBar, 13>; + +Here is the generated C++ code. + +.. code-block:: text + + #ifdef GET_CTable_DECL + const CEntry *lookupCEntryByEncoding(uint16_t Encoding); + #endif + + #ifdef GET_CTable_IMPL + constexpr CEntry CTable[] = { + { "Apple", CFoo, 0xA }, // 0 + { "Apple", CBar, 0xD }, // 1 + { "Pear", CBaz, 0xF }, // 2 + }; + + const CEntry *lookupCEntryByEncoding(uint16_t Encoding) { + struct KeyType { + uint16_t Encoding; + }; + KeyType Key = { Encoding }; + auto Table = makeArrayRef(CTable); + auto Idx = std::lower_bound(Table.begin(), Table.end(), Key, + [](const CEntry &LHS, const KeyType &RHS) { + if (LHS.Encoding < RHS.Encoding) + return true; + if (LHS.Encoding > RHS.Encoding) + return false; + return false; + }); + + if (Idx == Table.end() || + Key.Encoding != Idx->Encoding) + return nullptr; + return &*Idx; + } + +The ``PrimaryKeyEarlyOut`` field, when set to 1, modifies the lookup +function so that it tests the first field of the primary key to determine +whether it is within the range of the collected records' primary keys. If +not, the function returns the null pointer without performing the binary +search. This is useful for tables that provide data for only some of the +elements of a larger enum-based space. The first field of the primary key +must be an integral type; it cannot be a string. + +Adding ``let PrimaryKeyEarlyOut = 1`` to the ``ATable`` above: + +.. code-block:: text + + def ATable : GenericTable { + let FilterClass = "AEntry"; + let Fields = ["Str", "Val1", "Val2"]; + let PrimaryKey = ["Val1", "Val2"]; + let PrimaryKeyName = "lookupATableByValues"; + let PrimaryKeyEarlyOut = 1; + } + +causes the lookup function to change as follows: + +.. code-block:: text + + const AEntry *lookupATableByValues(uint8_t Val1, uint16_t Val2) { + if ((Val1 < 0x2) || + (Val1 > 0x5)) + return nullptr; + + struct KeyType { + ... + +Search Indexes +~~~~~~~~~~~~~~ + +The ``SearchIndex`` class is used to define additional lookup functions for +generic tables. To define an additional function, define a record whose parent +class is ``SearchIndex`` and whose name is the name of the desired lookup +function. This class provides three fields. + +* ``GenericTable Table``. The name of the table that is to receive another + lookup function. + +* ``list Key``. The list of fields that make up the secondary key. + +* ``bit EarlyOut``. See the third example in `Generic Tables`_. + +Here is an example of a secondary key added to the ``CTable`` above. The +generated function looks up entries based on the ``Name`` and ``Kind`` fields. + +.. code-block:: text + + def lookupCEntry : SearchIndex { + let Table = CTable; + let Key = ["Name", "Kind"]; + } + +This use of ``SearchIndex`` generates the following additional C++ code. + +.. code-block:: text + + const CEntry *lookupCEntry(StringRef Name, unsigned Kind); + + ... + + const CEntry *lookupCEntryByName(StringRef Name, unsigned Kind) { + struct IndexType { + const char * Name; + unsigned Kind; + unsigned _index; + }; + static const struct IndexType Index[] = { + { "APPLE", CBar, 1 }, + { "APPLE", CFoo, 0 }, + { "PEAR", CBaz, 2 }, + }; + + struct KeyType { + std::string Name; + unsigned Kind; + }; + KeyType Key = { Name.upper(), Kind }; + auto Table = makeArrayRef(Index); + auto Idx = std::lower_bound(Table.begin(), Table.end(), Key, + [](const IndexType &LHS, const KeyType &RHS) { + int CmpName = StringRef(LHS.Name).compare(RHS.Name); + if (CmpName < 0) return true; + if (CmpName > 0) return false; + if ((unsigned)LHS.Kind < (unsigned)RHS.Kind) + return true; + if ((unsigned)LHS.Kind > (unsigned)RHS.Kind) + return false; + return false; + }); + + if (Idx == Table.end() || + Key.Name != Idx->Name || + Key.Kind != Idx->Kind) + return nullptr; + return &CTable[Idx->_index]; + } + JSON ---- diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst index 83684ab41c280..7bc70c8f89e6d 100644 --- a/llvm/docs/TableGen/ProgRef.rst +++ b/llvm/docs/TableGen/ProgRef.rst @@ -140,7 +140,7 @@ the file is printed for review. The following are the basic punctuation tokens:: - - + [ ] { } ( ) < > : ; . = ? # + - + [ ] { } ( ) < > : ; . ... = ? # Literals -------- @@ -285,10 +285,11 @@ wide range of records conveniently and compactly. ``dag`` This type represents a nestable directed acyclic graph (DAG) of nodes. - Each node has an operator and one or more operands. A operand can be + Each node has an operator and zero or more operands. A operand can be another ``dag`` object, allowing an arbitrary tree of nodes and edges. - As an example, DAGs are used to represent code and patterns for use by - the code generator instruction selection algorithms. + As an example, DAGs are used to represent code patterns for use by + the code generator instruction selection algorithms. See `Directed + acyclic graphs (DAGs)`_ for more details; :token:`ClassID` Specifying a class name in a type context indicates @@ -328,8 +329,8 @@ to an entity of type ``bits<4>``. .. warning:: The peculiar last form of :token:`RangePiece` is due to the fact that the "``-``" is included in the :token:`TokInteger`, hence ``1-5`` gets lexed as - two consecutive tokens, with values ``1`` and ``-5``, - instead of "1", "-", and "5". + two consecutive tokens, with values ``1`` and ``-5``, instead of "1", "-", + and "5". The use of hyphen as the range punctuation is deprecated. Simple values ------------- @@ -374,6 +375,7 @@ sometimes not when the value is the empty list (``[]``). This represents a DAG initializer (note the parentheses). The first :token:`DagArg` is called the "operator" of the DAG and must be a record. +See `Directed acyclic graphs (DAGs)`_ for more details. .. productionlist:: SimpleValue6: `TokIdentifier` @@ -431,7 +433,7 @@ sense after reading the remainder of this guide. * The iteration variable of a ``foreach``, such as the use of ``i`` in:: - foreach i = 0..5 in + foreach i = 0...5 in def Foo#i; .. productionlist:: @@ -466,11 +468,11 @@ primary value. Here are the possible suffixes for some primary *value*. *value*\ ``{17}`` The final value is bit 17 of the integer *value* (note the braces). -*value*\ ``{8..15}`` +*value*\ ``{8...15}`` The final value is bits 8--15 of the integer *value*. The order of the - bits can be reversed by specifying ``{15..8}``. + bits can be reversed by specifying ``{15...8}``. -*value*\ ``[4..7,17,2..3,4]`` +*value*\ ``[4...7,17,2...3,4]`` The final value is a new list that is a slice of the list *value* (note the brackets). The new list contains elements 4, 5, 6, 7, 17, 2, 3, and 4. Elements may be @@ -582,7 +584,7 @@ in a ``bit`` field. The ``defvar`` form defines a variable whose value can be used in other value expressions within the body. The variable is not a field: it does not become a field of the class or record being defined. Variables are provided -to hold temporary values while processing the body. See `Defvar in Record +to hold temporary values while processing the body. See `Defvar in a Record Body`_ for more details. When class ``C2`` inherits from class ``C1``, it acquires all the field @@ -827,10 +829,13 @@ template that expands into multiple records. MultiClassID: `TokIdentifier` As with regular classes, the multiclass has a name and can accept template -arguments. The body of the multiclass contains a series of statements that -define records, using :token:`Def` and :token:`Defm`. In addition, -:token:`Defvar`, :token:`Foreach`, and :token:`Let` -statements can be used to factor out even more common elements. +arguments. A multiclass can inherit from other multiclasses, which causes +the other multiclasses to be expanded and contribute to the record +definitions in the inheriting multiclass. The body of the multiclass +contains a series of statements that define records, using :token:`Def` and +:token:`Defm`. In addition, :token:`Defvar`, :token:`Foreach`, and +:token:`Let` statements can be used to factor out even more common elements. +The :token:`If` statement can also be used. Also as with regular classes, the multiclass has the implicit template argument ``NAME`` (see NAME_). When a named (non-anonymous) record is @@ -1126,10 +1131,10 @@ the next iteration. The following ``defvar`` will not work:: defvar i = !add(i, 1) Variables can also be defined with ``defvar`` in a record body. See -`Defvar in Record Body`_ for more details. +`Defvar in a Record Body`_ for more details. -``foreach`` --- iterate over a sequence ---------------------------------------- +``foreach`` --- iterate over a sequence of statements +----------------------------------------------------- The ``foreach`` statement iterates over a series of statements, varying a variable over a sequence of values. @@ -1190,7 +1195,7 @@ the usual way: in a case like ``if v1 then if v2 then {...} else {...}``, the The :token:`IfBody` of the then and else arms of the ``if`` establish an inner scope. Any ``defvar`` variables defined in the bodies go out of scope -when the bodies are finished (see `Defvar in Record Body`_ for more details). +when the bodies are finished (see `Defvar in a Record Body`_ for more details). The ``if`` statement can also be used in a record :token:`Body`. @@ -1198,8 +1203,41 @@ The ``if`` statement can also be used in a record :token:`Body`. Additional Details ================== -Defvar in record body ---------------------- +Directed acyclic graphs (DAGs) +------------------------------ + +A directed acyclic graph can be represented directly in TableGen using the +``dag`` datatype. A DAG node consists of an operator and zero or more +operands. Each operand can be of any desired type. By using another DAG node +as an operand, an arbitrary graph of DAG nodes can be built. + +The syntax of a ``dag`` instance is: + + ``(`` *operator* *operand1*\ ``,`` *operand2*\ ``,`` ... ``)`` + +The operator must be present and must be a record. There can be zero or more +operands, separated by commas. The operator and operands can have three +formats. + +====================== ============================================= +Format Meaning +====================== ============================================= +*value* operand value +*value*\ ``:``\ *name* operand value and associated name +*name* operand name with unset (uninitialized) value +====================== ============================================= + +The *value* can be any TableGen value. The *name*, if present, must be a +:token:`TokVarName`, which starts with a dollar sign (``$``). The purpose of +a name is to tag an operator or operand in a DAG with a particular meaning, +or to associate an operand in one DAG with a like-named operand in another +DAG. + +The following bang operators manipulate DAGs: ``!con``, ``!dag``, ``!foreach``, +``!getop``, ``!setop``. + +Defvar in a record body +----------------------- In addition to defining global variables, the ``defvar`` statement can be used inside the :token:`Body` of a class or record definition to define @@ -1529,7 +1567,7 @@ and non-0 as true. ``!shl(``\ *a*\ ``,`` *count*\ ``)`` This operator shifts *a* left logically by *count* bits and produces the resulting value. The operation is performed on a 64-bit integer; the result - is undefined for shift counts outside 0..63. + is undefined for shift counts outside 0...63. ``!size(``\ *a*\ ``)`` This operator produces the number of elements in the list *a*. @@ -1537,12 +1575,12 @@ and non-0 as true. ``!sra(``\ *a*\ ``,`` *count*\ ``)`` This operator shifts *a* right arithmetically by *count* bits and produces the resulting value. The operation is performed on a 64-bit integer; the result - is undefined for shift counts outside 0..63. + is undefined for shift counts outside 0...63. ``!srl(``\ *a*\ ``,`` *count*\ ``)`` This operator shifts *a* right logically by *count* bits and produces the resulting value. The operation is performed on a 64-bit integer; the result - is undefined for shift counts outside 0..63. + is undefined for shift counts outside 0...63. ``!strconcat(``\ *str1*\ ``,`` *str2*\ ``, ...)`` This operator concatenates the string arguments *str1*, *str2*, etc., and diff --git a/llvm/docs/TransformMetadata.rst b/llvm/docs/TransformMetadata.rst index 817b41b43711d..3c0e10b3eb7a5 100644 --- a/llvm/docs/TransformMetadata.rst +++ b/llvm/docs/TransformMetadata.rst @@ -196,7 +196,7 @@ is transformed into (using an unroll factor of 4): .. code-block:: c int i = 0; - for (; i + 3 < n; i+=4) // unrolled loop + for (; i + 3 < n; i+=4) { // unrolled loop Stmt(i); Stmt(i+1); Stmt(i+2); diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst index 2e0cffb711ef9..00e99db297f78 100644 --- a/llvm/docs/UserGuides.rst +++ b/llvm/docs/UserGuides.rst @@ -54,6 +54,7 @@ intermediate LLVM representation. TableGenFundamentals Vectorizers WritingAnLLVMPass + WritingAnLLVMNewPMPass WritingAnLLVMBackend yaml2obj @@ -107,6 +108,10 @@ Optimizations :doc:`WritingAnLLVMPass` Information on how to write LLVM transformations and analyses. +:doc:`WritingAnLLVMNewPMPass` + Information on how to write LLVM transformations under the new pass + manager. + :doc:`Passes` A list of optimizations and analyses implemented in LLVM. diff --git a/llvm/docs/WritingAnLLVMNewPMPass.rst b/llvm/docs/WritingAnLLVMNewPMPass.rst new file mode 100644 index 0000000000000..a876ec4ceb005 --- /dev/null +++ b/llvm/docs/WritingAnLLVMNewPMPass.rst @@ -0,0 +1,209 @@ +==================== +Writing an LLVM Pass +==================== + +.. program:: opt + +.. contents:: + :local: + +Introduction --- What is a pass? +================================ + +The LLVM pass framework is an important part of the LLVM system, because LLVM +passes are where most of the interesting parts of the compiler exist. Passes +perform the transformations and optimizations that make up the compiler, they +build the analysis results that are used by these transformations, and they +are, above all, a structuring technique for compiler code. + +Unlike passes under the legacy pass manager where the pass interface is +defined via inheritance, passes under the new pass manager rely on +concept-based polymorphism, meaning there is no explicit interface (see +comments in ``PassManager.h`` for more details). All LLVM passes inherit from +the CRTP mix-in ``PassInfoMixin``. The pass should have a ``run()`` +method which returns a ``PreservedAnalyses`` and takes in some unit of IR +along with an analysis manager. For example, a function pass would have a +``PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);`` method. + +We start by showing you how to construct a pass, from setting up the build, +creating the pass, to executing and testing it. Looking at existing passes is +always a great way to learn details. + +Quick Start --- Writing hello world +=================================== + +Here we describe how to write the "hello world" of passes. The "HelloWorld" +pass is designed to simply print out the name of non-external functions that +exist in the program being compiled. It does not modify the program at all, +it just inspects it. + +The code below already exists; feel free to create a pass with a different +name alongside the HelloWorld source files. + +.. _writing-an-llvm-npm-pass-build: + +Setting up the build +-------------------- + +First, configure and build LLVM as described in :doc:`GettingStarted`. + +Next, we will reuse an existing directory (creating a new directory involves +modifying more ``CMakeLists.txt``s and ``LLVMBuild.txt``s than we want). For +this example, we'll use ``llvm/lib/Transforms/HelloNew/HelloWorld.cpp``, +which has already been created. If you'd like to create your own pass, add a +new source file into ``llvm/lib/Transforms/HelloNew/CMakeLists.txt`` under +``HelloWorld.cpp``: + +.. code-block:: cmake + + add_llvm_component_library(LLVMHelloWorld + HelloWorld.cpp + + DEPENDS + intrinsics_gen + ) + +Now that we have the build set up for a new pass, we need to write the code +for the pass itself. + +.. _writing-an-llvm-npm-pass-basiccode: + +Basic code required +------------------- + +Now that the build is setup for a new pass, we just have to write it. + +First we need to define the pass in a header file. We'll create +``llvm/include/llvm/Transforms/HelloNew/HelloWorld.h``. The file should +contain the following boilerplate: + +.. code-block:: c++ + + #ifndef LLVM_TRANSFORMS_HELLONEW_HELLOWORLD_H + #define LLVM_TRANSFORMS_HELLONEW_HELLOWORLD_H + + #include "llvm/IR/PassManager.h" + + namespace llvm { + + class HelloWorldPass : public PassInfoMixin { + public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + }; + + } // namespace llvm + + #endif // LLVM_TRANSFORMS_HELLONEW_HELLOWORLD_H + +This creates the class for the pass with a declaration of the ``run()`` +method which actually runs the pass. Inheriting from ``PassInfoMixin`` +sets up some more boilerplate so that we don't have to write it ourselves. + +Our class is in the ``llvm`` namespace so that we don't pollute the global +namespace. + +Next we'll create ``llvm/lib/Transforms/HelloNew/HelloWorld.cpp``, starting +with + +.. code-block:: c++ + + #include "llvm/Transforms/HelloNew/HelloWorld.h" + +... to include the header file we just created. + +.. code-block:: c++ + + using namespace llvm; + +... is required because the functions from the include files live in the llvm +namespace. This should only be done in non-header files. + +Next we have the pass's ``run()`` definition: + +.. code-block:: c++ + + PreservedAnalyses HelloWorldPass::run(Function &F, + FunctionAnalysisManager &AM) { + errs() << F.getName() << "\n"; + return PreservedAnalyses::all(); + } + +... which simply prints out the name of the function to stderr. The pass +manager will ensure that the pass will be run on every function in a module. +The ``PreservedAnalyses`` return value says that all analyses (e.g. dominator +tree) are still valid after this pass since we didn't modify any functions. + +That's it for the pass itself. Now in order to "register" the pass, we need +to add it to a couple places. Add the following to +``llvm\lib\Passes\PassRegistry.def`` in the ``FUNCTION_PASS`` section + +.. code-block:: c++ + + FUNCTION_PASS("helloworld", HelloWorldPass()) + +... which adds the pass under the name "helloworld". + +``llvm\lib\Passes\PassRegistry.def`` is #include'd into +``llvm\lib\Passes\PassBuilder.cpp`` multiple times for various reasons. Since +it constructs our pass, we need to also add the proper #include in +``llvm\lib\Passes\PassBuilder.cpp``: + +.. code-block:: c++ + + #include "llvm/Transforms/HelloNew/HelloWorld.h" + +This should be all the code necessary for our pass, now it's time to compile +and run it. + +Running a pass with ``opt`` +--------------------------- + +Now that you have a brand new shiny pass, we can build :program:`opt` and use +it to run some LLVM IR through the pass. + +.. code-block:: console + + $ ninja -C build/ opt + # or whatever build system/build directory you are using + + $ cat /tmp/a.ll + define i32 @foo() { + %a = add i32 2, 3 + ret i32 %a + } + + define void @bar() { + ret void + } + + $ build/bin/opt -disable-output /tmp/a.ll -passes=helloworld + foo + bar + +Our pass ran and printed the names of functions as expected! + +Testing a pass +-------------- + +Testing our pass is important to prevent future regressions. We'll add a lit +test at ``llvm/test/Transforms/HelloNew/helloworld.ll``. See +:doc:`TestingGuide` for more information on testing. + +.. code-block:: llvm + + $ cat llvm/test/Transforms/HelloNew/helloworld.ll + ; RUN: opt -disable-output -passes=helloworld %s 2>&1 | FileCheck %s + + ; CHECK: {{^}}foo{{$}} + define i32 @foo() { + %a = add i32 2, 3 + ret i32 %a + } + + ; CHECK-NEXT: {{^}}bar{{$}} + define void @bar() { + ret void + } + + $ ninja -C build check-llvm + # runs our new test alongside all other llvm lit tests diff --git a/llvm/docs/WritingAnLLVMPass.rst b/llvm/docs/WritingAnLLVMPass.rst index 88f481ba6b076..7a24659e62942 100644 --- a/llvm/docs/WritingAnLLVMPass.rst +++ b/llvm/docs/WritingAnLLVMPass.rst @@ -34,6 +34,10 @@ We start by showing you how to construct a pass, everything from setting up the code, to compiling, loading, and executing it. After the basics are down, more advanced features are discussed. +This document deals with the legacy pass manager. LLVM is transitioning to +the new pass manager, which has its own way of defining passes. For more +details, see :doc:`WritingAnLLVMNewPMPass`. + Quick Start --- Writing hello world =================================== diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst index c37c9600f51e7..7170b0fb25de0 100644 --- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst +++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst @@ -90,7 +90,7 @@ detail, we just need a single instance to pass into APIs that require it. The ``Builder`` object is a helper object that makes it easy to generate LLVM instructions. Instances of the -`IRBuilder `_ +`IRBuilder `_ class template keep track of the current place to insert instructions and has methods to create new instructions. diff --git a/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp b/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp index 4de4897053c1b..24cf0847558f9 100644 --- a/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp +++ b/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp @@ -113,14 +113,13 @@ class SpeculativeJIT { this->CODLayer.setImplMap(&Imps); this->ES->setDispatchMaterialization( [this](std::unique_ptr MU, - MaterializationResponsibility MR) { - // FIXME: Switch to move capture once we have C++14. - auto SharedMU = std::shared_ptr(std::move(MU)); - auto SharedMR = - std::make_shared(std::move(MR)); - CompileThreads.async([SharedMU, SharedMR]() { - SharedMU->materialize(std::move(*SharedMR)); - }); + std::unique_ptr MR) { + CompileThreads.async( + [UnownedMU = MU.release(), UnownedMR = MR.release()]() { + std::unique_ptr MU(UnownedMU); + std::unique_ptr MR(UnownedMR); + MU->materialize(std::move(MR)); + }); }); ExitOnErr(S.addSpeculationRuntime(MainJD, Mangle)); LocalCXXRuntimeOverrides CXXRuntimeoverrides; diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp index 345bfd8dd8705..df844bf19b9cc 100644 --- a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp +++ b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp @@ -120,8 +120,8 @@ void ThinLtoInstrumentationLayer::nudgeIntoDiscovery( LLVM_DEBUG(dbgs() << "Nudged " << Count << " new functions into discovery\n"); } -void ThinLtoInstrumentationLayer::emit(MaterializationResponsibility R, - ThreadSafeModule TSM) { +void ThinLtoInstrumentationLayer::emit( + std::unique_ptr R, ThreadSafeModule TSM) { TSM.withModuleDo([this](Module &M) { std::vector FunctionsToInstrument; diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h index cd87207894745..25006b40607fe 100644 --- a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h +++ b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h @@ -34,7 +34,8 @@ class ThinLtoInstrumentationLayer : public IRLayer { ~ThinLtoInstrumentationLayer() override; - void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override; + void emit(std::unique_ptr R, + ThreadSafeModule TSM) override; unsigned reserveDiscoveryFlags(unsigned Count); void registerDiscoveryFlagOwners(std::vector Guids, diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp index f5c2b0696f55c..e668be7d11b7e 100644 --- a/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp +++ b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp @@ -267,19 +267,18 @@ void ThinLtoJIT::setupLayers(JITTargetMachineBuilder JTMB, llvm::hardware_concurrency(NumCompileThreads)); ES.setDispatchMaterialization( [this](std::unique_ptr MU, - MaterializationResponsibility MR) { + std::unique_ptr MR) { if (IsTrivialModule(MU.get())) { // This should be quick and we may save a few session locks. MU->materialize(std::move(MR)); } else { // FIXME: Drop the std::shared_ptr workaround once ThreadPool::async() // accepts llvm::unique_function to define jobs. - auto SharedMU = std::shared_ptr(std::move(MU)); - auto SharedMR = - std::make_shared(std::move(MR)); CompileThreads->async( - [MU = std::move(SharedMU), MR = std::move(SharedMR)]() { - MU->materialize(std::move(*MR)); + [UnownedMU = MU.release(), UnownedMR = MR.release()]() { + std::unique_ptr MU(UnownedMU); + std::unique_ptr MR(UnownedMR); + MU->materialize(std::move(MR)); }); } }); diff --git a/llvm/include/llvm-c/Orc.h b/llvm/include/llvm-c/Orc.h index 09a058846108a..6271ab689c8b1 100644 --- a/llvm/include/llvm-c/Orc.h +++ b/llvm/include/llvm-c/Orc.h @@ -112,6 +112,42 @@ LLVMOrcExecutionSessionIntern(LLVMOrcExecutionSessionRef ES, const char *Name); */ void LLVMOrcReleaseSymbolStringPoolEntry(LLVMOrcSymbolStringPoolEntryRef S); +/** + * Create a "bare" JITDylib. + * + * The client is responsible for ensuring that the JITDylib's name is unique, + * e.g. by calling LLVMOrcExecutionSessionGetJTIDylibByName first. + * + * This call does not install any library code or symbols into the newly + * created JITDylib. The client is responsible for all configuration. + */ +LLVMOrcJITDylibRef +LLVMOrcExecutionSessionCreateBareJITDylib(LLVMOrcExecutionSessionRef ES, + const char *Name); + +/** + * Create a JITDylib. + * + * The client is responsible for ensuring that the JITDylib's name is unique, + * e.g. by calling LLVMOrcExecutionSessionGetJTIDylibByName first. + * + * If a Platform is attached to the ExecutionSession then + * Platform::setupJITDylib will be called to install standard platform symbols + * (e.g. standard library interposes). If no Platform is installed then this + * call is equivalent to LLVMExecutionSessionRefCreateBareJITDylib and will + * always return success. + */ +LLVMErrorRef +LLVMOrcExecutionSessionCreateJITDylib(LLVMOrcExecutionSessionRef ES, + LLVMOrcJITDylibRef *Result, + const char *Name); + +/** + * Returns the JITDylib with the given name, or NULL if no such JITDylib + * exists. + */ +LLVMOrcJITDylibRef LLVMOrcExecutionSessionGetJITDylibByName(const char *Name); + /** * Dispose of a JITDylib::DefinitionGenerator. This should only be called if * ownership has not been passed to a JITDylib (e.g. because some error diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index 876e52c150a05..1f9ac22621a6d 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -249,7 +249,7 @@ class IEEEFloat final : public APFloatBase { /// \name Constructors /// @{ - IEEEFloat(const fltSemantics &); // Default construct to 0.0 + IEEEFloat(const fltSemantics &); // Default construct to +0.0 IEEEFloat(const fltSemantics &, integerPart); IEEEFloat(const fltSemantics &, uninitializedTag); IEEEFloat(const fltSemantics &, const APInt &); @@ -539,6 +539,9 @@ class IEEEFloat final : public APFloatBase { roundingMode) const; opStatus roundSignificandWithExponent(const integerPart *, unsigned int, int, roundingMode); + ExponentType exponentNaN() const; + ExponentType exponentInf() const; + ExponentType exponentZero() const; /// @} diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index 5e4206732f4df..fdc0850d21eb0 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -794,11 +794,10 @@ class LLVM_NODISCARD APInt { APInt &operator=(uint64_t RHS) { if (isSingleWord()) { U.VAL = RHS; - clearUnusedBits(); - } else { - U.pVal[0] = RHS; - memset(U.pVal+1, 0, (getNumWords() - 1) * APINT_WORD_SIZE); + return clearUnusedBits(); } + U.pVal[0] = RHS; + memset(U.pVal + 1, 0, (getNumWords() - 1) * APINT_WORD_SIZE); return *this; } @@ -855,10 +854,9 @@ class LLVM_NODISCARD APInt { APInt &operator|=(uint64_t RHS) { if (isSingleWord()) { U.VAL |= RHS; - clearUnusedBits(); - } else { - U.pVal[0] |= RHS; + return clearUnusedBits(); } + U.pVal[0] |= RHS; return *this; } @@ -885,10 +883,9 @@ class LLVM_NODISCARD APInt { APInt &operator^=(uint64_t RHS) { if (isSingleWord()) { U.VAL ^= RHS; - clearUnusedBits(); - } else { - U.pVal[0] ^= RHS; + return clearUnusedBits(); } + U.pVal[0] ^= RHS; return *this; } diff --git a/llvm/include/llvm/Analysis/AliasSetTracker.h b/llvm/include/llvm/Analysis/AliasSetTracker.h index 690a94d9cf2ce..1db657528d194 100644 --- a/llvm/include/llvm/Analysis/AliasSetTracker.h +++ b/llvm/include/llvm/Analysis/AliasSetTracker.h @@ -23,6 +23,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Casting.h" #include @@ -457,6 +458,14 @@ inline raw_ostream& operator<<(raw_ostream &OS, const AliasSetTracker &AST) { return OS; } +class AliasSetsPrinterPass : public PassInfoMixin { + raw_ostream &OS; + +public: + explicit AliasSetsPrinterPass(raw_ostream &OS); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + } // end namespace llvm #endif // LLVM_ANALYSIS_ALIASSETTRACKER_H diff --git a/llvm/include/llvm/Analysis/ConstraintSystem.h b/llvm/include/llvm/Analysis/ConstraintSystem.h new file mode 100644 index 0000000000000..f4e6dfbefc82b --- /dev/null +++ b/llvm/include/llvm/Analysis/ConstraintSystem.h @@ -0,0 +1,78 @@ +//===- ConstraintSystem.h - A system of linear constraints. --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_CONSTRAINTSYSTEM_H +#define LLVM_ANALYSIS_CONSTRAINTSYSTEM_H + +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" + +#include + +namespace llvm { + +class ConstraintSystem { + /// Current linear constraints in the system. + /// An entry of the form c0, c1, ... cn represents the following constraint: + /// c0 >= v0 * c1 + .... + v{n-1} * cn + SmallVector, 4> Constraints; + + /// Current greatest common divisor for all coefficients in the system. + uint32_t GCD = 1; + + // Eliminate constraints from the system using Fourier–Motzkin elimination. + bool eliminateUsingFM(); + + /// Print the constraints in the system, using \p Names as variable names. + void dump(ArrayRef Names) const; + + /// Print the constraints in the system, using x0...xn as variable names. + void dump() const; + + /// Returns true if there may be a solution for the constraints in the system. + bool mayHaveSolutionImpl(); + +public: + void addVariableRow(const SmallVector &R) { + assert(Constraints.empty() || R.size() == Constraints.back().size()); + for (const auto &C : R) { + auto A = std::abs(C); + GCD = APIntOps::GreatestCommonDivisor({32, (uint32_t)A}, {32, GCD}) + .getZExtValue(); + } + Constraints.push_back(R); + } + + void addVariableRowFill(const SmallVector &R) { + for (auto &CR : Constraints) { + while (CR.size() != R.size()) + CR.push_back(0); + } + addVariableRow(R); + } + + /// Returns true if there may be a solution for the constraints in the system. + bool mayHaveSolution(); + + static SmallVector negate(SmallVector R) { + // The negated constraint R is obtained by multiplying by -1 and adding 1 to + // the constant. + R[0] += 1; + for (auto &C : R) + C *= -1; + return R; + } + + bool isConditionImplied(SmallVector R); + + void popLastConstraint() { Constraints.pop_back(); } +}; +} // namespace llvm + +#endif // LLVM_ANALYSIS_CONSTRAINTSYSTEM_H diff --git a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h new file mode 100644 index 0000000000000..072c45a600d96 --- /dev/null +++ b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h @@ -0,0 +1,367 @@ +//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file +// Interface file for the IRSimilarityIdentifier for identifying similarities in +// IR including the IRInstructionMapper, which maps an Instruction to unsigned +// integers. +// +// Two sequences of instructions are called "similar" if they perform the same +// series of operations for all inputs. +// +// \code +// %1 = add i32 %a, 10 +// %2 = add i32 %a, %1 +// %3 = icmp slt icmp %1, %2 +// \endcode +// +// and +// +// \code +// %1 = add i32 11, %a +// %2 = sub i32 %a, %1 +// %3 = icmp sgt icmp %2, %1 +// \endcode +// +// ultimately have the same result, even if the inputs, and structure are +// slightly different. +// +// For instructions, we do not worry about operands that do not have fixed +// semantic meaning to the program. We consider the opcode that the instruction +// has, the types, parameters, and extra information such as the function name, +// or comparison predicate. These are used to create a hash to map instructions +// to integers to be used in similarity matching in sequences of instructions +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H +#define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H + +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Allocator.h" + +namespace llvm { +namespace IRSimilarity { + +/// This represents what is and is not supported when finding similarity in +/// Instructions. +/// +/// Legal Instructions are considered when looking at similarity between +/// Instructions. +/// +/// Illegal Instructions cannot be considered when looking for similarity +/// between Instructions. They act as boundaries between similarity regions. +/// +/// Invisible Instructions are skipped over during analysis. +// TODO: Shared with MachineOutliner +enum InstrType { Legal, Illegal, Invisible }; + +/// This provides the utilities for hashing an Instruction to an unsigned +/// integer. Two IRInstructionDatas produce the same hash value when their +/// underlying Instructions perform the same operation (even if they don't have +/// the same input operands.) +/// As a more concrete example, consider the following: +/// +/// \code +/// %add1 = add i32 %a, %b +/// %add2 = add i32 %c, %d +/// %add3 = add i64 %e, %f +/// \endcode +/// +// Then the IRInstructionData wrappers for these Instructions may be hashed like +/// so: +/// +/// \code +/// ; These two adds have the same types and operand types, so they hash to the +/// ; same number. +/// %add1 = add i32 %a, %b ; Hash: 1 +/// %add2 = add i32 %c, %d ; Hash: 1 +/// ; This add produces an i64. This differentiates it from %add1 and %add2. So, +/// ; it hashes to a different number. +/// %add3 = add i64 %e, %f; Hash: 2 +/// \endcode +/// +/// +/// This hashing scheme will be used to represent the program as a very long +/// string. This string can then be placed in a data structure which can be used +/// for similarity queries. +/// +/// TODO: Handle types of Instructions which can be equal even with different +/// operands. (E.g. comparisons with swapped predicates.) +/// TODO: Handle CallInsts, which are only checked for function type +/// by \ref isSameOperationAs. +/// TODO: Handle GetElementPtrInsts, as some of the operands have to be the +/// exact same, and some do not. +struct IRInstructionData : ilist_node { + + /// The source Instruction that is being wrapped. + Instruction *Inst = nullptr; + /// The values of the operands in the Instruction. + SmallVector OperVals; + /// The legality of the wrapped instruction. This is informed by InstrType, + /// and is used when checking when two instructions are considered similar. + /// If either instruction is not legal, the instructions are automatically not + /// considered similar. + bool Legal; + + /// Gather the information that is difficult to gather for an Instruction, or + /// is changed. i.e. the operands of an Instruction and the Types of those + /// operands. This extra information allows for similarity matching to make + /// assertions that allow for more flexibility when checking for whether an + /// Instruction performs the same operation. + IRInstructionData(Instruction &I, bool Legality); + + /// Hashes \p Value based on its opcode, types, and operand types. + /// Two IRInstructionData instances produce the same hash when they perform + /// the same operation. + /// + /// As a simple example, consider the following instructions. + /// + /// \code + /// %add1 = add i32 %x1, %y1 + /// %add2 = add i32 %x2, %y2 + /// + /// %sub = sub i32 %x1, %y1 + /// + /// %add_i64 = add i64 %x2, %y2 + /// \endcode + /// + /// Because the first two adds operate the same types, and are performing the + /// same action, they will be hashed to the same value. + /// + /// However, the subtraction instruction is not the same as an addition, and + /// will be hashed to a different value. + /// + /// Finally, the last add has a different type compared to the first two add + /// instructions, so it will also be hashed to a different value that any of + /// the previous instructions. + /// + /// \param [in] Value - The IRInstructionData instance to be hashed. + /// \returns A hash_value of the IRInstructionData. + friend hash_code hash_value(const IRInstructionData &ID) { + SmallVector OperTypes; + for (Value *V : ID.OperVals) + OperTypes.push_back(V->getType()); + + return llvm::hash_combine( + llvm::hash_value(ID.Inst->getOpcode()), + llvm::hash_value(ID.Inst->getType()), + llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); + } +}; + +/// Compare one IRInstructionData class to another IRInstructionData class for +/// whether they are performing a the same operation, and can mapped to the +/// same value. For regular instructions if the hash value is the same, then +/// they will also be close. +/// +/// \param A - The first IRInstructionData class to compare +/// \param B - The second IRInstructionData class to compare +/// \returns true if \p A and \p B are similar enough to be mapped to the same +/// value. +bool isClose(const IRInstructionData &A, const IRInstructionData &B); + +struct IRInstructionDataTraits : DenseMapInfo { + static inline IRInstructionData *getEmptyKey() { return nullptr; } + static inline IRInstructionData *getTombstoneKey() { + return reinterpret_cast(-1); + } + + static unsigned getHashValue(const IRInstructionData *E) { + using llvm::hash_value; + assert(E && "IRInstructionData is a nullptr?"); + return hash_value(*E); + } + + static bool isEqual(const IRInstructionData *LHS, + const IRInstructionData *RHS) { + if (RHS == getEmptyKey() || RHS == getTombstoneKey() || + LHS == getEmptyKey() || LHS == getTombstoneKey()) + return LHS == RHS; + + assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?"); + return isClose(*LHS, *RHS); + } +}; + +/// Helper struct for converting the Instructions in a Module into a vector of +/// unsigned integers. This vector of unsigned integers can be thought of as a +/// "numeric string". This numeric string can then be queried by, for example, +/// data structures that find repeated substrings. +/// +/// This hashing is done per BasicBlock in the module. To hash Instructions +/// based off of their operations, each Instruction is wrapped in an +/// IRInstructionData struct. The unsigned integer for an IRInstructionData +/// depends on: +/// - The hash provided by the IRInstructionData. +/// - Which member of InstrType the IRInstructionData is classified as. +// See InstrType for more details on the possible classifications, and how they +// manifest in the numeric string. +/// +/// The numeric string for an individual BasicBlock is terminated by an unique +/// unsigned integer. This prevents data structures which rely on repetition +/// from matching across BasicBlocks. (For example, the SuffixTree.) +/// As a concrete example, if we have the following two BasicBlocks: +/// \code +/// bb0: +/// %add1 = add i32 %a, %b +/// %add2 = add i32 %c, %d +/// %add3 = add i64 %e, %f +/// bb1: +/// %sub = sub i32 %c, %d +/// \endcode +/// We may hash the Instructions like this (via IRInstructionData): +/// \code +/// bb0: +/// %add1 = add i32 %a, %b ; Hash: 1 +/// %add2 = add i32 %c, %d; Hash: 1 +/// %add3 = add i64 %e, %f; Hash: 2 +/// bb1: +/// %sub = sub i32 %c, %d; Hash: 3 +/// %add4 = add i32 %c, %d ; Hash: 1 +/// \endcode +/// And produce a "numeric string representation" like so: +/// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2 +/// +/// TODO: This is very similar to the MachineOutliner, and should be +/// consolidated into the same interface. +struct IRInstructionMapper { + /// The starting illegal instruction number to map to. + /// + /// Set to -3 for compatibility with DenseMapInfo. + unsigned IllegalInstrNumber = static_cast(-3); + + /// The next available integer to assign to a legal Instruction to. + unsigned LegalInstrNumber = 0; + + /// Correspondence from IRInstructionData to unsigned integers. + DenseMap + InstructionIntegerMap; + + /// Set if we added an illegal number in the previous step. + /// Since each illegal number is unique, we only need one of them between + /// each range of legal numbers. This lets us make sure we don't add more + /// than one illegal number per range. + bool AddedIllegalLastTime = false; + + /// Marks whether we found a illegal instruction in the previous step. + bool CanCombineWithPrevInstr = false; + + /// Marks whether we have found a set of instructions that is long enough + /// to be considered for similarity. + bool HaveLegalRange = false; + + /// This allocator pointer is in charge of holding on to the IRInstructionData + /// so it is not deallocated until whatever external tool is using it is done + /// with the information. + SpecificBumpPtrAllocator *InstDataAllocator = nullptr; + + /// Get an allocated IRInstructionData struct using the InstDataAllocator. + /// + /// \param I - The Instruction to wrap with IRInstructionData. + /// \param Legality - A boolean value that is true if the instruction is to + /// be considered for similarity, and false if not. + /// \returns An allocated IRInstructionData struct. + IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality); + + /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers + /// determined by \p InstrType. Two Instructions are mapped to the same value + /// if they are close as defined by the InstructionData class above. + /// + /// \param [in] BB - The BasicBlock to be mapped to integers. + /// \param [in,out] InstrList - Vector of IRInstructionData to append to. + /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to. + void convertToUnsignedVec(BasicBlock &BB, + std::vector &InstrList, + std::vector &IntegerMapping); + + /// Maps an Instruction to a legal integer. + /// + /// \param [in] It - The Instruction to be mapped to an integer. + /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to + /// append to. + /// \param [in,out] InstrList - Vector of InstructionData to append + /// to. \returns The integer \p It was mapped to. + unsigned mapToLegalUnsigned(BasicBlock::iterator &It, + std::vector &IntegerMappingForBB, + std::vector &InstrListForBB); + + /// Maps an Instruction to an illegal integer. + /// + /// \param [in] It - The \p Instruction to be mapped to an integer. + /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to + /// append to. + /// \param [in,out] InstrList - Vector of IRInstructionData to append to. + /// \param End - true if creating a dummy IRInstructionData at the end of a + /// basic block. + /// \returns The integer \p It was mapped to. + unsigned mapToIllegalUnsigned( + BasicBlock::iterator &It, std::vector &IntegerMappingForBB, + std::vector &InstrListForBB, bool End = false); + + IRInstructionMapper(SpecificBumpPtrAllocator *IDA) + : InstDataAllocator(IDA) { + // Make sure that the implementation of DenseMapInfo hasn't + // changed. + assert(DenseMapInfo::getEmptyKey() == static_cast(-1) && + "DenseMapInfo's empty key isn't -1!"); + assert(DenseMapInfo::getTombstoneKey() == + static_cast(-2) && + "DenseMapInfo's tombstone key isn't -2!"); + } + + /// Custom InstVisitor to classify different instructions for whether it can + /// be analyzed for similarity. + struct InstructionClassification + : public InstVisitor { + InstructionClassification() {} + + // TODO: Determine a scheme to resolve when the label is similar enough. + InstrType visitBranchInst(BranchInst &BI) { return Illegal; } + // TODO: Determine a scheme to resolve when the labels are similar enough. + InstrType visitPHINode(PHINode &PN) { return Illegal; } + // TODO: Handle allocas. + InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; } + // We exclude variable argument instructions since variable arguments + // requires extra checking of the argument list. + InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; } + // We exclude all exception handling cases since they are so context + // dependent. + InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; } + InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; } + // DebugInfo should be included in the regions, but should not be + // analyzed for similarity as it has no bearing on the outcome of the + // program. + InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; } + // TODO: Handle GetElementPtrInsts + InstrType visitGetElementPtrInst(GetElementPtrInst &GEPI) { + return Illegal; + } + // TODO: Handle specific intrinsics. + InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; } + // TODO: Handle CallInsts. + InstrType visitCallInst(CallInst &CI) { return Illegal; } + // TODO: We do not current handle similarity that changes the control flow. + InstrType visitInvokeInst(InvokeInst &II) { return Illegal; } + // TODO: We do not current handle similarity that changes the control flow. + InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; } + // TODO: Handle interblock similarity. + InstrType visitTerminator(Instruction &I) { return Illegal; } + InstrType visitInstruction(Instruction &I) { return Legal; } + }; + + /// Maps an Instruction to a member of InstrType. + InstructionClassification InstClassifier; +}; + +} // end namespace IRSimilarity +} // end namespace llvm + +#endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H diff --git a/llvm/include/llvm/Analysis/InstructionSimplify.h b/llvm/include/llvm/Analysis/InstructionSimplify.h index 6f3d168466217..a4cee8b29d9e8 100644 --- a/llvm/include/llvm/Analysis/InstructionSimplify.h +++ b/llvm/include/llvm/Analysis/InstructionSimplify.h @@ -292,6 +292,13 @@ Value *SimplifyFreezeInst(Value *Op, const SimplifyQuery &Q); Value *SimplifyInstruction(Instruction *I, const SimplifyQuery &Q, OptimizationRemarkEmitter *ORE = nullptr); +/// See if V simplifies when its operand Op is replaced with RepOp. If not, +/// return null. +/// AllowRefinement specifies whether the simplification can be a refinement, +/// or whether it needs to be strictly identical. +Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, + const SimplifyQuery &Q, bool AllowRefinement); + /// Replace all uses of 'I' with 'SimpleV' and simplify the uses recursively. /// /// This first performs a normal RAUW of I with SimpleV. It then recursively diff --git a/llvm/include/llvm/Analysis/LoopAnalysisManager.h b/llvm/include/llvm/Analysis/LoopAnalysisManager.h index 0e162e03bde14..11dbd15c86783 100644 --- a/llvm/include/llvm/Analysis/LoopAnalysisManager.h +++ b/llvm/include/llvm/Analysis/LoopAnalysisManager.h @@ -57,6 +57,7 @@ struct LoopStandardAnalysisResults { ScalarEvolution &SE; TargetLibraryInfo &TLI; TargetTransformInfo &TTI; + BlockFrequencyInfo *BFI; MemorySSA *MSSA; }; diff --git a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h index ffec78b6db2c7..832122e8a97ae 100644 --- a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h @@ -14,19 +14,20 @@ #ifndef LLVM_ANALYSIS_LOOPCACHEANALYSIS_H #define LLVM_ANALYSIS_LOOPCACHEANALYSIS_H -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopAnalysisManager.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Instructions.h" -#include "llvm/Pass.h" +#include "llvm/IR/PassManager.h" #include "llvm/Support/raw_ostream.h" namespace llvm { +class AAResults; +class DependenceInfo; class LPMUpdater; +class ScalarEvolution; +class SCEV; +class TargetTransformInfo; + using CacheCostTy = int64_t; using LoopVectorTy = SmallVector; @@ -70,7 +71,7 @@ class IndexedReference { /// the same chace line iff the distance between them in the innermost /// dimension is less than the cache line size. Return None if unsure. Optional hasSpacialReuse(const IndexedReference &Other, unsigned CLS, - AliasAnalysis &AA) const; + AAResults &AA) const; /// Return true if the current object and the indexed reference \p Other /// have distance smaller than \p MaxDistance in the dimension associated with @@ -78,7 +79,7 @@ class IndexedReference { /// MaxDistance and None if unsure. Optional hasTemporalReuse(const IndexedReference &Other, unsigned MaxDistance, const Loop &L, - DependenceInfo &DI, AliasAnalysis &AA) const; + DependenceInfo &DI, AAResults &AA) const; /// Compute the cost of the reference w.r.t. the given loop \p L when it is /// considered in the innermost position in the loop nest. @@ -118,7 +119,7 @@ class IndexedReference { /// Return true if the given reference \p Other is definetely aliased with /// the indexed reference represented by this class. - bool isAliased(const IndexedReference &Other, AliasAnalysis &AA) const; + bool isAliased(const IndexedReference &Other, AAResults &AA) const; private: /// True if the reference can be delinearized, false otherwise. @@ -183,7 +184,7 @@ class CacheCost { /// between array elements accessed in a loop so that the elements are /// classified to have temporal reuse. CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI, ScalarEvolution &SE, - TargetTransformInfo &TTI, AliasAnalysis &AA, DependenceInfo &DI, + TargetTransformInfo &TTI, AAResults &AA, DependenceInfo &DI, Optional TRT = None); /// Create a CacheCost for the loop nest rooted by \p Root. @@ -258,7 +259,7 @@ class CacheCost { const LoopInfo &LI; ScalarEvolution &SE; TargetTransformInfo &TTI; - AliasAnalysis &AA; + AAResults &AA; DependenceInfo &DI; }; diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h index 3ec09e8c0a45e..d91b676d2e5a8 100644 --- a/llvm/include/llvm/Analysis/MemorySSA.h +++ b/llvm/include/llvm/Analysis/MemorySSA.h @@ -88,6 +88,7 @@ #include "llvm/IR/DerivedUser.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -270,7 +271,7 @@ class MemoryUseOrDef : public MemoryAccess { // Retrieve AliasResult type of the optimized access. Ideally this would be // returned by the caching walker and may go away in the future. Optional getOptimizedAccessType() const { - return OptimizedAccessAlias; + return isOptimized() ? OptimizedAccessAlias : None; } /// Reset the ID of what this MemoryUse was optimized to, causing it to @@ -1181,9 +1182,11 @@ class upward_defs_iterator using BaseT = upward_defs_iterator::iterator_facade_base; public: - upward_defs_iterator(const MemoryAccessPair &Info, DominatorTree *DT) + upward_defs_iterator(const MemoryAccessPair &Info, DominatorTree *DT, + bool *PerformedPhiTranslation = nullptr) : DefIterator(Info.first), Location(Info.second), - OriginalAccess(Info.first), DT(DT) { + OriginalAccess(Info.first), DT(DT), + PerformedPhiTranslation(PerformedPhiTranslation) { CurrentPair.first = nullptr; WalkingPhi = Info.first && isa(Info.first); @@ -1214,29 +1217,59 @@ class upward_defs_iterator BasicBlock *getPhiArgBlock() const { return DefIterator.getPhiArgBlock(); } - bool performedPhiTranslation() const { return PerformedPhiTranslation; } - private: + /// Returns true if \p Ptr is guaranteed to be loop invariant for any possible + /// loop. In particular, this guarantees that it only references a single + /// MemoryLocation during execution of the containing function. + bool IsGuaranteedLoopInvariant(Value *Ptr) const { + auto IsGuaranteedLoopInvariantBase = [](Value *Ptr) { + Ptr = Ptr->stripPointerCasts(); + if (!isa(Ptr)) + return true; + return isa(Ptr); + }; + + Ptr = Ptr->stripPointerCasts(); + if (auto *GEP = dyn_cast(Ptr)) { + return IsGuaranteedLoopInvariantBase(GEP->getPointerOperand()) && + GEP->hasAllConstantIndices(); + } + return IsGuaranteedLoopInvariantBase(Ptr); + } + void fillInCurrentPair() { CurrentPair.first = *DefIterator; + CurrentPair.second = Location; if (WalkingPhi && Location.Ptr) { + // Mark size as unknown, if the location is not guaranteed to be + // loop-invariant for any possible loop in the function. Setting the size + // to unknown guarantees that any memory accesses that access locations + // after the pointer are considered as clobbers, which is important to + // catch loop carried dependences. + if (Location.Ptr && + !IsGuaranteedLoopInvariant(const_cast(Location.Ptr))) + CurrentPair.second = Location.getWithNewSize(LocationSize::unknown()); PHITransAddr Translator( const_cast(Location.Ptr), OriginalAccess->getBlock()->getModule()->getDataLayout(), nullptr); + if (!Translator.PHITranslateValue(OriginalAccess->getBlock(), DefIterator.getPhiArgBlock(), DT, - false)) { - if (Translator.getAddr() != Location.Ptr) { - CurrentPair.second = Location.getWithNewPtr(Translator.getAddr()); - PerformedPhiTranslation = true; - return; + true)) { + Value *TransAddr = Translator.getAddr(); + if (TransAddr != Location.Ptr) { + CurrentPair.second = CurrentPair.second.getWithNewPtr(TransAddr); + + if (TransAddr && + !IsGuaranteedLoopInvariant(const_cast(TransAddr))) + CurrentPair.second = + CurrentPair.second.getWithNewSize(LocationSize::unknown()); + + if (PerformedPhiTranslation) + *PerformedPhiTranslation = true; } - } else { - CurrentPair.second = Location.getWithNewSize(LocationSize::unknown()); - return; } } - CurrentPair.second = Location; } MemoryAccessPair CurrentPair; @@ -1245,12 +1278,13 @@ class upward_defs_iterator MemoryAccess *OriginalAccess = nullptr; DominatorTree *DT = nullptr; bool WalkingPhi = false; - bool PerformedPhiTranslation = false; + bool *PerformedPhiTranslation = nullptr; }; -inline upward_defs_iterator upward_defs_begin(const MemoryAccessPair &Pair, - DominatorTree &DT) { - return upward_defs_iterator(Pair, &DT); +inline upward_defs_iterator +upward_defs_begin(const MemoryAccessPair &Pair, DominatorTree &DT, + bool *PerformedPhiTranslation = nullptr) { + return upward_defs_iterator(Pair, &DT, PerformedPhiTranslation); } inline upward_defs_iterator upward_defs_end() { return upward_defs_iterator(); } diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 81c5fc9325884..82dbe380b947a 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -696,7 +696,8 @@ class ScalarEvolution { /// before taking the branch. For loops with multiple exits, it may not be /// the number times that the loop header executes if the loop exits /// prematurely via another branch. - unsigned getSmallConstantTripCount(const Loop *L, BasicBlock *ExitingBlock); + unsigned getSmallConstantTripCount(const Loop *L, + const BasicBlock *ExitingBlock); /// Returns the upper bound of the loop trip count as a normal unsigned /// value. @@ -718,8 +719,7 @@ class ScalarEvolution { /// for getSmallConstantTripCount, this assumes that control exits the loop /// via ExitingBlock. unsigned getSmallConstantTripMultiple(const Loop *L, - BasicBlock *ExitingBlock); - + const BasicBlock *ExitingBlock); /// The terms "backedge taken count" and "exit count" are used /// interchangeably to refer to the number of times the backedge of a loop @@ -737,8 +737,8 @@ class ScalarEvolution { /// For a single exit loop, this value is equivelent to the result of /// getBackedgeTakenCount. The loop is guaranteed to exit (via *some* exit) /// before the backedge is executed (ExitCount + 1) times. Note that there - /// is no guarantee about *which* exit is taken on the exiting iteration. - const SCEV *getExitCount(const Loop *L, BasicBlock *ExitingBlock, + /// is no guarantee about *which* exit is taken on the exiting iteration. + const SCEV *getExitCount(const Loop *L, const BasicBlock *ExitingBlock, ExitCountKind Kind = Exact); /// If the specified loop has a predictable backedge-taken count, return it, @@ -768,6 +768,11 @@ class ScalarEvolution { return getBackedgeTakenCount(L, ConstantMaximum); } + /// Return a symbolic upper bound for the backedge taken count of the loop. + /// This is more general than getConstantMaxBackedgeTakenCount as it returns + /// an arbitrary expression as opposed to only constants. + const SCEV* computeMaxBackedgeTakenCount(const Loop *L); + /// Return true if the backedge taken count is either the value returned by /// getConstantMaxBackedgeTakenCount or zero. bool isBackedgeTakenCountMaxOrZero(const Loop *L); @@ -1181,7 +1186,7 @@ class ScalarEvolution { ValueExprMapType ValueExprMap; /// Mark predicate values currently being processed by isImpliedCond. - SmallPtrSet PendingLoopPredicates; + SmallPtrSet PendingLoopPredicates; /// Mark SCEVUnknown Phis currently being processed by getRangeRef. SmallPtrSet PendingPhiRanges; @@ -1347,13 +1352,15 @@ class ScalarEvolution { /// edge, or SCEVCouldNotCompute. The loop is guaranteed not to exit via /// this block before this number of iterations, but may exit via another /// block. - const SCEV *getExact(BasicBlock *ExitingBlock, ScalarEvolution *SE) const; + const SCEV *getExact(const BasicBlock *ExitingBlock, + ScalarEvolution *SE) const; /// Get the max backedge taken count for the loop. const SCEV *getMax(ScalarEvolution *SE) const; /// Get the max backedge taken count for the particular loop exit. - const SCEV *getMax(BasicBlock *ExitingBlock, ScalarEvolution *SE) const; + const SCEV *getMax(const BasicBlock *ExitingBlock, + ScalarEvolution *SE) const; /// Return true if the number of times this backedge is taken is either the /// value returned by getMax or zero. @@ -1655,13 +1662,13 @@ class ScalarEvolution { /// Return a predecessor of BB (which may not be an immediate predecessor) /// which has exactly one successor from which BB is reachable, or null if /// no such block is found. - std::pair - getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB); + std::pair + getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB) const; /// Test whether the condition described by Pred, LHS, and RHS is true /// whenever the given FoundCondValue value evaluates to true. bool isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, - Value *FoundCondValue, bool Inverse); + const Value *FoundCondValue, bool Inverse); /// Test whether the condition described by Pred, LHS, and RHS is true /// whenever the condition described by FoundPred, FoundLHS, FoundRHS is @@ -1708,7 +1715,7 @@ class ScalarEvolution { /// Return true if the condition denoted by \p LHS \p Pred \p RHS is implied /// by a call to @llvm.experimental.guard in \p BB. - bool isImpliedViaGuard(BasicBlock *BB, ICmpInst::Predicate Pred, + bool isImpliedViaGuard(const BasicBlock *BB, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS); /// Test whether the condition described by Pred, LHS, and RHS is true diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index ffbec74c61d02..9bf821fa1e3b8 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1288,6 +1288,10 @@ class TargetTransformInfo { bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags Flags) const; + /// \returns True if the target prefers reductions in loop. + bool preferInLoopReduction(unsigned Opcode, Type *Ty, + ReductionFlags Flags) const; + /// \returns True if the target prefers reductions select kept in the loop /// when tail folding. i.e. /// loop: @@ -1592,6 +1596,8 @@ class TargetTransformInfo::Concept { VectorType *VecTy) const = 0; virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; + virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty, + ReductionFlags) const = 0; virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; @@ -2094,6 +2100,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { ReductionFlags Flags) const override { return Impl.useReductionIntrinsic(Opcode, Ty, Flags); } + bool preferInLoopReduction(unsigned Opcode, Type *Ty, + ReductionFlags Flags) const override { + return Impl.preferInLoopReduction(Opcode, Ty, Flags); + } bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const override { return Impl.preferPredicatedReductionSelect(Opcode, Ty, Flags); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index bb70b97870804..7f42074119667 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -660,6 +660,11 @@ class TargetTransformInfoImplBase { return false; } + bool preferInLoopReduction(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + return false; + } + bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { return false; diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index f9a27a8ec4b09..8ddbcbf4d6433 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -584,25 +584,27 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6; /// if, for all i, r is evaluated to poison or op raises UB if vi = poison. /// To filter out operands that raise UB on poison, you can use /// getGuaranteedNonPoisonOp. - bool propagatesPoison(const Instruction *I); + bool propagatesPoison(const Operator *I); /// Insert operands of I into Ops such that I will trigger undefined behavior /// if I is executed and that operand has a poison value. void getGuaranteedNonPoisonOps(const Instruction *I, SmallPtrSetImpl &Ops); - /// Return true if the given instruction must trigger undefined behavior. + /// Return true if the given instruction must trigger undefined behavior /// when I is executed with any operands which appear in KnownPoison holding /// a poison value at the point of execution. bool mustTriggerUB(const Instruction *I, const SmallSet& KnownPoison); - /// Return true if this function can prove that if PoisonI is executed - /// and yields a poison value, then that will trigger undefined behavior. + /// Return true if this function can prove that if Inst is executed + /// and yields a poison value or undef bits, then that will trigger + /// undefined behavior. /// /// Note that this currently only considers the basic block that is - /// the parent of I. - bool programUndefinedIfPoison(const Instruction *PoisonI); + /// the parent of Inst. + bool programUndefinedIfUndefOrPoison(const Instruction *Inst); + bool programUndefinedIfPoison(const Instruction *Inst); /// canCreateUndefOrPoison returns true if Op can create undef or poison from /// non-undef & non-poison operands. @@ -618,9 +620,9 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6; bool canCreateUndefOrPoison(const Operator *Op); bool canCreatePoison(const Operator *Op); - /// Return true if this function can prove that V is never undef value - /// or poison value. If V is an aggregate value or vector, check whether all - /// elements (except padding) are not undef or poison. + /// Return true if this function can prove that V does not have undef bits + /// and is never poison. If V is an aggregate value or vector, check whether + /// all elements (except padding) are not undef or poison. /// Note that this is different from canCreateUndefOrPoison because the /// function assumes Op's operands are not poison/undef. /// @@ -631,6 +633,10 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6; const Instruction *CtxI = nullptr, const DominatorTree *DT = nullptr, unsigned Depth = 0); + bool isGuaranteedNotToBePoison(const Value *V, + const Instruction *CtxI = nullptr, + const DominatorTree *DT = nullptr, + unsigned Depth = 0); /// Specific patterns of select instructions we can match. enum SelectPatternFlavor { diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def index 9fdbf638078f4..a47ee3c147252 100644 --- a/llvm/include/llvm/Analysis/VecFuncs.def +++ b/llvm/include/llvm/Analysis/VecFuncs.def @@ -269,6 +269,54 @@ TLI_DEFINE_VECFUNC("llvm.log2.f32", "__svml_log2f4", 4) TLI_DEFINE_VECFUNC("llvm.log2.f32", "__svml_log2f8", 8) TLI_DEFINE_VECFUNC("llvm.log2.f32", "__svml_log2f16", 16) +TLI_DEFINE_VECFUNC("log10", "__svml_log102", 2) +TLI_DEFINE_VECFUNC("log10", "__svml_log104", 4) +TLI_DEFINE_VECFUNC("log10", "__svml_log108", 8) + +TLI_DEFINE_VECFUNC("log10f", "__svml_log10f4", 4) +TLI_DEFINE_VECFUNC("log10f", "__svml_log10f8", 8) +TLI_DEFINE_VECFUNC("log10f", "__svml_log10f16", 16) + +TLI_DEFINE_VECFUNC("__log10_finite", "__svml_log102", 2) +TLI_DEFINE_VECFUNC("__log10_finite", "__svml_log104", 4) +TLI_DEFINE_VECFUNC("__log10_finite", "__svml_log108", 8) + +TLI_DEFINE_VECFUNC("__log10f_finite", "__svml_log10f4", 4) +TLI_DEFINE_VECFUNC("__log10f_finite", "__svml_log10f8", 8) +TLI_DEFINE_VECFUNC("__log10f_finite", "__svml_log10f16", 16) + +TLI_DEFINE_VECFUNC("llvm.log10.f64", "__svml_log102", 2) +TLI_DEFINE_VECFUNC("llvm.log10.f64", "__svml_log104", 4) +TLI_DEFINE_VECFUNC("llvm.log10.f64", "__svml_log108", 8) + +TLI_DEFINE_VECFUNC("llvm.log10.f32", "__svml_log10f4", 4) +TLI_DEFINE_VECFUNC("llvm.log10.f32", "__svml_log10f8", 8) +TLI_DEFINE_VECFUNC("llvm.log10.f32", "__svml_log10f16", 16) + +TLI_DEFINE_VECFUNC("sqrt", "__svml_sqrt2", 2) +TLI_DEFINE_VECFUNC("sqrt", "__svml_sqrt4", 4) +TLI_DEFINE_VECFUNC("sqrt", "__svml_sqrt8", 8) + +TLI_DEFINE_VECFUNC("sqrtf", "__svml_sqrtf4", 4) +TLI_DEFINE_VECFUNC("sqrtf", "__svml_sqrtf8", 8) +TLI_DEFINE_VECFUNC("sqrtf", "__svml_sqrtf16", 16) + +TLI_DEFINE_VECFUNC("__sqrt_finite", "__svml_sqrt2", 2) +TLI_DEFINE_VECFUNC("__sqrt_finite", "__svml_sqrt4", 4) +TLI_DEFINE_VECFUNC("__sqrt_finite", "__svml_sqrt8", 8) + +TLI_DEFINE_VECFUNC("__sqrtf_finite", "__svml_sqrtf4", 4) +TLI_DEFINE_VECFUNC("__sqrtf_finite", "__svml_sqrtf8", 8) +TLI_DEFINE_VECFUNC("__sqrtf_finite", "__svml_sqrtf16", 16) + +TLI_DEFINE_VECFUNC("llvm.sqrt.f64", "__svml_sqrt2", 2) +TLI_DEFINE_VECFUNC("llvm.sqrt.f64", "__svml_sqrt4", 4) +TLI_DEFINE_VECFUNC("llvm.sqrt.f64", "__svml_sqrt8", 8) + +TLI_DEFINE_VECFUNC("llvm.sqrt.f32", "__svml_sqrtf4", 4) +TLI_DEFINE_VECFUNC("llvm.sqrt.f32", "__svml_sqrtf8", 8) +TLI_DEFINE_VECFUNC("llvm.sqrt.f32", "__svml_sqrtf16", 16) + TLI_DEFINE_VECFUNC("exp2", "__svml_exp22", 2) TLI_DEFINE_VECFUNC("exp2", "__svml_exp24", 4) TLI_DEFINE_VECFUNC("exp2", "__svml_exp28", 8) diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index 8498335bf78e6..c570bf25e92b5 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -544,20 +544,20 @@ createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs); /// elements, it will be padded with undefs. Value *concatenateVectors(IRBuilderBase &Builder, ArrayRef Vecs); -/// Given a mask vector of the form , Return true if all of the -/// elements of this predicate mask are false or undef. That is, return true -/// if all lanes can be assumed inactive. +/// Given a mask vector of i1, Return true if all of the elements of this +/// predicate mask are known to be false or undef. That is, return true if all +/// lanes can be assumed inactive. bool maskIsAllZeroOrUndef(Value *Mask); -/// Given a mask vector of the form , Return true if all of the -/// elements of this predicate mask are true or undef. That is, return true -/// if all lanes can be assumed active. +/// Given a mask vector of i1, Return true if all of the elements of this +/// predicate mask are known to be true or undef. That is, return true if all +/// lanes can be assumed active. bool maskIsAllOneOrUndef(Value *Mask); /// Given a mask vector of the form , return an APInt (of bitwidth Y) /// for each lane which may be active. APInt possiblyDemandedEltsInMask(Value *Mask); - + /// The group of interleaved loads/stores sharing the same stride and /// close to each other. /// diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h index bcc447a84a4dc..28cbc2c6a0e4b 100644 --- a/llvm/include/llvm/BinaryFormat/Dwarf.h +++ b/llvm/include/llvm/BinaryFormat/Dwarf.h @@ -183,6 +183,7 @@ enum SourceLanguage { }; inline bool isCPlusPlus(SourceLanguage S) { + bool result = false; // Deliberately enumerate all the language options so we get a warning when // new language options are added (-Wswitch) that'll hopefully help keep this // switch up-to-date when new C++ versions are added. @@ -191,7 +192,8 @@ inline bool isCPlusPlus(SourceLanguage S) { case DW_LANG_C_plus_plus_03: case DW_LANG_C_plus_plus_11: case DW_LANG_C_plus_plus_14: - return true; + result = true; + break; case DW_LANG_C89: case DW_LANG_C: case DW_LANG_Ada83: @@ -230,9 +232,68 @@ inline bool isCPlusPlus(SourceLanguage S) { case DW_LANG_BORLAND_Delphi: case DW_LANG_lo_user: case DW_LANG_hi_user: - return false; + result = false; + break; + } + + return result; +} + +inline bool isFortran(SourceLanguage S) { + bool result = false; + // Deliberately enumerate all the language options so we get a warning when + // new language options are added (-Wswitch) that'll hopefully help keep this + // switch up-to-date when new Fortran versions are added. + switch (S) { + case DW_LANG_Fortran77: + case DW_LANG_Fortran90: + case DW_LANG_Fortran95: + case DW_LANG_Fortran03: + case DW_LANG_Fortran08: + result = true; + break; + case DW_LANG_C89: + case DW_LANG_C: + case DW_LANG_Ada83: + case DW_LANG_C_plus_plus: + case DW_LANG_Cobol74: + case DW_LANG_Cobol85: + case DW_LANG_Pascal83: + case DW_LANG_Modula2: + case DW_LANG_Java: + case DW_LANG_C99: + case DW_LANG_Ada95: + case DW_LANG_PLI: + case DW_LANG_ObjC: + case DW_LANG_ObjC_plus_plus: + case DW_LANG_UPC: + case DW_LANG_D: + case DW_LANG_Python: + case DW_LANG_OpenCL: + case DW_LANG_Go: + case DW_LANG_Modula3: + case DW_LANG_Haskell: + case DW_LANG_C_plus_plus_03: + case DW_LANG_C_plus_plus_11: + case DW_LANG_OCaml: + case DW_LANG_Rust: + case DW_LANG_C11: + case DW_LANG_Swift: + case DW_LANG_Julia: + case DW_LANG_Dylan: + case DW_LANG_C_plus_plus_14: + case DW_LANG_RenderScript: + case DW_LANG_BLISS: + case DW_LANG_Mips_Assembler: + case DW_LANG_GOOGLE_RenderScript: + case DW_LANG_BORLAND_Delphi: + case DW_LANG_lo_user: + case DW_LANG_hi_user: + result = false; + break; } - llvm_unreachable("Invalid source language"); + + return result; } enum CaseSensitivity { diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC64.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC64.def index 2cf021a4cf6f2..901af679b9150 100644 --- a/llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC64.def +++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC64.def @@ -100,6 +100,7 @@ #undef R_PPC64_PCREL_OPT #undef R_PPC64_PCREL34 #undef R_PPC64_GOT_PCREL34 +#undef R_PPC64_TPREL34 #undef R_PPC64_GOT_TLSGD_PCREL34 #undef R_PPC64_GOT_TPREL_PCREL34 #undef R_PPC64_IRELATIVE @@ -200,6 +201,7 @@ ELF_RELOC(R_PPC64_REL24_NOTOC, 116) ELF_RELOC(R_PPC64_PCREL_OPT, 123) ELF_RELOC(R_PPC64_PCREL34, 132) ELF_RELOC(R_PPC64_GOT_PCREL34, 133) +ELF_RELOC(R_PPC64_TPREL34, 146) ELF_RELOC(R_PPC64_GOT_TLSGD_PCREL34, 148) ELF_RELOC(R_PPC64_GOT_TPREL_PCREL34, 150) ELF_RELOC(R_PPC64_IRELATIVE, 248) diff --git a/llvm/include/llvm/BinaryFormat/MachO.h b/llvm/include/llvm/BinaryFormat/MachO.h index e84ed8b643cbb..f5d5ec328b5e7 100644 --- a/llvm/include/llvm/BinaryFormat/MachO.h +++ b/llvm/include/llvm/BinaryFormat/MachO.h @@ -83,6 +83,7 @@ enum { MH_NO_HEAP_EXECUTION = 0x01000000u, MH_APP_EXTENSION_SAFE = 0x02000000u, MH_NLIST_OUTOFSYNC_WITH_DYLDINFO = 0x04000000u, + MH_SIM_SUPPORT = 0x08000000u, MH_DYLIB_IN_CACHE = 0x80000000u, }; diff --git a/llvm/include/llvm/Bitcode/BitcodeWriter.h b/llvm/include/llvm/Bitcode/BitcodeWriter.h index 4beb89d30e008..74e9d103b7f3b 100644 --- a/llvm/include/llvm/Bitcode/BitcodeWriter.h +++ b/llvm/include/llvm/Bitcode/BitcodeWriter.h @@ -47,7 +47,7 @@ class raw_ostream; public: /// Create a BitcodeWriter that writes to Buffer. - BitcodeWriter(SmallVectorImpl &Buffer); + BitcodeWriter(SmallVectorImpl &Buffer, raw_fd_stream *FS = nullptr); ~BitcodeWriter(); @@ -153,6 +153,10 @@ class raw_ostream; *ModuleToSummariesForIndex = nullptr); /// Save a copy of the llvm IR as data in the __LLVM,__bitcode section. + /// If available, pass the serialized module via the Buf parameter. If not, + /// pass an empty (default-initialized) MemoryBufferRef, and the serialization + /// will be handled by this API. The same behavior happens if the provided Buf + /// is not bitcode (i.e. if it's invalid data or even textual LLVM assembly). void EmbedBitcodeInModule(Module &M, MemoryBufferRef Buf, bool EmbedBitcode, bool EmbedMarker, const std::vector *CmdArgs); diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 613391ad05ede..d81f61c59c852 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -539,8 +539,9 @@ enum FunctionCodes { FUNC_CODE_DEBUG_LOC = 35, // DEBUG_LOC: [Line,Col,ScopeVal, IAVal] FUNC_CODE_INST_FENCE = 36, // FENCE: [ordering, synchscope] - FUNC_CODE_INST_CMPXCHG_OLD = 37, // CMPXCHG: [ptrty,ptr,cmp,new, align, vol, - // ordering, synchscope] + FUNC_CODE_INST_CMPXCHG_OLD = 37, // CMPXCHG: [ptrty, ptr, cmp, val, vol, + // ordering, synchscope, + // failure_ordering?, weak?] FUNC_CODE_INST_ATOMICRMW = 38, // ATOMICRMW: [ptrty,ptr,val, operation, // align, vol, // ordering, synchscope] @@ -554,8 +555,9 @@ enum FunctionCodes { FUNC_CODE_INST_GEP = 43, // GEP: [inbounds, n x operands] FUNC_CODE_INST_STORE = 44, // STORE: [ptrty,ptr,valty,val, align, vol] FUNC_CODE_INST_STOREATOMIC = 45, // STORE: [ptrty,ptr,val, align, vol - FUNC_CODE_INST_CMPXCHG = 46, // CMPXCHG: [ptrty,ptr,valty,cmp,new, align, - // vol,ordering,synchscope] + FUNC_CODE_INST_CMPXCHG = 46, // CMPXCHG: [ptrty, ptr, cmp, val, vol, + // success_ordering, synchscope, + // failure_ordering, weak] FUNC_CODE_INST_LANDINGPAD = 47, // LANDINGPAD: [ty,val,num,id0,val0...] FUNC_CODE_INST_CLEANUPRET = 48, // CLEANUPRET: [val] or [val,bb#] FUNC_CODE_INST_CATCHRET = 49, // CATCHRET: [val,bb#] diff --git a/llvm/include/llvm/Bitstream/BitstreamWriter.h b/llvm/include/llvm/Bitstream/BitstreamWriter.h index 162a0fea09132..8dc135e6404da 100644 --- a/llvm/include/llvm/Bitstream/BitstreamWriter.h +++ b/llvm/include/llvm/Bitstream/BitstreamWriter.h @@ -20,17 +20,28 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Bitstream/BitCodes.h" #include "llvm/Support/Endian.h" +#include "llvm/Support/raw_ostream.h" +#include #include namespace llvm { class BitstreamWriter { + /// Out - The buffer that keeps unflushed bytes. SmallVectorImpl &Out; + /// FS - The file stream that Out flushes to. If FS is nullptr, it does not + /// support read or seek, Out cannot be flushed until all data are written. + raw_fd_stream *FS; + + /// FlushThreshold - If FS is valid, this is the threshold (unit B) to flush + /// FS. + const uint64_t FlushThreshold; + /// CurBit - Always between 0 and 31 inclusive, specifies the next bit to use. unsigned CurBit; - /// CurValue - The current value. Only bits < CurBit are valid. + /// CurValue - The current value. Only bits < CurBit are valid. uint32_t CurValue; /// CurCodeSize - This is the declared size of code values used for the @@ -64,15 +75,19 @@ class BitstreamWriter { void WriteByte(unsigned char Value) { Out.push_back(Value); + FlushToFile(); } void WriteWord(unsigned Value) { Value = support::endian::byte_swap(Value); Out.append(reinterpret_cast(&Value), reinterpret_cast(&Value + 1)); + FlushToFile(); } - size_t GetBufferOffset() const { return Out.size(); } + uint64_t GetNumOfFlushedBytes() const { return FS ? FS->tell() : 0; } + + size_t GetBufferOffset() const { return Out.size() + GetNumOfFlushedBytes(); } size_t GetWordIndex() const { size_t Offset = GetBufferOffset(); @@ -80,9 +95,29 @@ class BitstreamWriter { return Offset / 4; } + /// If the related file stream supports reading, seeking and writing, flush + /// the buffer if its size is above a threshold. + void FlushToFile() { + if (!FS) + return; + if (Out.size() < FlushThreshold) + return; + FS->write((char *)&Out.front(), Out.size()); + Out.clear(); + } + public: - explicit BitstreamWriter(SmallVectorImpl &O) - : Out(O), CurBit(0), CurValue(0), CurCodeSize(2) {} + /// Create a BitstreamWriter that writes to Buffer \p O. + /// + /// \p FS is the file stream that \p O flushes to incrementally. If \p FS is + /// null, \p O does not flush incrementially, but writes to disk at the end. + /// + /// \p FlushThreshold is the threshold (unit M) to flush \p O if \p FS is + /// valid. + BitstreamWriter(SmallVectorImpl &O, raw_fd_stream *FS = nullptr, + uint32_t FlushThreshold = 512) + : Out(O), FS(FS), FlushThreshold(FlushThreshold << 20), CurBit(0), + CurValue(0), CurCodeSize(2) {} ~BitstreamWriter() { assert(CurBit == 0 && "Unflushed data remaining"); @@ -104,11 +139,59 @@ class BitstreamWriter { void BackpatchWord(uint64_t BitNo, unsigned NewWord) { using namespace llvm::support; uint64_t ByteNo = BitNo / 8; - assert((!endian::readAtBitAlignment( - &Out[ByteNo], BitNo & 7)) && - "Expected to be patching over 0-value placeholders"); - endian::writeAtBitAlignment( - &Out[ByteNo], NewWord, BitNo & 7); + uint64_t StartBit = BitNo & 7; + uint64_t NumOfFlushedBytes = GetNumOfFlushedBytes(); + + if (ByteNo >= NumOfFlushedBytes) { + assert((!endian::readAtBitAlignment( + &Out[ByteNo - NumOfFlushedBytes], StartBit)) && + "Expected to be patching over 0-value placeholders"); + endian::writeAtBitAlignment( + &Out[ByteNo - NumOfFlushedBytes], NewWord, StartBit); + return; + } + + // If the byte offset to backpatch is flushed, use seek to backfill data. + // First, save the file position to restore later. + uint64_t CurPos = FS->tell(); + + // Copy data to update into Bytes from the file FS and the buffer Out. + char Bytes[8]; + size_t BytesNum = StartBit ? 8 : 4; + size_t BytesFromDisk = std::min(static_cast(BytesNum), NumOfFlushedBytes - ByteNo); + size_t BytesFromBuffer = BytesNum - BytesFromDisk; + + // When unaligned, copy existing data into Bytes from the file FS and the + // buffer Out so that it can be updated before writing. For debug builds + // read bytes unconditionally in order to check that the existing value is 0 + // as expected. +#ifdef NDEBUG + if (StartBit) +#endif + { + FS->seek(ByteNo); + ssize_t BytesRead = FS->read(Bytes, BytesFromDisk); + (void)BytesRead; // silence warning + assert(BytesRead >= 0 && static_cast(BytesRead) == BytesFromDisk); + for (size_t i = 0; i < BytesFromBuffer; ++i) + Bytes[BytesFromDisk + i] = Out[i]; + assert((!endian::readAtBitAlignment( + Bytes, StartBit)) && + "Expected to be patching over 0-value placeholders"); + } + + // Update Bytes in terms of bit offset and value. + endian::writeAtBitAlignment(Bytes, NewWord, + StartBit); + + // Copy updated data back to the file FS and the buffer Out. + FS->seek(ByteNo); + FS->write(Bytes, BytesFromDisk); + for (size_t i = 0; i < BytesFromBuffer; ++i) + Out[i] = Bytes[BytesFromDisk + i]; + + // Restore the file position. + FS->seek(CurPos); } void BackpatchWord64(uint64_t BitNo, uint64_t Val) { diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h index eab6eb52b86cf..11ba36aee5a80 100644 --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -216,6 +216,14 @@ class AsmPrinter : public MachineFunctionPass { uint16_t getDwarfVersion() const; void setDwarfVersion(uint16_t Version); + bool isDwarf64() const; + + /// Returns 4 for DWARF32 and 8 for DWARF64. + unsigned int getDwarfOffsetByteSize() const; + + /// Returns 4 for DWARF32 and 12 for DWARF64. + unsigned int getUnitLengthFieldByteSize() const; + bool isPositionIndependent() const; /// Return true if assembly output should contain comments. @@ -342,6 +350,8 @@ class AsmPrinter : public MachineFunctionPass { void emitStackSizeSection(const MachineFunction &MF); + void emitBBAddrMapSection(const MachineFunction &MF); + void emitRemarksSection(remarks::RemarkStreamer &RS); enum CFIMoveType { CFI_M_None, CFI_M_EH, CFI_M_Debug }; @@ -560,9 +570,6 @@ class AsmPrinter : public MachineFunctionPass { emitLabelPlusOffset(Label, 0, Size, IsSectionRelative); } - /// Emit something like ".long Label + Offset". - void emitDwarfOffset(const MCSymbol *Label, uint64_t Offset) const; - //===------------------------------------------------------------------===// // Dwarf Emission Helper Routines //===------------------------------------------------------------------===// @@ -591,18 +598,39 @@ class AsmPrinter : public MachineFunctionPass { void emitDwarfSymbolReference(const MCSymbol *Label, bool ForceOffset = false) const; - /// Emit the 4-byte offset of a string from the start of its section. + /// Emit the 4- or 8-byte offset of a string from the start of its section. /// /// When possible, emit a DwarfStringPool section offset without any /// relocations, and without using the symbol. Otherwise, defers to \a /// emitDwarfSymbolReference(). + /// + /// The length of the emitted value depends on the DWARF format. void emitDwarfStringOffset(DwarfStringPoolEntry S) const; - /// Emit the 4-byte offset of a string from the start of its section. + /// Emit the 4-or 8-byte offset of a string from the start of its section. void emitDwarfStringOffset(DwarfStringPoolEntryRef S) const { emitDwarfStringOffset(S.getEntry()); } + /// Emit something like ".long Label + Offset" or ".quad Label + Offset" + /// depending on the DWARF format. + void emitDwarfOffset(const MCSymbol *Label, uint64_t Offset) const; + + /// Emit 32- or 64-bit value depending on the DWARF format. + void emitDwarfLengthOrOffset(uint64_t Value) const; + + /// Emit a special value of 0xffffffff if producing 64-bit debugging info. + void maybeEmitDwarf64Mark() const; + + /// Emit a unit length field. The actual format, DWARF32 or DWARF64, is chosen + /// according to the settings. + void emitDwarfUnitLength(uint64_t Length, const Twine &Comment) const; + + /// Emit a unit length field. The actual format, DWARF32 or DWARF64, is chosen + /// according to the settings. + void emitDwarfUnitLength(const MCSymbol *Hi, const MCSymbol *Lo, + const Twine &Comment) const; + /// Emit reference to a call site with a specified encoding void emitCallSiteOffset(const MCSymbol *Hi, const MCSymbol *Lo, unsigned Encoding) const; diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 9e5c45084c599..d5c0b83ea6f7b 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -40,7 +40,6 @@ #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" -#include "llvm/MC/MCSchedule.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" @@ -1350,13 +1349,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { break; case Intrinsic::minnum: ISDs.push_back(ISD::FMINNUM); - if (FMF.noNaNs()) - ISDs.push_back(ISD::FMINIMUM); break; case Intrinsic::maxnum: ISDs.push_back(ISD::FMAXNUM); - if (FMF.noNaNs()) - ISDs.push_back(ISD::FMAXIMUM); break; case Intrinsic::copysign: ISDs.push_back(ISD::FCOPYSIGN); diff --git a/llvm/include/llvm/CodeGen/DIE.h b/llvm/include/llvm/CodeGen/DIE.h index 43ba859fdc79c..fa554be64e79f 100644 --- a/llvm/include/llvm/CodeGen/DIE.h +++ b/llvm/include/llvm/CodeGen/DIE.h @@ -788,7 +788,7 @@ class DIE : IntrusiveBackListNode, public DIEValueList { /// Get the absolute offset within the .debug_info or .debug_types section /// for this DIE. - unsigned getDebugSectionOffset() const; + uint64_t getDebugSectionOffset() const; /// Compute the offset of this DIE and all its children. /// @@ -890,8 +890,8 @@ class DIEUnit { /// /// \returns Section pointer which can be NULL. MCSection *getSection() const { return Section; } - void setDebugSectionOffset(unsigned O) { Offset = O; } - unsigned getDebugSectionOffset() const { return Offset; } + void setDebugSectionOffset(uint64_t O) { Offset = O; } + uint64_t getDebugSectionOffset() const { return Offset; } DIE &getUnitDie() { return Die; } const DIE &getUnitDie() const { return Die; } }; diff --git a/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h b/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h index e189352a7b2d8..abeba62707c1d 100644 --- a/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h +++ b/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h @@ -21,7 +21,7 @@ struct DwarfStringPoolEntry { static constexpr unsigned NotIndexed = -1; MCSymbol *Symbol; - unsigned Offset; + uint64_t Offset; unsigned Index; bool isIndexed() const { return Index != NotIndexed; } @@ -47,7 +47,7 @@ class DwarfStringPoolEntryRef { assert(getMapEntry()->second.Symbol && "No symbol available!"); return getMapEntry()->second.Symbol; } - unsigned getOffset() const { return getMapEntry()->second.Offset; } + uint64_t getOffset() const { return getMapEntry()->second.Offset; } bool isIndexed() const { return MapEntryAndIndexed.getInt(); } unsigned getIndex() const { assert(isIndexed()); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 8607ad02d5063..8ee3b545815b2 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -17,6 +17,7 @@ #ifndef LLVM_CODEGEN_GLOBALISEL_COMBINER_HELPER_H #define LLVM_CODEGEN_GLOBALISEL_COMBINER_HELPER_H +#include "llvm/ADT/APFloat.h" #include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/Register.h" #include "llvm/Support/Alignment.h" @@ -147,9 +148,10 @@ class CombinerHelper { bool matchSextInRegOfLoad(MachineInstr &MI, std::tuple &MatchInfo); bool applySextInRegOfLoad(MachineInstr &MI, std::tuple &MatchInfo); - bool matchElideBrByInvertingCond(MachineInstr &MI); - void applyElideBrByInvertingCond(MachineInstr &MI); - bool tryElideBrByInvertingCond(MachineInstr &MI); + /// If a brcond's true block is not the fallthrough, make it so by inverting + /// the condition and swapping operands. + bool matchOptBrCondByInvertingCond(MachineInstr &MI); + void applyOptBrCondByInvertingCond(MachineInstr &MI); /// If \p MI is G_CONCAT_VECTORS, try to combine it. /// Returns true if MI changed. @@ -243,6 +245,34 @@ class CombinerHelper { bool applyCombineShiftToUnmerge(MachineInstr &MI, const unsigned &ShiftVal); bool tryCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftAmount); + /// Transform G_UNMERGE(G_MERGE ty X, Y, Z) -> ty X, Y, Z. + bool + matchCombineUnmergeMergeToPlainValues(MachineInstr &MI, + SmallVectorImpl &Operands); + bool + applyCombineUnmergeMergeToPlainValues(MachineInstr &MI, + SmallVectorImpl &Operands); + + /// Transform G_UNMERGE Constant -> Constant1, Constant2, ... + bool matchCombineUnmergeConstant(MachineInstr &MI, + SmallVectorImpl &Csts); + bool applyCombineUnmergeConstant(MachineInstr &MI, + SmallVectorImpl &Csts); + + /// Transform X, Y = G_UNMERGE Z -> X = G_TRUNC Z. + bool matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI); + bool applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI); + + /// Transform X, Y = G_UNMERGE(G_ZEXT(Z)) -> X = G_ZEXT(Z); Y = G_CONSTANT 0 + bool matchCombineUnmergeZExtToZExt(MachineInstr &MI); + bool applyCombineUnmergeZExtToZExt(MachineInstr &MI); + + /// Transform fp_instr(cst) to constant result of the fp operation. + bool matchCombineConstantFoldFpUnary(MachineInstr &MI, + Optional &Cst); + bool applyCombineConstantFoldFpUnary(MachineInstr &MI, + Optional &Cst); + /// Transform IntToPtr(PtrToInt(x)) to x if cast is in the same address space. bool matchCombineI2PToP2I(MachineInstr &MI, Register &Reg); bool applyCombineI2PToP2I(MachineInstr &MI, Register &Reg); @@ -268,6 +298,29 @@ class CombinerHelper { bool applyCombineExtOfExt(MachineInstr &MI, std::tuple &MatchInfo); + /// Transform fneg(fneg(x)) to x. + bool matchCombineFNegOfFNeg(MachineInstr &MI, Register &Reg); + + /// Match fabs(fabs(x)) to fabs(x). + bool matchCombineFAbsOfFAbs(MachineInstr &MI, Register &Src); + bool applyCombineFAbsOfFAbs(MachineInstr &MI, Register &Src); + + /// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x). + bool matchCombineTruncOfExt(MachineInstr &MI, + std::pair &MatchInfo); + bool applyCombineTruncOfExt(MachineInstr &MI, + std::pair &MatchInfo); + + /// Transform trunc (shl x, K) to shl (trunc x), + /// K => K < VT.getScalarSizeInBits(). + bool matchCombineTruncOfShl(MachineInstr &MI, + std::pair &MatchInfo); + bool applyCombineTruncOfShl(MachineInstr &MI, + std::pair &MatchInfo); + + /// Transform G_MUL(x, -1) to G_SUB(0, x) + bool applyCombineMulByNegativeOne(MachineInstr &MI); + /// Return true if any explicit use operand on \p MI is defined by a /// G_IMPLICIT_DEF. bool matchAnyExplicitUseIsUndef(MachineInstr &MI); @@ -321,6 +374,9 @@ class CombinerHelper { /// Check if operand \p OpIdx is zero. bool matchOperandIsZero(MachineInstr &MI, unsigned OpIdx); + /// Check if operand \p OpIdx is undef. + bool matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx); + /// Erase \p MI bool eraseInst(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h index 38eb0e4bebe74..37c94ccbbd20d 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h @@ -27,6 +27,7 @@ #include "llvm/CodeGen/SwitchLoweringUtils.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/CodeGen.h" #include #include @@ -37,6 +38,7 @@ class BasicBlock; class CallInst; class CallLowering; class Constant; +class ConstrainedFPIntrinsic; class DataLayout; class Instruction; class MachineBasicBlock; @@ -299,6 +301,27 @@ class IRTranslator : public MachineFunctionPass { bool translateBinaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder); + /// If the set of cases should be emitted as a series of branches, return + /// true. If we should emit this as a bunch of and/or'd together conditions, + /// return false. + bool shouldEmitAsBranches(const std::vector &Cases); + /// Helper method for findMergedConditions. + /// This function emits a branch and is used at the leaves of an OR or an + /// AND operator tree. + void emitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + MachineBasicBlock *CurBB, + MachineBasicBlock *SwitchBB, + BranchProbability TProb, + BranchProbability FProb, bool InvertCond); + /// Used during condbr translation to find trees of conditions that can be + /// optimized. + void findMergedConditions(const Value *Cond, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, MachineBasicBlock *CurBB, + MachineBasicBlock *SwitchBB, + Instruction::BinaryOps Opc, BranchProbability TProb, + BranchProbability FProb, bool InvertCond); + /// Translate branch (br) instruction. /// \pre \p U is a branch instruction. bool translateBr(const User &U, MachineIRBuilder &MIRBuilder); @@ -535,6 +558,8 @@ class IRTranslator : public MachineFunctionPass { /// Current target configuration. Controls how the pass handles errors. const TargetPassConfig *TPC; + CodeGenOpt::Level OptLevel; + /// Current optimization remark emitter. Used to report failures. std::unique_ptr ORE; @@ -638,8 +663,7 @@ class IRTranslator : public MachineFunctionPass { BranchProbability Prob); public: - // Ctor, nothing fancy. - IRTranslator(); + IRTranslator(CodeGenOpt::Level OptLevel = CodeGenOpt::None); StringRef getPassName() const override { return "IRTranslator"; } diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h index 17c1ec36c24fe..bf9991eb08de1 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h @@ -254,6 +254,15 @@ enum { /// - OtherOpIdx - Other operand index GIM_CheckIsSameOperand, + /// Predicates with 'let PredicateCodeUsesOperands = 1' need to examine some + /// named operands that will be recorded in RecordedOperands. Names of these + /// operands are referenced in predicate argument list. Emitter determines + /// StoreIdx(corresponds to the order in which names appear in argument list). + /// - InsnID - Instruction ID + /// - OpIdx - Operand index + /// - StoreIdx - Store location in RecordedOperands. + GIM_RecordNamedOperand, + /// Fail the current try-block, or completely fail to match if there is no /// current try-block. GIM_Reject, @@ -446,6 +455,11 @@ class InstructionSelector { std::vector Renderers; RecordedMIVector MIs; DenseMap TempRegisters; + /// Named operands that predicate with 'let PredicateCodeUsesOperands = 1' + /// referenced in its argument list. Operands are inserted at index set by + /// emitter, it corresponds to the order in which names appear in argument + /// list. Currently such predicates don't have more then 3 arguments. + std::array RecordedOperands; MatcherState(unsigned MaxRenderers); }; @@ -506,7 +520,9 @@ class InstructionSelector { llvm_unreachable( "Subclasses must override this with a tablegen-erated function"); } - virtual bool testMIPredicate_MI(unsigned, const MachineInstr &) const { + virtual bool testMIPredicate_MI( + unsigned, const MachineInstr &, + const std::array &Operands) const { llvm_unreachable( "Subclasses must override this with a tablegen-erated function"); } diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h index 1f1fb5aca8757..bcb84c337f5e9 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h @@ -367,7 +367,8 @@ bool InstructionSelector::executeMatchTable( assert(State.MIs[InsnID] != nullptr && "Used insn before defined"); assert(Predicate > GIPFP_MI_Invalid && "Expected a valid predicate"); - if (!testMIPredicate_MI(Predicate, *State.MIs[InsnID])) + if (!testMIPredicate_MI(Predicate, *State.MIs[InsnID], + State.RecordedOperands)) if (handleReject() == RejectAndGiveUp) return false; break; @@ -617,6 +618,20 @@ bool InstructionSelector::executeMatchTable( break; } + case GIM_RecordNamedOperand: { + int64_t InsnID = MatchTable[CurrentIdx++]; + int64_t OpIdx = MatchTable[CurrentIdx++]; + uint64_t StoreIdx = MatchTable[CurrentIdx++]; + + DEBUG_WITH_TYPE(TgtInstructionSelector::getName(), + dbgs() << CurrentIdx << ": GIM_RecordNamedOperand(MIs[" + << InsnID << "]->getOperand(" << OpIdx + << "), StoreIdx=" << StoreIdx << ")\n"); + assert(State.MIs[InsnID] != nullptr && "Used insn before defined"); + assert(StoreIdx < State.RecordedOperands.size() && "Index out of range"); + State.RecordedOperands[StoreIdx] = &State.MIs[InsnID]->getOperand(OpIdx); + break; + } case GIM_CheckRegBankForClass: { int64_t InsnID = MatchTable[CurrentIdx++]; int64_t OpIdx = MatchTable[CurrentIdx++]; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index 50534860bec16..a230f5adfe88f 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -245,5 +245,9 @@ bool isBuildVectorAllOnes(const MachineInstr &MI, /// the value \p Val contains a true value. bool isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector, bool IsFP); + +/// Returns an integer representing true, as defined by the +/// TargetBooleanContents. +int64_t getICmpTrueVal(const TargetLowering &TLI, bool IsVector, bool IsFP); } // End namespace llvm. #endif diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index ae08d6e9313d6..ba5a5d6e87519 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -598,6 +598,7 @@ enum NodeType { CTLZ, CTPOP, BITREVERSE, + PARITY, /// Bit counting operators with an undefined result for zero inputs. CTTZ_ZERO_UNDEF, diff --git a/llvm/include/llvm/CodeGen/LiveInterval.h b/llvm/include/llvm/CodeGen/LiveInterval.h index 0764257125e6e..4fa7afaefc64f 100644 --- a/llvm/include/llvm/CodeGen/LiveInterval.h +++ b/llvm/include/llvm/CodeGen/LiveInterval.h @@ -25,6 +25,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/Support/Allocator.h" @@ -704,12 +705,16 @@ namespace llvm { private: SubRange *SubRanges = nullptr; ///< Single linked list of subregister live /// ranges. + const Register Reg; // the register or stack slot of this interval. + float Weight = 0.0; // weight of this interval public: - const unsigned reg; // the register or stack slot of this interval. - float weight; // weight of this interval + Register reg() const { return Reg; } + float weight() const { return Weight; } + void incrementWeight(float Inc) { Weight += Inc; } + void setWeight(float Value) { Weight = Value; } - LiveInterval(unsigned Reg, float Weight) : reg(Reg), weight(Weight) {} + LiveInterval(unsigned Reg, float Weight) : Reg(Reg), Weight(Weight) {} ~LiveInterval() { clearSubRanges(); @@ -806,14 +811,10 @@ namespace llvm { unsigned getSize() const; /// isSpillable - Can this interval be spilled? - bool isSpillable() const { - return weight != huge_valf; - } + bool isSpillable() const { return Weight != huge_valf; } /// markNotSpillable - Mark interval as not spillable - void markNotSpillable() { - weight = huge_valf; - } + void markNotSpillable() { Weight = huge_valf; } /// For a given lane mask @p LaneMask, compute indexes at which the /// lane is marked undefined by subregister definitions. @@ -870,7 +871,7 @@ namespace llvm { bool operator<(const LiveInterval& other) const { const SlotIndex &thisIndex = beginIndex(); const SlotIndex &otherIndex = other.beginIndex(); - return std::tie(thisIndex, reg) < std::tie(otherIndex, other.reg); + return std::tie(thisIndex, Reg) < std::tie(otherIndex, other.Reg); } void print(raw_ostream &OS) const; diff --git a/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/llvm/include/llvm/CodeGen/LiveRangeEdit.h index 3c4273130ab2b..af8fe91431c88 100644 --- a/llvm/include/llvm/CodeGen/LiveRangeEdit.h +++ b/llvm/include/llvm/CodeGen/LiveRangeEdit.h @@ -152,7 +152,7 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate { return *Parent; } - Register getReg() const { return getParent().reg; } + Register getReg() const { return getParent().reg(); } /// Iterator for accessing the new registers added by this edit. using iterator = SmallVectorImpl::const_iterator; diff --git a/llvm/include/llvm/CodeGen/LiveRegUnits.h b/llvm/include/llvm/CodeGen/LiveRegUnits.h index 1ed091e3bb5e9..e20e04cad35cc 100644 --- a/llvm/include/llvm/CodeGen/LiveRegUnits.h +++ b/llvm/include/llvm/CodeGen/LiveRegUnits.h @@ -15,7 +15,7 @@ #define LLVM_CODEGEN_LIVEREGUNITS_H #include "llvm/ADT/BitVector.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/MC/MCRegisterInfo.h" diff --git a/llvm/include/llvm/CodeGen/LowLevelType.h b/llvm/include/llvm/CodeGen/LowLevelType.h index 6295d86f749cb..402fa2ce61e74 100644 --- a/llvm/include/llvm/CodeGen/LowLevelType.h +++ b/llvm/include/llvm/CodeGen/LowLevelType.h @@ -23,6 +23,7 @@ namespace llvm { class DataLayout; class Type; +struct fltSemantics; /// Construct a low-level type based on an LLVM type. LLT getLLTForType(Type &Ty, const DataLayout &DL); @@ -35,6 +36,9 @@ MVT getMVTForLLT(LLT Ty); /// scalarable vector types, and will assert if used. LLT getLLTForMVT(MVT Ty); +/// Get the appropriate floating point arithmetic semantic based on the bit size +/// of the given scalar LLT. +const llvm::fltSemantics &getFltSemanticForLLT(LLT Ty); } #endif // LLVM_CODEGEN_LOWLEVELTYPE_H diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h index 0ea2da9910f39..8f80eca939fd4 100644 --- a/llvm/include/llvm/CodeGen/MachineFunction.h +++ b/llvm/include/llvm/CodeGen/MachineFunction.h @@ -431,6 +431,11 @@ class MachineFunction { using VariableDbgInfoMapTy = SmallVector; VariableDbgInfoMapTy VariableDbgInfos; + /// A count of how many instructions in the function have had numbers + /// assigned to them. Used for debug value tracking, to determine the + /// next instruction number. + unsigned DebugInstrNumberingCount = 0; + MachineFunction(Function &F, const LLVMTargetMachine &Target, const TargetSubtargetInfo &STI, unsigned FunctionNum, MachineModuleInfo &MMI); @@ -505,9 +510,6 @@ class MachineFunction { void setBBSectionsType(BasicBlockSection V) { BBSectionsType = V; } - /// Creates basic block Labels for this function. - void createBBLabels(); - /// Assign IsBeginSection IsEndSection fields for basic blocks in this /// function. void assignBeginEndSections(); @@ -1076,6 +1078,10 @@ class MachineFunction { /// the same callee. void moveCallSiteInfo(const MachineInstr *Old, const MachineInstr *New); + + unsigned getNewDebugInstrNum() { + return ++DebugInstrNumberingCount; + } }; //===--------------------------------------------------------------------===// diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h index 2c912b177384b..957ec2124e0ae 100644 --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -249,6 +249,10 @@ class MachineInstr DebugLoc debugLoc; // Source line information. + /// Unique instruction number. Used by DBG_INSTR_REFs to refer to the values + /// defined by this instruction. + unsigned DebugInstrNum; + // Intrusive list support friend struct ilist_traits; friend struct ilist_callback_traits; @@ -444,6 +448,14 @@ class MachineInstr /// this DBG_LABEL instruction. const DILabel *getDebugLabel() const; + /// Fetch the instruction number of this MachineInstr. If it does not have + /// one already, a new and unique number will be assigned. + unsigned getDebugInstrNum(); + + /// Examine the instruction number of this MachineInstr. May be zero if + /// it hasn't been assigned a number yet. + unsigned peekDebugInstrNum() const { return DebugInstrNum; } + /// Emit an error referring to the source location of this instruction. /// This should only be used for inline assembly that is somehow /// impossible to compile. Other errors should have been handled much @@ -1145,7 +1157,10 @@ class MachineInstr bool isDebugValue() const { return getOpcode() == TargetOpcode::DBG_VALUE; } bool isDebugLabel() const { return getOpcode() == TargetOpcode::DBG_LABEL; } - bool isDebugInstr() const { return isDebugValue() || isDebugLabel(); } + bool isDebugRef() const { return getOpcode() == TargetOpcode::DBG_INSTR_REF; } + bool isDebugInstr() const { + return isDebugValue() || isDebugLabel() || isDebugRef(); + } bool isDebugOffsetImm() const { return getDebugOffset().isImm(); } @@ -1238,6 +1253,7 @@ class MachineInstr case TargetOpcode::EH_LABEL: case TargetOpcode::GC_LABEL: case TargetOpcode::DBG_VALUE: + case TargetOpcode::DBG_INSTR_REF: case TargetOpcode::DBG_LABEL: case TargetOpcode::LIFETIME_START: case TargetOpcode::LIFETIME_END: diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h index 4a1b04ab3e886..a5dbbdb4fdcd2 100644 --- a/llvm/include/llvm/CodeGen/MachineOutliner.h +++ b/llvm/include/llvm/CodeGen/MachineOutliner.h @@ -15,10 +15,11 @@ #ifndef LLVM_MACHINEOUTLINER_H #define LLVM_MACHINEOUTLINER_H +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/CodeGen/LivePhysRegs.h" namespace llvm { namespace outliner { diff --git a/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h b/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h index 56db30ff7d6de..fe07c70d85c59 100644 --- a/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h +++ b/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h @@ -39,7 +39,7 @@ class NonRelocatableStringpool { /// Get the offset of string \p S in the string table. This can insert a new /// element or return the offset of a pre-existing one. - uint32_t getStringOffset(StringRef S) { return getEntry(S).getOffset(); } + uint64_t getStringOffset(StringRef S) { return getEntry(S).getOffset(); } /// Get permanent storage for \p S (but do not necessarily emit \p S in the /// output section). A latter call to getStringOffset() with the same string @@ -57,7 +57,7 @@ class NonRelocatableStringpool { private: MapTy Strings; - uint32_t CurrentEndOffset = 0; + uint64_t CurrentEndOffset = 0; unsigned NumEntries = 0; DwarfStringPoolEntryRef EmptyString; std::function Translator; diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 5607e785e349a..b5b18f49e104f 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1049,8 +1049,8 @@ class SelectionDAG { /// Helper function to make it easier to build SetCC's if you just have an /// ISD::CondCode instead of an SDValue. SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, - ISD::CondCode Cond, SDValue Chain = SDValue(), - bool IsSignaling = false) { + ISD::CondCode Cond, SDNodeFlags Flags = SDNodeFlags(), + SDValue Chain = SDValue(), bool IsSignaling = false) { assert(LHS.getValueType().isVector() == RHS.getValueType().isVector() && "Cannot compare scalars to vectors"); assert(LHS.getValueType().isVector() == VT.isVector() && @@ -1060,7 +1060,7 @@ class SelectionDAG { if (Chain) return getNode(IsSignaling ? ISD::STRICT_FSETCCS : ISD::STRICT_FSETCC, DL, {VT, MVT::Other}, {Chain, LHS, RHS, getCondCode(Cond)}); - return getNode(ISD::SETCC, DL, VT, LHS, RHS, getCondCode(Cond)); + return getNode(ISD::SETCC, DL, VT, LHS, RHS, getCondCode(Cond), Flags); } /// Helper function to make it easier to build Select's if you just have @@ -1178,14 +1178,15 @@ class SelectionDAG { /// This function will set the MOLoad flag on MMOFlags, but you can set it if /// you want. The MOStore flag must not be set. SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, - MachinePointerInfo PtrInfo, MaybeAlign Alignment, + MachinePointerInfo PtrInfo, + MaybeAlign Alignment = MaybeAlign(), MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr); /// FIXME: Remove once transition to Align is over. inline SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, - MachinePointerInfo PtrInfo, unsigned Alignment = 0, + MachinePointerInfo PtrInfo, unsigned Alignment, MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr) { @@ -1197,14 +1198,14 @@ class SelectionDAG { SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, - MaybeAlign Alignment, + MaybeAlign Alignment = MaybeAlign(), MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, const AAMDNodes &AAInfo = AAMDNodes()); /// FIXME: Remove once transition to Align is over. inline SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, - unsigned Alignment = 0, + unsigned Alignment, MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, const AAMDNodes &AAInfo = AAMDNodes()) { return getExtLoad(ExtType, dl, VT, Chain, Ptr, PtrInfo, MemVT, @@ -1221,13 +1222,12 @@ class SelectionDAG { MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr); - inline SDValue - getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, - const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset, - MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment, - MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, - const AAMDNodes &AAInfo = AAMDNodes(), - const MDNode *Ranges = nullptr) { + inline SDValue getLoad( + ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &dl, + SDValue Chain, SDValue Ptr, SDValue Offset, MachinePointerInfo PtrInfo, + EVT MemVT, MaybeAlign Alignment = MaybeAlign(), + MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, + const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr) { // Ensures that codegen never sees a None Alignment. return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, PtrInfo, MemVT, Alignment.getValueOr(getEVTAlign(MemVT)), MMOFlags, AAInfo, @@ -1237,7 +1237,7 @@ class SelectionDAG { inline SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset, - MachinePointerInfo PtrInfo, EVT MemVT, unsigned Alignment = 0, + MachinePointerInfo PtrInfo, EVT MemVT, unsigned Alignment, MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr) { @@ -1260,7 +1260,7 @@ class SelectionDAG { const AAMDNodes &AAInfo = AAMDNodes()); inline SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, - MachinePointerInfo PtrInfo, MaybeAlign Alignment, + MachinePointerInfo PtrInfo, MaybeAlign Alignment = MaybeAlign(), MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, const AAMDNodes &AAInfo = AAMDNodes()) { return getStore(Chain, dl, Val, Ptr, PtrInfo, @@ -1270,7 +1270,7 @@ class SelectionDAG { /// FIXME: Remove once transition to Align is over. inline SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, - MachinePointerInfo PtrInfo, unsigned Alignment = 0, + MachinePointerInfo PtrInfo, unsigned Alignment, MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, const AAMDNodes &AAInfo = AAMDNodes()) { return getStore(Chain, dl, Val, Ptr, PtrInfo, MaybeAlign(Alignment), @@ -1285,7 +1285,8 @@ class SelectionDAG { const AAMDNodes &AAInfo = AAMDNodes()); inline SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, - MachinePointerInfo PtrInfo, EVT SVT, MaybeAlign Alignment, + MachinePointerInfo PtrInfo, EVT SVT, + MaybeAlign Alignment = MaybeAlign(), MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, const AAMDNodes &AAInfo = AAMDNodes()) { return getTruncStore(Chain, dl, Val, Ptr, PtrInfo, SVT, @@ -1295,7 +1296,7 @@ class SelectionDAG { /// FIXME: Remove once transition to Align is over. inline SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, - MachinePointerInfo PtrInfo, EVT SVT, unsigned Alignment = 0, + MachinePointerInfo PtrInfo, EVT SVT, unsigned Alignment, MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, const AAMDNodes &AAInfo = AAMDNodes()) { return getTruncStore(Chain, dl, Val, Ptr, PtrInfo, SVT, diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 6eef79162f8a7..fa150831bdbd0 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -357,10 +357,6 @@ template<> struct simplify_type { /// the backend. struct SDNodeFlags { private: - // This bit is used to determine if the flags are in a defined state. It is - // only used by SelectionDAGBuilder. - bool AnyDefined : 1; - bool NoUnsignedWrap : 1; bool NoSignedWrap : 1; bool Exact : 1; @@ -382,9 +378,8 @@ struct SDNodeFlags { public: /// Default constructor turns off all optimization flags. SDNodeFlags() - : AnyDefined(false), NoUnsignedWrap(false), NoSignedWrap(false), - Exact(false), NoNaNs(false), NoInfs(false), - NoSignedZeros(false), AllowReciprocal(false), + : NoUnsignedWrap(false), NoSignedWrap(false), Exact(false), NoNaNs(false), + NoInfs(false), NoSignedZeros(false), AllowReciprocal(false), AllowContract(false), ApproximateFuncs(false), AllowReassociation(false), NoFPExcept(false) {} @@ -399,56 +394,18 @@ struct SDNodeFlags { setAllowReassociation(FPMO.hasAllowReassoc()); } - /// Sets the state of the flags to the defined state. - void setDefined() { AnyDefined = true; } - /// Returns true if the flags are in a defined state. - bool isDefined() const { return AnyDefined; } - // These are mutators for each flag. - void setNoUnsignedWrap(bool b) { - setDefined(); - NoUnsignedWrap = b; - } - void setNoSignedWrap(bool b) { - setDefined(); - NoSignedWrap = b; - } - void setExact(bool b) { - setDefined(); - Exact = b; - } - void setNoNaNs(bool b) { - setDefined(); - NoNaNs = b; - } - void setNoInfs(bool b) { - setDefined(); - NoInfs = b; - } - void setNoSignedZeros(bool b) { - setDefined(); - NoSignedZeros = b; - } - void setAllowReciprocal(bool b) { - setDefined(); - AllowReciprocal = b; - } - void setAllowContract(bool b) { - setDefined(); - AllowContract = b; - } - void setApproximateFuncs(bool b) { - setDefined(); - ApproximateFuncs = b; - } - void setAllowReassociation(bool b) { - setDefined(); - AllowReassociation = b; - } - void setNoFPExcept(bool b) { - setDefined(); - NoFPExcept = b; - } + void setNoUnsignedWrap(bool b) { NoUnsignedWrap = b; } + void setNoSignedWrap(bool b) { NoSignedWrap = b; } + void setExact(bool b) { Exact = b; } + void setNoNaNs(bool b) { NoNaNs = b; } + void setNoInfs(bool b) { NoInfs = b; } + void setNoSignedZeros(bool b) { NoSignedZeros = b; } + void setAllowReciprocal(bool b) { AllowReciprocal = b; } + void setAllowContract(bool b) { AllowContract = b; } + void setApproximateFuncs(bool b) { ApproximateFuncs = b; } + void setAllowReassociation(bool b) { AllowReassociation = b; } + void setNoFPExcept(bool b) { NoFPExcept = b; } // These are accessors for each flag. bool hasNoUnsignedWrap() const { return NoUnsignedWrap; } diff --git a/llvm/include/llvm/CodeGen/StableHashing.h b/llvm/include/llvm/CodeGen/StableHashing.h index c6113aa93c800..caf27e152e78f 100644 --- a/llvm/include/llvm/CodeGen/StableHashing.h +++ b/llvm/include/llvm/CodeGen/StableHashing.h @@ -40,7 +40,7 @@ inline void stable_hash_append(stable_hash &Hash, const char Value) { inline void stable_hash_append(stable_hash &Hash, stable_hash Value) { for (unsigned I = 0; I < 8; ++I) { - stable_hash_append(Hash, (const char)Value); + stable_hash_append(Hash, static_cast(Value)); Value >>= 8; } } diff --git a/llvm/include/llvm/CodeGen/StackMaps.h b/llvm/include/llvm/CodeGen/StackMaps.h index ce4eb85d64525..578bc0e161a64 100644 --- a/llvm/include/llvm/CodeGen/StackMaps.h +++ b/llvm/include/llvm/CodeGen/StackMaps.h @@ -261,6 +261,10 @@ class StackMaps { StackMaps(AsmPrinter &AP); + /// Get index of next meta operand. + /// Similar to parseOperand, but does not actually parses operand meaning. + static unsigned getNextMetaArgIdx(MachineInstr *MI, unsigned CurIdx); + void reset() { CSInfos.clear(); ConstPool.clear(); diff --git a/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h b/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h index 4d6afa617d3a2..51f1d7d6fd218 100644 --- a/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h +++ b/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h @@ -10,16 +10,21 @@ #define LLVM_CODEGEN_SWITCHLOWERINGUTILS_H #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/SelectionDAGNodes.h" -#include "llvm/CodeGen/TargetLowering.h" -#include "llvm/IR/Constants.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/Support/BranchProbability.h" +#include namespace llvm { +class BlockFrequencyInfo; +class ConstantInt; class FunctionLoweringInfo; class MachineBasicBlock; -class BlockFrequencyInfo; +class ProfileSummaryInfo; +class TargetLowering; +class TargetMachine; namespace SwitchCG { diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index f9f9ce41e329b..0629c81d4f4f8 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1270,6 +1270,17 @@ class TargetInstrInfo : public MCInstrInfo { return false; } + /// Returns true if MI's Def is NullValueReg, and the MI + /// does not change the Zero value. i.e. cases such as rax = shr rax, X where + /// NullValueReg = rax. Note that if the NullValueReg is non-zero, this + /// function can return true even if becomes zero. Specifically cases such as + /// NullValueReg = shl NullValueReg, 63. + virtual bool preservesZeroValueInReg(const MachineInstr *MI, + const Register NullValueReg, + const TargetRegisterInfo *TRI) const { + return false; + } + /// If the instruction is an increment of a constant value, return the amount. virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const { return false; diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index e4e92581b893d..af6a5fa171a62 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -977,6 +977,36 @@ class TargetRegisterInfo : public MCRegisterInfo { virtual bool shouldRegionSplitForVirtReg(const MachineFunction &MF, const LiveInterval &VirtReg) const; + /// Last chance recoloring has a high compile time cost especially for + /// targets with a lot of registers. + /// This method is used to decide whether or not \p VirtReg should + /// go through this expensive heuristic. + /// When this target hook is hit, by returning false, there is a high + /// chance that the register allocation will fail altogether (usually with + /// ran out of registers). + /// That said, this error usually points to another problem in the + /// optimization pipeline. + virtual bool + shouldUseLastChanceRecoloringForVirtReg(const MachineFunction &MF, + const LiveInterval &VirtReg) const { + return true; + } + + /// Deferred spilling delais the spill insertion of a virtual register + /// after every other allocations. By deferring the spilling, it is + /// sometimes possible to eliminate that spilling altogether because + /// something else could have been eliminated, thus leaving some space + /// for the virtual register. + /// However, this comes with a compile time impact because it adds one + /// more stage to the greedy register allocator. + /// This method is used to decide whether \p VirtReg should use the deferred + /// spilling stage instead of being spilled right away. + virtual bool + shouldUseDeferredSpillingForVirtReg(const MachineFunction &MF, + const LiveInterval &VirtReg) const { + return false; + } + //===--------------------------------------------------------------------===// /// Debug information queries. diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index aec8d08f30e74..9ad0d827dfd8d 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -306,7 +306,7 @@ #cmakedefine01 LLVM_VERSION_PRINTER_SHOW_HOST_TARGET_INFO /* Define if libxml2 is supported on this platform. */ -#cmakedefine LLVM_LIBXML2_ENABLED ${LLVM_LIBXML2_ENABLED} +#cmakedefine LLVM_ENABLE_LIBXML2 ${LLVM_ENABLE_LIBXML2} /* Define to the extension used for shared libraries, say, ".so". */ #cmakedefine LTDL_SHLIB_EXT "${LTDL_SHLIB_EXT}" diff --git a/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h b/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h index 784c47e3bf5dc..bb29ef5f2ce82 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h +++ b/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h @@ -11,9 +11,9 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Optional.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/CodeViewError.h" #include "llvm/DebugInfo/CodeView/RecordSerialization.h" -#include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/BinaryStreamRef.h" #include "llvm/Support/Endian.h" @@ -61,12 +61,9 @@ template class CVRecord { ArrayRef RecordData; }; -template struct RemappedRecord { - explicit RemappedRecord(const CVRecord &R) : OriginalRecord(R) {} - - CVRecord OriginalRecord; - SmallVector, 8> Mappings; -}; +// There are two kinds of codeview records: type and symbol records. +using CVType = CVRecord; +using CVSymbol = CVRecord; template Error forEachCodeViewRecord(ArrayRef StreamBuffer, Func F) { @@ -126,6 +123,12 @@ struct VarStreamArrayExtractor> { } }; +namespace codeview { +using CVSymbolArray = VarStreamArray; +using CVTypeArray = VarStreamArray; +using CVTypeRange = iterator_range; +} // namespace codeview + } // end namespace llvm #endif // LLVM_DEBUGINFO_CODEVIEW_RECORDITERATOR_H diff --git a/llvm/include/llvm/DebugInfo/CodeView/CVSymbolVisitor.h b/llvm/include/llvm/DebugInfo/CodeView/CVSymbolVisitor.h index 1615ff41df125..82ef8c173beec 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/CVSymbolVisitor.h +++ b/llvm/include/llvm/DebugInfo/CodeView/CVSymbolVisitor.h @@ -10,9 +10,6 @@ #define LLVM_DEBUGINFO_CODEVIEW_CVSYMBOLVISITOR_H #include "llvm/DebugInfo/CodeView/CVRecord.h" -#include "llvm/DebugInfo/CodeView/CodeView.h" -#include "llvm/DebugInfo/CodeView/SymbolRecord.h" -#include "llvm/DebugInfo/CodeView/SymbolVisitorDelegate.h" #include "llvm/Support/ErrorOr.h" namespace llvm { diff --git a/llvm/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h b/llvm/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h index f26e80ebe2a94..d851dea0a27f4 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h +++ b/llvm/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h @@ -15,7 +15,8 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/CodeView/CodeViewError.h" -#include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/DebugInfo/CodeView/GUID.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/BinaryStreamWriter.h" #include "llvm/Support/Error.h" diff --git a/llvm/include/llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h b/llvm/include/llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h index 784fc59484b96..51b8523ed9697 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h +++ b/llvm/include/llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h @@ -9,8 +9,8 @@ #ifndef LLVM_DEBUGINFO_CODEVIEW_DEBUGSYMBOLSSUBSECTION_H #define LLVM_DEBUGINFO_CODEVIEW_DEBUGSYMBOLSSUBSECTION_H +#include "llvm/DebugInfo/CodeView/CVRecord.h" #include "llvm/DebugInfo/CodeView/DebugSubsection.h" -#include "llvm/DebugInfo/CodeView/SymbolRecord.h" #include "llvm/Support/Error.h" namespace llvm { diff --git a/llvm/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h b/llvm/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h index 35eeef5a327e0..ddbb4e3c5e6c8 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h +++ b/llvm/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h @@ -14,7 +14,6 @@ #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/CodeView/TypeCollection.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" -#include "llvm/DebugInfo/CodeView/TypeRecord.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/BinaryStreamArray.h" #include "llvm/Support/Error.h" diff --git a/llvm/include/llvm/DebugInfo/CodeView/RecordName.h b/llvm/include/llvm/DebugInfo/CodeView/RecordName.h index cc09db8933bdb..8e06be9e41e8f 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/RecordName.h +++ b/llvm/include/llvm/DebugInfo/CodeView/RecordName.h @@ -9,7 +9,6 @@ #ifndef LLVM_DEBUGINFO_CODEVIEW_RECORDNAME_H #define LLVM_DEBUGINFO_CODEVIEW_RECORDNAME_H -#include "llvm/DebugInfo/CodeView/SymbolRecord.h" #include "llvm/DebugInfo/CodeView/TypeCollection.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" diff --git a/llvm/include/llvm/DebugInfo/CodeView/SymbolDumper.h b/llvm/include/llvm/DebugInfo/CodeView/SymbolDumper.h index d832a48b12653..aaeffb2446ad8 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/SymbolDumper.h +++ b/llvm/include/llvm/DebugInfo/CodeView/SymbolDumper.h @@ -11,8 +11,8 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringSet.h" +#include "llvm/DebugInfo/CodeView/CVRecord.h" #include "llvm/DebugInfo/CodeView/SymbolDumpDelegate.h" -#include "llvm/DebugInfo/CodeView/SymbolRecord.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" namespace llvm { diff --git a/llvm/include/llvm/DebugInfo/CodeView/SymbolRecord.h b/llvm/include/llvm/DebugInfo/CodeView/SymbolRecord.h index 4383534b0db28..c37f6b4d5fa77 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/SymbolRecord.h +++ b/llvm/include/llvm/DebugInfo/CodeView/SymbolRecord.h @@ -1003,9 +1003,6 @@ class AnnotationSym : public SymbolRecord { uint32_t RecordOffset = 0; }; -using CVSymbol = CVRecord; -using CVSymbolArray = VarStreamArray; - Expected readSymbolFromStream(BinaryStreamRef Stream, uint32_t Offset); diff --git a/llvm/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h b/llvm/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h index 57dbc56c0769d..71bc70dde6ed1 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h +++ b/llvm/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h @@ -9,7 +9,8 @@ #ifndef LLVM_DEBUGINFO_CODEVIEW_SYMBOLRECORDHELPERS_H #define LLVM_DEBUGINFO_CODEVIEW_SYMBOLRECORDHELPERS_H -#include "llvm/DebugInfo/CodeView/SymbolRecord.h" +#include "llvm/DebugInfo/CodeView/CVRecord.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" namespace llvm { namespace codeview { diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeCollection.h b/llvm/include/llvm/DebugInfo/CodeView/TypeCollection.h index 102d68c3fb2a9..bde5a8b3ab2fa 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/TypeCollection.h +++ b/llvm/include/llvm/DebugInfo/CodeView/TypeCollection.h @@ -10,9 +10,8 @@ #define LLVM_DEBUGINFO_CODEVIEW_TYPECOLLECTION_H #include "llvm/ADT/StringRef.h" - +#include "llvm/DebugInfo/CodeView/CVRecord.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" -#include "llvm/DebugInfo/CodeView/TypeRecord.h" namespace llvm { namespace codeview { diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h b/llvm/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h index 469768787274d..f4f5835d8b57a 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h +++ b/llvm/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h @@ -10,8 +10,8 @@ #define LLVM_DEBUGINFO_CODEVIEW_TYPEINDEXDISCOVERY_H #include "llvm/ADT/SmallVector.h" -#include "llvm/DebugInfo/CodeView/SymbolRecord.h" -#include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/DebugInfo/CodeView/CVRecord.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/Support/Error.h" namespace llvm { diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h b/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h index 35f5c05611385..59bdd2a7c9f2c 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h +++ b/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h @@ -14,7 +14,6 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/iterator_range.h" #include "llvm/DebugInfo/CodeView/CVRecord.h" #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/GUID.h" @@ -32,15 +31,10 @@ using support::little32_t; using support::ulittle16_t; using support::ulittle32_t; -using CVType = CVRecord; -using RemappedType = RemappedRecord; - struct CVMemberRecord { TypeLeafKind Kind; ArrayRef Data; }; -using CVTypeArray = VarStreamArray; -using CVTypeRange = iterator_range; /// Equvalent to CV_fldattr_t in cvinfo.h. struct MemberAttributes { diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeRecordHelpers.h b/llvm/include/llvm/DebugInfo/CodeView/TypeRecordHelpers.h index 19492b93681cc..041f5214967c6 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/TypeRecordHelpers.h +++ b/llvm/include/llvm/DebugInfo/CodeView/TypeRecordHelpers.h @@ -9,7 +9,8 @@ #ifndef LLVM_DEBUGINFO_CODEVIEW_TYPERECORDHELPERS_H #define LLVM_DEBUGINFO_CODEVIEW_TYPERECORDHELPERS_H -#include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/DebugInfo/CodeView/CVRecord.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" namespace llvm { namespace codeview { diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h index d0506cce81762..04d7c7b0420a8 100644 --- a/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h +++ b/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h @@ -11,7 +11,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/DebugInfo/CodeView/CVRecord.h" #include "llvm/Support/Error.h" namespace llvm { diff --git a/llvm/include/llvm/DebugInfo/DIContext.h b/llvm/include/llvm/DebugInfo/DIContext.h index 661d30d04c94e..ae78fe912188d 100644 --- a/llvm/include/llvm/DebugInfo/DIContext.h +++ b/llvm/include/llvm/DebugInfo/DIContext.h @@ -35,6 +35,7 @@ struct DILineInfo { static constexpr const char *const Addr2LineBadString = "??"; std::string FileName; std::string FunctionName; + std::string StartFileName; Optional Source; uint32_t Line = 0; uint32_t Column = 0; @@ -43,12 +44,15 @@ struct DILineInfo { // DWARF-specific. uint32_t Discriminator = 0; - DILineInfo() : FileName(BadString), FunctionName(BadString) {} + DILineInfo() + : FileName(BadString), FunctionName(BadString), StartFileName(BadString) { + } bool operator==(const DILineInfo &RHS) const { return Line == RHS.Line && Column == RHS.Column && FileName == RHS.FileName && FunctionName == RHS.FunctionName && - StartLine == RHS.StartLine && Discriminator == RHS.Discriminator; + StartFileName == RHS.StartFileName && StartLine == RHS.StartLine && + Discriminator == RHS.Discriminator; } bool operator!=(const DILineInfo &RHS) const { @@ -56,10 +60,10 @@ struct DILineInfo { } bool operator<(const DILineInfo &RHS) const { - return std::tie(FileName, FunctionName, Line, Column, StartLine, - Discriminator) < - std::tie(RHS.FileName, RHS.FunctionName, RHS.Line, RHS.Column, - RHS.StartLine, RHS.Discriminator); + return std::tie(FileName, FunctionName, StartFileName, Line, Column, + StartLine, Discriminator) < + std::tie(RHS.FileName, RHS.FunctionName, RHS.StartFileName, RHS.Line, + RHS.Column, RHS.StartLine, RHS.Discriminator); } explicit operator bool() const { return *this != DILineInfo(); } @@ -72,6 +76,8 @@ struct DILineInfo { OS << "function '" << FunctionName << "', "; OS << "line " << Line << ", "; OS << "column " << Column << ", "; + if (StartFileName != BadString) + OS << "start file '" << StartFileName << "', "; OS << "start line " << StartLine << '\n'; } }; diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h index 32844ffd570ff..69e67866946ce 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h @@ -74,6 +74,24 @@ class DWARFDebugAddrTable { /// Return the full length of this table, including the length field. /// Return None if the length cannot be identified reliably. Optional getFullLength() const; + + /// Return the DWARF format of this table. + dwarf::DwarfFormat getFormat() const { return Format; } + + /// Return the length of this table. + uint64_t getLength() const { return Length; } + + /// Return the version of this table. + uint16_t getVersion() const { return Version; } + + /// Return the address size of this table. + uint8_t getAddressSize() const { return AddrSize; } + + /// Return the segment selector size of this table. + uint8_t getSegmentSelectorSize() const { return SegSize; } + + /// Return the parsed addresses of this table. + ArrayRef getAddressEntries() const { return Addrs; } }; } // end namespace llvm diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h index 05a6056e8e21f..5789421e53044 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h @@ -262,6 +262,7 @@ class DWARFDie { /// for this subprogram by resolving DW_AT_sepcification or /// DW_AT_abstract_origin references if necessary. uint64_t getDeclLine() const; + std::string getDeclFile(DILineInfoSpecifier::FileLineInfoKind Kind) const; /// Retrieves values of DW_AT_call_file, DW_AT_call_line and DW_AT_call_column /// from DIE (or zeroes if they are missing). This function looks for diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h index bcfc71381aeee..e54bed2d65d67 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h @@ -270,19 +270,13 @@ template Expected DWARFListTableBase::findList(DWARFDataExtractor Data, uint64_t Offset) { - auto Entry = ListMap.find(Offset); - if (Entry != ListMap.end()) - return Entry->second; - // Extract the list from the section and enter it into the list map. DWARFListType List; uint64_t End = getHeaderOffset() + Header.length(); - uint64_t StartingOffset = Offset; if (Error E = List.extract(Data, getHeaderOffset(), End, &Offset, Header.getSectionName(), Header.getListTypeString())) return std::move(E); - ListMap[StartingOffset] = List; return List; } diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h index 1b7fd2d54cb22..70288868ca21c 100644 --- a/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h +++ b/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h @@ -9,7 +9,7 @@ #ifndef LLVM_DEBUGINFO_PDB_RAW_PDBTPISTREAM_H #define LLVM_DEBUGINFO_PDB_RAW_PDBTPISTREAM_H -#include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/DebugInfo/CodeView/CVRecord.h" #include "llvm/DebugInfo/PDB/Native/HashTable.h" #include "llvm/DebugInfo/PDB/Native/RawConstants.h" #include "llvm/DebugInfo/PDB/Native/RawTypes.h" diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h index 2982146f960c9..88849d024c233 100644 --- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h +++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h @@ -42,7 +42,6 @@ class StringRef; class raw_ostream; namespace pdb { -class IPDBRawSymbol; class IPDBSession; #define DECLARE_PDB_SYMBOL_CONCRETE_TYPE(TagValue) \ diff --git a/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h index 9ecc0464dec1b..3a2f8b54ad22b 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h @@ -96,7 +96,8 @@ class CompileOnDemandLayer : public IRLayer { /// Emits the given module. This should not be called by clients: it will be /// called by the JIT when a definition added via the add method is requested. - void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override; + void emit(std::unique_ptr R, + ThreadSafeModule TSM) override; private: struct PerDylibResources { @@ -120,7 +121,8 @@ class CompileOnDemandLayer : public IRLayer { void expandPartition(GlobalValueSet &Partition); - void emitPartition(MaterializationResponsibility R, ThreadSafeModule TSM, + void emitPartition(std::unique_ptr R, + ThreadSafeModule TSM, IRMaterializationUnit::SymbolNameToDefinitionMap Defs); mutable std::mutex CODLayerMutex; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/CompileUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/CompileUtils.h index 8376d163d57a5..c7ba57228ab71 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/CompileUtils.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/CompileUtils.h @@ -28,8 +28,6 @@ class TargetMachine; namespace orc { -class JITTargetMachineBuilder; - IRSymbolMapper::ManglingOptions irManglingOptionsFromTargetOptions(const TargetOptions &Opts); diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h index 6951df3f2d3f2..70bd983c40ce0 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h @@ -410,7 +410,7 @@ class UnexpectedSymbolDefinitions : public ErrorInfo + delegate(const SymbolNameSet &Symbols, VModuleKey NewKey = VModuleKey()); void addDependencies(const SymbolStringPtr &Name, const SymbolDependenceMap &Dependencies); @@ -577,7 +577,8 @@ class MaterializationUnit { /// Implementations of this method should materialize all symbols /// in the materialzation unit, except for those that have been /// previously discarded. - virtual void materialize(MaterializationResponsibility R) = 0; + virtual void + materialize(std::unique_ptr R) = 0; /// Called by JITDylibs to notify MaterializationUnits that the given symbol /// has been overridden. @@ -594,10 +595,11 @@ class MaterializationUnit { private: virtual void anchor(); - MaterializationResponsibility + std::unique_ptr createMaterializationResponsibility(std::shared_ptr JD) { - return MaterializationResponsibility(std::move(JD), std::move(SymbolFlags), - std::move(InitSymbol), K); + return std::unique_ptr( + new MaterializationResponsibility(std::move(JD), std::move(SymbolFlags), + std::move(InitSymbol), K)); } /// Implementations of this method should discard the given symbol @@ -621,7 +623,7 @@ class AbsoluteSymbolsMaterializationUnit : public MaterializationUnit { StringRef getName() const override; private: - void materialize(MaterializationResponsibility R) override; + void materialize(std::unique_ptr R) override; void discard(const JITDylib &JD, const SymbolStringPtr &Name) override; static SymbolFlagsMap extractFlags(const SymbolMap &Symbols); @@ -663,7 +665,7 @@ class ReExportsMaterializationUnit : public MaterializationUnit { StringRef getName() const override; private: - void materialize(MaterializationResponsibility R) override; + void materialize(std::unique_ptr R) override; void discard(const JITDylib &JD, const SymbolStringPtr &Name) override; static SymbolFlagsMap extractFlags(const SymbolAliasMap &Aliases); @@ -1116,7 +1118,7 @@ class ExecutionSession { /// For dispatching MaterializationUnit::materialize calls. using DispatchMaterializationFunction = std::function MU, - MaterializationResponsibility MR)>; + std::unique_ptr MR)>; /// Construct an ExecutionSession. /// @@ -1268,10 +1270,11 @@ class ExecutionSession { SymbolState RequiredState = SymbolState::Ready); /// Materialize the given unit. - void dispatchMaterialization(std::unique_ptr MU, - MaterializationResponsibility MR) { + void + dispatchMaterialization(std::unique_ptr MU, + std::unique_ptr MR) { assert(MU && "MU must be non-null"); - DEBUG_WITH_TYPE("orc", dumpDispatchInfo(MR.getTargetJITDylib(), *MU)); + DEBUG_WITH_TYPE("orc", dumpDispatchInfo(MR->getTargetJITDylib(), *MU)); DispatchMaterialization(std::move(MU), std::move(MR)); } @@ -1283,9 +1286,9 @@ class ExecutionSession { logAllUnhandledErrors(std::move(Err), errs(), "JIT session error: "); } - static void - materializeOnCurrentThread(std::unique_ptr MU, - MaterializationResponsibility MR) { + static void materializeOnCurrentThread( + std::unique_ptr MU, + std::unique_ptr MR) { MU->materialize(std::move(MR)); } @@ -1309,7 +1312,7 @@ class ExecutionSession { // with callbacks from asynchronous queries. mutable std::recursive_mutex OutstandingMUsMutex; std::vector, - MaterializationResponsibility>> + std::unique_ptr>> OutstandingMUs; }; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/GlobalMappingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/GlobalMappingLayer.h index a4e43d4e1c9c2..943404262bd04 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/GlobalMappingLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/GlobalMappingLayer.h @@ -22,7 +22,6 @@ namespace llvm { class Module; -class JITSymbolResolver; namespace orc { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h index eb74d283f0435..2c53e2f66e851 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h @@ -55,7 +55,8 @@ class IRCompileLayer : public IRLayer { void setNotifyCompiled(NotifyCompiledFunction NotifyCompiled); - void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override; + void emit(std::unique_ptr R, + ThreadSafeModule TSM) override; private: mutable std::mutex IRLayerMutex; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h index 296d74ae6b865..ee4ee3437fa6d 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h @@ -37,7 +37,8 @@ class IRTransformLayer : public IRLayer { this->Transform = std::move(Transform); } - void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override; + void emit(std::unique_ptr R, + ThreadSafeModule TSM) override; static ThreadSafeModule identityTransform(ThreadSafeModule TSM, MaterializationResponsibility &R) { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Layer.h b/llvm/include/llvm/ExecutionEngine/Orc/Layer.h index e843d0f562455..c8a41199760da 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Layer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Layer.h @@ -100,7 +100,8 @@ class IRLayer { VModuleKey K = VModuleKey()); /// Emit should materialize the given IR. - virtual void emit(MaterializationResponsibility R, ThreadSafeModule TSM) = 0; + virtual void emit(std::unique_ptr R, + ThreadSafeModule TSM) = 0; private: bool CloneToNewContextOnEmit = false; @@ -117,8 +118,7 @@ class BasicIRLayerMaterializationUnit : public IRMaterializationUnit { ThreadSafeModule TSM, VModuleKey K); private: - - void materialize(MaterializationResponsibility R) override; + void materialize(std::unique_ptr R) override; IRLayer &L; VModuleKey K; @@ -139,7 +139,7 @@ class ObjectLayer { VModuleKey K = VModuleKey()); /// Emit should materialize the given IR. - virtual void emit(MaterializationResponsibility R, + virtual void emit(std::unique_ptr R, std::unique_ptr O) = 0; private: @@ -162,8 +162,7 @@ class BasicObjectLayerMaterializationUnit : public MaterializationUnit { StringRef getName() const override; private: - - void materialize(MaterializationResponsibility R) override; + void materialize(std::unique_ptr R) override; void discard(const JITDylib &JD, const SymbolStringPtr &Name) override; ObjectLayer &L; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/LazyReexports.h b/llvm/include/llvm/ExecutionEngine/Orc/LazyReexports.h index 9206e40fffb1c..63e3a80d87d86 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/LazyReexports.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/LazyReexports.h @@ -149,7 +149,7 @@ class LazyReexportsMaterializationUnit : public MaterializationUnit { StringRef getName() const override; private: - void materialize(MaterializationResponsibility R) override; + void materialize(std::unique_ptr R) override; void discard(const JITDylib &JD, const SymbolStringPtr &Name) override; static SymbolFlagsMap extractFlags(const SymbolAliasMap &Aliases); diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h index cb8ee130ab614..cbcf3928be3df 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h @@ -119,7 +119,7 @@ class ObjectLinkingLayer : public ObjectLayer { } /// Emit the object. - void emit(MaterializationResponsibility R, + void emit(std::unique_ptr R, std::unique_ptr O) override; /// Instructs this ObjectLinkingLayer instance to override the symbol flags diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h index bf989cc8677cf..c77649f19fc74 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h @@ -31,7 +31,7 @@ class ObjectTransformLayer : public ObjectLayer { ObjectTransformLayer(ExecutionSession &ES, ObjectLayer &BaseLayer, TransformFunction Transform = TransformFunction()); - void emit(MaterializationResponsibility R, + void emit(std::unique_ptr R, std::unique_ptr O) override; void setTransform(TransformFunction Transform) { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h index 9ada0871cf0cb..9cd3c57a19c6a 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h @@ -58,7 +58,7 @@ class RTDyldObjectLinkingLayer : public ObjectLayer { ~RTDyldObjectLinkingLayer(); /// Emit the object. - void emit(MaterializationResponsibility R, + void emit(std::unique_ptr R, std::unique_ptr O) override; /// Set the NotifyLoaded callback. diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h index 10f78c8bc6beb..a138f60a77564 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h @@ -181,7 +181,8 @@ class IRSpeculationLayer : public IRLayer { : IRLayer(ES, BaseLayer.getManglingOptions()), NextLayer(BaseLayer), S(Spec), Mangle(Mangle), QueryAnalysis(Interpreter) {} - void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override; + void emit(std::unique_ptr R, + ThreadSafeModule TSM) override; private: TargetAndLikelies diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index 9ad7efff6ef56..1b39fff3edec4 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -1118,6 +1118,8 @@ __OMP_TRAIT_SELECTOR(implementation, extension, true) __OMP_TRAIT_PROPERTY(implementation, extension, match_all) __OMP_TRAIT_PROPERTY(implementation, extension, match_any) __OMP_TRAIT_PROPERTY(implementation, extension, match_none) +__OMP_TRAIT_PROPERTY(implementation, extension, disable_implicit_base) +__OMP_TRAIT_PROPERTY(implementation, extension, allow_templates) __OMP_TRAIT_SET(user) diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index f223fadcce23f..5fa3620791856 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -785,7 +785,11 @@ class IRBuilderBase { /// Create an assume intrinsic call that allows the optimizer to /// assume that the provided condition will be true. - CallInst *CreateAssumption(Value *Cond); + /// + /// The optional argument \p OpBundles specifies operand bundles that are + /// added to the call instruction. + CallInst *CreateAssumption(Value *Cond, + ArrayRef OpBundles = llvm::None); /// Create a call to the experimental.gc.statepoint intrinsic to /// start a new statepoint sequence. @@ -2513,13 +2517,11 @@ class IRBuilderBase { private: /// Helper function that creates an assume intrinsic call that - /// represents an alignment assumption on the provided Ptr, Mask, Type - /// and Offset. It may be sometimes useful to do some other logic - /// based on this alignment check, thus it can be stored into 'TheCheck'. + /// represents an alignment assumption on the provided pointer \p PtrValue + /// with offset \p OffsetValue and alignment value \p AlignValue. CallInst *CreateAlignmentAssumptionHelper(const DataLayout &DL, - Value *PtrValue, Value *Mask, - Type *IntPtrTy, Value *OffsetValue, - Value **TheCheck); + Value *PtrValue, Value *AlignValue, + Value *OffsetValue); public: /// Create an assume intrinsic call that represents an alignment @@ -2528,13 +2530,9 @@ class IRBuilderBase { /// An optional offset can be provided, and if it is provided, the offset /// must be subtracted from the provided pointer to get the pointer with the /// specified alignment. - /// - /// It may be sometimes useful to do some other logic - /// based on this alignment check, thus it can be stored into 'TheCheck'. CallInst *CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, - Value *OffsetValue = nullptr, - Value **TheCheck = nullptr); + Value *OffsetValue = nullptr); /// Create an assume intrinsic call that represents an alignment /// assumption on the provided pointer. @@ -2543,15 +2541,11 @@ class IRBuilderBase { /// must be subtracted from the provided pointer to get the pointer with the /// specified alignment. /// - /// It may be sometimes useful to do some other logic - /// based on this alignment check, thus it can be stored into 'TheCheck'. - /// /// This overload handles the condition where the Alignment is dependent /// on an existing value rather than a static value. CallInst *CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, Value *Alignment, - Value *OffsetValue = nullptr, - Value **TheCheck = nullptr); + Value *OffsetValue = nullptr); }; /// This provides a uniform API for creating instructions and inserting diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index d42d576dc2030..20c6d3b8cb1c4 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1349,42 +1349,42 @@ def int_get_active_lane_mask: //===-------------------------- Masked Intrinsics -------------------------===// // -def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, - LLVMAnyPointerType>, - llvm_i32_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], - [IntrArgMemOnly, IntrWillReturn, ImmArg>]>; - -def int_masked_load : Intrinsic<[llvm_anyvector_ty], - [LLVMAnyPointerType>, llvm_i32_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], - [IntrReadMem, IntrArgMemOnly, IntrWillReturn, - ImmArg>]>; - -def int_masked_gather: Intrinsic<[llvm_anyvector_ty], - [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - LLVMMatchType<0>], - [IntrReadMem, IntrWillReturn, - ImmArg>]>; - -def int_masked_scatter: Intrinsic<[], - [llvm_anyvector_ty, - LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], - [IntrWillReturn, ImmArg>]>; - -def int_masked_expandload: Intrinsic<[llvm_anyvector_ty], - [LLVMPointerToElt<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - LLVMMatchType<0>], - [IntrReadMem, IntrWillReturn]>; - -def int_masked_compressstore: Intrinsic<[], - [llvm_anyvector_ty, - LLVMPointerToElt<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], - [IntrArgMemOnly, IntrWillReturn]>; +def int_masked_load: + Intrinsic<[llvm_anyvector_ty], + [LLVMAnyPointerType>, llvm_i32_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], + [IntrReadMem, IntrArgMemOnly, IntrWillReturn, ImmArg>]>; + +def int_masked_store: + Intrinsic<[], + [llvm_anyvector_ty, LLVMAnyPointerType>, + llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, + ImmArg>]>; + +def int_masked_gather: + Intrinsic<[llvm_anyvector_ty], + [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], + [IntrReadMem, IntrWillReturn, ImmArg>]>; + +def int_masked_scatter: + Intrinsic<[], + [llvm_anyvector_ty, LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [IntrWriteMem, IntrWillReturn, ImmArg>]>; + +def int_masked_expandload: + Intrinsic<[llvm_anyvector_ty], + [LLVMPointerToElt<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMMatchType<0>], + [IntrReadMem, IntrWillReturn]>; + +def int_masked_compressstore: + Intrinsic<[], + [llvm_anyvector_ty, LLVMPointerToElt<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [IntrWriteMem, IntrArgMemOnly, IntrWillReturn]>; // Test whether a pointer is associated with a type metadata identifier. def int_type_test : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty], diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 3536facfa9aea..62f009b666d08 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1012,7 +1012,7 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< AMDGPURsrcIntrinsic<2, 0>; // gfx908 intrinsic -def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic; class AMDGPUStructBufferAtomic : Intrinsic < !if(NoRtn, [], [data_ty]), @@ -1049,7 +1049,7 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< AMDGPURsrcIntrinsic<2, 0>; // gfx908 intrinsic -def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic; // Obsolescent tbuffer intrinsics. @@ -1181,6 +1181,19 @@ def int_amdgcn_buffer_atomic_cmpswap : Intrinsic< AMDGPURsrcIntrinsic<2, 0>; def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; + +class AMDGPUBufferAtomicFP : Intrinsic < + [llvm_anyfloat_ty], + [LLVMMatchType<0>, // vdata(VGPR) + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) + llvm_i1_ty], // slc(imm) + [ImmArg>], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; + +// Legacy form of the intrinsic. raw and struct forms should be preferred. +def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; } // defset AMDGPUBufferIntrinsics // Uses that do not set the done bit should set IntrWriteMem on the @@ -1685,6 +1698,14 @@ class AMDGPUGlobalAtomicRtn : Intrinsic < def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn; +// uint4 llvm.amdgcn.image.bvh.intersect.ray , , , +// , ray_inv_dir>, +def int_amdgcn_image_bvh_intersect_ray : + Intrinsic<[llvm_v4i32_ty], + [llvm_anyint_ty, llvm_float_ty, llvm_v4f32_ty, llvm_anyvector_ty, + LLVMMatchType<1>, llvm_v4i32_ty], + [IntrReadMem]>; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// @@ -1800,27 +1821,7 @@ def int_amdgcn_udot8 : // gfx908 intrinsics // ===----------------------------------------------------------------------===// -class AMDGPUBufferAtomicNoRtn : Intrinsic < - [], - [llvm_anyfloat_ty, // vdata(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(SGPR/VGPR/imm) - llvm_i1_ty], // slc(imm) - [ImmArg>, IntrWillReturn], "", [SDNPMemOperand]>, - AMDGPURsrcIntrinsic<1, 0>; - -class AMDGPUGlobalAtomicNoRtn : Intrinsic < - [], - [llvm_anyptr_ty, // vaddr - llvm_anyfloat_ty], // vdata(VGPR) - [IntrArgMemOnly, IntrWillReturn, NoCapture>], "", - [SDNPMemOperand]>; - -def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicNoRtn; - -// Legacy form of the intrinsic. raw and struct forms should be preferred. -def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicNoRtn; +def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn; // llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp def int_amdgcn_mfma_f32_32x32x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x1f32">, diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td index 73a49ec77f8b4..34ef4b768e3b7 100644 --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -467,6 +467,20 @@ let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.". def int_ppc_altivec_vexpandqm : GCCBuiltin<"__builtin_altivec_vexpandqm">, Intrinsic<[llvm_v1i128_ty], [llvm_v1i128_ty], [IntrNoMem]>; + // P10 Vector Count with Mask intrinsics. + def int_ppc_altivec_vcntmbb : GCCBuiltin<"__builtin_altivec_vcntmbb">, + Intrinsic<[llvm_i64_ty], [llvm_v16i8_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_ppc_altivec_vcntmbh : GCCBuiltin<"__builtin_altivec_vcntmbh">, + Intrinsic<[llvm_i64_ty], [llvm_v8i16_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_ppc_altivec_vcntmbw : GCCBuiltin<"__builtin_altivec_vcntmbw">, + Intrinsic<[llvm_i64_ty], [llvm_v4i32_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_ppc_altivec_vcntmbd : GCCBuiltin<"__builtin_altivec_vcntmbd">, + Intrinsic<[llvm_i64_ty], [llvm_v2i64_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + // P10 Vector Parallel Bits Deposit/Extract Doubleword Builtins. def int_ppc_altivec_vpdepd : GCCBuiltin<"__builtin_altivec_vpdepd">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], diff --git a/llvm/include/llvm/IR/LegacyPassManagers.h b/llvm/include/llvm/IR/LegacyPassManagers.h index 6b1ddd4d79f8f..498e736a0100c 100644 --- a/llvm/include/llvm/IR/LegacyPassManagers.h +++ b/llvm/include/llvm/IR/LegacyPassManagers.h @@ -88,7 +88,6 @@ namespace llvm { template class ArrayRef; class Module; -class Pass; class StringRef; class Value; class Timer; diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 5d11a25c7a9ad..0e75f2fa98014 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -113,6 +113,7 @@ void initializeCalledValuePropagationLegacyPassPass(PassRegistry &); void initializeCodeGenPreparePass(PassRegistry&); void initializeConstantHoistingLegacyPassPass(PassRegistry&); void initializeConstantMergeLegacyPassPass(PassRegistry&); +void initializeConstraintEliminationPass(PassRegistry &); void initializeControlHeightReductionLegacyPassPass(PassRegistry&); void initializeCorrelatedValuePropagationPass(PassRegistry&); void initializeCostModelAnalysisPass(PassRegistry&); @@ -177,7 +178,7 @@ void initializeGlobalSplitPass(PassRegistry&); void initializeGlobalsAAWrapperPassPass(PassRegistry&); void initializeGuardWideningLegacyPassPass(PassRegistry&); void initializeHardwareLoopsPass(PassRegistry&); -void initializeHeapProfilerLegacyPassPass(PassRegistry &); +void initializeMemProfilerLegacyPassPass(PassRegistry &); void initializeHotColdSplittingLegacyPassPass(PassRegistry&); void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &); void initializeIPSCCPLegacyPassPass(PassRegistry&); @@ -264,7 +265,7 @@ void initializeLowerGuardIntrinsicLegacyPassPass(PassRegistry&); void initializeLowerWidenableConditionLegacyPassPass(PassRegistry&); void initializeLowerIntrinsicsPass(PassRegistry&); void initializeLowerInvokeLegacyPassPass(PassRegistry&); -void initializeLowerSwitchPass(PassRegistry&); +void initializeLowerSwitchLegacyPassPass(PassRegistry &); void initializeLowerTypeTestsPass(PassRegistry&); void initializeLowerMatrixIntrinsicsLegacyPassPass(PassRegistry &); void initializeLowerMatrixIntrinsicsMinimalLegacyPassPass(PassRegistry &); @@ -306,7 +307,7 @@ void initializeMergeICmpsLegacyPassPass(PassRegistry &); void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&); void initializeMetaRenamerPass(PassRegistry&); void initializeModuleDebugInfoPrinterPass(PassRegistry&); -void initializeModuleHeapProfilerLegacyPassPass(PassRegistry &); +void initializeModuleMemProfilerLegacyPassPass(PassRegistry &); void initializeModuleSummaryIndexWrapperPassPass(PassRegistry&); void initializeModuloScheduleTestPass(PassRegistry&); void initializeMustExecutePrinterPass(PassRegistry&); diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h index 0226e4a3fbf56..735969c47039b 100644 --- a/llvm/include/llvm/LTO/LTOBackend.h +++ b/llvm/include/llvm/LTO/LTOBackend.h @@ -44,7 +44,8 @@ Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream, Module &M, const ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, - MapVector &ModuleMap); + MapVector &ModuleMap, + const std::vector *CmdArgs = nullptr); Error finalizeOptimizationRemarks( std::unique_ptr DiagOutputFile); diff --git a/llvm/include/llvm/MC/MCELFObjectWriter.h b/llvm/include/llvm/MC/MCELFObjectWriter.h index 8f78b99d37949..5d99c494b11eb 100644 --- a/llvm/include/llvm/MC/MCELFObjectWriter.h +++ b/llvm/include/llvm/MC/MCELFObjectWriter.h @@ -23,7 +23,6 @@ namespace llvm { class MCAssembler; class MCContext; class MCFixup; -class MCObjectWriter; class MCSymbol; class MCSymbolELF; class MCValue; diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h index 87338ab46cc2a..0e5a5976cc8e4 100644 --- a/llvm/include/llvm/MC/MCFragment.h +++ b/llvm/include/llvm/MC/MCFragment.h @@ -64,6 +64,10 @@ class MCFragment : public ilist_node_with_parent { /// The layout order of this fragment. unsigned LayoutOrder; + /// The subsection this fragment belongs to. This is 0 if the fragment is not + // in any subsection. + unsigned SubsectionNumber = 0; + FragmentType Kind; /// Whether fragment is being laid out. @@ -102,6 +106,9 @@ class MCFragment : public ilist_node_with_parent { bool hasInstructions() const { return HasInstructions; } void dump() const; + + void setSubsectionNumber(unsigned Value) { SubsectionNumber = Value; } + unsigned getSubsectionNumber() const { return SubsectionNumber; } }; class MCDummyFragment : public MCFragment { diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h index ca04d8e8d3b68..8c6bcba2332b1 100644 --- a/llvm/include/llvm/MC/MCObjectFileInfo.h +++ b/llvm/include/llvm/MC/MCObjectFileInfo.h @@ -338,6 +338,8 @@ class MCObjectFileInfo { MCSection *getStackSizesSection(const MCSection &TextSec) const; + MCSection *getBBAddrMapSection(const MCSection &TextSec) const; + // ELF specific sections. MCSection *getDataRelROSection() const { return DataRelROSection; } const MCSection *getMergeableConst4Section() const { diff --git a/llvm/include/llvm/MC/MCParser/MCAsmParser.h b/llvm/include/llvm/MC/MCParser/MCAsmParser.h index a68066e0f50b5..2040810eac141 100644 --- a/llvm/include/llvm/MC/MCParser/MCAsmParser.h +++ b/llvm/include/llvm/MC/MCParser/MCAsmParser.h @@ -90,6 +90,20 @@ struct InlineAsmIdentifierInfo { IdKind Kind; }; +// Generic type information for an assembly object. +// All sizes measured in bytes. +struct AsmTypeInfo { + StringRef Name; + unsigned Size = 0; + unsigned ElementSize = 0; + unsigned Length = 0; +}; + +struct AsmFieldInfo { + AsmTypeInfo Type; + unsigned Offset = 0; +}; + /// Generic Sema callback for assembly parser. class MCAsmParserSemaCallback { public: @@ -170,12 +184,15 @@ class MCAsmParser { virtual bool isParsingMasm() const { return false; } - virtual bool lookUpField(StringRef Name, StringRef &Type, - unsigned &Offset) const { + virtual bool lookUpField(StringRef Name, AsmFieldInfo &Info) const { return true; } - virtual bool lookUpField(StringRef Base, StringRef Member, StringRef &Type, - unsigned &Offset) const { + virtual bool lookUpField(StringRef Base, StringRef Member, + AsmFieldInfo &Info) const { + return true; + } + + virtual bool lookUpType(StringRef Name, AsmTypeInfo &Info) const { return true; } @@ -281,7 +298,8 @@ class MCAsmParser { /// \param Res - The value of the expression. The result is undefined /// on error. /// \return - False on success. - virtual bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) = 0; + virtual bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, + AsmTypeInfo *TypeInfo) = 0; /// Parse an arbitrary expression, assuming that an initial '(' has /// already been consumed. diff --git a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h index 1d10c66b4201f..0a1e50d501e93 100644 --- a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h +++ b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h @@ -24,7 +24,6 @@ namespace llvm { class MCInst; -class MCParsedAsmOperand; class MCStreamer; class MCSubtargetInfo; template class SmallVectorImpl; @@ -370,7 +369,7 @@ class MCTargetAsmParser : public MCAsmParserExtension { // Target-specific parsing of expression. virtual bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { - return getParser().parsePrimaryExpr(Res, EndLoc); + return getParser().parsePrimaryExpr(Res, EndLoc, nullptr); } virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, diff --git a/llvm/include/llvm/MC/MCWasmObjectWriter.h b/llvm/include/llvm/MC/MCWasmObjectWriter.h index 382818ad6867a..00da632bbcc61 100644 --- a/llvm/include/llvm/MC/MCWasmObjectWriter.h +++ b/llvm/include/llvm/MC/MCWasmObjectWriter.h @@ -52,6 +52,10 @@ std::unique_ptr createWasmObjectWriter(std::unique_ptr MOTW, raw_pwrite_stream &OS); +std::unique_ptr +createWasmDwoObjectWriter(std::unique_ptr MOTW, + raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS); + } // namespace llvm #endif diff --git a/llvm/include/llvm/MC/MCWinEH.h b/llvm/include/llvm/MC/MCWinEH.h index 53cffccce8c1a..f05f5f1641cd0 100644 --- a/llvm/include/llvm/MC/MCWinEH.h +++ b/llvm/include/llvm/MC/MCWinEH.h @@ -26,6 +26,14 @@ struct Instruction { Instruction(unsigned Op, MCSymbol *L, unsigned Reg, unsigned Off) : Label(L), Offset(Off), Register(Reg), Operation(Op) {} + + bool operator==(const Instruction &I) const { + // Check whether two instructions refer to the same operation + // applied at a different spot (i.e. pointing at a different label). + return Offset == I.Offset && Register == I.Register && + Operation == I.Operation; + } + bool operator!=(const Instruction &I) const { return !(*this == I); } }; struct FrameInfo { diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h index 35d2456f7ce20..f4ba2cf66d9f3 100644 --- a/llvm/include/llvm/Object/ELF.h +++ b/llvm/include/llvm/Object/ELF.h @@ -58,11 +58,11 @@ enum PPCInstrMasks : uint64_t { template class ELFFile; template -std::string getSecIndexForError(const ELFFile *Obj, - const typename ELFT::Shdr *Sec) { - auto TableOrErr = Obj->sections(); +std::string getSecIndexForError(const ELFFile &Obj, + const typename ELFT::Shdr &Sec) { + auto TableOrErr = Obj.sections(); if (TableOrErr) - return "[index " + std::to_string(Sec - &TableOrErr->front()) + "]"; + return "[index " + std::to_string(&Sec - &TableOrErr->front()) + "]"; // To make this helper be more convenient for error reporting purposes we // drop the error. But really it should never be triggered. Before this point, // our code should have called 'sections()' and reported a proper error on @@ -72,11 +72,11 @@ std::string getSecIndexForError(const ELFFile *Obj, } template -std::string getPhdrIndexForError(const ELFFile *Obj, - const typename ELFT::Phdr *Phdr) { - auto Headers = Obj->program_headers(); +std::string getPhdrIndexForError(const ELFFile &Obj, + const typename ELFT::Phdr &Phdr) { + auto Headers = Obj.program_headers(); if (Headers) - return ("[index " + Twine(Phdr - &Headers->front()) + "]").str(); + return ("[index " + Twine(&Phdr - &Headers->front()) + "]").str(); // See comment in the getSecIndexForError() above. llvm::consumeError(Headers.takeError()); return "[unknown index]"; @@ -134,17 +134,17 @@ class ELFFile { ELFFile(StringRef Object); public: - const Elf_Ehdr *getHeader() const { - return reinterpret_cast(base()); + const Elf_Ehdr &getHeader() const { + return *reinterpret_cast(base()); } template Expected getEntry(uint32_t Section, uint32_t Entry) const; template - Expected getEntry(const Elf_Shdr *Section, uint32_t Entry) const; + Expected getEntry(const Elf_Shdr &Section, uint32_t Entry) const; Expected - getStringTable(const Elf_Shdr *Section, + getStringTable(const Elf_Shdr &Section, WarningHandler WarnHandler = &defaultWarningHandler) const; Expected getStringTableForSymtab(const Elf_Shdr &Section) const; Expected getStringTableForSymtab(const Elf_Shdr &Section, @@ -163,18 +163,18 @@ class ELFFile { std::string getDynamicTagAsString(uint64_t Type) const; /// Get the symbol for a given relocation. - Expected getRelocationSymbol(const Elf_Rel *Rel, + Expected getRelocationSymbol(const Elf_Rel &Rel, const Elf_Shdr *SymTab) const; static Expected create(StringRef Object); bool isLE() const { - return getHeader()->getDataEncoding() == ELF::ELFDATA2LSB; + return getHeader().getDataEncoding() == ELF::ELFDATA2LSB; } bool isMipsELF64() const { - return getHeader()->e_machine == ELF::EM_MIPS && - getHeader()->getFileClass() == ELF::ELFCLASS64; + return getHeader().e_machine == ELF::EM_MIPS && + getHeader().getFileClass() == ELF::ELFCLASS64; } bool isMips64EL() const { return isMipsELF64() && isLE(); } @@ -188,43 +188,43 @@ class ELFFile { Expected symbols(const Elf_Shdr *Sec) const { if (!Sec) return makeArrayRef(nullptr, nullptr); - return getSectionContentsAsArray(Sec); + return getSectionContentsAsArray(*Sec); } - Expected relas(const Elf_Shdr *Sec) const { + Expected relas(const Elf_Shdr &Sec) const { return getSectionContentsAsArray(Sec); } - Expected rels(const Elf_Shdr *Sec) const { + Expected rels(const Elf_Shdr &Sec) const { return getSectionContentsAsArray(Sec); } - Expected relrs(const Elf_Shdr *Sec) const { + Expected relrs(const Elf_Shdr &Sec) const { return getSectionContentsAsArray(Sec); } std::vector decode_relrs(Elf_Relr_Range relrs) const; - Expected> android_relas(const Elf_Shdr *Sec) const; + Expected> android_relas(const Elf_Shdr &Sec) const; /// Iterate over program header table. Expected program_headers() const { - if (getHeader()->e_phnum && getHeader()->e_phentsize != sizeof(Elf_Phdr)) + if (getHeader().e_phnum && getHeader().e_phentsize != sizeof(Elf_Phdr)) return createError("invalid e_phentsize: " + - Twine(getHeader()->e_phentsize)); + Twine(getHeader().e_phentsize)); uint64_t HeadersSize = - (uint64_t)getHeader()->e_phnum * getHeader()->e_phentsize; - uint64_t PhOff = getHeader()->e_phoff; + (uint64_t)getHeader().e_phnum * getHeader().e_phentsize; + uint64_t PhOff = getHeader().e_phoff; if (PhOff + HeadersSize < PhOff || PhOff + HeadersSize > getBufSize()) return createError("program headers are longer than binary of size " + Twine(getBufSize()) + ": e_phoff = 0x" + - Twine::utohexstr(getHeader()->e_phoff) + - ", e_phnum = " + Twine(getHeader()->e_phnum) + - ", e_phentsize = " + Twine(getHeader()->e_phentsize)); + Twine::utohexstr(getHeader().e_phoff) + + ", e_phnum = " + Twine(getHeader().e_phnum) + + ", e_phentsize = " + Twine(getHeader().e_phentsize)); auto *Begin = reinterpret_cast(base() + PhOff); - return makeArrayRef(Begin, Begin + getHeader()->e_phnum); + return makeArrayRef(Begin, Begin + getHeader().e_phnum); } /// Get an iterator over notes in a program header. @@ -257,7 +257,7 @@ class ELFFile { assert(Shdr.sh_type == ELF::SHT_NOTE && "Shdr is not of type SHT_NOTE"); ErrorAsOutParameter ErrAsOutParam(&Err); if (Shdr.sh_offset + Shdr.sh_size > getBufSize()) { - Err = createError("SHT_NOTE section " + getSecIndexForError(this, &Shdr) + + Err = createError("SHT_NOTE section " + getSecIndexForError(*this, Shdr) + " has invalid offset (0x" + Twine::utohexstr(Shdr.sh_offset) + ") or size (0x" + Twine::utohexstr(Shdr.sh_size) + ")"); @@ -298,12 +298,12 @@ class ELFFile { Expected getSectionStringTable( Elf_Shdr_Range Sections, WarningHandler WarnHandler = &defaultWarningHandler) const; - Expected getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms, + Expected getSectionIndex(const Elf_Sym &Sym, Elf_Sym_Range Syms, ArrayRef ShndxTable) const; - Expected getSection(const Elf_Sym *Sym, + Expected getSection(const Elf_Sym &Sym, const Elf_Shdr *SymTab, ArrayRef ShndxTable) const; - Expected getSection(const Elf_Sym *Sym, + Expected getSection(const Elf_Sym &Sym, Elf_Sym_Range Symtab, ArrayRef ShndxTable) const; Expected getSection(uint32_t Index) const; @@ -312,14 +312,14 @@ class ELFFile { uint32_t Index) const; Expected - getSectionName(const Elf_Shdr *Section, + getSectionName(const Elf_Shdr &Section, WarningHandler WarnHandler = &defaultWarningHandler) const; - Expected getSectionName(const Elf_Shdr *Section, + Expected getSectionName(const Elf_Shdr &Section, StringRef DotShstrtab) const; template - Expected> getSectionContentsAsArray(const Elf_Shdr *Sec) const; - Expected> getSectionContents(const Elf_Shdr *Sec) const; - Expected> getSegmentContents(const Elf_Phdr *Phdr) const; + Expected> getSectionContentsAsArray(const Elf_Shdr &Sec) const; + Expected> getSectionContents(const Elf_Shdr &Sec) const; + Expected> getSegmentContents(const Elf_Phdr &Phdr) const; }; using ELF32LEFile = ELFFile; @@ -337,11 +337,11 @@ getSection(typename ELFT::ShdrRange Sections, uint32_t Index) { template inline Expected -getExtendedSymbolTableIndex(const typename ELFT::Sym *Sym, - const typename ELFT::Sym *FirstSym, +getExtendedSymbolTableIndex(const typename ELFT::Sym &Sym, + const typename ELFT::Sym &FirstSym, ArrayRef ShndxTable) { - assert(Sym->st_shndx == ELF::SHN_XINDEX); - unsigned Index = Sym - FirstSym; + assert(Sym.st_shndx == ELF::SHN_XINDEX); + unsigned Index = &Sym - &FirstSym; if (Index >= ShndxTable.size()) return createError( "extended symbol index (" + Twine(Index) + @@ -354,12 +354,12 @@ getExtendedSymbolTableIndex(const typename ELFT::Sym *Sym, template Expected -ELFFile::getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms, +ELFFile::getSectionIndex(const Elf_Sym &Sym, Elf_Sym_Range Syms, ArrayRef ShndxTable) const { - uint32_t Index = Sym->st_shndx; + uint32_t Index = Sym.st_shndx; if (Index == ELF::SHN_XINDEX) { - auto ErrorOrIndex = getExtendedSymbolTableIndex( - Sym, Syms.begin(), ShndxTable); + Expected ErrorOrIndex = + getExtendedSymbolTableIndex(Sym, *Syms.begin(), ShndxTable); if (!ErrorOrIndex) return ErrorOrIndex.takeError(); return *ErrorOrIndex; @@ -371,7 +371,7 @@ ELFFile::getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms, template Expected -ELFFile::getSection(const Elf_Sym *Sym, const Elf_Shdr *SymTab, +ELFFile::getSection(const Elf_Sym &Sym, const Elf_Shdr *SymTab, ArrayRef ShndxTable) const { auto SymsOrErr = symbols(SymTab); if (!SymsOrErr) @@ -381,7 +381,7 @@ ELFFile::getSection(const Elf_Sym *Sym, const Elf_Shdr *SymTab, template Expected -ELFFile::getSection(const Elf_Sym *Sym, Elf_Sym_Range Symbols, +ELFFile::getSection(const Elf_Sym &Sym, Elf_Sym_Range Symbols, ArrayRef ShndxTable) const { auto IndexOrErr = getSectionIndex(Sym, Symbols, ShndxTable); if (!IndexOrErr) @@ -402,7 +402,7 @@ ELFFile::getSymbol(const Elf_Shdr *Sec, uint32_t Index) const { Elf_Sym_Range Symbols = *SymsOrErr; if (Index >= Symbols.size()) return createError("unable to get symbol from section " + - getSecIndexForError(this, Sec) + + getSecIndexForError(*this, *Sec) + ": invalid symbol index (" + Twine(Index) + ")"); return &Symbols[Index]; } @@ -410,26 +410,26 @@ ELFFile::getSymbol(const Elf_Shdr *Sec, uint32_t Index) const { template template Expected> -ELFFile::getSectionContentsAsArray(const Elf_Shdr *Sec) const { - if (Sec->sh_entsize != sizeof(T) && sizeof(T) != 1) - return createError("section " + getSecIndexForError(this, Sec) + - " has an invalid sh_entsize: " + Twine(Sec->sh_entsize)); +ELFFile::getSectionContentsAsArray(const Elf_Shdr &Sec) const { + if (Sec.sh_entsize != sizeof(T) && sizeof(T) != 1) + return createError("section " + getSecIndexForError(*this, Sec) + + " has an invalid sh_entsize: " + Twine(Sec.sh_entsize)); - uintX_t Offset = Sec->sh_offset; - uintX_t Size = Sec->sh_size; + uintX_t Offset = Sec.sh_offset; + uintX_t Size = Sec.sh_size; if (Size % sizeof(T)) - return createError("section " + getSecIndexForError(this, Sec) + + return createError("section " + getSecIndexForError(*this, Sec) + " has an invalid sh_size (" + Twine(Size) + ") which is not a multiple of its sh_entsize (" + - Twine(Sec->sh_entsize) + ")"); + Twine(Sec.sh_entsize) + ")"); if (std::numeric_limits::max() - Offset < Size) - return createError("section " + getSecIndexForError(this, Sec) + + return createError("section " + getSecIndexForError(*this, Sec) + " has a sh_offset (0x" + Twine::utohexstr(Offset) + ") + sh_size (0x" + Twine::utohexstr(Size) + ") that cannot be represented"); if (Offset + Size > Buf.size()) - return createError("section " + getSecIndexForError(this, Sec) + + return createError("section " + getSecIndexForError(*this, Sec) + " has a sh_offset (0x" + Twine::utohexstr(Offset) + ") + sh_size (0x" + Twine::utohexstr(Size) + ") that is greater than the file size (0x" + @@ -445,17 +445,17 @@ ELFFile::getSectionContentsAsArray(const Elf_Shdr *Sec) const { template Expected> -ELFFile::getSegmentContents(const Elf_Phdr *Phdr) const { - uintX_t Offset = Phdr->p_offset; - uintX_t Size = Phdr->p_filesz; +ELFFile::getSegmentContents(const Elf_Phdr &Phdr) const { + uintX_t Offset = Phdr.p_offset; + uintX_t Size = Phdr.p_filesz; if (std::numeric_limits::max() - Offset < Size) - return createError("program header " + getPhdrIndexForError(this, Phdr) + + return createError("program header " + getPhdrIndexForError(*this, Phdr) + " has a p_offset (0x" + Twine::utohexstr(Offset) + ") + p_filesz (0x" + Twine::utohexstr(Size) + ") that cannot be represented"); if (Offset + Size > Buf.size()) - return createError("program header " + getPhdrIndexForError(this, Phdr) + + return createError("program header " + getPhdrIndexForError(*this, Phdr) + " has a p_offset (0x" + Twine::utohexstr(Offset) + ") + p_filesz (0x" + Twine::utohexstr(Size) + ") that is greater than the file size (0x" + @@ -465,13 +465,13 @@ ELFFile::getSegmentContents(const Elf_Phdr *Phdr) const { template Expected> -ELFFile::getSectionContents(const Elf_Shdr *Sec) const { +ELFFile::getSectionContents(const Elf_Shdr &Sec) const { return getSectionContentsAsArray(Sec); } template StringRef ELFFile::getRelocationTypeName(uint32_t Type) const { - return getELFRelocationTypeName(getHeader()->e_machine, Type); + return getELFRelocationTypeName(getHeader().e_machine, Type); } template @@ -507,24 +507,24 @@ void ELFFile::getRelocationTypeName(uint32_t Type, template uint32_t ELFFile::getRelativeRelocationType() const { - return getELFRelativeRelocationType(getHeader()->e_machine); + return getELFRelativeRelocationType(getHeader().e_machine); } template Expected -ELFFile::getRelocationSymbol(const Elf_Rel *Rel, +ELFFile::getRelocationSymbol(const Elf_Rel &Rel, const Elf_Shdr *SymTab) const { - uint32_t Index = Rel->getSymbol(isMips64EL()); + uint32_t Index = Rel.getSymbol(isMips64EL()); if (Index == 0) return nullptr; - return getEntry(SymTab, Index); + return getEntry(*SymTab, Index); } template Expected ELFFile::getSectionStringTable(Elf_Shdr_Range Sections, WarningHandler WarnHandler) const { - uint32_t Index = getHeader()->e_shstrndx; + uint32_t Index = getHeader().e_shstrndx; if (Index == ELF::SHN_XINDEX) { // If the section name string table section index is greater than // or equal to SHN_LORESERVE, then the actual index of the section name @@ -542,7 +542,7 @@ ELFFile::getSectionStringTable(Elf_Shdr_Range Sections, if (Index >= Sections.size()) return createError("section header string table index " + Twine(Index) + " does not exist"); - return getStringTable(&Sections[Index], WarnHandler); + return getStringTable(Sections[Index], WarnHandler); } template ELFFile::ELFFile(StringRef Object) : Buf(Object) {} @@ -558,13 +558,13 @@ Expected> ELFFile::create(StringRef Object) { template Expected ELFFile::sections() const { - const uintX_t SectionTableOffset = getHeader()->e_shoff; + const uintX_t SectionTableOffset = getHeader().e_shoff; if (SectionTableOffset == 0) return ArrayRef(); - if (getHeader()->e_shentsize != sizeof(Elf_Shdr)) + if (getHeader().e_shentsize != sizeof(Elf_Shdr)) return createError("invalid e_shentsize in ELF header: " + - Twine(getHeader()->e_shentsize)); + Twine(getHeader().e_shentsize)); const uint64_t FileSize = Buf.size(); if (SectionTableOffset + sizeof(Elf_Shdr) > FileSize || @@ -581,7 +581,7 @@ Expected ELFFile::sections() const { const Elf_Shdr *First = reinterpret_cast(base() + SectionTableOffset); - uintX_t NumSections = getHeader()->e_shnum; + uintX_t NumSections = getHeader().e_shnum; if (NumSections == 0) NumSections = First->sh_size; @@ -612,21 +612,21 @@ Expected ELFFile::getEntry(uint32_t Section, auto SecOrErr = getSection(Section); if (!SecOrErr) return SecOrErr.takeError(); - return getEntry(*SecOrErr, Entry); + return getEntry(**SecOrErr, Entry); } template template -Expected ELFFile::getEntry(const Elf_Shdr *Section, +Expected ELFFile::getEntry(const Elf_Shdr &Section, uint32_t Entry) const { - if (sizeof(T) != Section->sh_entsize) - return createError("section " + getSecIndexForError(this, Section) + + if (sizeof(T) != Section.sh_entsize) + return createError("section " + getSecIndexForError(*this, Section) + " has invalid sh_entsize: expected " + Twine(sizeof(T)) + - ", but got " + Twine(Section->sh_entsize)); - uint64_t Pos = Section->sh_offset + (uint64_t)Entry * sizeof(T); + ", but got " + Twine(Section.sh_entsize)); + uint64_t Pos = Section.sh_offset + (uint64_t)Entry * sizeof(T); if (Pos + sizeof(T) > Buf.size()) return createError("unable to access section " + - getSecIndexForError(this, Section) + " data at 0x" + + getSecIndexForError(*this, Section) + " data at 0x" + Twine::utohexstr(Pos) + ": offset goes past the end of file"); return reinterpret_cast(base() + Pos); @@ -643,14 +643,14 @@ ELFFile::getSection(uint32_t Index) const { template Expected -ELFFile::getStringTable(const Elf_Shdr *Section, +ELFFile::getStringTable(const Elf_Shdr &Section, WarningHandler WarnHandler) const { - if (Section->sh_type != ELF::SHT_STRTAB) + if (Section.sh_type != ELF::SHT_STRTAB) if (Error E = WarnHandler("invalid sh_type for string table section " + - getSecIndexForError(this, Section) + + getSecIndexForError(*this, Section) + ": expected SHT_STRTAB, but got " + object::getELFSectionTypeName( - getHeader()->e_machine, Section->sh_type))) + getHeader().e_machine, Section.sh_type))) return std::move(E); auto V = getSectionContentsAsArray(Section); @@ -659,10 +659,10 @@ ELFFile::getStringTable(const Elf_Shdr *Section, ArrayRef Data = *V; if (Data.empty()) return createError("SHT_STRTAB string table section " + - getSecIndexForError(this, Section) + " is empty"); + getSecIndexForError(*this, Section) + " is empty"); if (Data.back() != '\0') return createError("SHT_STRTAB string table section " + - getSecIndexForError(this, Section) + + getSecIndexForError(*this, Section) + " is non-null terminated"); return StringRef(Data.begin(), Data.size()); } @@ -681,7 +681,7 @@ Expected> ELFFile::getSHNDXTable(const Elf_Shdr &Section, Elf_Shdr_Range Sections) const { assert(Section.sh_type == ELF::SHT_SYMTAB_SHNDX); - auto VOrErr = getSectionContentsAsArray(&Section); + auto VOrErr = getSectionContentsAsArray(Section); if (!VOrErr) return VOrErr.takeError(); ArrayRef V = *VOrErr; @@ -691,10 +691,10 @@ ELFFile::getSHNDXTable(const Elf_Shdr &Section, const Elf_Shdr &SymTable = **SymTableOrErr; if (SymTable.sh_type != ELF::SHT_SYMTAB && SymTable.sh_type != ELF::SHT_DYNSYM) - return createError("SHT_SYMTAB_SHNDX section is linked with " + - object::getELFSectionTypeName(getHeader()->e_machine, - SymTable.sh_type) + - " section (expected SHT_SYMTAB/SHT_DYNSYM)"); + return createError( + "SHT_SYMTAB_SHNDX section is linked with " + + object::getELFSectionTypeName(getHeader().e_machine, SymTable.sh_type) + + " section (expected SHT_SYMTAB/SHT_DYNSYM)"); uint64_t Syms = SymTable.sh_size / sizeof(Elf_Sym); if (V.size() != Syms) @@ -722,15 +722,16 @@ ELFFile::getStringTableForSymtab(const Elf_Shdr &Sec, if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM) return createError( "invalid sh_type for symbol table, expected SHT_SYMTAB or SHT_DYNSYM"); - auto SectionOrErr = object::getSection(Sections, Sec.sh_link); + Expected SectionOrErr = + object::getSection(Sections, Sec.sh_link); if (!SectionOrErr) return SectionOrErr.takeError(); - return getStringTable(*SectionOrErr); + return getStringTable(**SectionOrErr); } template Expected -ELFFile::getSectionName(const Elf_Shdr *Section, +ELFFile::getSectionName(const Elf_Shdr &Section, WarningHandler WarnHandler) const { auto SectionsOrErr = sections(); if (!SectionsOrErr) @@ -742,13 +743,13 @@ ELFFile::getSectionName(const Elf_Shdr *Section, } template -Expected ELFFile::getSectionName(const Elf_Shdr *Section, +Expected ELFFile::getSectionName(const Elf_Shdr &Section, StringRef DotShstrtab) const { - uint32_t Offset = Section->sh_name; + uint32_t Offset = Section.sh_name; if (Offset == 0) return StringRef(); if (Offset >= DotShstrtab.size()) - return createError("a section " + getSecIndexForError(this, Section) + + return createError("a section " + getSecIndexForError(*this, Section) + " has an invalid sh_name (0x" + Twine::utohexstr(Offset) + ") offset which goes past the end of the " diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h index 74d4745c1034f..5c12231331be8 100644 --- a/llvm/include/llvm/Object/ELFObjectFile.h +++ b/llvm/include/llvm/Object/ELFObjectFile.h @@ -377,7 +377,7 @@ template class ELFObjectFile : public ELFObjectFileBase { for (const Elf_Shdr &Sec : *SectionsOrErr) { if (Sec.sh_type == ELF::SHT_ARM_ATTRIBUTES || Sec.sh_type == ELF::SHT_RISCV_ATTRIBUTES) { - auto ErrorOrContents = EF.getSectionContents(&Sec); + auto ErrorOrContents = EF.getSectionContents(Sec); if (!ErrorOrContents) return ErrorOrContents.takeError(); @@ -432,7 +432,7 @@ template class ELFObjectFile : public ELFObjectFileBase { Triple::ArchType getArch() const override; Expected getStartAddress() const override; - unsigned getPlatformFlags() const override { return EF.getHeader()->e_flags; } + unsigned getPlatformFlags() const override { return EF.getHeader().e_flags; } const ELFFile *getELFFile() const { return &EF; } @@ -468,7 +468,7 @@ Expected ELFObjectFile::getSymbolName(DataRefImpl Sym) const { if (!StrTabOrErr) return StrTabOrErr.takeError(); const Elf_Shdr *StringTableSec = *StrTabOrErr; - auto SymStrTabOrErr = EF.getStringTable(StringTableSec); + auto SymStrTabOrErr = EF.getStringTable(*StringTableSec); if (!SymStrTabOrErr) return SymStrTabOrErr.takeError(); Expected Name = ESym->getName(*SymStrTabOrErr); @@ -507,9 +507,9 @@ uint64_t ELFObjectFile::getSymbolValueImpl(DataRefImpl Symb) const { if (ESym->st_shndx == ELF::SHN_ABS) return Ret; - const Elf_Ehdr *Header = EF.getHeader(); + const Elf_Ehdr &Header = EF.getHeader(); // Clear the ARM/Thumb or microMIPS indicator flag. - if ((Header->e_machine == ELF::EM_ARM || Header->e_machine == ELF::EM_MIPS) && + if ((Header.e_machine == ELF::EM_ARM || Header.e_machine == ELF::EM_MIPS) && ESym->getType() == ELF::STT_FUNC) Ret &= ~1; @@ -533,14 +533,13 @@ ELFObjectFile::getSymbolAddress(DataRefImpl Symb) const { return Result; } - const Elf_Ehdr *Header = EF.getHeader(); auto SymTabOrErr = EF.getSection(Symb.d.a); if (!SymTabOrErr) return SymTabOrErr.takeError(); - const Elf_Shdr *SymTab = *SymTabOrErr; - if (Header->e_type == ELF::ET_REL) { - auto SectionOrErr = EF.getSection(ESym, SymTab, ShndxTable); + if (EF.getHeader().e_type == ELF::ET_REL) { + Expected SectionOrErr = + EF.getSection(*ESym, *SymTabOrErr, ShndxTable); if (!SectionOrErr) return SectionOrErr.takeError(); const Elf_Shdr *Section = *SectionOrErr; @@ -561,11 +560,11 @@ uint32_t ELFObjectFile::getSymbolAlignment(DataRefImpl Symb) const { template uint16_t ELFObjectFile::getEMachine() const { - return EF.getHeader()->e_machine; + return EF.getHeader().e_machine; } template uint16_t ELFObjectFile::getEType() const { - return EF.getHeader()->e_type; + return EF.getHeader().e_type; } template @@ -652,7 +651,7 @@ Expected ELFObjectFile::getSymbolFlags(DataRefImpl Sym) const { // TODO: Test this error. return SymbolsOrErr.takeError(); - if (EF.getHeader()->e_machine == ELF::EM_ARM) { + if (EF.getHeader().e_machine == ELF::EM_ARM) { if (Expected NameOrErr = getSymbolName(Sym)) { StringRef Name = *NameOrErr; if (Name.startswith("$d") || Name.startswith("$t") || @@ -685,7 +684,7 @@ template Expected ELFObjectFile::getSymbolSection(const Elf_Sym *ESym, const Elf_Shdr *SymTab) const { - auto ESecOrErr = EF.getSection(ESym, SymTab, ShndxTable); + auto ESecOrErr = EF.getSection(*ESym, SymTab, ShndxTable); if (!ESecOrErr) return ESecOrErr.takeError(); @@ -717,7 +716,7 @@ void ELFObjectFile::moveSectionNext(DataRefImpl &Sec) const { template Expected ELFObjectFile::getSectionName(DataRefImpl Sec) const { - return EF.getSectionName(&*getSection(Sec)); + return EF.getSectionName(*getSection(Sec)); } template @@ -847,7 +846,7 @@ ELFObjectFile::section_rel_begin(DataRefImpl Sec) const { if (!SectionsOrErr) return relocation_iterator(RelocationRef()); uintptr_t SHT = reinterpret_cast((*SectionsOrErr).begin()); - RelData.d.a = (Sec.p - SHT) / EF.getHeader()->e_shentsize; + RelData.d.a = (Sec.p - SHT) / EF.getHeader().e_shentsize; RelData.d.b = 0; return relocation_iterator(RelocationRef(RelData, this)); } @@ -874,7 +873,7 @@ ELFObjectFile::section_rel_end(DataRefImpl Sec) const { template Expected ELFObjectFile::getRelocatedSection(DataRefImpl Sec) const { - if (EF.getHeader()->e_type != ELF::ET_REL) + if (EF.getHeader().e_type != ELF::ET_REL) return section_end(); const Elf_Shdr *EShdr = getSection(Sec); @@ -933,7 +932,7 @@ uint64_t ELFObjectFile::getRelocationType(DataRefImpl Rel) const { template StringRef ELFObjectFile::getRelocationTypeName(uint32_t Type) const { - return getELFRelocationTypeName(EF.getHeader()->e_machine, Type); + return getELFRelocationTypeName(EF.getHeader().e_machine, Type); } template @@ -1087,9 +1086,9 @@ uint8_t ELFObjectFile::getBytesInAddress() const { template StringRef ELFObjectFile::getFileFormatName() const { bool IsLittleEndian = ELFT::TargetEndianness == support::little; - switch (EF.getHeader()->e_ident[ELF::EI_CLASS]) { + switch (EF.getHeader().e_ident[ELF::EI_CLASS]) { case ELF::ELFCLASS32: - switch (EF.getHeader()->e_machine) { + switch (EF.getHeader().e_machine) { case ELF::EM_386: return "elf32-i386"; case ELF::EM_IAMCU: @@ -1123,7 +1122,7 @@ StringRef ELFObjectFile::getFileFormatName() const { return "elf32-unknown"; } case ELF::ELFCLASS64: - switch (EF.getHeader()->e_machine) { + switch (EF.getHeader().e_machine) { case ELF::EM_386: return "elf64-i386"; case ELF::EM_X86_64: @@ -1157,7 +1156,7 @@ StringRef ELFObjectFile::getFileFormatName() const { template Triple::ArchType ELFObjectFile::getArch() const { bool IsLittleEndian = ELFT::TargetEndianness == support::little; - switch (EF.getHeader()->e_machine) { + switch (EF.getHeader().e_machine) { case ELF::EM_386: case ELF::EM_IAMCU: return Triple::x86; @@ -1174,7 +1173,7 @@ template Triple::ArchType ELFObjectFile::getArch() const { case ELF::EM_LANAI: return Triple::lanai; case ELF::EM_MIPS: - switch (EF.getHeader()->e_ident[ELF::EI_CLASS]) { + switch (EF.getHeader().e_ident[ELF::EI_CLASS]) { case ELF::ELFCLASS32: return IsLittleEndian ? Triple::mipsel : Triple::mips; case ELF::ELFCLASS64: @@ -1189,7 +1188,7 @@ template Triple::ArchType ELFObjectFile::getArch() const { case ELF::EM_PPC64: return IsLittleEndian ? Triple::ppc64le : Triple::ppc64; case ELF::EM_RISCV: - switch (EF.getHeader()->e_ident[ELF::EI_CLASS]) { + switch (EF.getHeader().e_ident[ELF::EI_CLASS]) { case ELF::ELFCLASS32: return Triple::riscv32; case ELF::ELFCLASS64: @@ -1210,7 +1209,7 @@ template Triple::ArchType ELFObjectFile::getArch() const { if (!IsLittleEndian) return Triple::UnknownArch; - unsigned MACH = EF.getHeader()->e_flags & ELF::EF_AMDGPU_MACH; + unsigned MACH = EF.getHeader().e_flags & ELF::EF_AMDGPU_MACH; if (MACH >= ELF::EF_AMDGPU_MACH_R600_FIRST && MACH <= ELF::EF_AMDGPU_MACH_R600_LAST) return Triple::r600; @@ -1235,7 +1234,7 @@ template Triple::ArchType ELFObjectFile::getArch() const { template Expected ELFObjectFile::getStartAddress() const { - return EF.getHeader()->e_entry; + return EF.getHeader().e_entry; } template @@ -1245,7 +1244,7 @@ ELFObjectFile::getDynamicSymbolIterators() const { } template bool ELFObjectFile::isRelocatableObject() const { - return EF.getHeader()->e_type == ELF::ET_REL; + return EF.getHeader().e_type == ELF::ET_REL; } } // end namespace object diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h index 99a7af87d2c78..3e5be41b8fa3b 100644 --- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h +++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h @@ -214,7 +214,7 @@ struct Data { Optional> DebugStrings; Optional> DebugStrOffsets; Optional> DebugAranges; - std::vector DebugRanges; + Optional> DebugRanges; Optional> DebugAddr; Optional PubNames; Optional PubTypes; diff --git a/llvm/include/llvm/Option/OptTable.h b/llvm/include/llvm/Option/OptTable.h index 1aabff0fd6591..58c09b23d237c 100644 --- a/llvm/include/llvm/Option/OptTable.h +++ b/llvm/include/llvm/Option/OptTable.h @@ -50,7 +50,7 @@ class OptTable { unsigned ID; unsigned char Kind; unsigned char Param; - unsigned short Flags; + unsigned int Flags; unsigned short GroupID; unsigned short AliasID; const char *AliasArgs; @@ -152,7 +152,7 @@ class OptTable { /// /// \return The vector of flags which start with Cur. std::vector findByPrefix(StringRef Cur, - unsigned short DisableFlags) const; + unsigned int DisableFlags) const; /// Find the OptTable option that most closely matches the given string. /// diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h index 795e2770bbe18..76e217c899745 100644 --- a/llvm/include/llvm/Passes/StandardInstrumentations.h +++ b/llvm/include/llvm/Passes/StandardInstrumentations.h @@ -17,8 +17,11 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/PassInstrumentation.h" #include "llvm/IR/PassTimingInfo.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Support/CommandLine.h" #include #include @@ -26,6 +29,7 @@ namespace llvm { class Module; +class Function; /// Instrumentation to print IR before/after passes. /// @@ -73,6 +77,53 @@ class PrintPassInstrumentation { bool DebugLogging; }; +class PreservedCFGCheckerInstrumentation { +private: + // CFG is a map BB -> {(Succ, Multiplicity)}, where BB is a non-leaf basic + // block, {(Succ, Multiplicity)} set of all pairs of the block's successors + // and the multiplicity of the edge (BB->Succ). As the mapped sets are + // unordered the order of successors is not tracked by the CFG. In other words + // this allows basic block successors to be swapped by a pass without + // reporting a CFG change. CFG can be guarded by basic block tracking pointers + // in the Graph (BBGuard). That is if any of the block is deleted or RAUWed + // then the CFG is treated poisoned and no block pointer of the Graph is used. + struct CFG { + struct BBGuard final : public CallbackVH { + BBGuard(const BasicBlock *BB) : CallbackVH(BB) {} + void deleted() override { CallbackVH::deleted(); } + void allUsesReplacedWith(Value *) override { CallbackVH::deleted(); } + bool isPoisoned() const { return !getValPtr(); } + }; + + Optional> BBGuards; + DenseMap> Graph; + + CFG(const Function *F, bool TrackBBLifetime = false); + + bool operator==(const CFG &G) const { + return !isPoisoned() && !G.isPoisoned() && Graph == G.Graph; + } + + bool isPoisoned() const { + if (BBGuards) + for (auto &BB : *BBGuards) { + if (BB.second.isPoisoned()) + return true; + } + return false; + } + + static void printDiff(raw_ostream &out, const CFG &Before, + const CFG &After); + }; + + SmallVector>, 8> GraphStackBefore; + +public: + static cl::opt VerifyPreservedCFG; + void registerCallbacks(PassInstrumentationCallbacks &PIC); +}; + /// This class provides an interface to register all the standard pass /// instrumentations and manages their state (if any). class StandardInstrumentations { @@ -80,6 +131,7 @@ class StandardInstrumentations { PrintPassInstrumentation PrintPass; TimePassesHandler TimePasses; OptNoneInstrumentation OptNone; + PreservedCFGCheckerInstrumentation PreservedCFGChecker; public: StandardInstrumentations(bool DebugLogging) : PrintPass(DebugLogging) {} diff --git a/llvm/include/llvm/ProfileData/GCOV.h b/llvm/include/llvm/ProfileData/GCOV.h index 7b9ba4410b654..2766ff52e4a09 100644 --- a/llvm/include/llvm/ProfileData/GCOV.h +++ b/llvm/include/llvm/ProfileData/GCOV.h @@ -15,6 +15,7 @@ #define LLVM_PROFILEDATA_GCOV_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" @@ -38,7 +39,6 @@ namespace llvm { class GCOVFunction; class GCOVBlock; -class FileInfo; namespace GCOV { @@ -47,10 +47,11 @@ enum GCOVVersion { V304, V407, V408, V800, V900 }; /// A struct for passing gcov options between functions. struct Options { Options(bool A, bool B, bool C, bool F, bool P, bool U, bool I, bool L, - bool N, bool T, bool X) + bool M, bool N, bool R, bool T, bool X, std::string SourcePrefix) : AllBlocks(A), BranchInfo(B), BranchCount(C), FuncCoverage(F), PreservePaths(P), UncondBranch(U), Intermediate(I), LongFileNames(L), - NoOutput(N), UseStdout(T), HashFilenames(X) {} + Demangle(M), NoOutput(N), RelativeOnly(R), UseStdout(T), + HashFilenames(X), SourcePrefix(std::move(SourcePrefix)) {} bool AllBlocks; bool BranchInfo; @@ -60,9 +61,12 @@ struct Options { bool UncondBranch; bool Intermediate; bool LongFileNames; + bool Demangle; bool NoOutput; + bool RelativeOnly; bool UseStdout; bool HashFilenames; + std::string SourcePrefix; }; } // end namespace GCOV @@ -187,39 +191,38 @@ class GCOVFile { bool readGCNO(GCOVBuffer &Buffer); bool readGCDA(GCOVBuffer &Buffer); GCOV::GCOVVersion getVersion() const { return Version; } - uint32_t getChecksum() const { return Checksum; } void print(raw_ostream &OS) const; void dump() const; - void collectLineCounts(FileInfo &FI); std::vector filenames; StringMap filenameToIdx; -private: +public: bool GCNOInitialized = false; GCOV::GCOVVersion Version; uint32_t Checksum = 0; StringRef cwd; - SmallVector, 16> Functions; + SmallVector, 16> functions; std::map IdentToFunction; uint32_t RunCount = 0; uint32_t ProgramCount = 0; using iterator = pointee_iterator< SmallVectorImpl>::const_iterator>; - iterator begin() const { return iterator(Functions.begin()); } - iterator end() const { return iterator(Functions.end()); } + iterator begin() const { return iterator(functions.begin()); } + iterator end() const { return iterator(functions.end()); } }; struct GCOVArc { - GCOVArc(GCOVBlock &src, GCOVBlock &dst, bool fallthrough) - : src(src), dst(dst), fallthrough(fallthrough) {} + GCOVArc(GCOVBlock &src, GCOVBlock &dst, uint32_t flags) + : src(src), dst(dst), flags(flags) {} + bool onTree() const; GCOVBlock &src; GCOVBlock &dst; - bool fallthrough; - uint64_t Count = 0; - uint64_t CyclesCount = 0; + uint32_t flags; + uint64_t count = 0; + uint64_t cycleCount = 0; }; /// GCOVFunction - Collects function information. @@ -230,21 +233,18 @@ class GCOVFunction { GCOVFunction(GCOVFile &file) : file(file) {} - StringRef getName() const { return Name; } + StringRef getName(bool demangle) const; StringRef getFilename() const; - size_t getNumBlocks() const { return Blocks.size(); } uint64_t getEntryCount() const; - uint64_t getExitCount() const; + GCOVBlock &getExitBlock() const; - BlockIterator block_begin() const { return Blocks.begin(); } - BlockIterator block_end() const { return Blocks.end(); } - iterator_range blocks() const { - return make_range(block_begin(), block_end()); + iterator_range blocksRange() const { + return make_range(blocks.begin(), blocks.end()); } + uint64_t propagateCounts(const GCOVBlock &v, GCOVArc *pred); void print(raw_ostream &OS) const; void dump() const; - void collectLineCounts(FileInfo &FI); GCOVFile &file; uint32_t ident = 0; @@ -256,40 +256,31 @@ class GCOVFunction { uint32_t endColumn = 0; uint8_t artificial = 0; StringRef Name; + mutable SmallString<0> demangled; unsigned srcIdx; - SmallVector, 0> Blocks; + SmallVector, 0> blocks; SmallVector, 0> arcs, treeArcs; + DenseSet visited; }; /// GCOVBlock - Collects block information. class GCOVBlock { - struct EdgeWeight { - EdgeWeight(GCOVBlock *D) : Dst(D) {} - - GCOVBlock *Dst; - uint64_t Count = 0; - }; - public: using EdgeIterator = SmallVectorImpl::const_iterator; - using BlockVector = SmallVector; + using BlockVector = SmallVector; using BlockVectorLists = SmallVector; using Edges = SmallVector; - GCOVBlock(GCOVFunction &P, uint32_t N) : Parent(P), Number(N) {} + GCOVBlock(uint32_t N) : number(N) {} - const GCOVFunction &getParent() const { return Parent; } - void addLine(uint32_t N) { Lines.push_back(N); } - uint32_t getLastLine() const { return Lines.back(); } - uint64_t getCount() const { return Counter; } + void addLine(uint32_t N) { lines.push_back(N); } + uint32_t getLastLine() const { return lines.back(); } + uint64_t getCount() const { return count; } void addSrcEdge(GCOVArc *Edge) { pred.push_back(Edge); } void addDstEdge(GCOVArc *Edge) { succ.push_back(Edge); } - size_t getNumSrcEdges() const { return pred.size(); } - size_t getNumDstEdges() const { return succ.size(); } - iterator_range srcs() const { return make_range(pred.begin(), pred.end()); } @@ -300,7 +291,6 @@ class GCOVBlock { void print(raw_ostream &OS) const; void dump() const; - void collectLineCounts(FileInfo &FI); static uint64_t getCycleCount(const Edges &Path); static void unblock(const GCOVBlock *U, BlockVector &Blocked, @@ -313,103 +303,15 @@ class GCOVBlock { static uint64_t getLineCount(const BlockVector &Blocks); public: - GCOVFunction &Parent; - uint32_t Number; - uint64_t Counter = 0; + uint32_t number; + uint64_t count = 0; SmallVector pred; SmallVector succ; - SmallVector Lines; -}; - -struct GCOVCoverage { - GCOVCoverage() = default; - GCOVCoverage(StringRef Name) : Name(Name) {} - - StringRef Name; - - uint32_t LogicalLines = 0; - uint32_t LinesExec = 0; - - uint32_t Branches = 0; - uint32_t BranchesExec = 0; - uint32_t BranchesTaken = 0; -}; - -struct SourceInfo { - StringRef filename; - std::string name; - std::vector functions; - GCOVCoverage coverage; - SourceInfo(StringRef filename) : filename(filename) {} + SmallVector lines; }; -class FileInfo { -protected: - // It is unlikely--but possible--for multiple functions to be on the same - // line. - // Therefore this typedef allows LineData.Functions to store multiple - // functions - // per instance. This is rare, however, so optimize for the common case. - using FunctionVector = SmallVector; - using FunctionLines = DenseMap; - using BlockVector = SmallVector; - using BlockLines = DenseMap; - - struct LineData { - LineData() = default; - - BlockLines Blocks; - FunctionLines Functions; - uint32_t LastLine = 0; - }; - -public: - friend class GCOVFile; - FileInfo(const GCOV::Options &Options) : Options(Options) {} - - void addBlockLine(StringRef Filename, uint32_t Line, const GCOVBlock *Block) { - if (Line > LineInfo[Filename].LastLine) - LineInfo[Filename].LastLine = Line; - LineInfo[Filename].Blocks[Line - 1].push_back(Block); - } - - void addFunctionLine(StringRef Filename, uint32_t Line, - const GCOVFunction *Function) { - if (Line > LineInfo[Filename].LastLine) - LineInfo[Filename].LastLine = Line; - LineInfo[Filename].Functions[Line - 1].push_back(Function); - } - - void setRunCount(uint32_t Runs) { RunCount = Runs; } - void setProgramCount(uint32_t Programs) { ProgramCount = Programs; } - void print(raw_ostream &OS, StringRef MainFilename, StringRef GCNOFile, - StringRef GCDAFile, GCOVFile &file); - -protected: - std::string getCoveragePath(StringRef Filename, StringRef MainFilename); - std::unique_ptr openCoveragePath(StringRef CoveragePath); - void printFunctionSummary(raw_ostream &OS, const FunctionVector &Funcs) const; - void printBlockInfo(raw_ostream &OS, const GCOVBlock &Block, - uint32_t LineIndex, uint32_t &BlockNo) const; - void printBranchInfo(raw_ostream &OS, const GCOVBlock &Block, - GCOVCoverage &Coverage, uint32_t &EdgeNo); - void printUncondBranchInfo(raw_ostream &OS, uint32_t &EdgeNo, - uint64_t Count) const; - - void printCoverage(raw_ostream &OS, const GCOVCoverage &Coverage) const; - void printFuncCoverage(raw_ostream &OS) const; - void printFileCoverage(raw_ostream &OS) const; - - const GCOV::Options &Options; - StringMap LineInfo; - uint32_t RunCount = 0; - uint32_t ProgramCount = 0; - - using FuncCoverageMap = MapVector; - - FuncCoverageMap FuncCoverages; - std::vector sources; -}; +void gcovOneInput(const GCOV::Options &options, StringRef filename, + StringRef gcno, StringRef gcda, GCOVFile &file); } // end namespace llvm diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h index aca941b2da15a..3707f980ccca0 100644 --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -37,8 +37,6 @@ namespace llvm { -class raw_ostream; - const std::error_category &sampleprof_category(); enum class sampleprof_error { diff --git a/llvm/include/llvm/Support/AArch64TargetParser.h b/llvm/include/llvm/Support/AArch64TargetParser.h index b045e31bc92aa..007a9ed867cee 100644 --- a/llvm/include/llvm/Support/AArch64TargetParser.h +++ b/llvm/include/llvm/Support/AArch64TargetParser.h @@ -104,7 +104,7 @@ const ArchKind ArchKinds[] = { }; // FIXME: These should be moved to TargetTuple once it exists -bool getExtensionFeatures(unsigned Extensions, +bool getExtensionFeatures(uint64_t Extensions, std::vector &Features); bool getArchFeatures(ArchKind AK, std::vector &Features); @@ -117,7 +117,7 @@ StringRef getArchExtFeature(StringRef ArchExt); // Information by Name unsigned getDefaultFPU(StringRef CPU, ArchKind AK); -unsigned getDefaultExtensions(StringRef CPU, ArchKind AK); +uint64_t getDefaultExtensions(StringRef CPU, ArchKind AK); StringRef getDefaultCPU(StringRef Arch); ArchKind getCPUArchKind(StringRef CPU); diff --git a/llvm/include/llvm/Support/ARMWinEH.h b/llvm/include/llvm/Support/ARMWinEH.h index 857a0d3814a8f..327aa9804849f 100644 --- a/llvm/include/llvm/Support/ARMWinEH.h +++ b/llvm/include/llvm/Support/ARMWinEH.h @@ -31,6 +31,9 @@ enum class ReturnType { /// RuntimeFunction - An entry in the table of procedure data (.pdata) /// +/// This is ARM specific, but the Function Start RVA, Flag and +/// ExceptionInformationRVA fields work identically for ARM64. +/// /// 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 /// 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 /// +---------------------------------------------------------------+ @@ -204,6 +207,85 @@ inline uint16_t StackAdjustment(const RuntimeFunction &RF) { /// purpose (r0-r15) and VFP (d0-d31) registers. std::pair SavedRegisterMask(const RuntimeFunction &RF); +/// RuntimeFunctionARM64 - An entry in the table of procedure data (.pdata) +/// +/// 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 +/// 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 +/// +---------------------------------------------------------------+ +/// | Function Start RVA | +/// +-----------------+---+-+-------+-----+---------------------+---+ +/// | Frame Size |CR |H| RegI |RegF | Function Length |Flg| +/// +-----------------+---+-+-------+-----+---------------------+---+ +/// +/// See https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling +/// for the full reference for this struct. + +class RuntimeFunctionARM64 { +public: + const support::ulittle32_t BeginAddress; + const support::ulittle32_t UnwindData; + + RuntimeFunctionARM64(const support::ulittle32_t *Data) + : BeginAddress(Data[0]), UnwindData(Data[1]) {} + + RuntimeFunctionARM64(const support::ulittle32_t BeginAddress, + const support::ulittle32_t UnwindData) + : BeginAddress(BeginAddress), UnwindData(UnwindData) {} + + RuntimeFunctionFlag Flag() const { + return RuntimeFunctionFlag(UnwindData & 0x3); + } + + uint32_t ExceptionInformationRVA() const { + assert(Flag() == RuntimeFunctionFlag::RFF_Unpacked && + "unpacked form required for this operation"); + return (UnwindData & ~0x3); + } + + uint32_t PackedUnwindData() const { + assert((Flag() == RuntimeFunctionFlag::RFF_Packed || + Flag() == RuntimeFunctionFlag::RFF_PackedFragment) && + "packed form required for this operation"); + return (UnwindData & ~0x3); + } + uint32_t FunctionLength() const { + assert((Flag() == RuntimeFunctionFlag::RFF_Packed || + Flag() == RuntimeFunctionFlag::RFF_PackedFragment) && + "packed form required for this operation"); + return (((UnwindData & 0x00001ffc) >> 2) << 2); + } + uint8_t RegF() const { + assert((Flag() == RuntimeFunctionFlag::RFF_Packed || + Flag() == RuntimeFunctionFlag::RFF_PackedFragment) && + "packed form required for this operation"); + return ((UnwindData & 0x0000e000) >> 13); + } + uint8_t RegI() const { + assert((Flag() == RuntimeFunctionFlag::RFF_Packed || + Flag() == RuntimeFunctionFlag::RFF_PackedFragment) && + "packed form required for this operation"); + return ((UnwindData & 0x000f0000) >> 16); + } + bool H() const { + assert((Flag() == RuntimeFunctionFlag::RFF_Packed || + Flag() == RuntimeFunctionFlag::RFF_PackedFragment) && + "packed form required for this operation"); + return ((UnwindData & 0x00100000) >> 20); + } + uint8_t CR() const { + assert((Flag() == RuntimeFunctionFlag::RFF_Packed || + Flag() == RuntimeFunctionFlag::RFF_PackedFragment) && + "packed form required for this operation"); + return ((UnwindData & 0x600000) >> 21); + } + uint16_t FrameSize() const { + assert((Flag() == RuntimeFunctionFlag::RFF_Packed || + Flag() == RuntimeFunctionFlag::RFF_PackedFragment) && + "packed form required for this operation"); + return ((UnwindData & 0xff800000) >> 23); + } +}; + /// ExceptionDataRecord - An entry in the table of exception data (.xdata) /// /// The format on ARM is: @@ -416,12 +498,13 @@ struct ExceptionDataRecord { uint32_t ExceptionHandlerRVA() const { assert(X() && "Exception Handler RVA is only valid if the X bit is set"); - return Data[HeaderWords(*this) + EpilogueCount() + CodeWords()]; + return Data[HeaderWords(*this) + (E() ? 0 : EpilogueCount()) + CodeWords()]; } uint32_t ExceptionHandlerParameter() const { assert(X() && "Exception Handler RVA is only valid if the X bit is set"); - return Data[HeaderWords(*this) + EpilogueCount() + CodeWords() + 1]; + return Data[HeaderWords(*this) + (E() ? 0 : EpilogueCount()) + CodeWords() + + 1]; } }; diff --git a/llvm/include/llvm/Support/CFGUpdate.h b/llvm/include/llvm/Support/CFGUpdate.h index af4cd6ed1f1df..3a12b9d86c18a 100644 --- a/llvm/include/llvm/Support/CFGUpdate.h +++ b/llvm/include/llvm/Support/CFGUpdate.h @@ -14,7 +14,6 @@ #ifndef LLVM_SUPPORT_CFGUPDATE_H #define LLVM_SUPPORT_CFGUPDATE_H -#include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PointerIntPair.h" #include "llvm/Support/Compiler.h" diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h index 38c588080069c..a367387510e9e 100644 --- a/llvm/include/llvm/Support/CommandLine.h +++ b/llvm/include/llvm/Support/CommandLine.h @@ -672,7 +672,7 @@ class ValuesClass { : Values(Options) {} template void apply(Opt &O) const { - for (auto Value : Values) + for (const auto &Value : Values) O.getParser().addLiteralOption(Value.Name, Value.Value, Value.Description); } diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h index 76973f521042c..c77168432058a 100644 --- a/llvm/include/llvm/Support/GenericDomTree.h +++ b/llvm/include/llvm/Support/GenericDomTree.h @@ -38,7 +38,6 @@ #include #include #include -#include namespace llvm { @@ -61,7 +60,7 @@ template class DomTreeNodeBase { NodeT *TheBB; DomTreeNodeBase *IDom; unsigned Level; - std::vector Children; + SmallVector Children; mutable unsigned DFSNumIn = ~0; mutable unsigned DFSNumOut = ~0; @@ -69,9 +68,9 @@ template class DomTreeNodeBase { DomTreeNodeBase(NodeT *BB, DomTreeNodeBase *iDom) : TheBB(BB), IDom(iDom), Level(IDom ? IDom->Level + 1 : 0) {} - using iterator = typename std::vector::iterator; + using iterator = typename SmallVector::iterator; using const_iterator = - typename std::vector::const_iterator; + typename SmallVector::const_iterator; iterator begin() { return Children.begin(); } iterator end() { return Children.end(); } @@ -837,7 +836,7 @@ class DominatorTreeBase { "NewBB should have a single successor!"); NodeRef NewBBSucc = *GraphT::child_begin(NewBB); - std::vector PredBlocks; + SmallVector PredBlocks; for (auto Pred : children>(NewBB)) PredBlocks.push_back(Pred); diff --git a/llvm/include/llvm/Support/GlobPattern.h b/llvm/include/llvm/Support/GlobPattern.h index 3e5989d025007..b79de6f41c494 100644 --- a/llvm/include/llvm/Support/GlobPattern.h +++ b/llvm/include/llvm/Support/GlobPattern.h @@ -31,6 +31,16 @@ class GlobPattern { static Expected create(StringRef Pat); bool match(StringRef S) const; + // Returns true for glob pattern "*". Can be used to avoid expensive + // preparation/acquisition of the input for match(). + bool isTrivialMatchAll() const { + if (Prefix && Prefix->empty()) { + assert(!Suffix); + return true; + } + return false; + } + private: bool matchOne(ArrayRef Pat, StringRef S) const; diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h index a29e150b904a3..8da6c7d98ba5f 100644 --- a/llvm/include/llvm/Support/KnownBits.h +++ b/llvm/include/llvm/Support/KnownBits.h @@ -278,6 +278,9 @@ struct KnownBits { /// Update known bits based on XORing with RHS. KnownBits &operator^=(const KnownBits &RHS); + /// Compute known bits for the absolute value. + KnownBits abs() const; + KnownBits byteSwap() { return KnownBits(Zero.byteSwap(), One.byteSwap()); } diff --git a/llvm/include/llvm/Support/PluginLoader.h b/llvm/include/llvm/Support/PluginLoader.h index c0c516bdae03e..95c087f03d9bf 100644 --- a/llvm/include/llvm/Support/PluginLoader.h +++ b/llvm/include/llvm/Support/PluginLoader.h @@ -16,7 +16,11 @@ #ifndef LLVM_SUPPORT_PLUGINLOADER_H #define LLVM_SUPPORT_PLUGINLOADER_H +#ifndef DONT_GET_PLUGIN_LOADER_OPTION #include "llvm/Support/CommandLine.h" +#endif + +#include namespace llvm { struct PluginLoader { diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index db36fc42aa2a2..2e464b395d7d9 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -77,6 +77,10 @@ HANDLE_TARGET_OPCODE(SUBREG_TO_REG) /// DBG_VALUE - a mapping of the llvm.dbg.value intrinsic HANDLE_TARGET_OPCODE(DBG_VALUE) +/// DBG_INSTR_REF - A mapping of llvm.dbg.value referring to the instruction +/// that defines the value, rather than a virtual register. +HANDLE_TARGET_OPCODE(DBG_INSTR_REF) + /// DBG_LABEL - a mapping of the llvm.dbg.label intrinsic HANDLE_TARGET_OPCODE(DBG_LABEL) diff --git a/llvm/include/llvm/Support/TrigramIndex.h b/llvm/include/llvm/Support/TrigramIndex.h index d635694eb5fd3..360ab94597902 100644 --- a/llvm/include/llvm/Support/TrigramIndex.h +++ b/llvm/include/llvm/Support/TrigramIndex.h @@ -27,7 +27,7 @@ #define LLVM_SUPPORT_TRIGRAMINDEX_H #include "llvm/ADT/SmallVector.h" - +#include "llvm/ADT/StringRef.h" #include #include #include diff --git a/llvm/include/llvm/Support/raw_ostream.h b/llvm/include/llvm/Support/raw_ostream.h index cae57430baffb..bd15f97a13a1b 100644 --- a/llvm/include/llvm/Support/raw_ostream.h +++ b/llvm/include/llvm/Support/raw_ostream.h @@ -15,6 +15,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/DataTypes.h" #include #include #include @@ -47,7 +48,16 @@ class FileLocker; /// buffered disciplines etc. It is a simple buffer that outputs /// a chunk at a time. class raw_ostream { +public: + // Class kinds to support LLVM-style RTTI. + enum class OStreamKind { + OK_OStream, + OK_FDStream, + }; + private: + OStreamKind Kind; + /// The buffer is handled in such a way that the buffer is /// uninitialized, unbuffered, or out of space when OutBufCur >= /// OutBufEnd. Thus a single comparison suffices to determine if we @@ -105,9 +115,10 @@ class raw_ostream { static constexpr Colors SAVEDCOLOR = Colors::SAVEDCOLOR; static constexpr Colors RESET = Colors::RESET; - explicit raw_ostream(bool unbuffered = false) - : BufferMode(unbuffered ? BufferKind::Unbuffered - : BufferKind::InternalBuffer) { + explicit raw_ostream(bool unbuffered = false, + OStreamKind K = OStreamKind::OK_OStream) + : Kind(K), BufferMode(unbuffered ? BufferKind::Unbuffered + : BufferKind::InternalBuffer) { // Start out ready to flush. OutBufStart = OutBufEnd = OutBufCur = nullptr; } @@ -120,6 +131,8 @@ class raw_ostream { /// tell - Return the current offset with the file. uint64_t tell() const { return current_pos() + GetNumBytesInBuffer(); } + OStreamKind get_kind() const { return Kind; } + //===--------------------------------------------------------------------===// // Configuration Interface //===--------------------------------------------------------------------===// @@ -388,8 +401,9 @@ class raw_pwrite_stream : public raw_ostream { void anchor() override; public: - explicit raw_pwrite_stream(bool Unbuffered = false) - : raw_ostream(Unbuffered) {} + explicit raw_pwrite_stream(bool Unbuffered = false, + OStreamKind K = OStreamKind::OK_OStream) + : raw_ostream(Unbuffered, K) {} void pwrite(const char *Ptr, size_t Size, uint64_t Offset) { #ifndef NDEBUG uint64_t Pos = tell(); @@ -436,10 +450,17 @@ class raw_fd_ostream : public raw_pwrite_stream { /// Determine an efficient buffer size. size_t preferred_buffer_size() const override; + void anchor() override; + +protected: /// Set the flag indicating that an output error has been encountered. void error_detected(std::error_code EC) { this->EC = EC; } - void anchor() override; + /// Return the file descriptor. + int get_fd() const { return FD; } + + // Update the file position by increasing \p Delta. + void inc_pos(uint64_t Delta) { pos += Delta; } public: /// Open the specified file for writing. If an error occurs, information @@ -464,7 +485,8 @@ class raw_fd_ostream : public raw_pwrite_stream { /// FD is the file descriptor that this writes to. If ShouldClose is true, /// this closes the file when the stream is destroyed. If FD is for stdout or /// stderr, it will not be closed. - raw_fd_ostream(int fd, bool shouldClose, bool unbuffered=false); + raw_fd_ostream(int fd, bool shouldClose, bool unbuffered = false, + OStreamKind K = OStreamKind::OK_OStream); ~raw_fd_ostream() override; @@ -548,6 +570,34 @@ raw_fd_ostream &errs(); /// This returns a reference to a raw_ostream which simply discards output. raw_ostream &nulls(); +//===----------------------------------------------------------------------===// +// File Streams +//===----------------------------------------------------------------------===// + +/// A raw_ostream of a file for reading/writing/seeking. +/// +class raw_fd_stream : public raw_fd_ostream { +public: + /// Open the specified file for reading/writing/seeking. If an error occurs, + /// information about the error is put into EC, and the stream should be + /// immediately destroyed. + raw_fd_stream(StringRef Filename, std::error_code &EC); + + /// This reads the \p Size bytes into a buffer pointed by \p Ptr. + /// + /// \param Ptr The start of the buffer to hold data to be read. + /// + /// \param Size The number of bytes to be read. + /// + /// On success, the number of bytes read is returned, and the file position is + /// advanced by this number. On error, -1 is returned, use error() to get the + /// error code. + ssize_t read(char *Ptr, size_t Size); + + /// Check if \p OS is a pointer of type raw_fd_stream*. + static bool classof(const raw_ostream *OS); +}; + //===----------------------------------------------------------------------===// // Output Stream Adaptors //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h index a082fe5d74a1f..5d67ef4455cf6 100644 --- a/llvm/include/llvm/TableGen/Record.h +++ b/llvm/include/llvm/TableGen/Record.h @@ -67,6 +67,7 @@ class RecTy { private: RecTyKind Kind; + /// ListRecTy of the list that has elements of this type. ListRecTy *ListTy = nullptr; public: @@ -190,14 +191,14 @@ class StringRecTy : public RecTy { bool typeIsConvertibleTo(const RecTy *RHS) const override; }; -/// 'list' - Represent a list of values, all of which must be of -/// the specified type. +/// 'list' - Represent a list of element values, all of which must be of +/// the specified type. The type is stored in ElementTy. class ListRecTy : public RecTy { friend ListRecTy *RecTy::getListTy(); - RecTy *Ty; + RecTy *ElementTy; - explicit ListRecTy(RecTy *T) : RecTy(ListRecTyKind), Ty(T) {} + explicit ListRecTy(RecTy *T) : RecTy(ListRecTyKind), ElementTy(T) {} public: static bool classof(const RecTy *RT) { @@ -205,7 +206,7 @@ class ListRecTy : public RecTy { } static ListRecTy *get(RecTy *T) { return T->getListTy(); } - RecTy *getElementType() const { return Ty; } + RecTy *getElementType() const { return ElementTy; } std::string getAsString() const override; @@ -420,14 +421,14 @@ inline raw_ostream &operator<<(raw_ostream &OS, const Init &I) { I.print(OS); return OS; } -/// This is the common super-class of types that have a specific, -/// explicit, type. +/// This is the common superclass of types that have a specific, +/// explicit, type, stored in ValueTy. class TypedInit : public Init { - RecTy *Ty; + RecTy *ValueTy; protected: explicit TypedInit(InitKind K, RecTy *T, uint8_t Opc = 0) - : Init(K, Opc), Ty(T) {} + : Init(K, Opc), ValueTy(T) {} public: TypedInit(const TypedInit &) = delete; @@ -438,7 +439,7 @@ class TypedInit : public Init { I->getKind() <= IK_LastTypedInit; } - RecTy *getType() const { return Ty; } + RecTy *getType() const { return ValueTy; } Init *getCastTo(RecTy *Ty) const override; Init *convertInitializerTo(RecTy *Ty) const override; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 6a6f97ae78b04..d3ccbb4049496 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -145,13 +145,11 @@ def combine_indexed_load_store : GICombineRule< [{ return Helper.matchCombineIndexedLoadStore(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyCombineIndexedLoadStore(*${root}, ${matchinfo}); }])>; -// FIXME: Is there a reason this wasn't in tryCombine? I've left it out of -// all_combines because it wasn't there. -def elide_br_by_inverting_cond : GICombineRule< +def opt_brcond_by_inverting_cond : GICombineRule< (defs root:$root), (match (wip_match_opcode G_BR):$root, - [{ return Helper.matchElideBrByInvertingCond(*${root}); }]), - (apply [{ Helper.applyElideBrByInvertingCond(*${root}); }])>; + [{ return Helper.matchOptBrCondByInvertingCond(*${root}); }]), + (apply [{ Helper.applyOptBrCondByInvertingCond(*${root}); }])>; def ptr_add_immed_matchdata : GIDefMatchData<"PtrAddChain">; def ptr_add_immed_chain : GICombineRule< @@ -194,11 +192,17 @@ def undef_to_negative_one: GICombineRule< [{ return Helper.matchAnyExplicitUseIsUndef(*${root}); }]), (apply [{ Helper.replaceInstWithConstant(*${root}, -1); }])>; +def binop_left_undef_to_zero: GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_SHL):$root, + [{ return Helper.matchOperandIsUndef(*${root}, 1); }]), + (apply [{ Helper.replaceInstWithConstant(*${root}, 0); }])>; + // Instructions where if any source operand is undef, the instruction can be // replaced with undef. def propagate_undef_any_op: GICombineRule< (defs root:$root), - (match (wip_match_opcode G_ADD, G_FPTOSI, G_FPTOUI, G_SUB, G_XOR):$root, + (match (wip_match_opcode G_ADD, G_FPTOSI, G_FPTOUI, G_SUB, G_XOR, G_TRUNC):$root, [{ return Helper.matchAnyExplicitUseIsUndef(*${root}); }]), (apply [{ Helper.replaceInstWithUndef(*${root}); }])>; @@ -251,6 +255,14 @@ def right_identity_zero: GICombineRule< (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 1); }]) >; +// Fold x op 1 -> x +def right_identity_one: GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_MUL):$root, + [{ return Helper.matchConstantOp(${root}->getOperand(2), 1); }]), + (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 1); }]) +>; + // Fold (x op x) - > x def binop_same_val: GICombineRule< (defs root:$root), @@ -291,6 +303,15 @@ def simplify_add_to_sub: GICombineRule < (apply [{ return Helper.applySimplifyAddToSub(*${root}, ${info});}]) >; +// Fold fp_op(cst) to the constant result of the floating point operation. +def constant_fp_op_matchinfo: GIDefMatchData<"Optional">; +def constant_fp_op: GICombineRule < + (defs root:$root, constant_fp_op_matchinfo:$info), + (match (wip_match_opcode G_FNEG, G_FABS, G_FPTRUNC, G_FSQRT, G_FLOG2):$root, + [{ return Helper.matchCombineConstantFoldFpUnary(*${root}, ${info}); }]), + (apply [{ return Helper.applyCombineConstantFoldFpUnary(*${root}, ${info}); }]) +>; + // Fold int2ptr(ptr2int(x)) -> x def p2i_to_i2p_matchinfo: GIDefMatchData<"Register">; def p2i_to_i2p: GICombineRule< @@ -381,9 +402,88 @@ def not_cmp_fold : GICombineRule< (apply [{ return Helper.applyNotCmp(*${d}, ${info}); }]) >; +// Fold (fneg (fneg x)) -> x. +def fneg_fneg_fold_matchinfo : GIDefMatchData<"Register">; +def fneg_fneg_fold: GICombineRule < + (defs root:$root, fneg_fneg_fold_matchinfo:$matchinfo), + (match (wip_match_opcode G_FNEG):$root, + [{ return Helper.matchCombineFNegOfFNeg(*${root}, ${matchinfo}); }]), + (apply [{ return Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }]) +>; + +// Fold (unmerge(merge x, y, z)) -> z, y, z. +def unmerge_merge_matchinfo : GIDefMatchData<"SmallVector">; +def unmerge_merge : GICombineRule< + (defs root:$d, unmerge_merge_matchinfo:$info), + (match (wip_match_opcode G_UNMERGE_VALUES): $d, + [{ return Helper.matchCombineUnmergeMergeToPlainValues(*${d}, ${info}); }]), + (apply [{ return Helper.applyCombineUnmergeMergeToPlainValues(*${d}, ${info}); }]) +>; + +// Fold (fabs (fabs x)) -> (fabs x). +def fabs_fabs_fold_matchinfo : GIDefMatchData<"Register">; +def fabs_fabs_fold: GICombineRule< + (defs root:$root, fabs_fabs_fold_matchinfo:$matchinfo), + (match (wip_match_opcode G_FABS):$root, + [{ return Helper.matchCombineFAbsOfFAbs(*${root}, ${matchinfo}); }]), + (apply [{ return Helper.applyCombineFAbsOfFAbs(*${root}, ${matchinfo}); }]) +>; + +// Fold (unmerge cst) -> cst1, cst2, ... +def unmerge_cst_matchinfo : GIDefMatchData<"SmallVector">; +def unmerge_cst : GICombineRule< + (defs root:$d, unmerge_cst_matchinfo:$info), + (match (wip_match_opcode G_UNMERGE_VALUES): $d, + [{ return Helper.matchCombineUnmergeConstant(*${d}, ${info}); }]), + (apply [{ return Helper.applyCombineUnmergeConstant(*${d}, ${info}); }]) +>; + +// Transform x,y = unmerge z -> x = trunc z. +def unmerge_dead_to_trunc : GICombineRule< + (defs root:$d), + (match (wip_match_opcode G_UNMERGE_VALUES): $d, + [{ return Helper.matchCombineUnmergeWithDeadLanesToTrunc(*${d}); }]), + (apply [{ return Helper.applyCombineUnmergeWithDeadLanesToTrunc(*${d}); }]) +>; + +// Transform x,y = unmerge(zext(z)) -> x = zext z; y = 0. +def unmerge_zext_to_zext : GICombineRule< + (defs root:$d), + (match (wip_match_opcode G_UNMERGE_VALUES): $d, + [{ return Helper.matchCombineUnmergeZExtToZExt(*${d}); }]), + (apply [{ return Helper.applyCombineUnmergeZExtToZExt(*${d}); }]) +>; + +// Fold trunc ([asz]ext x) -> x or ([asz]ext x) or (trunc x). +def trunc_ext_fold_matchinfo : GIDefMatchData<"std::pair">; +def trunc_ext_fold: GICombineRule < + (defs root:$root, trunc_ext_fold_matchinfo:$matchinfo), + (match (wip_match_opcode G_TRUNC):$root, + [{ return Helper.matchCombineTruncOfExt(*${root}, ${matchinfo}); }]), + (apply [{ return Helper.applyCombineTruncOfExt(*${root}, ${matchinfo}); }]) +>; + +// Fold trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits(). +def trunc_shl_matchinfo : GIDefMatchData<"std::pair">; +def trunc_shl: GICombineRule < + (defs root:$root, trunc_shl_matchinfo:$matchinfo), + (match (wip_match_opcode G_TRUNC):$root, + [{ return Helper.matchCombineTruncOfShl(*${root}, ${matchinfo}); }]), + (apply [{ return Helper.applyCombineTruncOfShl(*${root}, ${matchinfo}); }]) +>; + +// Transform (mul x, -1) -> (sub 0, x) +def mul_by_neg_one: GICombineRule < + (defs root:$root), + (match (wip_match_opcode G_MUL):$root, + [{ return Helper.matchConstantOp(${root}->getOperand(2), -1); }]), + (apply [{ return Helper.applyCombineMulByNegativeOne(*${root}); }]) +>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, + binop_left_undef_to_zero, propagate_undef_any_op, propagate_undef_all_ops, propagate_undef_shuffle_mask, @@ -392,7 +492,8 @@ def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, def identity_combines : GICombineGroup<[select_same_val, right_identity_zero, binop_same_val, binop_left_to_zero, binop_right_to_zero, p2i_to_i2p, - i2p_to_p2i, anyext_trunc_fold]>; + i2p_to_p2i, anyext_trunc_fold, + fneg_fneg_fold, right_identity_one]>; def known_bits_simplifications : GICombineGroup<[ and_trivial_mask, redundant_sext_inreg]>; @@ -401,7 +502,9 @@ def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>; def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp]>; -def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd]>; +def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd, + mul_by_neg_one]>; + def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, combines_for_extload, combine_indexed_load_store, undef_combines, identity_combines, simplify_add_to_sub, @@ -409,4 +512,7 @@ def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, shl_ashr_to_sext_inreg, sext_inreg_of_load, width_reduction_combines, select_combines, known_bits_simplifications, ext_ext_fold, - not_cmp_fold]>; + not_cmp_fold, opt_brcond_by_inverting_cond, + unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc, + unmerge_zext_to_zext, trunc_ext_fold, trunc_shl, + constant_fp_op]>; diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index e56927540f51c..8fba826f21874 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1100,6 +1100,12 @@ def DBG_VALUE : StandardPseudoInstruction { let AsmString = "DBG_VALUE"; let hasSideEffects = 0; } +def DBG_INSTR_REF : StandardPseudoInstruction { + let OutOperandList = (outs); + let InOperandList = (ins variable_ops); + let AsmString = "DBG_INSTR_REF"; + let hasSideEffects = 0; +} def DBG_LABEL : StandardPseudoInstruction { let OutOperandList = (outs); let InOperandList = (ins unknown:$label); diff --git a/llvm/include/llvm/Testing/Support/SupportHelpers.h b/llvm/include/llvm/Testing/Support/SupportHelpers.h index 3517361041b94..2419fc95d8178 100644 --- a/llvm/include/llvm/Testing/Support/SupportHelpers.h +++ b/llvm/include/llvm/Testing/Support/SupportHelpers.h @@ -152,6 +152,9 @@ class TempDir { /// The path to the temporary directory. StringRef path() const { return Path; } + /// The null-terminated C string pointing to the path. + const char *c_str() { return Path.c_str(); } + /// Creates a new path by appending the argument to the path of the managed /// directory using the native path separator. SmallString<128> path(StringRef component) const { diff --git a/llvm/include/llvm/Transforms/HelloNew/HelloWorld.h b/llvm/include/llvm/Transforms/HelloNew/HelloWorld.h new file mode 100644 index 0000000000000..6c753032f913c --- /dev/null +++ b/llvm/include/llvm/Transforms/HelloNew/HelloWorld.h @@ -0,0 +1,23 @@ +//===-- HelloWorld.h - Example Transformations ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_HELLONEW_HELLOWORLD_H +#define LLVM_TRANSFORMS_HELLONEW_HELLOWORLD_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class HelloWorldPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_HELLONEW_HELLOWORLD_H diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index 75e7ccde4dba7..e73dc637117b1 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -116,9 +116,6 @@ #include "llvm/IR/PassManager.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/DOTGraphTraits.h" -#include "llvm/Support/GraphWriter.h" #include "llvm/Support/TimeProfiler.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" @@ -133,8 +130,10 @@ struct AAIsDead; class Function; -/// Simple enum classes that forces properties to be spelled out explicitly. -/// +/// The value passed to the line option that defines the maximal initialization +/// chain length. +extern unsigned MaxInitializationChainLength; + ///{ enum class ChangeStatus { CHANGED, @@ -337,8 +336,14 @@ struct IRPosition { /// Return the associated function, if any. Function *getAssociatedFunction() const { - if (auto *CB = dyn_cast(&getAnchorValue())) + if (auto *CB = dyn_cast(&getAnchorValue())) { + // We reuse the logic that associates callback calles to arguments of a + // call site here to identify the callback callee as the associated + // function. + if (Argument *Arg = getAssociatedArgument()) + return Arg->getParent(); return CB->getCalledFunction(); + } return getAnchorScope(); } @@ -386,10 +391,11 @@ struct IRPosition { /// Return the value this abstract attribute is associated with. Value &getAssociatedValue() const { - if (getArgNo() < 0 || isa(&getAnchorValue())) + if (getCallSiteArgNo() < 0 || isa(&getAnchorValue())) return getAnchorValue(); assert(isa(&getAnchorValue()) && "Expected a call base!"); - return *cast(&getAnchorValue())->getArgOperand(getArgNo()); + return *cast(&getAnchorValue()) + ->getArgOperand(getCallSiteArgNo()); } /// Return the type this abstract attribute is associated with. @@ -399,19 +405,22 @@ struct IRPosition { return getAssociatedValue().getType(); } - /// Return the argument number of the associated value if it is an argument or - /// call site argument, otherwise a negative value. - int getArgNo() const { - switch (getPositionKind()) { - case IRPosition::IRP_ARGUMENT: - return cast(getAsValuePtr())->getArgNo(); - case IRPosition::IRP_CALL_SITE_ARGUMENT: { - Use &U = *getAsUsePtr(); - return cast(U.getUser())->getArgOperandNo(&U); - } - default: - return -1; - } + /// Return the callee argument number of the associated value if it is an + /// argument or call site argument, otherwise a negative value. In contrast to + /// `getCallSiteArgNo` this method will always return the "argument number" + /// from the perspective of the callee. This may not the same as the call site + /// if this is a callback call. + int getCalleeArgNo() const { + return getArgNo(/* CallbackCalleeArgIfApplicable */ true); + } + + /// Return the call site argument number of the associated value if it is an + /// argument or call site argument, otherwise a negative value. In contrast to + /// `getCalleArgNo` this method will always return the "operand number" from + /// the perspective of the call site. This may not the same as the callee + /// perspective if this is a callback call. + int getCallSiteArgNo() const { + return getArgNo(/* CallbackCalleeArgIfApplicable */ false); } /// Return the index in the attribute list for this position. @@ -428,7 +437,7 @@ struct IRPosition { return AttributeList::ReturnIndex; case IRPosition::IRP_ARGUMENT: case IRPosition::IRP_CALL_SITE_ARGUMENT: - return getArgNo() + AttributeList::FirstArgIndex; + return getCallSiteArgNo() + AttributeList::FirstArgIndex; } llvm_unreachable( "There is no attribute index for a floating or invalid position!"); @@ -513,6 +522,17 @@ struct IRPosition { } } + /// Return true if the position is an argument or call site argument. + bool isArgumentPosition() const { + switch (getPositionKind()) { + case IRPosition::IRP_ARGUMENT: + case IRPosition::IRP_CALL_SITE_ARGUMENT: + return true; + default: + return false; + } + } + /// Special DenseMap key values. /// ///{ @@ -559,6 +579,25 @@ struct IRPosition { verify(); } + /// Return the callee argument number of the associated value if it is an + /// argument or call site argument. See also `getCalleeArgNo` and + /// `getCallSiteArgNo`. + int getArgNo(bool CallbackCalleeArgIfApplicable) const { + if (CallbackCalleeArgIfApplicable) + if (Argument *Arg = getAssociatedArgument()) + return Arg->getArgNo(); + switch (getPositionKind()) { + case IRPosition::IRP_ARGUMENT: + return cast(getAsValuePtr())->getArgNo(); + case IRPosition::IRP_CALL_SITE_ARGUMENT: { + Use &U = *getAsUsePtr(); + return cast(U.getUser())->getArgOperandNo(&U); + } + default: + return -1; + } + } + /// IRPosition for the use \p U. The position kind \p PK needs to be /// IRP_CALL_SITE_ARGUMENT, the anchor value is the user, the associated value /// the used value. @@ -1071,6 +1110,9 @@ struct Attributor { Invalidate |= FnScope->hasFnAttribute(Attribute::Naked) || FnScope->hasFnAttribute(Attribute::OptimizeNone); + // Avoid too many nested initializations to prevent a stack overflow. + Invalidate |= InitializationChainLength > MaxInitializationChainLength; + // Bootstrap the new attribute with an initial update to propagate // information, e.g., function -> call site. If it is not on a given // Allowed we will not perform updates at all. @@ -1081,7 +1123,9 @@ struct Attributor { { TimeTraceScope TimeScope(AA.getName() + "::initialize"); + ++InitializationChainLength; AA.initialize(*this); + --InitializationChainLength; } // Initialize and update is allowed for code outside of the current function @@ -1615,6 +1659,9 @@ struct Attributor { CLEANUP, } Phase = AttributorPhase::SEEDING; + /// The current initialization chain length. Tracked to avoid stack overflows. + unsigned InitializationChainLength = 0; + /// Functions, blocks, and instructions we delete after manifest is done. /// ///{ diff --git a/llvm/include/llvm/Transforms/IPO/CalledValuePropagation.h b/llvm/include/llvm/Transforms/IPO/CalledValuePropagation.h index c2626d0867b4d..782633799ede6 100644 --- a/llvm/include/llvm/Transforms/IPO/CalledValuePropagation.h +++ b/llvm/include/llvm/Transforms/IPO/CalledValuePropagation.h @@ -19,7 +19,6 @@ #ifndef LLVM_TRANSFORMS_IPO_CALLEDVALUEPROPAGATION_H #define LLVM_TRANSFORMS_IPO_CALLEDVALUEPROPAGATION_H -#include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" namespace llvm { diff --git a/llvm/include/llvm/Transforms/IPO/CrossDSOCFI.h b/llvm/include/llvm/Transforms/IPO/CrossDSOCFI.h index 8440df6397299..d34a510811018 100644 --- a/llvm/include/llvm/Transforms/IPO/CrossDSOCFI.h +++ b/llvm/include/llvm/Transforms/IPO/CrossDSOCFI.h @@ -14,7 +14,6 @@ #ifndef LLVM_TRANSFORMS_IPO_CROSSDSOCFI_H #define LLVM_TRANSFORMS_IPO_CROSSDSOCFI_H -#include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" namespace llvm { diff --git a/llvm/include/llvm/Transforms/IPO/ForceFunctionAttrs.h b/llvm/include/llvm/Transforms/IPO/ForceFunctionAttrs.h index 7379009b2592c..fd99843d0449b 100644 --- a/llvm/include/llvm/Transforms/IPO/ForceFunctionAttrs.h +++ b/llvm/include/llvm/Transforms/IPO/ForceFunctionAttrs.h @@ -13,7 +13,6 @@ #ifndef LLVM_TRANSFORMS_IPO_FORCEFUNCTIONATTRS_H #define LLVM_TRANSFORMS_IPO_FORCEFUNCTIONATTRS_H -#include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" namespace llvm { diff --git a/llvm/include/llvm/Transforms/IPO/StripSymbols.h b/llvm/include/llvm/Transforms/IPO/StripSymbols.h new file mode 100644 index 0000000000000..dd76d481d668c --- /dev/null +++ b/llvm/include/llvm/Transforms/IPO/StripSymbols.h @@ -0,0 +1,47 @@ +//===- StripSymbols.h - Strip symbols and debug info from a module --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The StripSymbols transformation implements code stripping. Specifically, it +// can delete: +// +// * names for virtual registers +// * symbols for internal globals and functions +// * debug information +// +// Note that this transformation makes code much less readable, so it should +// only be used in situations where the 'strip' utility would be used, such as +// reducing code size or making it harder to reverse engineer code. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_IPO_STRIPSYMBOLS_H +#define LLVM_TRANSFORMS_IPO_STRIPSYMBOLS_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +struct StripSymbolsPass : PassInfoMixin { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +struct StripNonDebugSymbolsPass : PassInfoMixin { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +struct StripDebugDeclarePass : PassInfoMixin { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +struct StripDeadDebugInfoPass : PassInfoMixin { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_IPO_STRIPSYMBOLS_H diff --git a/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h b/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h index b3971e49754ea..2766cc5e6263b 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h +++ b/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h @@ -26,5 +26,5 @@ class GCOVProfilerPass : public PassInfoMixin { GCOVOptions GCOVOpts; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h b/llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h deleted file mode 100644 index 21943616c5e1b..0000000000000 --- a/llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h +++ /dev/null @@ -1,49 +0,0 @@ -//===--------- Definition of the HeapProfiler class -------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file declares the HeapProfiler class. -// -//===----------------------------------------------------------------------===// -#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_HEAPPROFILER_H -#define LLVM_TRANSFORMS_INSTRUMENTATION_HEAPPROFILER_H - -#include "llvm/IR/Function.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/PassManager.h" - -namespace llvm { - -/// Public interface to the heap profiler pass for instrumenting code to -/// profile heap memory accesses. -/// -/// The profiler itself is a function pass that works by inserting various -/// calls to the HeapProfiler runtime library functions. The runtime library -/// essentially replaces malloc() and free() with custom implementations that -/// record data about the allocations. -class HeapProfilerPass : public PassInfoMixin { -public: - explicit HeapProfilerPass(); - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); -}; - -/// Public interface to the heap profiler module pass for instrumenting code -/// to profile heap memory allocations and accesses. -class ModuleHeapProfilerPass : public PassInfoMixin { -public: - explicit ModuleHeapProfilerPass(); - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); -}; - -// Insert HeapProfiler instrumentation -FunctionPass *createHeapProfilerFunctionPass(); -ModulePass *createModuleHeapProfilerLegacyPassPass(); - -} // namespace llvm - -#endif diff --git a/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h b/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h new file mode 100644 index 0000000000000..6918a24183b0d --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h @@ -0,0 +1,49 @@ +//===--------- Definition of the MemProfiler class --------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the MemProfiler class. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_MEMPROFILER_H +#define LLVM_TRANSFORMS_INSTRUMENTATION_MEMPROFILER_H + +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +/// Public interface to the memory profiler pass for instrumenting code to +/// profile memory accesses. +/// +/// The profiler itself is a function pass that works by inserting various +/// calls to the MemProfiler runtime library functions. The runtime library +/// essentially replaces malloc() and free() with custom implementations that +/// record data about the allocations. +class MemProfilerPass : public PassInfoMixin { +public: + explicit MemProfilerPass(); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +/// Public interface to the memory profiler module pass for instrumenting code +/// to profile memory allocations and accesses. +class ModuleMemProfilerPass : public PassInfoMixin { +public: + explicit ModuleMemProfilerPass(); + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +// Insert MemProfiler instrumentation +FunctionPass *createMemProfilerFunctionPass(); +ModulePass *createModuleMemProfilerLegacyPassPass(); + +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h index 242ffa0ede09d..8c525c6895690 100644 --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -240,10 +240,12 @@ FunctionPass *createReassociatePass(); //===----------------------------------------------------------------------===// // // JumpThreading - Thread control through mult-pred/multi-succ blocks where some -// preds always go to some succ. Thresholds other than minus one override the -// internal BB duplication default threshold. +// preds always go to some succ. If FreezeSelectCond is true, unfold the +// condition of a select that unfolds to branch. Thresholds other than minus one +// override the internal BB duplication default threshold. // -FunctionPass *createJumpThreadingPass(int Threshold = -1); +FunctionPass *createJumpThreadingPass(bool FreezeSelectCond = false, + int Threshold = -1); //===----------------------------------------------------------------------===// // @@ -338,6 +340,13 @@ Pass *createLoopDeletionPass(); // FunctionPass *createConstantHoistingPass(); +//===----------------------------------------------------------------------===// +// +// ConstraintElimination - This pass eliminates conditions based on found +// constraints. +// +FunctionPass *createConstraintEliminationPass(); + //===----------------------------------------------------------------------===// // // Sink - Code Sinking diff --git a/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h b/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h index be119b8ab8552..10b6e1c6a21b6 100644 --- a/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h +++ b/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h @@ -37,9 +37,9 @@ struct AlignmentFromAssumptionsPass ScalarEvolution *SE = nullptr; DominatorTree *DT = nullptr; - bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV, - const SCEV *&OffSCEV); - bool processAssumption(CallInst *I); + bool extractAlignmentInfo(CallInst *I, unsigned Idx, Value *&AAPtr, + const SCEV *&AlignSCEV, const SCEV *&OffSCEV); + bool processAssumption(CallInst *I, unsigned Idx); }; } diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h index 327bf6d00c479..b5b907471cd72 100644 --- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h +++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h @@ -91,9 +91,10 @@ class JumpThreadingPass : public PassInfoMixin { unsigned BBDupThreshold; unsigned DefaultBBDupThreshold; + bool InsertFreezeWhenUnfoldingSelect; public: - JumpThreadingPass(int T = -1); + JumpThreadingPass(bool InsertFreezeWhenUnfoldingSelect = false, int T = -1); // Glue for old PM. bool runImpl(Function &F, TargetLibraryInfo *TLI_, LazyValueInfo *LVI_, diff --git a/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h b/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h index 751c1832ba6c3..821de6c70aa01 100644 --- a/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h +++ b/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h @@ -41,6 +41,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" @@ -233,9 +234,11 @@ class FunctionToLoopPassAdaptor : public PassInfoMixin> { public: explicit FunctionToLoopPassAdaptor(LoopPassT Pass, bool UseMemorySSA = false, + bool UseBlockFrequencyInfo = false, bool DebugLogging = false) : Pass(std::move(Pass)), LoopCanonicalizationFPM(DebugLogging), - UseMemorySSA(UseMemorySSA) { + UseMemorySSA(UseMemorySSA), + UseBlockFrequencyInfo(UseBlockFrequencyInfo) { LoopCanonicalizationFPM.addPass(LoopSimplifyPass()); LoopCanonicalizationFPM.addPass(LCSSAPass()); } @@ -267,6 +270,9 @@ class FunctionToLoopPassAdaptor MemorySSA *MSSA = UseMemorySSA ? (&AM.getResult(F).getMSSA()) : nullptr; + BlockFrequencyInfo *BFI = UseBlockFrequencyInfo && F.hasProfileData() + ? (&AM.getResult(F)) + : nullptr; LoopStandardAnalysisResults LAR = {AM.getResult(F), AM.getResult(F), AM.getResult(F), @@ -274,6 +280,7 @@ class FunctionToLoopPassAdaptor AM.getResult(F), AM.getResult(F), AM.getResult(F), + BFI, MSSA}; // Setup the loop analysis manager from its proxy. It is important that @@ -370,6 +377,8 @@ class FunctionToLoopPassAdaptor PA.preserve(); PA.preserve(); PA.preserve(); + if (UseBlockFrequencyInfo && F.hasProfileData()) + PA.preserve(); if (UseMemorySSA) PA.preserve(); // FIXME: What we really want to do here is preserve an AA category, but @@ -389,6 +398,7 @@ class FunctionToLoopPassAdaptor FunctionPassManager LoopCanonicalizationFPM; bool UseMemorySSA = false; + bool UseBlockFrequencyInfo = false; }; /// A function to deduce a loop pass type and wrap it in the templated @@ -396,9 +406,10 @@ class FunctionToLoopPassAdaptor template FunctionToLoopPassAdaptor createFunctionToLoopPassAdaptor(LoopPassT Pass, bool UseMemorySSA = false, + bool UseBlockFrequencyInfo = false, bool DebugLogging = false) { - return FunctionToLoopPassAdaptor(std::move(Pass), UseMemorySSA, - DebugLogging); + return FunctionToLoopPassAdaptor( + std::move(Pass), UseMemorySSA, UseBlockFrequencyInfo, DebugLogging); } /// Pass for printing a loop's contents as textual IR. diff --git a/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h b/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h index 4e47ff70d5574..22b2e649e4d48 100644 --- a/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h +++ b/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h @@ -17,6 +17,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/CommandLine.h" namespace llvm { @@ -31,6 +32,8 @@ struct LowerExpectIntrinsicPass : PassInfoMixin { PreservedAnalyses run(Function &F, FunctionAnalysisManager &); }; +extern cl::opt LikelyBranchWeight; +extern cl::opt UnlikelyBranchWeight; } #endif diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 70c8c84c857bf..d741b5142e5bf 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -26,6 +26,7 @@ class AAResults; class AliasSet; class AliasSetTracker; class BasicBlock; +class BlockFrequencyInfo; class IRBuilderBase; class Loop; class LoopInfo; @@ -38,7 +39,6 @@ class ScalarEvolution; class SCEV; class SCEVExpander; class TargetLibraryInfo; -class TargetTransformInfo; class LPPassManager; class Instruction; struct RuntimeCheckingPtrGroup; @@ -123,12 +123,13 @@ struct SinkAndHoistLICMFlags { /// reverse depth first order w.r.t the DominatorTree. This allows us to visit /// uses before definitions, allowing us to sink a loop body in one pass without /// iteration. Takes DomTreeNode, AAResults, LoopInfo, DominatorTree, -/// TargetLibraryInfo, Loop, AliasSet information for all +/// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all /// instructions of the loop and loop safety information as /// arguments. Diagnostics is emitted via \p ORE. It returns changed status. bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, - TargetLibraryInfo *, TargetTransformInfo *, Loop *, - AliasSetTracker *, MemorySSAUpdater *, ICFLoopSafetyInfo *, + BlockFrequencyInfo *, TargetLibraryInfo *, + TargetTransformInfo *, Loop *, AliasSetTracker *, + MemorySSAUpdater *, ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *); /// Walk the specified region of the CFG (defined by all blocks @@ -136,13 +137,14 @@ bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, /// first order w.r.t the DominatorTree. This allows us to visit definitions /// before uses, allowing us to hoist a loop body in one pass without iteration. /// Takes DomTreeNode, AAResults, LoopInfo, DominatorTree, -/// TargetLibraryInfo, Loop, AliasSet information for all instructions of the -/// loop and loop safety information as arguments. Diagnostics is emitted via \p -/// ORE. It returns changed status. +/// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all +/// instructions of the loop and loop safety information as arguments. +/// Diagnostics is emitted via \p ORE. It returns changed status. bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, - TargetLibraryInfo *, Loop *, AliasSetTracker *, - MemorySSAUpdater *, ScalarEvolution *, ICFLoopSafetyInfo *, - SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *); + BlockFrequencyInfo *, TargetLibraryInfo *, Loop *, + AliasSetTracker *, MemorySSAUpdater *, ScalarEvolution *, + ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &, + OptimizationRemarkEmitter *); /// This function deletes dead loops. The caller of this function needs to /// guarantee that the loop is infact dead. diff --git a/llvm/include/llvm/Transforms/Utils/LoopVersioning.h b/llvm/include/llvm/Transforms/Utils/LoopVersioning.h index ac6cee637a46d..13321e498c97f 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopVersioning.h +++ b/llvm/include/llvm/Transforms/Utils/LoopVersioning.h @@ -25,7 +25,6 @@ namespace llvm { class Loop; class LoopAccessInfo; class LoopInfo; -class ScalarEvolution; struct RuntimeCheckingPtrGroup; typedef std::pair diff --git a/llvm/include/llvm/Transforms/Utils/LowerSwitch.h b/llvm/include/llvm/Transforms/Utils/LowerSwitch.h new file mode 100644 index 0000000000000..97086987ffcbd --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/LowerSwitch.h @@ -0,0 +1,26 @@ +//===- LowerSwitch.h - Eliminate Switch instructions ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The LowerSwitch transformation rewrites switch instructions with a sequence +// of branches, which allows targets to get away with not implementing the +// switch instruction until it is convenient. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_LOWERSWITCH_H +#define LLVM_TRANSFORMS_UTILS_LOWERSWITCH_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { +struct LowerSwitchPass : public PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} // namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_LOWERSWITCH_H diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h index 46f6ca0462f8b..fb3a7490346f4 100644 --- a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h +++ b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h @@ -25,7 +25,7 @@ struct SimplifyCFGOptions { bool ForwardSwitchCondToPhi = false; bool ConvertSwitchToLookupTable = false; bool NeedCanonicalLoop = true; - bool HoistCommonInsts = true; + bool HoistCommonInsts = false; bool SinkCommonInsts = false; bool SimplifyCondBranch = true; bool FoldTwoEntryPHINode = true; diff --git a/llvm/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h b/llvm/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h index ff70446e163d4..a9fe808cb4552 100644 --- a/llvm/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h +++ b/llvm/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h @@ -7,10 +7,7 @@ //===----------------------------------------------------------------------===// // // This pass is used to ensure that functions have at most one return and one -// unwind instruction in them. Additionally, it keeps track of which node is -// the new exit node of the CFG. If there are no return or unwind instructions -// in the function, the getReturnBlock/getUnwindBlock methods will return a null -// pointer. +// unreachable instruction in them. // //===----------------------------------------------------------------------===// @@ -23,10 +20,9 @@ namespace llvm { class BasicBlock; -struct UnifyFunctionExitNodes : public FunctionPass { - BasicBlock *ReturnBlock = nullptr; - BasicBlock *UnwindBlock = nullptr; - BasicBlock *UnreachableBlock; +class UnifyFunctionExitNodes : public FunctionPass { + bool unifyUnreachableBlocks(Function &F); + bool unifyReturnBlocks(Function &F); public: static char ID; // Pass identification, replacement for typeid @@ -35,13 +31,6 @@ struct UnifyFunctionExitNodes : public FunctionPass { // We can preserve non-critical-edgeness when we unify function exit nodes void getAnalysisUsage(AnalysisUsage &AU) const override; - // getReturn|Unwind|UnreachableBlock - Return the new single (or nonexistent) - // return, unwind, or unreachable basic blocks in the CFG. - // - BasicBlock *getReturnBlock() const { return ReturnBlock; } - BasicBlock *getUnwindBlock() const { return UnwindBlock; } - BasicBlock *getUnreachableBlock() const { return UnreachableBlock; } - bool runOnFunction(Function &F) override; }; diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h index 77236dec75dc2..52a57939209cc 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -22,11 +22,11 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/None.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/PassManager.h" namespace llvm { +class AAResults; class AssumptionCache; class BasicBlock; class CmpInst; @@ -34,6 +34,7 @@ class DataLayout; class DemandedBits; class DominatorTree; class Function; +class GetElementPtrInst; class InsertElementInst; class InsertValueInst; class Instruction; @@ -63,7 +64,7 @@ struct SLPVectorizerPass : public PassInfoMixin { ScalarEvolution *SE = nullptr; TargetTransformInfo *TTI = nullptr; TargetLibraryInfo *TLI = nullptr; - AliasAnalysis *AA = nullptr; + AAResults *AA = nullptr; LoopInfo *LI = nullptr; DominatorTree *DT = nullptr; AssumptionCache *AC = nullptr; @@ -75,7 +76,7 @@ struct SLPVectorizerPass : public PassInfoMixin { // Glue for old PM. bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, - TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_, + TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_); diff --git a/llvm/lib/Analysis/AliasSetTracker.cpp b/llvm/lib/Analysis/AliasSetTracker.cpp index 5cc68f05dc0ec..6f8f192d0d968 100644 --- a/llvm/lib/Analysis/AliasSetTracker.cpp +++ b/llvm/lib/Analysis/AliasSetTracker.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" @@ -21,24 +20,20 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" -#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/AtomicOrdering.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include -#include -#include using namespace llvm; @@ -740,8 +735,6 @@ AliasSetTracker::ASTCallbackVH::operator=(Value *V) { namespace { class AliasSetPrinter : public FunctionPass { - AliasSetTracker *Tracker; - public: static char ID; // Pass identification, replacement for typeid @@ -756,12 +749,11 @@ namespace { bool runOnFunction(Function &F) override { auto &AAWP = getAnalysis(); - Tracker = new AliasSetTracker(AAWP.getAAResults()); + AliasSetTracker Tracker(AAWP.getAAResults()); errs() << "Alias sets for function '" << F.getName() << "':\n"; for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) - Tracker->add(&*I); - Tracker->print(errs()); - delete Tracker; + Tracker.add(&*I); + Tracker.print(errs()); return false; } }; @@ -775,3 +767,16 @@ INITIALIZE_PASS_BEGIN(AliasSetPrinter, "print-alias-sets", INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(AliasSetPrinter, "print-alias-sets", "Alias Set Printer", false, true) + +AliasSetsPrinterPass::AliasSetsPrinterPass(raw_ostream &OS) : OS(OS) {} + +PreservedAnalyses AliasSetsPrinterPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &AA = AM.getResult(F); + AliasSetTracker Tracker(AA); + OS << "Alias sets for function '" << F.getName() << "':\n"; + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) + Tracker.add(&*I); + Tracker.print(OS); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Analysis/AssumeBundleQueries.cpp b/llvm/lib/Analysis/AssumeBundleQueries.cpp index 9539af6d9d457..0084e2f13f5f9 100644 --- a/llvm/lib/Analysis/AssumeBundleQueries.cpp +++ b/llvm/lib/Analysis/AssumeBundleQueries.cpp @@ -108,10 +108,17 @@ llvm::getKnowledgeFromBundle(CallInst &Assume, Result.AttrKind = Attribute::getAttrKindFromName(BOI.Tag->getKey()); if (bundleHasArgument(BOI, ABA_WasOn)) Result.WasOn = getValueFromBundleOpInfo(Assume, BOI, ABA_WasOn); + auto GetArgOr1 = [&](unsigned Idx) -> unsigned { + if (auto *ConstInt = dyn_cast( + getValueFromBundleOpInfo(Assume, BOI, ABA_Argument + Idx))) + return ConstInt->getZExtValue(); + return 1; + }; if (BOI.End - BOI.Begin > ABA_Argument) - Result.ArgValue = - cast(getValueFromBundleOpInfo(Assume, BOI, ABA_Argument)) - ->getZExtValue(); + Result.ArgValue = GetArgOr1(0); + if (Result.AttrKind == Attribute::Alignment) + if (BOI.End - BOI.Begin > ABA_Argument + 1) + Result.ArgValue = MinAlign(Result.ArgValue, GetArgOr1(1)); return Result; } diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index f50439bc87627..4bd45ead30d35 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -39,6 +39,7 @@ add_llvm_component_library(LLVMAnalysis CodeMetrics.cpp ConstantFolding.cpp DDG.cpp + ConstraintSystem.cpp Delinearization.cpp DemandedBits.cpp DependenceAnalysis.cpp @@ -53,6 +54,7 @@ add_llvm_component_library(LLVMAnalysis GlobalsModRef.cpp GuardUtils.cpp HeatUtils.cpp + IRSimilarityIdentifier.cpp IVDescriptors.cpp IVUsers.cpp IndirectCallPromotionAnalysis.cpp diff --git a/llvm/lib/Analysis/ConstraintSystem.cpp b/llvm/lib/Analysis/ConstraintSystem.cpp new file mode 100644 index 0000000000000..d5b15e7587b37 --- /dev/null +++ b/llvm/lib/Analysis/ConstraintSystem.cpp @@ -0,0 +1,150 @@ +//===- ConstraintSytem.cpp - A system of linear constraints. ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/ConstraintSystem.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Debug.h" + +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "constraint-system" + +bool ConstraintSystem::eliminateUsingFM() { + // Implementation of Fourier–Motzkin elimination, with some tricks from the + // paper Pugh, William. "The Omega test: a fast and practical integer + // programming algorithm for dependence + // analysis." + // Supercomputing'91: Proceedings of the 1991 ACM/ + // IEEE conference on Supercomputing. IEEE, 1991. + assert(!Constraints.empty() && + "should only be called for non-empty constraint systems"); + unsigned NumVariables = Constraints[0].size(); + SmallVector, 4> NewSystem; + + unsigned NumConstraints = Constraints.size(); + uint32_t NewGCD = 1; + // FIXME do not use copy + for (unsigned R1 = 0; R1 < NumConstraints; R1++) { + if (Constraints[R1][1] == 0) { + SmallVector NR; + NR.push_back(Constraints[R1][0]); + for (unsigned i = 2; i < NumVariables; i++) { + NR.push_back(Constraints[R1][i]); + } + NewSystem.push_back(std::move(NR)); + continue; + } + + // FIXME do not use copy + for (unsigned R2 = R1 + 1; R2 < NumConstraints; R2++) { + if (R1 == R2) + continue; + + // FIXME: can we do better than just dropping things here? + if (Constraints[R2][1] == 0) + continue; + + if ((Constraints[R1][1] < 0 && Constraints[R2][1] < 0) || + (Constraints[R1][1] > 0 && Constraints[R2][1] > 0)) + continue; + + unsigned LowerR = R1; + unsigned UpperR = R2; + if (Constraints[UpperR][1] < 0) + std::swap(LowerR, UpperR); + + SmallVector NR; + for (unsigned I = 0; I < NumVariables; I++) { + if (I == 1) + continue; + + int64_t M1, M2, N; + if (MulOverflow(Constraints[UpperR][I], + ((-1) * Constraints[LowerR][1] / GCD), M1)) + return false; + if (MulOverflow(Constraints[LowerR][I], + (Constraints[UpperR][1] / GCD), M2)) + return false; + if (AddOverflow(M1, M2, N)) + return false; + NR.push_back(N); + + NewGCD = APIntOps::GreatestCommonDivisor({32, (uint32_t)NR.back()}, + {32, NewGCD}) + .getZExtValue(); + } + NewSystem.push_back(std::move(NR)); + } + } + Constraints = std::move(NewSystem); + GCD = NewGCD; + + return true; +} + +bool ConstraintSystem::mayHaveSolutionImpl() { + while (!Constraints.empty() && Constraints[0].size() > 1) { + if (!eliminateUsingFM()) + return true; + } + + if (Constraints.empty() || Constraints[0].size() > 1) + return true; + + return all_of(Constraints, [](auto &R) { return R[0] >= 0; }); +} + +void ConstraintSystem::dump(ArrayRef Names) const { + if (Constraints.empty()) + return; + + for (auto &Row : Constraints) { + SmallVector Parts; + for (unsigned I = 1, S = Row.size(); I < S; ++I) { + if (Row[I] == 0) + continue; + std::string Coefficient = ""; + if (Row[I] != 1) + Coefficient = std::to_string(Row[I]) + " * "; + Parts.push_back(Coefficient + Names[I - 1]); + } + assert(!Parts.empty() && "need to have at least some parts"); + LLVM_DEBUG(dbgs() << join(Parts, std::string(" + ")) + << " <= " << std::to_string(Row[0]) << "\n"); + } +} + +void ConstraintSystem::dump() const { + SmallVector Names; + for (unsigned i = 1; i < Constraints.back().size(); ++i) + Names.push_back("x" + std::to_string(i)); + LLVM_DEBUG(dbgs() << "---\n"); + dump(Names); +} + +bool ConstraintSystem::mayHaveSolution() { + dump(); + bool HasSolution = mayHaveSolutionImpl(); + LLVM_DEBUG(dbgs() << (HasSolution ? "sat" : "unsat") << "\n"); + return HasSolution; +} + +bool ConstraintSystem::isConditionImplied(SmallVector R) { + // If there is no solution with the negation of R added to the system, the + // condition must hold based on the existing constraints. + R = ConstraintSystem::negate(R); + + auto NewSystem = *this; + NewSystem.addVariableRow(R); + return !NewSystem.mayHaveSolution(); +} diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp index 62e08f3f8a8ba..461fd7239905b 100644 --- a/llvm/lib/Analysis/DemandedBits.cpp +++ b/llvm/lib/Analysis/DemandedBits.cpp @@ -115,7 +115,7 @@ void DemandedBits::determineLiveOperandBits( default: break; case Instruction::Call: case Instruction::Invoke: - if (const IntrinsicInst *II = dyn_cast(UserI)) + if (const IntrinsicInst *II = dyn_cast(UserI)) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::bswap: @@ -170,7 +170,16 @@ void DemandedBits::determineLiveOperandBits( } break; } + case Intrinsic::umax: + case Intrinsic::umin: + case Intrinsic::smax: + case Intrinsic::smin: + // If low bits of result are not demanded, they are also not demanded + // for the min/max operands. + AB = APInt::getBitsSetFrom(BitWidth, AOut.countTrailingZeros()); + break; } + } break; case Instruction::Add: if (AOut.isMask()) { diff --git a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp new file mode 100644 index 0000000000000..edefb4499d165 --- /dev/null +++ b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp @@ -0,0 +1,156 @@ +//===- IRSimilarityIdentifier.cpp - Find similarity in a module -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file +// Implementation file for the IRSimilarityIdentifier for identifying +// similarities in IR including the IRInstructionMapper. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/IRSimilarityIdentifier.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/User.h" + +using namespace llvm; +using namespace IRSimilarity; + +IRInstructionData::IRInstructionData(Instruction &I, bool Legality) + : Inst(&I), Legal(Legality) { + // Here we collect the operands to be used to determine whether two + // instructions are similar to one another. + for (Use &OI : I.operands()) + OperVals.push_back(OI.get()); +} + +bool IRSimilarity::isClose(const IRInstructionData &A, + const IRInstructionData &B) { + return A.Legal && A.Inst->isSameOperationAs(B.Inst); +} + +// TODO: This is the same as the MachineOutliner, and should be consolidated +// into the same interface. +void IRInstructionMapper::convertToUnsignedVec( + BasicBlock &BB, std::vector &InstrList, + std::vector &IntegerMapping) { + BasicBlock::iterator It = BB.begin(); + + std::vector IntegerMappingForBB; + std::vector InstrListForBB; + + HaveLegalRange = false; + CanCombineWithPrevInstr = false; + AddedIllegalLastTime = true; + + for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) { + switch (InstClassifier.visit(*It)) { + case InstrType::Legal: + mapToLegalUnsigned(It, IntegerMappingForBB, InstrListForBB); + break; + case InstrType::Illegal: + mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB); + break; + case InstrType::Invisible: + AddedIllegalLastTime = false; + break; + } + } + + if (HaveLegalRange) { + mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true); + InstrList.insert(InstrList.end(), InstrListForBB.begin(), + InstrListForBB.end()); + IntegerMapping.insert(IntegerMapping.end(), IntegerMappingForBB.begin(), + IntegerMappingForBB.end()); + } +} + +// TODO: This is the same as the MachineOutliner, and should be consolidated +// into the same interface. +unsigned IRInstructionMapper::mapToLegalUnsigned( + BasicBlock::iterator &It, std::vector &IntegerMappingForBB, + std::vector &InstrListForBB) { + // We added something legal, so we should unset the AddedLegalLastTime + // flag. + AddedIllegalLastTime = false; + + // If we have at least two adjacent legal instructions (which may have + // invisible instructions in between), remember that. + if (CanCombineWithPrevInstr) + HaveLegalRange = true; + CanCombineWithPrevInstr = true; + + // Get the integer for this instruction or give it the current + // LegalInstrNumber. + IRInstructionData *ID = allocateIRInstructionData(*It, true); + InstrListForBB.push_back(ID); + + // Add to the instruction list + bool WasInserted; + DenseMap::iterator + ResultIt; + std::tie(ResultIt, WasInserted) = + InstructionIntegerMap.insert(std::make_pair(ID, LegalInstrNumber)); + unsigned INumber = ResultIt->second; + + // There was an insertion. + if (WasInserted) + LegalInstrNumber++; + + IntegerMappingForBB.push_back(INumber); + + // Make sure we don't overflow or use any integers reserved by the DenseMap. + assert(LegalInstrNumber < IllegalInstrNumber && + "Instruction mapping overflow!"); + + assert(LegalInstrNumber != DenseMapInfo::getEmptyKey() && + "Tried to assign DenseMap tombstone or empty key to instruction."); + assert(LegalInstrNumber != DenseMapInfo::getTombstoneKey() && + "Tried to assign DenseMap tombstone or empty key to instruction."); + + return INumber; +} + +IRInstructionData * +IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality) { + return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality); +} + +// TODO: This is the same as the MachineOutliner, and should be consolidated +// into the same interface. +unsigned IRInstructionMapper::mapToIllegalUnsigned( + BasicBlock::iterator &It, std::vector &IntegerMappingForBB, + std::vector &InstrListForBB, bool End) { + // Can't combine an illegal instruction. Set the flag. + CanCombineWithPrevInstr = false; + + // Only add one illegal number per range of legal numbers. + if (AddedIllegalLastTime) + return IllegalInstrNumber; + + IRInstructionData *ID = nullptr; + if (!End) + ID = allocateIRInstructionData(*It, false); + InstrListForBB.push_back(ID); + + // Remember that we added an illegal number last time. + AddedIllegalLastTime = true; + unsigned INumber = IllegalInstrNumber; + IntegerMappingForBB.push_back(IllegalInstrNumber--); + + assert(LegalInstrNumber < IllegalInstrNumber && + "Instruction mapping overflow!"); + + assert(IllegalInstrNumber != DenseMapInfo::getEmptyKey() && + "IllegalInstrNumber cannot be DenseMap tombstone or empty key!"); + + assert(IllegalInstrNumber != DenseMapInfo::getTombstoneKey() && + "IllegalInstrNumber cannot be DenseMap tombstone or empty key!"); + + return INumber; +} diff --git a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp index dc426aaccb22a..2213cd8598b0a 100644 --- a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp +++ b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp @@ -67,8 +67,6 @@ class IRToNativeSizeLearning { static const size_t NumNamedFeatures = static_cast(NamedFeatureIndex::NumNamedFeatures); struct FunctionFeatures { - static std::vector> - ImportantInstructionSuccessions; static const size_t FeatureCount; std::array NamedFeatures = {0}; @@ -84,53 +82,38 @@ class IRToNativeSizeLearning { static FunctionFeatures getFunctionFeatures(Function &F, FunctionAnalysisManager &FAM); - -private: - /// Sort once the feature tuples. - struct SortFeatureTuples { - bool IsSorted = false; - SortFeatureTuples() { - std::sort(FunctionFeatures::ImportantInstructionSuccessions.begin(), - FunctionFeatures::ImportantInstructionSuccessions.end()); - IsSorted = true; - } - }; - - static llvm::ManagedStatic TupleSorter; - - static bool ensureSortedTuples() { return TupleSorter->IsSorted; } }; -llvm::ManagedStatic - IRToNativeSizeLearning::TupleSorter; // This is a point in time - we determined including these pairs of // consecutive instructions (in the IR layout available at inline time) as // features improves the model performance. We want to move away from manual // feature selection. -// The vector is given in opcode pairs rather than labels because 1) labels -// weren't readily available, and 2) the successions were hand - extracted -std::vector> - IRToNativeSizeLearning::FunctionFeatures::ImportantInstructionSuccessions = - {{1, 34}, {15, 27}, {53, 53}, {53, 34}, {1, 11}, {32, 2}, {2, 48}, - {28, 48}, {1, 45}, {49, 32}, {57, 56}, {55, 53}, {1, 28}, {57, 34}, - {1, 1}, {32, 28}, {32, 15}, {49, 28}, {53, 1}, {2, 53}, {48, 34}, - {28, 53}, {2, 32}, {1, 40}, {32, 48}, {29, 56}, {56, 32}, {55, 56}, - {48, 56}, {1, 31}, {33, 34}, {2, 28}, {1, 12}, {55, 1}, {31, 31}, - {65, 1}, {33, 56}, {32, 32}, {13, 13}, {1, 26}, {13, 26}, {2, 1}, - {1, 33}, {47, 49}, {64, 1}, {2, 38}, {34, 53}, {48, 2}, {55, 34}, - {34, 32}, {1, 5}, {56, 13}, {2, 2}, {2, 49}, {33, 2}, {49, 39}, - {56, 49}, {33, 49}, {32, 39}, {39, 57}, {29, 33}, {31, 34}, {32, 29}, - {47, 15}, {13, 34}, {2, 33}, {32, 49}, {49, 34}, {56, 33}, {1, 30}, - {33, 33}, {31, 33}, {2, 29}, {56, 7}, {32, 13}, {2, 55}, {56, 56}, - {2, 34}, {1, 42}, {34, 49}, {1, 20}, {32, 33}, {1, 25}, {53, 28}, - {1, 14}, {31, 49}, {28, 2}, {2, 13}, {2, 56}, {1, 32}, {56, 53}, - {65, 65}, {33, 53}, {64, 64}, {13, 2}, {34, 33}, {1, 4}, {49, 2}, - {1, 9}, {56, 1}, {33, 1}, {53, 57}, {32, 53}, {13, 56}, {32, 56}, - {55, 55}, {1, 18}, {49, 56}, {34, 34}, {1, 7}, {56, 64}, {32, 1}, - {13, 33}, {55, 28}, {49, 33}, {57, 57}, {56, 34}, {34, 56}, {33, 32}, - {32, 40}, {1, 29}, {53, 2}, {34, 1}, {32, 34}, {49, 49}, {1, 24}, - {40, 34}, {1, 13}, {38, 34}, {29, 2}, {34, 2}, {1, 39}, {1, 22}, - {1, 27}, {49, 1}, {1, 8}, {56, 2}}; +// The array is given in opcode pairs rather than labels because 1) labels +// weren't readily available, and 2) the successions were hand - extracted. +// +// This array must be sorted. +static const std::array, 137> + ImportantInstructionSuccessions{ + {{1, 1}, {1, 4}, {1, 5}, {1, 7}, {1, 8}, {1, 9}, {1, 11}, + {1, 12}, {1, 13}, {1, 14}, {1, 18}, {1, 20}, {1, 22}, {1, 24}, + {1, 25}, {1, 26}, {1, 27}, {1, 28}, {1, 29}, {1, 30}, {1, 31}, + {1, 32}, {1, 33}, {1, 34}, {1, 39}, {1, 40}, {1, 42}, {1, 45}, + {2, 1}, {2, 2}, {2, 13}, {2, 28}, {2, 29}, {2, 32}, {2, 33}, + {2, 34}, {2, 38}, {2, 48}, {2, 49}, {2, 53}, {2, 55}, {2, 56}, + {13, 2}, {13, 13}, {13, 26}, {13, 33}, {13, 34}, {13, 56}, {15, 27}, + {28, 2}, {28, 48}, {28, 53}, {29, 2}, {29, 33}, {29, 56}, {31, 31}, + {31, 33}, {31, 34}, {31, 49}, {32, 1}, {32, 2}, {32, 13}, {32, 15}, + {32, 28}, {32, 29}, {32, 32}, {32, 33}, {32, 34}, {32, 39}, {32, 40}, + {32, 48}, {32, 49}, {32, 53}, {32, 56}, {33, 1}, {33, 2}, {33, 32}, + {33, 33}, {33, 34}, {33, 49}, {33, 53}, {33, 56}, {34, 1}, {34, 2}, + {34, 32}, {34, 33}, {34, 34}, {34, 49}, {34, 53}, {34, 56}, {38, 34}, + {39, 57}, {40, 34}, {47, 15}, {47, 49}, {48, 2}, {48, 34}, {48, 56}, + {49, 1}, {49, 2}, {49, 28}, {49, 32}, {49, 33}, {49, 34}, {49, 39}, + {49, 49}, {49, 56}, {53, 1}, {53, 2}, {53, 28}, {53, 34}, {53, 53}, + {53, 57}, {55, 1}, {55, 28}, {55, 34}, {55, 53}, {55, 55}, {55, 56}, + {56, 1}, {56, 2}, {56, 7}, {56, 13}, {56, 32}, {56, 33}, {56, 34}, + {56, 49}, {56, 53}, {56, 56}, {56, 64}, {57, 34}, {57, 56}, {57, 57}, + {64, 1}, {64, 64}, {65, 1}, {65, 65}}}; // We have: 9 calculated features (the features here); 1 feature for each // instruction opcode; and 1 feature for each manually-identified sequence. @@ -140,14 +123,13 @@ std::vector> // Note that instruction opcodes start from 1. For convenience, we also have an // always 0 feature for the '0' opcode, hence the extra 1. const size_t IRToNativeSizeLearning::FunctionFeatures::FeatureCount = - IRToNativeSizeLearning::FunctionFeatures::ImportantInstructionSuccessions - .size() + - getMaxInstructionID() + 1 + IRToNativeSizeLearning::NumNamedFeatures; + ImportantInstructionSuccessions.size() + getMaxInstructionID() + 1 + + IRToNativeSizeLearning::NumNamedFeatures; size_t getSize(Function &F, TargetTransformInfo &TTI) { size_t Ret = 0; - for (auto &BB : F) - for (auto &I : BB) + for (const auto &BB : F) + for (const auto &I : BB) Ret += TTI.getInstructionCost( &I, TargetTransformInfo::TargetCostKind::TCK_CodeSize); return Ret; @@ -161,8 +143,8 @@ size_t getSize(Function &F, FunctionAnalysisManager &FAM) { unsigned getMaxDominatorTreeDepth(const Function &F, const DominatorTree &Tree) { unsigned Ret = 0; - for (auto &BB : F) - if (auto *TN = Tree.getNode(&BB)) + for (const auto &BB : F) + if (const auto *TN = Tree.getNode(&BB)) Ret = std::max(Ret, TN->getLevel()); return Ret; } @@ -171,42 +153,37 @@ unsigned getMaxDominatorTreeDepth(const Function &F, IRToNativeSizeLearning::FunctionFeatures IRToNativeSizeLearning::getFunctionFeatures(Function &F, FunctionAnalysisManager &FAM) { - assert(ensureSortedTuples() && "expected lazy initialization"); + assert(llvm::is_sorted(ImportantInstructionSuccessions) && + "expected function features are sorted"); auto &DomTree = FAM.getResult(F); FunctionFeatures FF; size_t InstrCount = getMaxInstructionID() + 1; FF.InstructionHistogram.resize(InstrCount); - FF.InstructionPairHistogram.resize( - FunctionFeatures::ImportantInstructionSuccessions.size()); + FF.InstructionPairHistogram.resize(ImportantInstructionSuccessions.size()); - auto StartID = 0; - auto LastID = StartID; + int StartID = 0; + int LastID = StartID; auto getPairIndex = [](size_t a, size_t b) { - auto I = - std::find(FunctionFeatures::ImportantInstructionSuccessions.begin(), - FunctionFeatures::ImportantInstructionSuccessions.end(), - std::make_pair(a, b)); - if (I == FunctionFeatures::ImportantInstructionSuccessions.end()) + auto I = llvm::find(ImportantInstructionSuccessions, std::make_pair(a, b)); + if (I == ImportantInstructionSuccessions.end()) return -1; - return static_cast(std::distance( - FunctionFeatures::ImportantInstructionSuccessions.begin(), I)); + return static_cast( + std::distance(ImportantInstructionSuccessions.begin(), I)); }; // We don't want debug calls, because they'd just add noise. - for (auto &BB : F) { - for (auto I = BB.instructionsWithoutDebug().begin(), - E = BB.instructionsWithoutDebug().end(); - I != E; ++I) { - auto ID = I->getOpcode(); + for (const auto &BB : F) { + for (const auto &I : BB.instructionsWithoutDebug()) { + auto ID = I.getOpcode(); ++FF.InstructionHistogram[ID]; int PairIndex = getPairIndex(LastID, ID); if (PairIndex >= 0) ++FF.InstructionPairHistogram[PairIndex]; LastID = ID; - if (isa(*I)) + if (isa(I)) ++FF[NamedFeatureIndex::Calls]; } } diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 7c13b41bc7e64..7d939bb63a6b6 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -3769,10 +3769,10 @@ Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit); } -/// See if V simplifies when its operand Op is replaced with RepOp. -static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, - const SimplifyQuery &Q, - unsigned MaxRecurse) { +static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, + const SimplifyQuery &Q, + bool AllowRefinement, + unsigned MaxRecurse) { // Trivial replacement. if (V == Op) return RepOp; @@ -3785,27 +3785,41 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, if (!I) return nullptr; + // Consider: + // %cmp = icmp eq i32 %x, 2147483647 + // %add = add nsw i32 %x, 1 + // %sel = select i1 %cmp, i32 -2147483648, i32 %add + // + // We can't replace %sel with %add unless we strip away the flags (which will + // be done in InstCombine). + // TODO: This is unsound, because it only catches some forms of refinement. + if (!AllowRefinement && canCreatePoison(cast(I))) + return nullptr; + + // The simplification queries below may return the original value. Consider: + // %div = udiv i32 %arg, %arg2 + // %mul = mul nsw i32 %div, %arg2 + // %cmp = icmp eq i32 %mul, %arg + // %sel = select i1 %cmp, i32 %div, i32 undef + // Replacing %arg by %mul, %div becomes "udiv i32 %mul, %arg2", which + // simplifies back to %arg. This can only happen because %mul does not + // dominate %div. To ensure a consistent return value contract, we make sure + // that this case returns nullptr as well. + auto PreventSelfSimplify = [V](Value *Simplified) { + return Simplified != V ? Simplified : nullptr; + }; + // If this is a binary operator, try to simplify it with the replaced op. if (auto *B = dyn_cast(I)) { - // Consider: - // %cmp = icmp eq i32 %x, 2147483647 - // %add = add nsw i32 %x, 1 - // %sel = select i1 %cmp, i32 -2147483648, i32 %add - // - // We can't replace %sel with %add unless we strip away the flags. - // TODO: This is an unusual limitation because better analysis results in - // worse simplification. InstCombine can do this fold more generally - // by dropping the flags. Remove this fold to save compile-time? - if (canCreatePoison(cast(I))) - return nullptr; - if (MaxRecurse) { if (B->getOperand(0) == Op) - return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), Q, - MaxRecurse - 1); + return PreventSelfSimplify(SimplifyBinOp(B->getOpcode(), RepOp, + B->getOperand(1), Q, + MaxRecurse - 1)); if (B->getOperand(1) == Op) - return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, Q, - MaxRecurse - 1); + return PreventSelfSimplify(SimplifyBinOp(B->getOpcode(), + B->getOperand(0), RepOp, Q, + MaxRecurse - 1)); } } @@ -3813,11 +3827,13 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, if (CmpInst *C = dyn_cast(I)) { if (MaxRecurse) { if (C->getOperand(0) == Op) - return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), Q, - MaxRecurse - 1); + return PreventSelfSimplify(SimplifyCmpInst(C->getPredicate(), RepOp, + C->getOperand(1), Q, + MaxRecurse - 1)); if (C->getOperand(1) == Op) - return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, Q, - MaxRecurse - 1); + return PreventSelfSimplify(SimplifyCmpInst(C->getPredicate(), + C->getOperand(0), RepOp, Q, + MaxRecurse - 1)); } } @@ -3827,8 +3843,8 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, SmallVector NewOps(GEP->getNumOperands()); transform(GEP->operands(), NewOps.begin(), [&](Value *V) { return V == Op ? RepOp : V; }); - return SimplifyGEPInst(GEP->getSourceElementType(), NewOps, Q, - MaxRecurse - 1); + return PreventSelfSimplify(SimplifyGEPInst(GEP->getSourceElementType(), + NewOps, Q, MaxRecurse - 1)); } } @@ -3865,6 +3881,13 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, return nullptr; } +Value *llvm::SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, + const SimplifyQuery &Q, + bool AllowRefinement) { + return ::SimplifyWithOpReplaced(V, Op, RepOp, Q, AllowRefinement, + RecursionLimit); +} + /// Try to simplify a select instruction when its condition operand is an /// integer comparison where one operand of the compare is a constant. static Value *simplifySelectBitTest(Value *TrueVal, Value *FalseVal, Value *X, @@ -3985,14 +4008,18 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal, // arms of the select. See if substituting this value into the arm and // simplifying the result yields the same value as the other arm. if (Pred == ICmpInst::ICMP_EQ) { - if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, MaxRecurse) == + if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, + /* AllowRefinement */ false, MaxRecurse) == TrueVal || - SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, MaxRecurse) == + SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, + /* AllowRefinement */ false, MaxRecurse) == TrueVal) return FalseVal; - if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, MaxRecurse) == + if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, + /* AllowRefinement */ true, MaxRecurse) == FalseVal || - SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, MaxRecurse) == + SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, + /* AllowRefinement */ true, MaxRecurse) == FalseVal) return FalseVal; } @@ -5274,9 +5301,6 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1, // on the outer abs. if (match(Op0, m_Intrinsic(m_Value(), m_Value()))) return Op0; - // If the sign bit is clear already, then abs does not do anything. - if (isKnownNonNegative(Op0, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) - return Op0; break; case Intrinsic::smax: @@ -5440,19 +5464,44 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1, // If the arguments are the same, this is a no-op. if (Op0 == Op1) return Op0; - // If one argument is undef, return the other argument. - if (Q.isUndefValue(Op0)) - return Op1; + // Canonicalize constant operand as Op1. + if (isa(Op0)) + std::swap(Op0, Op1); + + // If an argument is undef, return the other argument. if (Q.isUndefValue(Op1)) return Op0; - // If one argument is NaN, return other or NaN appropriately. bool PropagateNaN = IID == Intrinsic::minimum || IID == Intrinsic::maximum; - if (match(Op0, m_NaN())) - return PropagateNaN ? Op0 : Op1; + bool IsMin = IID == Intrinsic::minimum || IID == Intrinsic::minnum; + + // minnum(X, nan) -> X + // maxnum(X, nan) -> X + // minimum(X, nan) -> nan + // maximum(X, nan) -> nan if (match(Op1, m_NaN())) return PropagateNaN ? Op1 : Op0; + // In the following folds, inf can be replaced with the largest finite + // float, if the ninf flag is set. + const APFloat *C; + if (match(Op1, m_APFloat(C)) && + (C->isInfinity() || (Q.CxtI->hasNoInfs() && C->isLargest()))) { + // minnum(X, -inf) -> -inf + // maxnum(X, +inf) -> +inf + // minimum(X, -inf) -> -inf if nnan + // maximum(X, +inf) -> +inf if nnan + if (C->isNegative() == IsMin && (!PropagateNaN || Q.CxtI->hasNoNaNs())) + return ConstantFP::get(ReturnType, *C); + + // minnum(X, +inf) -> X if nnan + // maxnum(X, -inf) -> X if nnan + // minimum(X, +inf) -> X + // maximum(X, -inf) -> X + if (C->isNegative() != IsMin && (PropagateNaN || Q.CxtI->hasNoNaNs())) + return Op0; + } + // Min/max of the same operation with common operand: // m(m(X, Y)), X --> m(X, Y) (4 commuted variants) if (auto *M0 = dyn_cast(Op0)) @@ -5464,20 +5513,6 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1, (M1->getOperand(0) == Op0 || M1->getOperand(1) == Op0)) return Op1; - // min(X, -Inf) --> -Inf (and commuted variant) - // max(X, +Inf) --> +Inf (and commuted variant) - bool UseNegInf = IID == Intrinsic::minnum || IID == Intrinsic::minimum; - const APFloat *C; - if ((match(Op0, m_APFloat(C)) && C->isInfinity() && - C->isNegative() == UseNegInf) || - (match(Op1, m_APFloat(C)) && C->isInfinity() && - C->isNegative() == UseNegInf)) - return ConstantFP::getInfinity(ReturnType, UseNegInf); - - // TODO: minnum(nnan x, inf) -> x - // TODO: minnum(nnan ninf x, flt_max) -> x - // TODO: maxnum(nnan x, -inf) -> x - // TODO: maxnum(nnan ninf x, -flt_max) -> x break; } default: diff --git a/llvm/lib/Analysis/LazyCallGraph.cpp b/llvm/lib/Analysis/LazyCallGraph.cpp index efded17cef4e3..b3658999e7fef 100644 --- a/llvm/lib/Analysis/LazyCallGraph.cpp +++ b/llvm/lib/Analysis/LazyCallGraph.cpp @@ -1595,8 +1595,6 @@ void LazyCallGraph::updateGraphPtrs() { } LazyCallGraph::Node &LazyCallGraph::createNode(Function &F) { - assert(!lookup(F) && "node already exists"); - Node &N = get(F); NodeMap[&F] = &N; N.DFSNumber = N.LowLink = -1; diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 04e04a8053e87..75b8f31c8a312 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -365,6 +365,11 @@ void Lint::visitCallBase(CallBase &I) { visitMemoryReference(I, I.getArgOperand(0), MemoryLocation::UnknownSize, None, nullptr, MemRef::Read | MemRef::Write); break; + case Intrinsic::get_active_lane_mask: + if (auto *TripCount = dyn_cast(I.getArgOperand(1))) + Assert(!TripCount->isZero(), "get_active_lane_mask: operand #2 " + "must be greater than 0", &I); + break; } } diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp index 6ba247a87c226..47b08a61ccb2a 100644 --- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp +++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp @@ -29,7 +29,11 @@ #include "llvm/ADT/BreadthFirstIterator.h" #include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -145,7 +149,7 @@ IndexedReference::IndexedReference(Instruction &StoreOrLoadInst, Optional IndexedReference::hasSpacialReuse(const IndexedReference &Other, unsigned CLS, - AliasAnalysis &AA) const { + AAResults &AA) const { assert(IsValid && "Expecting a valid reference"); if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) { @@ -202,7 +206,7 @@ Optional IndexedReference::hasTemporalReuse(const IndexedReference &Other, unsigned MaxDistance, const Loop &L, DependenceInfo &DI, - AliasAnalysis &AA) const { + AAResults &AA) const { assert(IsValid && "Expecting a valid reference"); if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) { @@ -457,7 +461,7 @@ bool IndexedReference::isSimpleAddRecurrence(const SCEV &Subscript, } bool IndexedReference::isAliased(const IndexedReference &Other, - AliasAnalysis &AA) const { + AAResults &AA) const { const auto &Loc1 = MemoryLocation::get(&StoreOrLoadInst); const auto &Loc2 = MemoryLocation::get(&Other.StoreOrLoadInst); return AA.isMustAlias(Loc1, Loc2); @@ -476,7 +480,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const CacheCost &CC) { CacheCost::CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI, ScalarEvolution &SE, TargetTransformInfo &TTI, - AliasAnalysis &AA, DependenceInfo &DI, + AAResults &AA, DependenceInfo &DI, Optional TRT) : Loops(Loops), TripCounts(), LoopCosts(), TRT((TRT == None) ? Optional(TemporalReuseThreshold) : TRT), diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index 2428d57d2809f..a19c1d78526b2 100644 --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -166,6 +166,12 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc, // These intrinsics don't really modify the memory, but returning Mod // will allow them to be handled conservatively. return ModRefInfo::Mod; + case Intrinsic::masked_load: + Loc = MemoryLocation::getForArgument(II, 0, TLI); + return ModRefInfo::Ref; + case Intrinsic::masked_store: + Loc = MemoryLocation::getForArgument(II, 1, TLI); + return ModRefInfo::Mod; default: break; } @@ -442,7 +448,9 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( if (IntrinsicInst *II = dyn_cast(Inst)) { // If we reach a lifetime begin or end marker, then the query ends here // because the value is undefined. - if (II->getIntrinsicID() == Intrinsic::lifetime_start) { + Intrinsic::ID ID = II->getIntrinsicID(); + switch (ID) { + case Intrinsic::lifetime_start: // FIXME: This only considers queries directly on the invariant-tagged // pointer, not on query pointers that are indexed off of them. It'd // be nice to handle that at some point (the right approach is to use @@ -450,6 +458,19 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( if (BatchAA.isMustAlias(MemoryLocation(II->getArgOperand(1)), MemLoc)) return MemDepResult::getDef(II); continue; + case Intrinsic::masked_load: + case Intrinsic::masked_store: { + MemoryLocation Loc; + /*ModRefInfo MR =*/ GetLocation(II, Loc, TLI); + AliasResult R = BatchAA.alias(Loc, MemLoc); + if (R == NoAlias) + continue; + if (R == MustAlias) + return MemDepResult::getDef(II); + if (ID == Intrinsic::masked_load) + continue; + return MemDepResult::getClobber(II); + } } } diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp index 9694036ce4767..fcea03a118bfc 100644 --- a/llvm/lib/Analysis/MemoryLocation.cpp +++ b/llvm/lib/Analysis/MemoryLocation.cpp @@ -176,6 +176,21 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, cast(II->getArgOperand(0))->getZExtValue()), AATags); + case Intrinsic::masked_load: + assert(ArgIdx == 0 && "Invalid argument index"); + return MemoryLocation( + Arg, + LocationSize::upperBound(DL.getTypeStoreSize(II->getType())), + AATags); + + case Intrinsic::masked_store: + assert(ArgIdx == 1 && "Invalid argument index"); + return MemoryLocation( + Arg, + LocationSize::upperBound( + DL.getTypeStoreSize(II->getArgOperand(0)->getType())), + AATags); + case Intrinsic::invariant_end: // The first argument to an invariant.end is a "descriptor" type (e.g. a // pointer to a empty struct) which is never actually dereferenced. diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp index f54f04460a4d7..14fa11988362d 100644 --- a/llvm/lib/Analysis/MemorySSA.cpp +++ b/llvm/lib/Analysis/MemorySSA.cpp @@ -603,13 +603,13 @@ template class ClobberWalker { void addSearches(MemoryPhi *Phi, SmallVectorImpl &PausedSearches, ListIndex PriorNode) { - auto UpwardDefsBegin = upward_defs_begin({Phi, Paths[PriorNode].Loc}, DT); + auto UpwardDefsBegin = upward_defs_begin({Phi, Paths[PriorNode].Loc}, DT, + &PerformedPhiTranslation); auto UpwardDefs = make_range(UpwardDefsBegin, upward_defs_end()); for (const MemoryAccessPair &P : UpwardDefs) { PausedSearches.push_back(Paths.size()); Paths.emplace_back(P.second, P.first, PriorNode); } - PerformedPhiTranslation |= UpwardDefsBegin.performedPhiTranslation(); } /// Represents a search that terminated after finding a clobber. This clobber diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp index 19f434f82cc66..f633fbe4e12b2 100644 --- a/llvm/lib/Analysis/MemorySSAUpdater.cpp +++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp @@ -342,6 +342,8 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) { SmallVector FixupList(InsertedPHIs.begin(), InsertedPHIs.end()); + SmallSet ExistingPhis; + // Remember the index where we may insert new phis. unsigned NewPhiIndex = InsertedPHIs.size(); if (!DefBeforeSameBlock) { @@ -382,6 +384,8 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) { if (!MPhi) { MPhi = MSSA->createMemoryPhi(BBIDF); NewInsertedPHIs.push_back(MPhi); + } else { + ExistingPhis.insert(MPhi); } // Add the phis created into the IDF blocks to NonOptPhis, so they are not // optimized out as trivial by the call to getPreviousDefFromEnd below. @@ -447,6 +451,13 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) { if (Phi) MSSA->renamePass(Phi->getBlock(), nullptr, Visited); } + // Existing Phi blocks may need renaming too, if an access was previously + // optimized and the inserted Defs "covers" the Optimized value. + for (auto &MP : ExistingPhis) { + MemoryPhi *Phi = dyn_cast_or_null(MP); + if (Phi) + MSSA->renamePass(Phi->getBlock(), nullptr, Visited); + } } } @@ -1322,6 +1333,7 @@ void MemorySSAUpdater::removeMemoryAccess(MemoryAccess *MA, bool OptimizePhis) { // Note: We assume MemorySSA is not used in metadata since it's not really // part of the IR. + assert(NewDefTarget != MA && "Going into an infinite loop"); while (!MA->use_empty()) { Use &U = *MA->use_begin(); if (auto *MUD = dyn_cast(U.getUser())) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 40d89fff04587..e571bad59f3a6 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -5912,7 +5912,7 @@ bool ScalarEvolution::isAddRecNeverPoison(const Instruction *I, const Loop *L) { const Instruction *Poison = PoisonStack.pop_back_val(); for (auto *PoisonUser : Poison->users()) { - if (propagatesPoison(cast(PoisonUser))) { + if (propagatesPoison(cast(PoisonUser))) { if (Pushed.insert(cast(PoisonUser)).second) PoisonStack.push_back(cast(PoisonUser)); } else if (auto *BI = dyn_cast(PoisonUser)) { @@ -6392,8 +6392,9 @@ unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L) { return 0; } -unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L, - BasicBlock *ExitingBlock) { +unsigned +ScalarEvolution::getSmallConstantTripCount(const Loop *L, + const BasicBlock *ExitingBlock) { assert(ExitingBlock && "Must pass a non-null exiting block!"); assert(L->isLoopExiting(ExitingBlock) && "Exiting block must actually branch out of the loop!"); @@ -6430,7 +6431,7 @@ unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) { /// that control exits the loop via ExitingBlock. unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L, - BasicBlock *ExitingBlock) { + const BasicBlock *ExitingBlock) { assert(ExitingBlock && "Must pass a non-null exiting block!"); assert(L->isLoopExiting(ExitingBlock) && "Exiting block must actually branch out of the loop!"); @@ -6461,7 +6462,7 @@ ScalarEvolution::getSmallConstantTripMultiple(const Loop *L, } const SCEV *ScalarEvolution::getExitCount(const Loop *L, - BasicBlock *ExitingBlock, + const BasicBlock *ExitingBlock, ExitCountKind Kind) { switch (Kind) { case Exact: @@ -6790,7 +6791,7 @@ ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE, /// Get the exact not taken count for this loop exit. const SCEV * -ScalarEvolution::BackedgeTakenInfo::getExact(BasicBlock *ExitingBlock, +ScalarEvolution::BackedgeTakenInfo::getExact(const BasicBlock *ExitingBlock, ScalarEvolution *SE) const { for (auto &ENT : ExitNotTaken) if (ENT.ExitingBlock == ExitingBlock && ENT.hasAlwaysTruePredicate()) @@ -6800,7 +6801,7 @@ ScalarEvolution::BackedgeTakenInfo::getExact(BasicBlock *ExitingBlock, } const SCEV * -ScalarEvolution::BackedgeTakenInfo::getMax(BasicBlock *ExitingBlock, +ScalarEvolution::BackedgeTakenInfo::getMax(const BasicBlock *ExitingBlock, ScalarEvolution *SE) const { for (auto &ENT : ExitNotTaken) if (ENT.ExitingBlock == ExitingBlock && ENT.hasAlwaysTruePredicate()) @@ -8036,22 +8037,22 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { if (const SCEVUnknown *SU = dyn_cast(V)) { if (Instruction *I = dyn_cast(SU->getValue())) { if (PHINode *PN = dyn_cast(I)) { - const Loop *LI = this->LI[I->getParent()]; + const Loop *CurrLoop = this->LI[I->getParent()]; // Looking for loop exit value. - if (LI && LI->getParentLoop() == L && - PN->getParent() == LI->getHeader()) { + if (CurrLoop && CurrLoop->getParentLoop() == L && + PN->getParent() == CurrLoop->getHeader()) { // Okay, there is no closed form solution for the PHI node. Check // to see if the loop that contains it has a known backedge-taken // count. If so, we may be able to force computation of the exit // value. - const SCEV *BackedgeTakenCount = getBackedgeTakenCount(LI); + const SCEV *BackedgeTakenCount = getBackedgeTakenCount(CurrLoop); // This trivial case can show up in some degenerate cases where // the incoming IR has not yet been fully simplified. if (BackedgeTakenCount->isZero()) { Value *InitValue = nullptr; bool MultipleInitValues = false; for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) { - if (!LI->contains(PN->getIncomingBlock(i))) { + if (!CurrLoop->contains(PN->getIncomingBlock(i))) { if (!InitValue) InitValue = PN->getIncomingValue(i); else if (InitValue != PN->getIncomingValue(i)) { @@ -8069,17 +8070,18 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { isKnownPositive(BackedgeTakenCount) && PN->getNumIncomingValues() == 2) { - unsigned InLoopPred = LI->contains(PN->getIncomingBlock(0)) ? 0 : 1; + unsigned InLoopPred = + CurrLoop->contains(PN->getIncomingBlock(0)) ? 0 : 1; Value *BackedgeVal = PN->getIncomingValue(InLoopPred); - if (LI->isLoopInvariant(BackedgeVal)) + if (CurrLoop->isLoopInvariant(BackedgeVal)) return getSCEV(BackedgeVal); } if (auto *BTCC = dyn_cast(BackedgeTakenCount)) { // Okay, we know how many times the containing loop executes. If // this is a constant evolving PHI node, get the final value at // the specified iteration number. - Constant *RV = - getConstantEvolutionLoopExitValue(PN, BTCC->getAPInt(), LI); + Constant *RV = getConstantEvolutionLoopExitValue( + PN, BTCC->getAPInt(), CurrLoop); if (RV) return getSCEV(RV); } } @@ -8135,9 +8137,10 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { if (const CmpInst *CI = dyn_cast(I)) C = ConstantFoldCompareInstOperands(CI->getPredicate(), Operands[0], Operands[1], DL, &TLI); - else if (const LoadInst *LI = dyn_cast(I)) { - if (!LI->isVolatile()) - C = ConstantFoldLoadFromConstPtr(Operands[0], LI->getType(), DL); + else if (const LoadInst *Load = dyn_cast(I)) { + if (!Load->isVolatile()) + C = ConstantFoldLoadFromConstPtr(Operands[0], Load->getType(), + DL); } else C = ConstantFoldInstOperands(I, Operands, DL, &TLI); if (!C) return V; @@ -8733,18 +8736,19 @@ ScalarEvolution::howFarToNonZero(const SCEV *V, const Loop *L) { return getCouldNotCompute(); } -std::pair -ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB) { +std::pair +ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB) + const { // If the block has a unique predecessor, then there is no path from the // predecessor to the block that does not go through the direct edge // from the predecessor to the block. - if (BasicBlock *Pred = BB->getSinglePredecessor()) + if (const BasicBlock *Pred = BB->getSinglePredecessor()) return {Pred, BB}; // A loop's header is defined to be a block that dominates the loop. // If the header has a unique predecessor outside the loop, it must be // a block that has exactly one successor that can reach the loop. - if (Loop *L = LI.getLoopFor(BB)) + if (const Loop *L = LI.getLoopFor(BB)) return {L->getLoopPredecessor(), L->getHeader()}; return {nullptr, nullptr}; @@ -9317,14 +9321,14 @@ bool ScalarEvolution::isKnownPredicateViaSplitting(ICmpInst::Predicate Pred, isKnownPredicate(CmpInst::ICMP_SLT, LHS, RHS); } -bool ScalarEvolution::isImpliedViaGuard(BasicBlock *BB, +bool ScalarEvolution::isImpliedViaGuard(const BasicBlock *BB, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // No need to even try if we know the module has no guards. if (!HasGuards) return false; - return any_of(*BB, [&](Instruction &I) { + return any_of(*BB, [&](const Instruction &I) { using namespace llvm::PatternMatch; Value *Condition; @@ -9488,7 +9492,7 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L, } // Try to prove (Pred, LHS, RHS) using isImpliedViaGuard. - auto ProveViaGuard = [&](BasicBlock *Block) { + auto ProveViaGuard = [&](const BasicBlock *Block) { if (isImpliedViaGuard(Block, Pred, LHS, RHS)) return true; if (ProvingStrictComparison) { @@ -9505,7 +9509,7 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L, }; // Try to prove (Pred, LHS, RHS) using isImpliedCond. - auto ProveViaCond = [&](Value *Condition, bool Inverse) { + auto ProveViaCond = [&](const Value *Condition, bool Inverse) { if (isImpliedCond(Pred, LHS, RHS, Condition, Inverse)) return true; if (ProvingStrictComparison) { @@ -9524,16 +9528,15 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L, // Starting at the loop predecessor, climb up the predecessor chain, as long // as there are predecessors that can be found that have unique successors // leading to the original header. - for (std::pair - Pair(L->getLoopPredecessor(), L->getHeader()); - Pair.first; - Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) { + for (std::pair Pair( + L->getLoopPredecessor(), L->getHeader()); + Pair.first; Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) { if (ProveViaGuard(Pair.first)) return true; - BranchInst *LoopEntryPredicate = - dyn_cast(Pair.first->getTerminator()); + const BranchInst *LoopEntryPredicate = + dyn_cast(Pair.first->getTerminator()); if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional()) continue; @@ -9558,10 +9561,9 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L, return false; } -bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, - const SCEV *LHS, const SCEV *RHS, - Value *FoundCondValue, - bool Inverse) { +bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, + const SCEV *RHS, + const Value *FoundCondValue, bool Inverse) { if (!PendingLoopPredicates.insert(FoundCondValue).second) return false; @@ -9569,7 +9571,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, make_scope_exit([&]() { PendingLoopPredicates.erase(FoundCondValue); }); // Recursively handle And and Or conditions. - if (BinaryOperator *BO = dyn_cast(FoundCondValue)) { + if (const BinaryOperator *BO = dyn_cast(FoundCondValue)) { if (BO->getOpcode() == Instruction::And) { if (!Inverse) return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) || @@ -9581,7 +9583,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, } } - ICmpInst *ICI = dyn_cast(FoundCondValue); + const ICmpInst *ICI = dyn_cast(FoundCondValue); if (!ICI) return false; // Now that we found a conditional branch that dominates the loop or controls @@ -12506,3 +12508,28 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS, MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(0))); return false; } + +const SCEV* ScalarEvolution::computeMaxBackedgeTakenCount(const Loop *L) { + SmallVector ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + // Form an expression for the maximum exit count possible for this loop. We + // merge the max and exact information to approximate a version of + // getConstantMaxBackedgeTakenCount which isn't restricted to just constants. + SmallVector ExitCounts; + for (BasicBlock *ExitingBB : ExitingBlocks) { + const SCEV *ExitCount = getExitCount(L, ExitingBB); + if (isa(ExitCount)) + ExitCount = getExitCount(L, ExitingBB, + ScalarEvolution::ConstantMaximum); + if (!isa(ExitCount)) { + assert(DT.dominates(ExitingBB, L->getLoopLatch()) && + "We should only have known counts for exiting blocks that " + "dominate latch!"); + ExitCounts.push_back(ExitCount); + } + } + if (ExitCounts.empty()) + return getCouldNotCompute(); + return getUMinFromMismatchedTypes(ExitCounts); +} diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 52c88180c9ec5..2ffe4ff5a8238 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1013,6 +1013,11 @@ bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, Type *Ty, return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags); } +bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty, + ReductionFlags Flags) const { + return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags); +} + bool TargetTransformInfo::preferPredicatedReductionSelect( unsigned Opcode, Type *Ty, ReductionFlags Flags) const { return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty, Flags); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 6e5a7195bb194..1a894959c5bd9 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -1739,6 +1739,26 @@ static void computeKnownBitsFromOperator(const Operator *I, } break; } + case Intrinsic::umin: + computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + Known = KnownBits::umin(Known, Known2); + break; + case Intrinsic::umax: + computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + Known = KnownBits::umax(Known, Known2); + break; + case Intrinsic::smin: + computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + Known = KnownBits::smin(Known, Known2); + break; + case Intrinsic::smax: + computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + Known = KnownBits::smax(Known, Known2); + break; case Intrinsic::x86_sse42_crc32_64_64: Known.Zero.setBitsFrom(32); break; @@ -1852,6 +1872,10 @@ static void computeKnownBitsFromOperator(const Operator *I, } } break; + case Instruction::Freeze: + if (isGuaranteedNotToBePoison(I->getOperand(0), Q.CxtI, Q.DT, Depth + 1)) + computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + break; } } @@ -2557,6 +2581,13 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth, return isKnownNonZero(Vec, DemandedVecElts, Depth, Q); } } + // Freeze + else if (const FreezeInst *FI = dyn_cast(V)) { + auto *Op = FI->getOperand(0); + if (isKnownNonZero(Op, Depth, Q) && + isGuaranteedNotToBePoison(Op, Q.CxtI, Q.DT, Depth)) + return true; + } KnownBits Known(BitWidth); computeKnownBits(V, DemandedElts, Known, Depth, Q); @@ -4840,10 +4871,13 @@ bool llvm::canCreatePoison(const Operator *Op) { return ::canCreateUndefOrPoison(Op, /*PoisonOnly=*/true); } -bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, - const Instruction *CtxI, - const DominatorTree *DT, - unsigned Depth) { +static bool programUndefinedIfUndefOrPoison(const Instruction *Inst, + bool PoisonOnly); + +static bool isGuaranteedNotToBeUndefOrPoison(const Value *V, + const Instruction *CtxI, + const DominatorTree *DT, + unsigned Depth, bool PoisonOnly) { if (Depth >= MaxAnalysisRecursionDepth) return false; @@ -4854,14 +4888,15 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, if (auto *C = dyn_cast(V)) { if (isa(C)) - return false; + return PoisonOnly; if (isa(C) || isa(C) || isa(V) || isa(C) || isa(C)) return true; if (C->getType()->isVectorTy() && !isa(C)) - return !C->containsConstantExpression() && !C->containsUndefElement(); + return (PoisonOnly || !C->containsUndefElement()) && + !C->containsConstantExpression(); } // Strip cast operations from a pointer value. @@ -4878,7 +4913,7 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, return true; auto OpCheck = [&](const Value *V) { - return isGuaranteedNotToBeUndefOrPoison(V, CtxI, DT, Depth + 1); + return isGuaranteedNotToBeUndefOrPoison(V, CtxI, DT, Depth + 1, PoisonOnly); }; if (auto *Opr = dyn_cast(V)) { @@ -4897,9 +4932,7 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, } if (auto *I = dyn_cast(V)) { - if (programUndefinedIfPoison(I) && I->getType()->isIntegerTy(1)) - // Note: once we have an agreement that poison is a value-wise concept, - // we can remove the isIntegerTy(1) constraint. + if (programUndefinedIfUndefOrPoison(I, PoisonOnly)) return true; } @@ -4921,12 +4954,24 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, while (Dominator) { auto *TI = Dominator->getBlock()->getTerminator(); + Value *Cond = nullptr; if (auto BI = dyn_cast(TI)) { - if (BI->isConditional() && BI->getCondition() == V) - return true; + if (BI->isConditional()) + Cond = BI->getCondition(); } else if (auto SI = dyn_cast(TI)) { - if (SI->getCondition() == V) + Cond = SI->getCondition(); + } + + if (Cond) { + if (Cond == V) return true; + else if (PoisonOnly && isa(Cond)) { + // For poison, we can analyze further + auto *Opr = cast(Cond); + if (propagatesPoison(Opr) && + any_of(Opr->operand_values(), [&](Value *Op) { return Op == V; })) + return true; + } } Dominator = Dominator->getIDom(); @@ -4935,6 +4980,18 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, return false; } +bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, + const Instruction *CtxI, + const DominatorTree *DT, + unsigned Depth) { + return ::isGuaranteedNotToBeUndefOrPoison(V, CtxI, DT, Depth, false); +} + +bool llvm::isGuaranteedNotToBePoison(const Value *V, const Instruction *CtxI, + const DominatorTree *DT, unsigned Depth) { + return ::isGuaranteedNotToBeUndefOrPoison(V, CtxI, DT, Depth, true); +} + OverflowResult llvm::computeOverflowForSignedAdd(const AddOperator *Add, const DataLayout &DL, AssumptionCache *AC, @@ -5028,7 +5085,7 @@ bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I, llvm_unreachable("Instruction not contained in its own parent basic block."); } -bool llvm::propagatesPoison(const Instruction *I) { +bool llvm::propagatesPoison(const Operator *I) { switch (I->getOpcode()) { case Instruction::Freeze: case Instruction::Select: @@ -5104,30 +5161,51 @@ bool llvm::mustTriggerUB(const Instruction *I, return false; } - -bool llvm::programUndefinedIfPoison(const Instruction *PoisonI) { - // We currently only look for uses of poison values within the same basic +static bool programUndefinedIfUndefOrPoison(const Instruction *Inst, + bool PoisonOnly) { + // We currently only look for uses of values within the same basic // block, as that makes it easier to guarantee that the uses will be - // executed given that PoisonI is executed. + // executed given that Inst is executed. // // FIXME: Expand this to consider uses beyond the same basic block. To do // this, look out for the distinction between post-dominance and strong // post-dominance. - const BasicBlock *BB = PoisonI->getParent(); + const BasicBlock *BB = Inst->getParent(); + + BasicBlock::const_iterator Begin = Inst->getIterator(), End = BB->end(); + + if (!PoisonOnly) { + // Be conservative & just check whether a value is passed to a noundef + // argument. + // Instructions that raise UB with a poison operand are well-defined + // or have unclear semantics when the input is partially undef. + // For example, 'udiv x, (undef | 1)' isn't UB. - // Set of instructions that we have proved will yield poison if PoisonI + for (auto &I : make_range(Begin, End)) { + if (const auto *CB = dyn_cast(&I)) { + for (unsigned i = 0; i < CB->arg_size(); ++i) { + if (CB->paramHasAttr(i, Attribute::NoUndef) && + CB->getArgOperand(i) == Inst) + return true; + } + } + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) + break; + } + return false; + } + + // Set of instructions that we have proved will yield poison if Inst // does. SmallSet YieldsPoison; SmallSet Visited; - YieldsPoison.insert(PoisonI); - Visited.insert(PoisonI->getParent()); - - BasicBlock::const_iterator Begin = PoisonI->getIterator(), End = BB->end(); + YieldsPoison.insert(Inst); + Visited.insert(Inst->getParent()); unsigned Iter = 0; while (Iter++ < MaxAnalysisRecursionDepth) { for (auto &I : make_range(Begin, End)) { - if (&I != PoisonI) { + if (&I != Inst) { if (mustTriggerUB(&I, YieldsPoison)) return true; if (!isGuaranteedToTransferExecutionToSuccessor(&I)) @@ -5138,7 +5216,7 @@ bool llvm::programUndefinedIfPoison(const Instruction *PoisonI) { if (YieldsPoison.count(&I)) { for (const User *User : I.users()) { const Instruction *UserI = cast(User); - if (propagatesPoison(UserI)) + if (propagatesPoison(cast(UserI))) YieldsPoison.insert(User); } } @@ -5158,6 +5236,14 @@ bool llvm::programUndefinedIfPoison(const Instruction *PoisonI) { return false; } +bool llvm::programUndefinedIfUndefOrPoison(const Instruction *Inst) { + return ::programUndefinedIfUndefOrPoison(Inst, false); +} + +bool llvm::programUndefinedIfPoison(const Instruction *Inst) { + return ::programUndefinedIfUndefOrPoison(Inst, true); +} + static bool isKnownNonNaN(const Value *V, FastMathFlags FMF) { if (FMF.noNaNs()) return true; diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index e241300dd2e7c..34fa0f283b03c 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -416,8 +416,7 @@ void llvm::narrowShuffleMaskElts(int Scale, ArrayRef Mask, ScaledMask.clear(); for (int MaskElt : Mask) { if (MaskElt >= 0) { - assert(((uint64_t)Scale * MaskElt + (Scale - 1)) <= - std::numeric_limits::max() && + assert(((uint64_t)Scale * MaskElt + (Scale - 1)) <= INT32_MAX && "Overflowed 32-bits"); } for (int SliceElt = 0; SliceElt != Scale; ++SliceElt) @@ -863,11 +862,19 @@ Value *llvm::concatenateVectors(IRBuilderBase &Builder, } bool llvm::maskIsAllZeroOrUndef(Value *Mask) { + assert(isa(Mask->getType()) && + isa(Mask->getType()->getScalarType()) && + cast(Mask->getType()->getScalarType())->getBitWidth() == + 1 && + "Mask must be a vector of i1"); + auto *ConstMask = dyn_cast(Mask); if (!ConstMask) return false; if (ConstMask->isNullValue() || isa(ConstMask)) return true; + if (isa(ConstMask->getType())) + return false; for (unsigned I = 0, E = cast(ConstMask->getType())->getNumElements(); @@ -882,11 +889,19 @@ bool llvm::maskIsAllZeroOrUndef(Value *Mask) { bool llvm::maskIsAllOneOrUndef(Value *Mask) { + assert(isa(Mask->getType()) && + isa(Mask->getType()->getScalarType()) && + cast(Mask->getType()->getScalarType())->getBitWidth() == + 1 && + "Mask must be a vector of i1"); + auto *ConstMask = dyn_cast(Mask); if (!ConstMask) return false; if (ConstMask->isAllOnesValue() || isa(ConstMask)) return true; + if (isa(ConstMask->getType())) + return false; for (unsigned I = 0, E = cast(ConstMask->getType())->getNumElements(); @@ -902,6 +917,11 @@ bool llvm::maskIsAllOneOrUndef(Value *Mask) { /// TODO: This is a lot like known bits, but for /// vectors. Is there something we can common this with? APInt llvm::possiblyDemandedEltsInMask(Value *Mask) { + assert(isa(Mask->getType()) && + isa(Mask->getType()->getScalarType()) && + cast(Mask->getType()->getScalarType())->getBitWidth() == + 1 && + "Mask must be a fixed width vector of i1"); const unsigned VWidth = cast(Mask->getType())->getNumElements(); diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 0fa502f4569f4..4d69dd7dcc5d6 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -651,7 +651,7 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer { /// Read a value/type pair out of the specified record from slot 'Slot'. /// Increment Slot past the number of slots used in the record. Return true on /// failure. - bool getValueTypePair(SmallVectorImpl &Record, unsigned &Slot, + bool getValueTypePair(const SmallVectorImpl &Record, unsigned &Slot, unsigned InstNum, Value *&ResVal, Type **FullTy = nullptr) { if (Slot == Record.size()) return true; @@ -688,7 +688,7 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer { } /// Like popValue, but does not increment the Slot number. - bool getValue(SmallVectorImpl &Record, unsigned Slot, + bool getValue(const SmallVectorImpl &Record, unsigned Slot, unsigned InstNum, Type *Ty, Value *&ResVal) { ResVal = getValue(Record, Slot, InstNum, Ty); return ResVal == nullptr; @@ -696,7 +696,7 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer { /// Version of getValue that returns ResVal directly, or 0 if there is an /// error. - Value *getValue(SmallVectorImpl &Record, unsigned Slot, + Value *getValue(const SmallVectorImpl &Record, unsigned Slot, unsigned InstNum, Type *Ty) { if (Slot == Record.size()) return nullptr; unsigned ValNo = (unsigned)Record[Slot]; @@ -707,7 +707,7 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer { } /// Like getValue, but decodes signed VBRs. - Value *getValueSigned(SmallVectorImpl &Record, unsigned Slot, + Value *getValueSigned(const SmallVectorImpl &Record, unsigned Slot, unsigned InstNum, Type *Ty) { if (Slot == Record.size()) return nullptr; unsigned ValNo = (unsigned)decodeSignRotatedValue(Record[Slot]); @@ -4989,54 +4989,55 @@ Error BitcodeReader::parseFunctionBody(Function *F) { InstructionList.push_back(I); break; } - case bitc::FUNC_CODE_INST_CMPXCHG_OLD: - case bitc::FUNC_CODE_INST_CMPXCHG: { - // CMPXCHG:[ptrty, ptr, cmp, new, vol, successordering, ssid, - // failureordering?, isweak?] + case bitc::FUNC_CODE_INST_CMPXCHG_OLD: { + // CMPXCHG_OLD: [ptrty, ptr, cmp, val, vol, ordering, synchscope, + // failure_ordering?, weak?] + const size_t NumRecords = Record.size(); unsigned OpNum = 0; - Value *Ptr, *Cmp, *New; + Value *Ptr = nullptr; if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, &FullTy)) return error("Invalid record"); if (!isa(Ptr->getType())) return error("Cmpxchg operand is not a pointer type"); - if (BitCode == bitc::FUNC_CODE_INST_CMPXCHG) { - if (getValueTypePair(Record, OpNum, NextValueNo, Cmp, &FullTy)) - return error("Invalid record"); - } else if (popValue(Record, OpNum, NextValueNo, - getPointerElementFlatType(FullTy), Cmp)) + Value *Cmp = nullptr; + if (popValue(Record, OpNum, NextValueNo, + getPointerElementFlatType(FullTy), Cmp)) return error("Invalid record"); - else - FullTy = cast(FullTy)->getElementType(); + FullTy = cast(FullTy)->getElementType(); + + Value *New = nullptr; if (popValue(Record, OpNum, NextValueNo, Cmp->getType(), New) || - Record.size() < OpNum + 3 || Record.size() > OpNum + 5) + NumRecords < OpNum + 3 || NumRecords > OpNum + 5) return error("Invalid record"); - AtomicOrdering SuccessOrdering = getDecodedOrdering(Record[OpNum + 1]); + const AtomicOrdering SuccessOrdering = + getDecodedOrdering(Record[OpNum + 1]); if (SuccessOrdering == AtomicOrdering::NotAtomic || SuccessOrdering == AtomicOrdering::Unordered) return error("Invalid record"); - SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 2]); + + const SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 2]); if (Error Err = typeCheckLoadStoreInst(Cmp->getType(), Ptr->getType())) return Err; - AtomicOrdering FailureOrdering; - if (Record.size() < 7) - FailureOrdering = - AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering); - else - FailureOrdering = getDecodedOrdering(Record[OpNum + 3]); - Align Alignment( + const AtomicOrdering FailureOrdering = + NumRecords < 7 + ? AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering) + : getDecodedOrdering(Record[OpNum + 3]); + + const Align Alignment( TheModule->getDataLayout().getTypeStoreSize(Cmp->getType())); + I = new AtomicCmpXchgInst(Ptr, Cmp, New, Alignment, SuccessOrdering, FailureOrdering, SSID); - FullTy = StructType::get(Context, {FullTy, Type::getInt1Ty(Context)}); cast(I)->setVolatile(Record[OpNum]); + FullTy = StructType::get(Context, {FullTy, Type::getInt1Ty(Context)}); - if (Record.size() < 8) { + if (NumRecords < 8) { // Before weak cmpxchgs existed, the instruction simply returned the // value loaded from memory, so bitcode files from that era will be // expecting the first component of a modern cmpxchg. @@ -5044,12 +5045,59 @@ Error BitcodeReader::parseFunctionBody(Function *F) { I = ExtractValueInst::Create(I, 0); FullTy = cast(FullTy)->getElementType(0); } else { - cast(I)->setWeak(Record[OpNum+4]); + cast(I)->setWeak(Record[OpNum + 4]); } InstructionList.push_back(I); break; } + case bitc::FUNC_CODE_INST_CMPXCHG: { + // CMPXCHG: [ptrty, ptr, cmp, val, vol, success_ordering, synchscope, + // failure_ordering, weak] + const size_t NumRecords = Record.size(); + unsigned OpNum = 0; + Value *Ptr = nullptr; + if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, &FullTy)) + return error("Invalid record"); + + if (!isa(Ptr->getType())) + return error("Cmpxchg operand is not a pointer type"); + + Value *Cmp = nullptr; + if (getValueTypePair(Record, OpNum, NextValueNo, Cmp, &FullTy)) + return error("Invalid record"); + + Value *Val = nullptr; + if (popValue(Record, OpNum, NextValueNo, Cmp->getType(), Val) || + NumRecords < OpNum + 3 || NumRecords > OpNum + 5) + return error("Invalid record"); + + const AtomicOrdering SuccessOrdering = + getDecodedOrdering(Record[OpNum + 1]); + if (SuccessOrdering == AtomicOrdering::NotAtomic || + SuccessOrdering == AtomicOrdering::Unordered) + return error("Invalid record"); + + const SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 2]); + + if (Error Err = typeCheckLoadStoreInst(Cmp->getType(), Ptr->getType())) + return Err; + + const AtomicOrdering FailureOrdering = + getDecodedOrdering(Record[OpNum + 3]); + + const Align Alignment( + TheModule->getDataLayout().getTypeStoreSize(Cmp->getType())); + + I = new AtomicCmpXchgInst(Ptr, Cmp, Val, Alignment, SuccessOrdering, + FailureOrdering, SSID); + FullTy = StructType::get(Context, {FullTy, Type::getInt1Ty(Context)}); + cast(I)->setVolatile(Record[OpNum]); + cast(I)->setWeak(Record[OpNum + 4]); + + InstructionList.push_back(I); + break; + } case bitc::FUNC_CODE_INST_ATOMICRMW: { // ATOMICRMW:[ptrty, ptr, val, op, vol, ordering, ssid] unsigned OpNum = 0; diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index 821185e46c046..874bb84170df2 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -17,7 +17,6 @@ #include "llvm/ADT/None.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" @@ -63,7 +62,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" @@ -75,7 +73,6 @@ #include #include #include -#include #include #include #include diff --git a/llvm/lib/Bitcode/Reader/ValueList.cpp b/llvm/lib/Bitcode/Reader/ValueList.cpp index 63a206eeb022c..ddfa28c6b1e44 100644 --- a/llvm/lib/Bitcode/Reader/ValueList.cpp +++ b/llvm/lib/Bitcode/Reader/ValueList.cpp @@ -16,14 +16,11 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" -#include "llvm/IR/ValueHandle.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include -#include #include #include -#include using namespace llvm; diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index eaea026681b1d..26874c9ac364f 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -86,6 +86,9 @@ static cl::opt IndexThreshold("bitcode-mdindex-threshold", cl::Hidden, cl::init(25), cl::desc("Number of metadatas above which we emit an index " "to enable lazy-loading")); +static cl::opt FlushThreshold( + "bitcode-flush-threshold", cl::Hidden, cl::init(512), + cl::desc("The threshold (unit M) for flushing LLVM bitcode.")); static cl::opt WriteRelBFToSummary( "write-relbf-to-summary", cl::Hidden, cl::init(false), @@ -4453,8 +4456,8 @@ static void writeBitcodeHeader(BitstreamWriter &Stream) { Stream.Emit(0xD, 4); } -BitcodeWriter::BitcodeWriter(SmallVectorImpl &Buffer) - : Buffer(Buffer), Stream(new BitstreamWriter(Buffer)) { +BitcodeWriter::BitcodeWriter(SmallVectorImpl &Buffer, raw_fd_stream *FS) + : Buffer(Buffer), Stream(new BitstreamWriter(Buffer, FS, FlushThreshold)) { writeBitcodeHeader(*Stream); } @@ -4565,7 +4568,7 @@ void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out, if (TT.isOSDarwin() || TT.isOSBinFormatMachO()) Buffer.insert(Buffer.begin(), BWH_HeaderSize, 0); - BitcodeWriter Writer(Buffer); + BitcodeWriter Writer(Buffer, dyn_cast(&Out)); Writer.writeModule(M, ShouldPreserveUseListOrder, Index, GenerateHash, ModHash); Writer.writeSymtab(); @@ -4575,7 +4578,8 @@ void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out, emitDarwinBCHeaderAndTrailer(Buffer, TT); // Write the generated bitstream to "Out". - Out.write((char*)&Buffer.front(), Buffer.size()); + if (!Buffer.empty()) + Out.write((char *)&Buffer.front(), Buffer.size()); } void IndexBitcodeWriter::write() { @@ -4829,11 +4833,10 @@ void llvm::EmbedBitcodeInModule(llvm::Module &M, llvm::MemoryBufferRef Buf, std::string Data; ArrayRef ModuleData; Triple T(M.getTargetTriple()); - // Create a constant that contains the bitcode. - // In case of embedding a marker, ignore the input Buf and use the empty - // ArrayRef. It is also legal to create a bitcode marker even Buf is empty. + if (EmbedBitcode) { - if (!isBitcode((const unsigned char *)Buf.getBufferStart(), + if (Buf.getBufferSize() == 0 || + !isBitcode((const unsigned char *)Buf.getBufferStart(), (const unsigned char *)Buf.getBufferEnd())) { // If the input is LLVM Assembly, bitcode is produced by serializing // the module. Use-lists order need to be preserved in this case. diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp index 8bdddc27e95ab..88279569bc028 100644 --- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp +++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp @@ -11,11 +11,9 @@ //===----------------------------------------------------------------------===// #include "ValueEnumerator.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Argument.h" -#include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -32,7 +30,6 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" -#include "llvm/IR/UseListOrder.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueSymbolTable.h" @@ -42,12 +39,9 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include -#include #include #include #include -#include -#include using namespace llvm; diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp index d7b0ffc48f09d..4e45a0ffc60fb 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp @@ -270,7 +270,7 @@ void AccelTableWriter::emitOffsets(const MCSymbol *Base) const { continue; PrevHash = HashValue; Asm->OutStreamer->AddComment("Offset in Bucket " + Twine(i)); - Asm->emitLabelDifference(Hash->Sym, Base, sizeof(uint32_t)); + Asm->emitLabelDifference(Hash->Sym, Base, Asm->getDwarfOffsetByteSize()); } } } @@ -366,9 +366,8 @@ void Dwarf5AccelTableWriter::Header::emit( assert(CompUnitCount > 0 && "Index must have at least one CU."); AsmPrinter *Asm = Ctx.Asm; - Asm->OutStreamer->AddComment("Header: unit length"); - Asm->emitLabelDifference(Ctx.ContributionEnd, Ctx.ContributionStart, - sizeof(uint32_t)); + Asm->emitDwarfUnitLength(Ctx.ContributionEnd, Ctx.ContributionStart, + "Header: unit length"); Asm->OutStreamer->emitLabel(Ctx.ContributionStart); Asm->OutStreamer->AddComment("Header: version"); Asm->emitInt16(Version); @@ -592,10 +591,14 @@ void llvm::emitDWARF5AccelTable( } void AppleAccelTableOffsetData::emit(AsmPrinter *Asm) const { + assert(Die.getDebugSectionOffset() <= UINT32_MAX && + "The section offset exceeds the limit."); Asm->emitInt32(Die.getDebugSectionOffset()); } void AppleAccelTableTypeData::emit(AsmPrinter *Asm) const { + assert(Die.getDebugSectionOffset() <= UINT32_MAX && + "The section offset exceeds the limit."); Asm->emitInt32(Die.getDebugSectionOffset()); Asm->emitInt16(Die.getTag()); Asm->emitInt8(0); diff --git a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp index 883aaf5aefc49..3df8e35accc4a 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp @@ -29,9 +29,7 @@ MCSymbol *AddressPool::emitHeader(AsmPrinter &Asm, MCSection *Section) { MCSymbol *BeginLabel = Asm.createTempSymbol(Prefix + "start"); MCSymbol *EndLabel = Asm.createTempSymbol(Prefix + "end"); - Asm.OutStreamer->AddComment("Length of contribution"); - Asm.emitLabelDifference(EndLabel, BeginLabel, - 4); // TODO: Support DWARF64 format. + Asm.emitDwarfUnitLength(EndLabel, BeginLabel, "Length of contribution"); Asm.OutStreamer->emitLabel(BeginLabel); Asm.OutStreamer->AddComment("DWARF version number"); Asm.emitInt16(Asm.getDwarfVersion()); diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index cdacedc723217..7d8355c049693 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1023,6 +1023,46 @@ void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) { MCConstantExpr::create(FrameOffset, OutContext)); } +/// Returns the BB metadata to be emitted in the bb_addr_map section for a given +/// basic block. This can be used to capture more precise profile information. +/// We use the last 3 bits (LSBs) to ecnode the following information: +/// * (1): set if return block (ret or tail call). +/// * (2): set if ends with a tail call. +/// * (3): set if exception handling (EH) landing pad. +/// The remaining bits are zero. +static unsigned getBBAddrMapMetadata(const MachineBasicBlock &MBB) { + const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo(); + return ((unsigned)MBB.isReturnBlock()) | + ((!MBB.empty() && TII->isTailCall(MBB.back())) << 1) | + (MBB.isEHPad() << 2); +} + +void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { + MCSection *BBAddrMapSection = + getObjFileLowering().getBBAddrMapSection(*MF.getSection()); + assert(BBAddrMapSection && ".bb_addr_map section is not initialized."); + + const MCSymbol *FunctionSymbol = getFunctionBegin(); + + OutStreamer->PushSection(); + OutStreamer->SwitchSection(BBAddrMapSection); + OutStreamer->emitSymbolValue(FunctionSymbol, getPointerSize()); + // Emit the total number of basic blocks in this function. + OutStreamer->emitULEB128IntValue(MF.size()); + // Emit BB Information for each basic block in the funciton. + for (const MachineBasicBlock &MBB : MF) { + const MCSymbol *MBBSymbol = + MBB.pred_empty() ? FunctionSymbol : MBB.getSymbol(); + // Emit the basic block offset. + emitLabelDifferenceAsULEB128(MBBSymbol, FunctionSymbol); + // Emit the basic block size. When BBs have alignments, their size cannot + // always be computed from their offsets. + emitLabelDifferenceAsULEB128(MBB.getEndSymbol(), MBBSymbol); + OutStreamer->emitULEB128IntValue(getBBAddrMapMetadata(MBB)); + } + OutStreamer->PopSection(); +} + void AsmPrinter::emitStackSizeSection(const MachineFunction &MF) { if (!MF.getTarget().Options.EmitStackSizeSection) return; @@ -1142,6 +1182,11 @@ void AsmPrinter::emitFunctionBody() { emitInstruction(&MI); } break; + case TargetOpcode::DBG_INSTR_REF: + // This instruction reference will have been resolved to a machine + // location, and a nearby DBG_VALUE created. We can safely ignore + // the instruction reference. + break; case TargetOpcode::DBG_LABEL: if (isVerbose()) { if (!emitDebugLabelComment(&MI, *this)) @@ -1174,34 +1219,26 @@ void AsmPrinter::emitFunctionBody() { } // We must emit temporary symbol for the end of this basic block, if either - // we have BBLabels enabled and we want to emit size directive for the BBs, - // or if this basic blocks marks the end of a section (except the section - // containing the entry basic block as the end symbol for that section is - // CurrentFnEnd). - if ((MAI->hasDotTypeDotSizeDirective() && MF->hasBBLabels()) || - (MBB.isEndSection() && !MBB.sameSection(&MF->front()))) + // we have BBLabels enabled or if this basic blocks marks the end of a + // section (except the section containing the entry basic block as the end + // symbol for that section is CurrentFnEnd). + if (MF->hasBBLabels() || + (MAI->hasDotTypeDotSizeDirective() && MBB.isEndSection() && + !MBB.sameSection(&MF->front()))) OutStreamer->emitLabel(MBB.getEndSymbol()); - // Helper for emitting the size directive associated with a basic block - // symbol. - auto emitELFSizeDirective = [&](MCSymbol *SymForSize) { - const MCExpr *SizeExp = MCBinaryExpr::createSub( - MCSymbolRefExpr::create(MBB.getEndSymbol(), OutContext), - MCSymbolRefExpr::create(SymForSize, OutContext), OutContext); - OutStreamer->emitELFSize(SymForSize, SizeExp); - }; - - // Emit size directive for the size of each basic block, if BBLabels is - // enabled. - if (MAI->hasDotTypeDotSizeDirective() && MF->hasBBLabels()) - emitELFSizeDirective(MBB.getSymbol()); - - // Emit size directive for the size of each basic block section once we - // get to the end of that section. if (MBB.isEndSection()) { + // The size directive for the section containing the entry block is + // handled separately by the function section. if (!MBB.sameSection(&MF->front())) { - if (MAI->hasDotTypeDotSizeDirective()) - emitELFSizeDirective(CurrentSectionBeginSym); + if (MAI->hasDotTypeDotSizeDirective()) { + // Emit the size directive for the basic block section. + const MCExpr *SizeExp = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(MBB.getEndSymbol(), OutContext), + MCSymbolRefExpr::create(CurrentSectionBeginSym, OutContext), + OutContext); + OutStreamer->emitELFSize(CurrentSectionBeginSym, SizeExp); + } MBBSectionRanges[MBB.getSectionIDNum()] = MBBSectionRange{CurrentSectionBeginSym, MBB.getEndSymbol()}; } @@ -1293,6 +1330,11 @@ void AsmPrinter::emitFunctionBody() { HI.Handler->endFunction(MF); } + // Emit section containing BB address offsets and their metadata, when + // BB labels are requested for this function. + if (MF->hasBBLabels()) + emitBBAddrMapSection(*MF); + // Emit section containing stack size metadata. emitStackSizeSection(*MF); @@ -1802,7 +1844,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { F.hasFnAttribute("function-instrument") || F.hasFnAttribute("xray-instruction-threshold") || needFuncLabelsForEHOrDebugInfo(MF) || NeedsLocalForSize || - MF.getTarget().Options.EmitStackSizeSection) { + MF.getTarget().Options.EmitStackSizeSection || MF.hasBBLabels()) { CurrentFnBegin = createTempSymbol("func_begin"); if (NeedsLocalForSize) CurrentFnSymForSize = CurrentFnBegin; @@ -3390,3 +3432,17 @@ uint16_t AsmPrinter::getDwarfVersion() const { void AsmPrinter::setDwarfVersion(uint16_t Version) { OutStreamer->getContext().setDwarfVersion(Version); } + +bool AsmPrinter::isDwarf64() const { + return OutStreamer->getContext().getDwarfFormat() == dwarf::DWARF64; +} + +unsigned int AsmPrinter::getDwarfOffsetByteSize() const { + return dwarf::getDwarfOffsetByteSize( + OutStreamer->getContext().getDwarfFormat()); +} + +unsigned int AsmPrinter::getUnitLengthFieldByteSize() const { + return dwarf::getUnitLengthFieldByteSize( + OutStreamer->getContext().getDwarfFormat()); +} diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp index b6a9a95683603..594b41bcea53f 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp @@ -27,6 +27,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" +#include using namespace llvm; #define DEBUG_TYPE "asm-printer" @@ -154,19 +155,22 @@ void AsmPrinter::emitDwarfSymbolReference(const MCSymbol *Label, if (!ForceOffset) { // On COFF targets, we have to emit the special .secrel32 directive. if (MAI->needsDwarfSectionOffsetDirective()) { + assert(!isDwarf64() && + "emitting DWARF64 is not implemented for COFF targets"); OutStreamer->EmitCOFFSecRel32(Label, /*Offset=*/0); return; } // If the format uses relocations with dwarf, refer to the symbol directly. if (MAI->doesDwarfUseRelocationsAcrossSections()) { - OutStreamer->emitSymbolValue(Label, 4); + OutStreamer->emitSymbolValue(Label, getDwarfOffsetByteSize()); return; } } // Otherwise, emit it as a label difference from the start of the section. - emitLabelDifference(Label, Label->getSection().getBeginSymbol(), 4); + emitLabelDifference(Label, Label->getSection().getBeginSymbol(), + getDwarfOffsetByteSize()); } void AsmPrinter::emitDwarfStringOffset(DwarfStringPoolEntry S) const { @@ -177,12 +181,38 @@ void AsmPrinter::emitDwarfStringOffset(DwarfStringPoolEntry S) const { } // Just emit the offset directly; no need for symbol math. - emitInt32(S.Offset); + OutStreamer->emitIntValue(S.Offset, getDwarfOffsetByteSize()); } void AsmPrinter::emitDwarfOffset(const MCSymbol *Label, uint64_t Offset) const { - // TODO: Support DWARF64 - emitLabelPlusOffset(Label, Offset, 4); + emitLabelPlusOffset(Label, Offset, getDwarfOffsetByteSize()); +} + +void AsmPrinter::emitDwarfLengthOrOffset(uint64_t Value) const { + assert(isDwarf64() || Value <= UINT32_MAX); + OutStreamer->emitIntValue(Value, getDwarfOffsetByteSize()); +} + +void AsmPrinter::maybeEmitDwarf64Mark() const { + if (!isDwarf64()) + return; + OutStreamer->AddComment("DWARF64 Mark"); + OutStreamer->emitInt32(dwarf::DW_LENGTH_DWARF64); +} + +void AsmPrinter::emitDwarfUnitLength(uint64_t Length, + const Twine &Comment) const { + assert(isDwarf64() || Length <= dwarf::DW_LENGTH_lo_reserved); + maybeEmitDwarf64Mark(); + OutStreamer->AddComment(Comment); + OutStreamer->emitIntValue(Length, getDwarfOffsetByteSize()); +} + +void AsmPrinter::emitDwarfUnitLength(const MCSymbol *Hi, const MCSymbol *Lo, + const Twine &Comment) const { + maybeEmitDwarf64Mark(); + OutStreamer->AddComment(Comment); + OutStreamer->emitAbsoluteSymbolDiff(Hi, Lo, getDwarfOffsetByteSize()); } void AsmPrinter::emitCallSiteOffset(const MCSymbol *Hi, const MCSymbol *Lo, diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index b388e43447835..bcace6264cd04 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -1578,11 +1578,16 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) { assert(Element->getTag() == dwarf::DW_TAG_subrange_type); const DISubrange *Subrange = cast(Element); - assert(!Subrange->getRawLowerBound() && - "codeview doesn't support subranges with lower bounds"); int64_t Count = -1; - if (auto *CI = Subrange->getCount().dyn_cast()) - Count = CI->getSExtValue(); + // Calculate the count if either LowerBound is absent or is zero and + // either of Count or UpperBound are constant. + auto *LI = Subrange->getLowerBound().dyn_cast(); + if (!Subrange->getRawLowerBound() || (LI && (LI->getSExtValue() == 0))) { + if (auto *CI = Subrange->getCount().dyn_cast()) + Count = CI->getSExtValue(); + else if (auto *UI = Subrange->getUpperBound().dyn_cast()) + Count = UI->getSExtValue() + 1; // LowerBound is zero + } // Forward declarations of arrays without a size and VLAs use a count of -1. // Emit a count of zero in these cases to match what MSVC does for arrays diff --git a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp index f1d2551281871..39b0b027c7657 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp @@ -194,7 +194,7 @@ DIEAbbrev DIE::generateAbbrev() const { return Abbrev; } -unsigned DIE::getDebugSectionOffset() const { +uint64_t DIE::getDebugSectionOffset() const { const DIEUnit *Unit = getUnit(); assert(Unit && "DIE must be owned by a DIEUnit to get its absolute offset"); return Unit->getDebugSectionOffset() + getOffset(); @@ -428,10 +428,10 @@ void DIEInteger::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const { /// SizeOf - Determine size of integer value in bytes. /// unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { - dwarf::FormParams Params = {0, 0, dwarf::DWARF32}; - if (AP) - Params = {AP->getDwarfVersion(), uint8_t(AP->getPointerSize()), - AP->OutStreamer->getContext().getDwarfFormat()}; + assert(AP && "AsmPrinter is required to set FormParams"); + dwarf::FormParams Params = {AP->getDwarfVersion(), + uint8_t(AP->getPointerSize()), + AP->OutStreamer->getContext().getDwarfFormat()}; if (Optional FixedSize = dwarf::getFixedFormByteSize(Form, Params)) return *FixedSize; @@ -476,8 +476,7 @@ unsigned DIEExpr::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { case dwarf::DW_FORM_data8: return 8; case dwarf::DW_FORM_sec_offset: - // FIXME: add support for DWARF64 - return 4; + return AP->getDwarfOffsetByteSize(); default: llvm_unreachable("DIE Value form not supported yet"); } @@ -503,10 +502,11 @@ unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { switch (Form) { case dwarf::DW_FORM_data4: return 4; + case dwarf::DW_FORM_data8: + return 8; case dwarf::DW_FORM_sec_offset: case dwarf::DW_FORM_strp: - // FIXME: add support for DWARF64 - return 4; + return AP->getDwarfOffsetByteSize(); case dwarf::DW_FORM_addr: return AP->MAI->getCodePointerSize(); default: @@ -550,9 +550,10 @@ unsigned DIEDelta::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { switch (Form) { case dwarf::DW_FORM_data4: return 4; + case dwarf::DW_FORM_data8: + return 8; case dwarf::DW_FORM_sec_offset: - // FIXME: add support for DWARF64 - return 4; + return AP->getDwarfOffsetByteSize(); default: llvm_unreachable("DIE Value form not supported yet"); } @@ -661,7 +662,7 @@ void DIEEntry::emitValue(const AsmPrinter *AP, dwarf::Form Form) const { case dwarf::DW_FORM_ref_addr: { // Get the absolute offset for this DIE within the debug info/types section. - unsigned Addr = Entry->getDebugSectionOffset(); + uint64_t Addr = Entry->getDebugSectionOffset(); if (const MCSymbol *SectionSym = Entry->getUnit()->getCrossSectionRelativeBaseAddress()) { AP->emitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true); @@ -822,10 +823,17 @@ unsigned DIELocList::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { case dwarf::DW_FORM_loclistx: return getULEB128Size(Index); case dwarf::DW_FORM_data4: + assert(!AP->isDwarf64() && + "DW_FORM_data4 is not suitable to emit a pointer to a location list " + "in the 64-bit DWARF format"); return 4; + case dwarf::DW_FORM_data8: + assert(AP->isDwarf64() && + "DW_FORM_data8 is not suitable to emit a pointer to a location list " + "in the 32-bit DWARF format"); + return 8; case dwarf::DW_FORM_sec_offset: - // FIXME: add support for DWARF64 - return 4; + return AP->getDwarfOffsetByteSize(); default: llvm_unreachable("DIE Value form not supported yet"); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 602b1bceddc3c..68386a555fdab 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -422,7 +422,10 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) { // FIXME: duplicated from Target/WebAssembly/WebAssembly.h // don't want to depend on target specific headers in this code? const unsigned TI_GLOBAL_RELOC = 3; - if (FrameBase.Location.WasmLoc.Kind == TI_GLOBAL_RELOC) { + // FIXME: when writing dwo, we need to avoid relocations. Probably + // the "right" solution is to treat globals the way func and data symbols + // are (with entries in .debug_addr). + if (FrameBase.Location.WasmLoc.Kind == TI_GLOBAL_RELOC && !isDwoUnit()) { // These need to be relocatable. assert(FrameBase.Location.WasmLoc.Index == 0); // Only SP so far. auto SPSym = cast( @@ -1346,11 +1349,9 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die, /// Add a Dwarf loclistptr attribute data and value. void DwarfCompileUnit::addLocationList(DIE &Die, dwarf::Attribute Attribute, unsigned Index) { - dwarf::Form Form = dwarf::DW_FORM_data4; - if (DD->getDwarfVersion() == 4) - Form =dwarf::DW_FORM_sec_offset; - if (DD->getDwarfVersion() >= 5) - Form =dwarf::DW_FORM_loclistx; + dwarf::Form Form = (DD->getDwarfVersion() >= 5) + ? dwarf::DW_FORM_loclistx + : DD->getDwarfSectionOffsetForm(); Die.addValue(DIEValueAllocator, Attribute, Form, DIELocList(Index)); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index 78015897408d5..6d8186a5ee2b3 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -289,8 +289,8 @@ class DwarfCompileUnit final : public DwarfUnit { return DwarfUnit::getHeaderSize() + DWOIdSize; } unsigned getLength() { - return sizeof(uint32_t) + // Length field - getHeaderSize() + getUnitDie().getSize(); + return Asm->getUnitLengthFieldByteSize() + // Length field + getHeaderSize() + getUnitDie().getSize(); } void emitHeader(bool UseOffsets) override; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 64d57aa9402c8..94bf94c296cb0 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -218,8 +218,8 @@ static DbgValueLoc getDebugLocValue(const MachineInstr *MI) { const DIExpression *Expr = MI->getDebugExpression(); assert(MI->getNumOperands() == 4); if (MI->getDebugOperand(0).isReg()) { - auto RegOp = MI->getDebugOperand(0); - auto Op1 = MI->getDebugOffset(); + const auto &RegOp = MI->getDebugOperand(0); + const auto &Op1 = MI->getDebugOffset(); // If the second operand is an immediate, this is a // register-indirect address. assert((!Op1.isImm() || (Op1.getImm() == 0)) && "unexpected offset"); @@ -227,7 +227,7 @@ static DbgValueLoc getDebugLocValue(const MachineInstr *MI) { return DbgValueLoc(Expr, MLoc); } if (MI->getDebugOperand(0).isTargetIndex()) { - auto Op = MI->getDebugOperand(0); + const auto &Op = MI->getDebugOperand(0); return DbgValueLoc(Expr, TargetIndexLocation(Op.getIndex(), Op.getOffset())); } @@ -373,6 +373,11 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M) DwarfVersion = TT.isNVPTX() ? 2 : (DwarfVersion ? DwarfVersion : dwarf::DWARF_VERSION); + bool Dwarf64 = Asm->TM.Options.MCOptions.Dwarf64 && + DwarfVersion >= 3 && // DWARF64 was introduced in DWARFv3. + TT.isArch64Bit() && // DWARF64 requires 64-bit relocations. + TT.isOSBinFormatELF(); // Support only ELF for now. + UseRangesSection = !NoDwarfRangesSection && !TT.isNVPTX(); // Use sections as references. Force for NVPTX. @@ -414,6 +419,8 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M) DwarfVersion >= 5 || (UseGNUDebugMacro && !useSplitDwarf()); Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion); + Asm->OutStreamer->getContext().setDwarfFormat(Dwarf64 ? dwarf::DWARF64 + : dwarf::DWARF32); } // Define out of line so we don't have to include DwarfUnit.h in DwarfDebug.h. @@ -2329,10 +2336,10 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name, TheU = Skeleton; // Emit the header. - Asm->OutStreamer->AddComment("Length of Public " + Name + " Info"); MCSymbol *BeginLabel = Asm->createTempSymbol("pub" + Name + "_begin"); MCSymbol *EndLabel = Asm->createTempSymbol("pub" + Name + "_end"); - Asm->emitLabelDifference(EndLabel, BeginLabel, 4); + Asm->emitDwarfUnitLength(EndLabel, BeginLabel, + "Length of Public " + Name + " Info"); Asm->OutStreamer->emitLabel(BeginLabel); @@ -2343,7 +2350,7 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name, emitSectionReference(*TheU); Asm->OutStreamer->AddComment("Compilation Unit Length"); - Asm->emitInt32(TheU->getLength()); + Asm->emitDwarfLengthOrOffset(TheU->getLength()); // Emit the pubnames for this compilation unit. for (const auto &GI : Globals) { @@ -2351,7 +2358,7 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name, const DIE *Entity = GI.second; Asm->OutStreamer->AddComment("DIE offset"); - Asm->emitInt32(Entity->getOffset()); + Asm->emitDwarfLengthOrOffset(Entity->getOffset()); if (GnuStyle) { dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheU, Entity); @@ -2366,7 +2373,7 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name, } Asm->OutStreamer->AddComment("End Mark"); - Asm->emitInt32(0); + Asm->emitDwarfLengthOrOffset(0); Asm->OutStreamer->emitLabel(EndLabel); } @@ -2499,7 +2506,7 @@ void DebugLocEntry::finalize(const AsmPrinter &AP, }) && "all values are expected to be fragments"); assert(llvm::is_sorted(Values) && "fragments are expected to be sorted"); - for (auto Fragment : Values) + for (const auto &Fragment : Values) DwarfDebug::emitDebugLocValue(AP, BT, Fragment, DwarfExpr); } else { @@ -2542,7 +2549,8 @@ static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm, Asm->OutStreamer->emitLabel(Holder.getRnglistsTableBaseSym()); for (const RangeSpanList &List : Holder.getRangeLists()) - Asm->emitLabelDifference(List.Label, Holder.getRnglistsTableBaseSym(), 4); + Asm->emitLabelDifference(List.Label, Holder.getRnglistsTableBaseSym(), + Asm->getDwarfOffsetByteSize()); return TableEnd; } @@ -2561,7 +2569,8 @@ static MCSymbol *emitLoclistsTableHeader(AsmPrinter *Asm, Asm->OutStreamer->emitLabel(DebugLocs.getSym()); for (const auto &List : DebugLocs.getLists()) - Asm->emitLabelDifference(List.Label, DebugLocs.getSym(), 4); + Asm->emitLabelDifference(List.Label, DebugLocs.getSym(), + Asm->getDwarfOffsetByteSize()); return TableEnd; } @@ -2843,23 +2852,23 @@ void DwarfDebug::emitDebugARanges() { // Emit size of content not including length itself. unsigned ContentSize = - sizeof(int16_t) + // DWARF ARange version number - sizeof(int32_t) + // Offset of CU in the .debug_info section - sizeof(int8_t) + // Pointer Size (in bytes) - sizeof(int8_t); // Segment Size (in bytes) + sizeof(int16_t) + // DWARF ARange version number + Asm->getDwarfOffsetByteSize() + // Offset of CU in the .debug_info + // section + sizeof(int8_t) + // Pointer Size (in bytes) + sizeof(int8_t); // Segment Size (in bytes) unsigned TupleSize = PtrSize * 2; // 7.20 in the Dwarf specs requires the table to be aligned to a tuple. - unsigned Padding = - offsetToAlignment(sizeof(int32_t) + ContentSize, Align(TupleSize)); + unsigned Padding = offsetToAlignment( + Asm->getUnitLengthFieldByteSize() + ContentSize, Align(TupleSize)); ContentSize += Padding; ContentSize += (List.size() + 1) * TupleSize; // For each compile unit, write the list of spans it covers. - Asm->OutStreamer->AddComment("Length of ARange Set"); - Asm->emitInt32(ContentSize); + Asm->emitDwarfUnitLength(ContentSize, "Length of ARange Set"); Asm->OutStreamer->AddComment("DWARF Arange version number"); Asm->emitInt16(dwarf::DW_ARANGES_VERSION); Asm->OutStreamer->AddComment("Offset Into Debug Info Section"); @@ -2953,21 +2962,22 @@ static void emitMacroHeader(AsmPrinter *Asm, const DwarfDebug &DD, #define HANDLE_MACRO_FLAG(ID, NAME) MACRO_FLAG_##NAME = ID, #include "llvm/BinaryFormat/Dwarf.def" }; - uint8_t Flags = 0; Asm->OutStreamer->AddComment("Macro information version"); Asm->emitInt16(DwarfVersion >= 5 ? DwarfVersion : 4); - // We are setting Offset and line offset flags unconditionally here, - // since we're only supporting DWARF32 and line offset should be mostly - // present. - // FIXME: Add support for DWARF64. - Flags |= MACRO_FLAG_DEBUG_LINE_OFFSET; - Asm->OutStreamer->AddComment("Flags: 32 bit, debug_line_offset present"); - Asm->emitInt8(Flags); + // We emit the line offset flag unconditionally here, since line offset should + // be mostly present. + if (Asm->isDwarf64()) { + Asm->OutStreamer->AddComment("Flags: 64 bit, debug_line_offset present"); + Asm->emitInt8(MACRO_FLAG_OFFSET_SIZE | MACRO_FLAG_DEBUG_LINE_OFFSET); + } else { + Asm->OutStreamer->AddComment("Flags: 32 bit, debug_line_offset present"); + Asm->emitInt8(MACRO_FLAG_DEBUG_LINE_OFFSET); + } Asm->OutStreamer->AddComment("debug_line_offset"); if (DD.useSplitDwarf()) - Asm->OutStreamer->emitIntValue(0, /*Size=*/4); + Asm->emitDwarfLengthOrOffset(0); else - Asm->OutStreamer->emitSymbolValue(CU.getLineTableStartSym(), /*Size=*/4); + Asm->emitDwarfSymbolReference(CU.getLineTableStartSym()); } void DwarfDebug::handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U) { @@ -3010,10 +3020,8 @@ void DwarfDebug::emitMacro(DIMacro &M) { Asm->OutStreamer->AddComment("Line Number"); Asm->emitULEB128(M.getLine()); Asm->OutStreamer->AddComment("Macro String"); - // FIXME: Add support for DWARF64. - Asm->OutStreamer->emitSymbolValue( - InfoHolder.getStringPool().getEntry(*Asm, Str).getSymbol(), - /*Size=*/4); + Asm->emitDwarfSymbolReference( + InfoHolder.getStringPool().getEntry(*Asm, Str).getSymbol()); } } else { Asm->OutStreamer->AddComment(dwarf::MacinfoString(M.getMacinfoType())); @@ -3351,6 +3359,15 @@ uint16_t DwarfDebug::getDwarfVersion() const { return Asm->OutStreamer->getContext().getDwarfVersion(); } +dwarf::Form DwarfDebug::getDwarfSectionOffsetForm() const { + if (Asm->getDwarfVersion() >= 4) + return dwarf::Form::DW_FORM_sec_offset; + assert((!Asm->isDwarf64() || (Asm->getDwarfVersion() == 3)) && + "DWARF64 is not defined prior DWARFv3"); + return Asm->isDwarf64() ? dwarf::Form::DW_FORM_data8 + : dwarf::Form::DW_FORM_data4; +} + const MCSymbol *DwarfDebug::getSectionLabel(const MCSection *S) { return SectionLabels.find(S)->second; } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h index ba0bb84367035..34c88f1a9c605 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -729,6 +729,12 @@ class DwarfDebug : public DebugHandlerBase { /// Returns the Dwarf Version. uint16_t getDwarfVersion() const; + /// Returns a suitable DWARF form to represent a section offset, i.e. + /// * DW_FORM_sec_offset for DWARF version >= 4; + /// * DW_FORM_data8 for 64-bit DWARFv3; + /// * DW_FORM_data4 for 32-bit DWARFv3 and DWARFv2. + dwarf::Form getDwarfSectionOffsetForm() const; + /// Returns the previous CU that was being updated const DwarfCompileUnit *getPrevCU() const { return PrevCU; } void setPrevCU(const DwarfCompileUnit *PrevCU) { this->PrevCU = PrevCU; } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index b0fa8645de248..a2bd35d232daf 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -18,11 +18,8 @@ #include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/Support/ErrorHandling.h" #include -#include -#include using namespace llvm; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp index 812e6383288fc..838e1c9a10be6 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp @@ -10,10 +10,9 @@ #include "DwarfCompileUnit.h" #include "DwarfDebug.h" #include "DwarfUnit.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/CodeGen/DIE.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Metadata.h" #include "llvm/MC/MCStreamer.h" #include #include @@ -59,7 +58,7 @@ void DwarfFile::emitUnit(DwarfUnit *TheU, bool UseOffsets) { // Compute the size and offset for each DIE. void DwarfFile::computeSizeAndOffsets() { // Offset from the first CU in the debug info section is 0 initially. - unsigned SecOffset = 0; + uint64_t SecOffset = 0; // Iterate over each compile unit and set the size and offsets for each // DIE within each compile unit. All offsets are CU relative. @@ -75,12 +74,15 @@ void DwarfFile::computeSizeAndOffsets() { TheU->setDebugSectionOffset(SecOffset); SecOffset += computeSizeAndOffsetsForUnit(TheU.get()); } + if (SecOffset > UINT32_MAX && !Asm->isDwarf64()) + report_fatal_error("The generated debug information is too large " + "for the 32-bit DWARF format."); } unsigned DwarfFile::computeSizeAndOffsetsForUnit(DwarfUnit *TheU) { // CU-relative offset is reset to 0 here. - unsigned Offset = sizeof(int32_t) + // Length of Unit Info - TheU->getHeaderSize(); // Unit-specific headers + unsigned Offset = Asm->getUnitLengthFieldByteSize() + // Length of Unit Info + TheU->getHeaderSize(); // Unit-specific headers // The return value here is CU-relative, after laying out // all of the CU DIE. diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h index cf293d7534d04..79a6ce7801b70 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -14,7 +14,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/DIE.h" -#include "llvm/IR/Metadata.h" #include "llvm/Support/Allocator.h" #include #include @@ -26,10 +25,12 @@ class AsmPrinter; class DbgEntity; class DbgVariable; class DbgLabel; +class DINode; class DwarfCompileUnit; class DwarfUnit; class LexicalScope; class MCSection; +class MDNode; // Data structure to hold a range for range lists. struct RangeSpan { diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp index a43929d8e8f70..a876f8ccace94 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp @@ -8,7 +8,6 @@ #include "DwarfStringPool.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/MC/MCAsmInfo.h" @@ -33,7 +32,6 @@ DwarfStringPool::getEntryImpl(AsmPrinter &Asm, StringRef Str) { Entry.Symbol = ShouldCreateSymbols ? Asm.createTempSymbol(Prefix) : nullptr; NumBytes += Str.size() + 1; - assert(NumBytes > Entry.Offset && "Unexpected overflow"); } return *I.first; } @@ -58,13 +56,13 @@ void DwarfStringPool::emitStringOffsetsTableHeader(AsmPrinter &Asm, if (getNumIndexedStrings() == 0) return; Asm.OutStreamer->SwitchSection(Section); - unsigned EntrySize = 4; - // FIXME: DWARF64 + unsigned EntrySize = Asm.getDwarfOffsetByteSize(); // We are emitting the header for a contribution to the string offsets // table. The header consists of an entry with the contribution's // size (not including the size of the length field), the DWARF version and // 2 bytes of padding. - Asm.emitInt32(getNumIndexedStrings() * EntrySize + 4); + Asm.emitDwarfUnitLength(getNumIndexedStrings() * EntrySize + 4, + "Length of String Offsets Set"); Asm.emitInt16(Asm.getDwarfVersion()); Asm.emitInt16(0); // Define the symbol that marks the start of the contribution. It is @@ -120,7 +118,7 @@ void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection, } Asm.OutStreamer->SwitchSection(OffsetSection); - unsigned size = 4; // FIXME: DWARF64 is 8. + unsigned size = Asm.getDwarfOffsetByteSize(); for (const auto &Entry : Entries) if (UseRelativeOffsets) Asm.emitDwarfStringOffset(Entry->getValue()); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h index c5f5637fdae3f..79b5df89e3389 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h @@ -28,7 +28,7 @@ class DwarfStringPool { StringMap Pool; StringRef Prefix; - unsigned NumBytes = 0; + uint64_t NumBytes = 0; unsigned NumIndexedStrings = 0; bool ShouldCreateSymbols; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 40c741077d1ad..8be6b889b8a99 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -13,7 +13,6 @@ #include "DwarfUnit.h" #include "AddressPool.h" #include "DwarfCompileUnit.h" -#include "DwarfDebug.h" #include "DwarfExpression.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" @@ -300,10 +299,7 @@ void DwarfUnit::addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label) { void DwarfUnit::addSectionOffset(DIE &Die, dwarf::Attribute Attribute, uint64_t Integer) { - if (DD->getDwarfVersion() >= 4) - addUInt(Die, Attribute, dwarf::DW_FORM_sec_offset, Integer); - else - addUInt(Die, Attribute, dwarf::DW_FORM_data4, Integer); + addUInt(Die, Attribute, DD->getDwarfSectionOffsetForm(), Integer); } unsigned DwarfTypeUnit::getOrCreateSourceID(const DIFile *File) { @@ -1695,15 +1691,15 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) { void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) { // Emit size of content not including length itself - Asm->OutStreamer->AddComment("Length of Unit"); if (!DD->useSectionsAsReferences()) { StringRef Prefix = isDwoUnit() ? "debug_info_dwo_" : "debug_info_"; MCSymbol *BeginLabel = Asm->createTempSymbol(Prefix + "start"); EndLabel = Asm->createTempSymbol(Prefix + "end"); - Asm->emitLabelDifference(EndLabel, BeginLabel, 4); + Asm->emitDwarfUnitLength(EndLabel, BeginLabel, "Length of Unit"); Asm->OutStreamer->emitLabel(BeginLabel); } else - Asm->emitInt32(getHeaderSize() + getUnitDie().getSize()); + Asm->emitDwarfUnitLength(getHeaderSize() + getUnitDie().getSize(), + "Length of Unit"); Asm->OutStreamer->AddComment("DWARF version number"); unsigned Version = DD->getDwarfVersion(); @@ -1723,7 +1719,7 @@ void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) { Asm->OutStreamer->AddComment("Offset Into Abbrev. Section"); const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); if (UseOffsets) - Asm->emitInt32(0); + Asm->emitDwarfLengthOrOffset(0); else Asm->emitDwarfSymbolReference( TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false); @@ -1742,16 +1738,14 @@ void DwarfTypeUnit::emitHeader(bool UseOffsets) { Asm->OutStreamer->emitIntValue(TypeSignature, sizeof(TypeSignature)); Asm->OutStreamer->AddComment("Type DIE Offset"); // In a skeleton type unit there is no type DIE so emit a zero offset. - Asm->OutStreamer->emitIntValue(Ty ? Ty->getOffset() : 0, - sizeof(Ty->getOffset())); + Asm->emitDwarfLengthOrOffset(Ty ? Ty->getOffset() : 0); } DIE::value_iterator DwarfUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi, const MCSymbol *Lo) { return Die.addValue(DIEValueAllocator, Attribute, - DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset - : dwarf::DW_FORM_data4, + DD->getDwarfSectionOffsetForm(), new (DIEValueAllocator) DIEDelta(Hi, Lo)); } @@ -1759,10 +1753,7 @@ DIE::value_iterator DwarfUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Label, const MCSymbol *Sec) { if (Asm->MAI->doesDwarfUseRelocationsAcrossSections()) - return addLabel(Die, Attribute, - DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset - : dwarf::DW_FORM_data4, - Label); + return addLabel(Die, Attribute, DD->getDwarfSectionOffsetForm(), Label); return addSectionDelta(Die, Attribute, Label, Sec); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 7147da33e631e..63a1e5a4780f1 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -16,22 +16,19 @@ #include "DwarfDebug.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" -#include "llvm/ADT/StringMap.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/DIE.h" -#include "llvm/IR/DIBuilder.h" -#include "llvm/IR/DebugInfo.h" -#include "llvm/MC/MCDwarf.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCSection.h" +#include namespace llvm { -class MachineOperand; -class ConstantInt; class ConstantFP; +class ConstantInt; class DbgVariable; class DwarfCompileUnit; +class MachineOperand; +class MCDwarfDwoLineTable; +class MCSymbol; //===----------------------------------------------------------------------===// /// This dwarf writer support class manages information associated with a @@ -77,7 +74,6 @@ class DwarfUnit : public DIEUnit { bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie); - bool shareAcrossDWOCUs() const; bool isShareableAcrossCUs(const DINode *D) const; public: @@ -253,9 +249,9 @@ class DwarfUnit : public DIEUnit { /// Compute the size of a header for this unit, not including the initial /// length field. virtual unsigned getHeaderSize() const { - return sizeof(int16_t) + // DWARF version number - sizeof(int32_t) + // Offset Into Abbrev. Section - sizeof(int8_t) + // Pointer Size (in bytes) + return sizeof(int16_t) + // DWARF version number + Asm->getDwarfOffsetByteSize() + // Offset Into Abbrev. Section + sizeof(int8_t) + // Pointer Size (in bytes) (DD->getDwarfVersion() >= 5 ? sizeof(int8_t) : 0); // DWARF v5 unit type } @@ -356,7 +352,7 @@ class DwarfTypeUnit final : public DwarfUnit { void emitHeader(bool UseOffsets) override; unsigned getHeaderSize() const override { return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature - sizeof(uint32_t); // Type DIE Offset + Asm->getDwarfOffsetByteSize(); // Type DIE Offset } void addGlobalName(StringRef Name, const DIE &Die, const DIScope *Context) override; diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp index cd8077e7d5486..c47ac7e17b6a1 100644 --- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp @@ -258,15 +258,6 @@ void WinException::endFuncletImpl() { if (F.hasPersonalityFn()) Per = classifyEHPersonality(F.getPersonalityFn()->stripPointerCasts()); - // On funclet exit, we emit a fake "function" end marker, so that the call - // to EmitWinEHHandlerData below can calculate the size of the funclet or - // function. - if (isAArch64) { - MCSection *XData = Asm->OutStreamer->getAssociatedXDataSection( - Asm->OutStreamer->getCurrentSectionOnly()); - Asm->OutStreamer->SwitchSection(XData); - } - // Emit an UNWIND_INFO struct describing the prologue. Asm->OutStreamer->EmitWinEHHandlerData(); diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp index a3c366004c7f3..421c1d896a0f1 100644 --- a/llvm/lib/CodeGen/BasicBlockSections.cpp +++ b/llvm/lib/CodeGen/BasicBlockSections.cpp @@ -48,19 +48,11 @@ // Basic Block Labels // ================== // -// With -fbasic-block-sections=labels, or when a basic block is placed in a -// unique section, it is labelled with a symbol. This allows easy mapping of -// virtual addresses from PMU profiles back to the corresponding basic blocks. -// Since the number of basic blocks is large, the labeling bloats the symbol -// table sizes and the string table sizes significantly. While the binary size -// does increase, it does not affect performance as the symbol table is not -// loaded in memory during run-time. The string table size bloat is kept very -// minimal using a unary naming scheme that uses string suffix compression. The -// basic blocks for function foo are named "a.BB.foo", "aa.BB.foo", ... This -// turns out to be very good for string table sizes and the bloat in the string -// table size for a very large binary is ~8 %. The naming also allows using -// the --symbol-ordering-file option in LLD to arbitrarily reorder the -// sections. +// With -fbasic-block-sections=labels, we emit the offsets of BB addresses of +// every function into a .bb_addr_map section. Along with the function symbols, +// this allows for mapping of virtual addresses in PMU profiles back to the +// corresponding basic blocks. This logic is implemented in AsmPrinter. This +// pass only assigns the BBSectionType of every function to ``labels``. // //===----------------------------------------------------------------------===// @@ -304,7 +296,6 @@ bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) { if (BBSectionsType == BasicBlockSection::Labels) { MF.setBBSectionsType(BBSectionsType); - MF.createBBLabels(); return true; } @@ -314,7 +305,6 @@ bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) { FuncBBClusterInfo)) return true; MF.setBBSectionsType(BBSectionsType); - MF.createBBLabels(); assignSections(MF, FuncBBClusterInfo); // We make sure that the cluster including the entry basic block precedes all diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp index 5a3ec1a36f962..366c303614d63 100644 --- a/llvm/lib/CodeGen/BranchRelaxation.cpp +++ b/llvm/lib/CodeGen/BranchRelaxation.cpp @@ -507,25 +507,31 @@ bool BranchRelaxation::relaxBranchInstructions() { Next = std::next(J); MachineInstr &MI = *J; - if (MI.isConditionalBranch()) { - MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI); - if (!isBlockInRange(MI, *DestBB)) { - if (Next != MBB.end() && Next->isConditionalBranch()) { - // If there are multiple conditional branches, this isn't an - // analyzable block. Split later terminators into a new block so - // each one will be analyzable. - - splitBlockBeforeInstr(*Next, DestBB); - } else { - fixupConditionalBranch(MI); - ++NumConditionalRelaxed; - } + if (!MI.isConditionalBranch()) + continue; + + if (MI.getOpcode() == TargetOpcode::FAULTING_OP) + // FAULTING_OP's destination is not encoded in the instruction stream + // and thus never needs relaxed. + continue; + + MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI); + if (!isBlockInRange(MI, *DestBB)) { + if (Next != MBB.end() && Next->isConditionalBranch()) { + // If there are multiple conditional branches, this isn't an + // analyzable block. Split later terminators into a new block so + // each one will be analyzable. + + splitBlockBeforeInstr(*Next, DestBB); + } else { + fixupConditionalBranch(MI); + ++NumConditionalRelaxed; + } - Changed = true; + Changed = true; - // This may have modified all of the terminators, so start over. - Next = MBB.getFirstTerminator(); - } + // This may have modified all of the terminators, so start over. + Next = MBB.getFirstTerminator(); } } } diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp index 254503673fd2b..75cf6a63dc9a7 100644 --- a/llvm/lib/CodeGen/CalcSpillWeights.cpp +++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp @@ -86,7 +86,7 @@ static bool isRematerializable(const LiveInterval &LI, const LiveIntervals &LIS, VirtRegMap *VRM, const TargetInstrInfo &TII) { - unsigned Reg = LI.reg; + unsigned Reg = LI.reg(); unsigned Original = VRM ? VRM->getOriginal(Reg) : 0; for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end(); I != E; ++I) { @@ -140,7 +140,7 @@ void VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) { // Check if unspillable. if (weight < 0) return; - li.weight = weight; + li.setWeight(weight); } float VirtRegAuxInfo::futureWeight(LiveInterval &li, SlotIndex start, @@ -159,10 +159,10 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start, unsigned numInstr = 0; // Number of instructions using li SmallPtrSet visited; - std::pair TargetHint = mri.getRegAllocationHint(li.reg); + std::pair TargetHint = mri.getRegAllocationHint(li.reg()); if (li.isSpillable() && VRM) { - Register Reg = li.reg; + Register Reg = li.reg(); Register Original = VRM->getOriginal(Reg); const LiveInterval &OrigInt = LIS.getInterval(Original); // li comes from a split of OrigInt. If OrigInt was marked @@ -215,7 +215,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start, std::set CopyHints; for (MachineRegisterInfo::reg_instr_nodbg_iterator - I = mri.reg_instr_nodbg_begin(li.reg), + I = mri.reg_instr_nodbg_begin(li.reg()), E = mri.reg_instr_nodbg_end(); I != E;) { MachineInstr *mi = &*(I++); @@ -243,7 +243,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start, // Calculate instr weight. bool reads, writes; - std::tie(reads, writes) = mi->readsWritesVirtualRegister(li.reg); + std::tie(reads, writes) = mi->readsWritesVirtualRegister(li.reg()); weight = LiveIntervals::getSpillWeight(writes, reads, &MBFI, *mi); // Give extra weight to what looks like a loop induction variable update. @@ -256,7 +256,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start, // Get allocation hints from copies. if (!mi->isCopy()) continue; - Register hint = copyHint(mi, li.reg, tri, mri); + Register hint = copyHint(mi, li.reg(), tri, mri); if (!hint) continue; // Force hweight onto the stack so that x86 doesn't add hidden precision, @@ -275,7 +275,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start, if (updateLI && CopyHints.size()) { // Remove a generic hint if previously added by target. if (TargetHint.first == 0 && TargetHint.second) - mri.clearSimpleHint(li.reg); + mri.clearSimpleHint(li.reg()); std::set HintedRegs; for (auto &Hint : CopyHints) { @@ -283,7 +283,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start, (TargetHint.first != 0 && Hint.Reg == TargetHint.second)) // Don't add the same reg twice or the target-type hint again. continue; - mri.addRegAllocationHint(li.reg, Hint.Reg); + mri.addRegAllocationHint(li.reg(), Hint.Reg); } // Weakly boost the spill weight of hinted registers. diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 9a4ed2fab608b..45feeae39659b 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2047,9 +2047,11 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { Value *Operand = II->getOperand(0); II->eraseFromParent(); // Prune the operand, it's most likely dead. - RecursivelyDeleteTriviallyDeadInstructions( - Operand, TLInfo, nullptr, - [&](Value *V) { removeAllAssertingVHReferences(V); }); + resetIteratorIfInvalidatedWhileCalling(BB, [&]() { + RecursivelyDeleteTriviallyDeadInstructions( + Operand, TLInfo, nullptr, + [&](Value *V) { removeAllAssertingVHReferences(V); }); + }); return true; } @@ -5274,22 +5276,11 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // If we have no uses, recursively delete the value and all dead instructions // using it. if (Repl->use_empty()) { - // This can cause recursive deletion, which can invalidate our iterator. - // Use a WeakTrackingVH to hold onto it in case this happens. - Value *CurValue = &*CurInstIterator; - WeakTrackingVH IterHandle(CurValue); - BasicBlock *BB = CurInstIterator->getParent(); - - RecursivelyDeleteTriviallyDeadInstructions( - Repl, TLInfo, nullptr, - [&](Value *V) { removeAllAssertingVHReferences(V); }); - - if (IterHandle != CurValue) { - // If the iterator instruction was recursively deleted, start over at the - // start of the block. - CurInstIterator = BB->begin(); - SunkAddrs.clear(); - } + resetIteratorIfInvalidatedWhileCalling(CurInstIterator->getParent(), [&]() { + RecursivelyDeleteTriviallyDeadInstructions( + Repl, TLInfo, nullptr, + [&](Value *V) { removeAllAssertingVHReferences(V); }); + }); } ++NumMemoryInsts; return true; @@ -5818,6 +5809,12 @@ bool CodeGenPrepare::optimizePhiType( Visited.insert(I); SmallPtrSet Defs; SmallPtrSet Uses; + // This works by adding extra bitcasts between load/stores and removing + // existing bicasts. If we have a phi(bitcast(load)) or a store(bitcast(phi)) + // we can get in the situation where we remove a bitcast in one iteration + // just to add it again in the next. We need to ensure that at least one + // bitcast we remove are anchored to something that will not change back. + bool AnyAnchored = false; while (!Worklist.empty()) { Instruction *II = Worklist.pop_back_val(); @@ -5834,6 +5831,8 @@ bool CodeGenPrepare::optimizePhiType( Worklist.push_back(OpPhi); } } else if (auto *OpLoad = dyn_cast(V)) { + if (!OpLoad->isSimple()) + return false; if (!Defs.count(OpLoad)) { Defs.insert(OpLoad); Worklist.push_back(OpLoad); @@ -5851,9 +5850,12 @@ bool CodeGenPrepare::optimizePhiType( if (!Defs.count(OpBC)) { Defs.insert(OpBC); Worklist.push_back(OpBC); + AnyAnchored |= !isa(OpBC->getOperand(0)) && + !isa(OpBC->getOperand(0)); } - } else if (!isa(V)) + } else if (!isa(V)) { return false; + } } } @@ -5868,7 +5870,7 @@ bool CodeGenPrepare::optimizePhiType( Worklist.push_back(OpPhi); } } else if (auto *OpStore = dyn_cast(V)) { - if (OpStore->getOperand(0) != II) + if (!OpStore->isSimple() || OpStore->getOperand(0) != II) return false; Uses.insert(OpStore); } else if (auto *OpBC = dyn_cast(V)) { @@ -5877,12 +5879,15 @@ bool CodeGenPrepare::optimizePhiType( if (OpBC->getType() != ConvertTy) return false; Uses.insert(OpBC); - } else + AnyAnchored |= + any_of(OpBC->users(), [](User *U) { return !isa(U); }); + } else { return false; + } } } - if (!ConvertTy || !TLI->shouldConvertPhiType(PhiTy, ConvertTy)) + if (!ConvertTy || !AnyAnchored || !TLI->shouldConvertPhiType(PhiTy, ConvertTy)) return false; LLVM_DEBUG(dbgs() << "Converting " << *I << "\n and connected nodes to " @@ -5893,11 +5898,13 @@ bool CodeGenPrepare::optimizePhiType( ValueToValueMap ValMap; ValMap[UndefValue::get(PhiTy)] = UndefValue::get(ConvertTy); for (Instruction *D : Defs) { - if (isa(D)) + if (isa(D)) { ValMap[D] = D->getOperand(0); - else + DeletedInstrs.insert(D); + } else { ValMap[D] = new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode()); + } } for (PHINode *Phi : PhiNodes) ValMap[Phi] = PHINode::Create(ConvertTy, Phi->getNumIncomingValues(), @@ -5908,15 +5915,17 @@ bool CodeGenPrepare::optimizePhiType( for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++) NewPhi->addIncoming(ValMap[Phi->getIncomingValue(i)], Phi->getIncomingBlock(i)); + Visited.insert(NewPhi); } // And finally pipe up the stores and bitcasts for (Instruction *U : Uses) { if (isa(U)) { DeletedInstrs.insert(U); U->replaceAllUsesWith(ValMap[U->getOperand(0)]); - } else + } else { U->setOperand(0, new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U)); + } } // Save the removed phis to be deleted later. diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp index 45f21c1085dda..dfaaafaf811f1 100644 --- a/llvm/lib/CodeGen/ExpandReductions.cpp +++ b/llvm/lib/CodeGen/ExpandReductions.cpp @@ -143,12 +143,24 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { case Intrinsic::experimental_vector_reduce_smax: case Intrinsic::experimental_vector_reduce_smin: case Intrinsic::experimental_vector_reduce_umax: - case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_umin: { + Value *Vec = II->getArgOperand(0); + if (!isPowerOf2_32( + cast(Vec->getType())->getNumElements())) + continue; + + Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + break; + } case Intrinsic::experimental_vector_reduce_fmax: case Intrinsic::experimental_vector_reduce_fmin: { + // FIXME: We only expand 'fast' reductions here because the underlying + // code in createMinMaxOp() assumes that comparisons use 'fast' + // semantics. Value *Vec = II->getArgOperand(0); if (!isPowerOf2_32( - cast(Vec->getType())->getNumElements())) + cast(Vec->getType())->getNumElements()) || + !FMF.isFast()) continue; Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 10cd58f17e9aa..938f55959d452 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -881,14 +881,12 @@ void CombinerHelper::applyCombineIndexedLoadStore( LLVM_DEBUG(dbgs() << " Combinined to indexed operation"); } -bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) { +bool CombinerHelper::matchOptBrCondByInvertingCond(MachineInstr &MI) { if (MI.getOpcode() != TargetOpcode::G_BR) return false; // Try to match the following: // bb1: - // %c(s32) = G_ICMP pred, %a, %b - // %c1(s1) = G_TRUNC %c(s32) // G_BRCOND %c1, %bb2 // G_BR %bb3 // bb2: @@ -898,7 +896,7 @@ bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) { // The above pattern does not have a fall through to the successor bb2, always // resulting in a branch no matter which path is taken. Here we try to find // and replace that pattern with conditional branch to bb3 and otherwise - // fallthrough to bb2. + // fallthrough to bb2. This is generally better for branch predictors. MachineBasicBlock *MBB = MI.getParent(); MachineBasicBlock::iterator BrIt(MI); @@ -913,40 +911,34 @@ bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) { // Check that the next block is the conditional branch target. if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB())) return false; - - MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg()); - if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP || - !MRI.hasOneNonDBGUse(CmpMI->getOperand(0).getReg())) - return false; - return true; -} - -bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr &MI) { - if (!matchElideBrByInvertingCond(MI)) - return false; - applyElideBrByInvertingCond(MI); return true; } -void CombinerHelper::applyElideBrByInvertingCond(MachineInstr &MI) { +void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI) { MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB(); MachineBasicBlock::iterator BrIt(MI); MachineInstr *BrCond = &*std::prev(BrIt); - MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg()); - CmpInst::Predicate InversePred = CmpInst::getInversePredicate( - (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate()); + Builder.setInstrAndDebugLoc(*BrCond); + LLT Ty = MRI.getType(BrCond->getOperand(0).getReg()); + // FIXME: Does int/fp matter for this? If so, we might need to restrict + // this to i1 only since we might not know for sure what kind of + // compare generated the condition value. + auto True = Builder.buildConstant( + Ty, getICmpTrueVal(getTargetLowering(), false, false)); + auto Xor = Builder.buildXor(Ty, BrCond->getOperand(0), True); - // Invert the G_ICMP condition. - Observer.changingInstr(*CmpMI); - CmpMI->getOperand(1).setPredicate(InversePred); - Observer.changedInstr(*CmpMI); + auto *FallthroughBB = BrCond->getOperand(1).getMBB(); + Observer.changingInstr(MI); + MI.getOperand(0).setMBB(FallthroughBB); + Observer.changedInstr(MI); - // Change the conditional branch target. + // Change the conditional branch to use the inverted condition and + // new target block. Observer.changingInstr(*BrCond); + BrCond->getOperand(0).setReg(Xor.getReg(0)); BrCond->getOperand(1).setMBB(BrTarget); Observer.changedInstr(*BrCond); - MI.eraseFromParent(); } static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { @@ -1438,6 +1430,69 @@ bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { return false; } +static Optional constantFoldFpUnary(unsigned Opcode, LLT DstTy, + const Register Op, + const MachineRegisterInfo &MRI) { + const ConstantFP *MaybeCst = getConstantFPVRegVal(Op, MRI); + if (!MaybeCst) + return None; + + APFloat V = MaybeCst->getValueAPF(); + switch (Opcode) { + default: + llvm_unreachable("Unexpected opcode!"); + case TargetOpcode::G_FNEG: { + V.changeSign(); + return V; + } + case TargetOpcode::G_FABS: { + V.clearSign(); + return V; + } + case TargetOpcode::G_FPTRUNC: + break; + case TargetOpcode::G_FSQRT: { + bool Unused; + V.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &Unused); + V = APFloat(sqrt(V.convertToDouble())); + break; + } + case TargetOpcode::G_FLOG2: { + bool Unused; + V.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &Unused); + V = APFloat(log2(V.convertToDouble())); + break; + } + } + // Convert `APFloat` to appropriate IEEE type depending on `DstTy`. Otherwise, + // `buildFConstant` will assert on size mismatch. Only `G_FPTRUNC`, `G_FSQRT`, + // and `G_FLOG2` reach here. + bool Unused; + V.convert(getFltSemanticForLLT(DstTy), APFloat::rmNearestTiesToEven, &Unused); + return V; +} + +bool CombinerHelper::matchCombineConstantFoldFpUnary(MachineInstr &MI, + Optional &Cst) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(DstReg); + Cst = constantFoldFpUnary(MI.getOpcode(), DstTy, SrcReg, MRI); + return Cst.hasValue(); +} + +bool CombinerHelper::applyCombineConstantFoldFpUnary(MachineInstr &MI, + Optional &Cst) { + assert(Cst.hasValue() && "Optional is unexpectedly empty!"); + Builder.setInstrAndDebugLoc(MI); + MachineFunction &MF = Builder.getMF(); + auto *FPVal = ConstantFP::get(MF.getFunction().getContext(), *Cst); + Register DstReg = MI.getOperand(0).getReg(); + Builder.buildFConstant(DstReg, *FPVal); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo) { // We're trying to match the following pattern: @@ -1561,6 +1616,201 @@ bool CombinerHelper::applyCombineShlOfExtend(MachineInstr &MI, return true; } +static Register peekThroughBitcast(Register Reg, + const MachineRegisterInfo &MRI) { + while (mi_match(Reg, MRI, m_GBitcast(m_Reg(Reg)))) + ; + + return Reg; +} + +bool CombinerHelper::matchCombineUnmergeMergeToPlainValues( + MachineInstr &MI, SmallVectorImpl &Operands) { + assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && + "Expected an unmerge"); + Register SrcReg = + peekThroughBitcast(MI.getOperand(MI.getNumOperands() - 1).getReg(), MRI); + + MachineInstr *SrcInstr = MRI.getVRegDef(SrcReg); + if (SrcInstr->getOpcode() != TargetOpcode::G_MERGE_VALUES && + SrcInstr->getOpcode() != TargetOpcode::G_BUILD_VECTOR && + SrcInstr->getOpcode() != TargetOpcode::G_CONCAT_VECTORS) + return false; + + // Check the source type of the merge. + LLT SrcMergeTy = MRI.getType(SrcInstr->getOperand(1).getReg()); + LLT Dst0Ty = MRI.getType(MI.getOperand(0).getReg()); + bool SameSize = Dst0Ty.getSizeInBits() == SrcMergeTy.getSizeInBits(); + if (SrcMergeTy != Dst0Ty && !SameSize) + return false; + // They are the same now (modulo a bitcast). + // We can collect all the src registers. + for (unsigned Idx = 1, EndIdx = SrcInstr->getNumOperands(); Idx != EndIdx; + ++Idx) + Operands.push_back(SrcInstr->getOperand(Idx).getReg()); + return true; +} + +bool CombinerHelper::applyCombineUnmergeMergeToPlainValues( + MachineInstr &MI, SmallVectorImpl &Operands) { + assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && + "Expected an unmerge"); + assert((MI.getNumOperands() - 1 == Operands.size()) && + "Not enough operands to replace all defs"); + unsigned NumElems = MI.getNumOperands() - 1; + + LLT SrcTy = MRI.getType(Operands[0]); + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + bool CanReuseInputDirectly = DstTy == SrcTy; + Builder.setInstrAndDebugLoc(MI); + for (unsigned Idx = 0; Idx < NumElems; ++Idx) { + Register DstReg = MI.getOperand(Idx).getReg(); + Register SrcReg = Operands[Idx]; + if (CanReuseInputDirectly) + replaceRegWith(MRI, DstReg, SrcReg); + else + Builder.buildCast(DstReg, SrcReg); + } + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::matchCombineUnmergeConstant(MachineInstr &MI, + SmallVectorImpl &Csts) { + unsigned SrcIdx = MI.getNumOperands() - 1; + Register SrcReg = MI.getOperand(SrcIdx).getReg(); + MachineInstr *SrcInstr = MRI.getVRegDef(SrcReg); + if (SrcInstr->getOpcode() != TargetOpcode::G_CONSTANT && + SrcInstr->getOpcode() != TargetOpcode::G_FCONSTANT) + return false; + // Break down the big constant in smaller ones. + const MachineOperand &CstVal = SrcInstr->getOperand(1); + APInt Val = SrcInstr->getOpcode() == TargetOpcode::G_CONSTANT + ? CstVal.getCImm()->getValue() + : CstVal.getFPImm()->getValueAPF().bitcastToAPInt(); + + LLT Dst0Ty = MRI.getType(MI.getOperand(0).getReg()); + unsigned ShiftAmt = Dst0Ty.getSizeInBits(); + // Unmerge a constant. + for (unsigned Idx = 0; Idx != SrcIdx; ++Idx) { + Csts.emplace_back(Val.trunc(ShiftAmt)); + Val = Val.lshr(ShiftAmt); + } + + return true; +} + +bool CombinerHelper::applyCombineUnmergeConstant(MachineInstr &MI, + SmallVectorImpl &Csts) { + assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && + "Expected an unmerge"); + assert((MI.getNumOperands() - 1 == Csts.size()) && + "Not enough operands to replace all defs"); + unsigned NumElems = MI.getNumOperands() - 1; + Builder.setInstrAndDebugLoc(MI); + for (unsigned Idx = 0; Idx < NumElems; ++Idx) { + Register DstReg = MI.getOperand(Idx).getReg(); + Builder.buildConstant(DstReg, Csts[Idx]); + } + + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && + "Expected an unmerge"); + // Check that all the lanes are dead except the first one. + for (unsigned Idx = 1, EndIdx = MI.getNumDefs(); Idx != EndIdx; ++Idx) { + if (!MRI.use_nodbg_empty(MI.getOperand(Idx).getReg())) + return false; + } + return true; +} + +bool CombinerHelper::applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) { + Builder.setInstrAndDebugLoc(MI); + Register SrcReg = MI.getOperand(MI.getNumDefs()).getReg(); + // Truncating a vector is going to truncate every single lane, + // whereas we want the full lowbits. + // Do the operation on a scalar instead. + LLT SrcTy = MRI.getType(SrcReg); + if (SrcTy.isVector()) + SrcReg = + Builder.buildCast(LLT::scalar(SrcTy.getSizeInBits()), SrcReg).getReg(0); + + Register Dst0Reg = MI.getOperand(0).getReg(); + LLT Dst0Ty = MRI.getType(Dst0Reg); + if (Dst0Ty.isVector()) { + auto MIB = Builder.buildTrunc(LLT::scalar(Dst0Ty.getSizeInBits()), SrcReg); + Builder.buildCast(Dst0Reg, MIB); + } else + Builder.buildTrunc(Dst0Reg, SrcReg); + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::matchCombineUnmergeZExtToZExt(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && + "Expected an unmerge"); + Register Dst0Reg = MI.getOperand(0).getReg(); + LLT Dst0Ty = MRI.getType(Dst0Reg); + // G_ZEXT on vector applies to each lane, so it will + // affect all destinations. Therefore we won't be able + // to simplify the unmerge to just the first definition. + if (Dst0Ty.isVector()) + return false; + Register SrcReg = MI.getOperand(MI.getNumDefs()).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + if (SrcTy.isVector()) + return false; + + Register ZExtSrcReg; + if (!mi_match(SrcReg, MRI, m_GZExt(m_Reg(ZExtSrcReg)))) + return false; + + // Finally we can replace the first definition with + // a zext of the source if the definition is big enough to hold + // all of ZExtSrc bits. + LLT ZExtSrcTy = MRI.getType(ZExtSrcReg); + return ZExtSrcTy.getSizeInBits() <= Dst0Ty.getSizeInBits(); +} + +bool CombinerHelper::applyCombineUnmergeZExtToZExt(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && + "Expected an unmerge"); + + Register Dst0Reg = MI.getOperand(0).getReg(); + + MachineInstr *ZExtInstr = + MRI.getVRegDef(MI.getOperand(MI.getNumDefs()).getReg()); + assert(ZExtInstr && ZExtInstr->getOpcode() == TargetOpcode::G_ZEXT && + "Expecting a G_ZEXT"); + + Register ZExtSrcReg = ZExtInstr->getOperand(1).getReg(); + LLT Dst0Ty = MRI.getType(Dst0Reg); + LLT ZExtSrcTy = MRI.getType(ZExtSrcReg); + + Builder.setInstrAndDebugLoc(MI); + + if (Dst0Ty.getSizeInBits() > ZExtSrcTy.getSizeInBits()) { + Builder.buildZExt(Dst0Reg, ZExtSrcReg); + } else { + assert(Dst0Ty.getSizeInBits() == ZExtSrcTy.getSizeInBits() && + "ZExt src doesn't fit in destination"); + replaceRegWith(MRI, Dst0Reg, ZExtSrcReg); + } + + Register ZeroReg; + for (unsigned Idx = 1, EndIdx = MI.getNumDefs(); Idx != EndIdx; ++Idx) { + if (!ZeroReg) + ZeroReg = Builder.buildConstant(Dst0Ty, 0).getReg(0); + replaceRegWith(MRI, MI.getOperand(Idx).getReg(), ZeroReg); + } + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftSize, unsigned &ShiftVal) { @@ -1821,6 +2071,118 @@ bool CombinerHelper::applyCombineExtOfExt( return false; } +bool CombinerHelper::applyCombineMulByNegativeOne(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(DstReg); + + Builder.setInstrAndDebugLoc(MI); + Builder.buildSub(DstReg, Builder.buildConstant(DstTy, 0), SrcReg, + MI.getFlags()); + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::matchCombineFNegOfFNeg(MachineInstr &MI, Register &Reg) { + assert(MI.getOpcode() == TargetOpcode::G_FNEG && "Expected a G_FNEG"); + Register SrcReg = MI.getOperand(1).getReg(); + return mi_match(SrcReg, MRI, m_GFNeg(m_Reg(Reg))); +} + +bool CombinerHelper::matchCombineFAbsOfFAbs(MachineInstr &MI, Register &Src) { + assert(MI.getOpcode() == TargetOpcode::G_FABS && "Expected a G_FABS"); + Src = MI.getOperand(1).getReg(); + Register AbsSrc; + return mi_match(Src, MRI, m_GFabs(m_Reg(AbsSrc))); +} + +bool CombinerHelper::applyCombineFAbsOfFAbs(MachineInstr &MI, Register &Src) { + assert(MI.getOpcode() == TargetOpcode::G_FABS && "Expected a G_FABS"); + Register Dst = MI.getOperand(0).getReg(); + MI.eraseFromParent(); + replaceRegWith(MRI, Dst, Src); + return true; +} + +bool CombinerHelper::matchCombineTruncOfExt( + MachineInstr &MI, std::pair &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC"); + Register SrcReg = MI.getOperand(1).getReg(); + MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); + unsigned SrcOpc = SrcMI->getOpcode(); + if (SrcOpc == TargetOpcode::G_ANYEXT || SrcOpc == TargetOpcode::G_SEXT || + SrcOpc == TargetOpcode::G_ZEXT) { + MatchInfo = std::make_pair(SrcMI->getOperand(1).getReg(), SrcOpc); + return true; + } + return false; +} + +bool CombinerHelper::applyCombineTruncOfExt( + MachineInstr &MI, std::pair &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC"); + Register SrcReg = MatchInfo.first; + unsigned SrcExtOp = MatchInfo.second; + Register DstReg = MI.getOperand(0).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + LLT DstTy = MRI.getType(DstReg); + if (SrcTy == DstTy) { + MI.eraseFromParent(); + replaceRegWith(MRI, DstReg, SrcReg); + return true; + } + Builder.setInstrAndDebugLoc(MI); + if (SrcTy.getSizeInBits() < DstTy.getSizeInBits()) + Builder.buildInstr(SrcExtOp, {DstReg}, {SrcReg}); + else + Builder.buildTrunc(DstReg, SrcReg); + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::matchCombineTruncOfShl( + MachineInstr &MI, std::pair &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC"); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(DstReg); + Register ShiftSrc; + Register ShiftAmt; + + if (MRI.hasOneNonDBGUse(SrcReg) && + mi_match(SrcReg, MRI, m_GShl(m_Reg(ShiftSrc), m_Reg(ShiftAmt))) && + isLegalOrBeforeLegalizer( + {TargetOpcode::G_SHL, + {DstTy, getTargetLowering().getPreferredShiftAmountTy(DstTy)}})) { + KnownBits Known = KB->getKnownBits(ShiftAmt); + unsigned Size = DstTy.getSizeInBits(); + if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) { + MatchInfo = std::make_pair(ShiftSrc, ShiftAmt); + return true; + } + } + return false; +} + +bool CombinerHelper::applyCombineTruncOfShl( + MachineInstr &MI, std::pair &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC"); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(DstReg); + MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); + + Register ShiftSrc = MatchInfo.first; + Register ShiftAmt = MatchInfo.second; + Builder.setInstrAndDebugLoc(MI); + auto TruncShiftSrc = Builder.buildTrunc(DstTy, ShiftSrc); + auto TruncShiftAmt = Builder.buildTrunc(DstTy, ShiftAmt); + Builder.buildShl(DstReg, TruncShiftSrc, TruncShiftAmt, SrcMI->getFlags()); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::matchAnyExplicitUseIsUndef(MachineInstr &MI) { return any_of(MI.explicit_uses(), [this](const MachineOperand &MO) { return MO.isReg() && @@ -1989,6 +2351,12 @@ bool CombinerHelper::matchOperandIsZero(MachineInstr &MI, unsigned OpIdx) { MRI); } +bool CombinerHelper::matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx) { + MachineOperand &MO = MI.getOperand(OpIdx); + return MO.isReg() && + getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI); +} + bool CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); Builder.setInstr(MI); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index cce0ca938c9fe..22c5d3c40dd90 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackProtector.h" +#include "llvm/CodeGen/SwitchLoweringUtils.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" @@ -49,11 +50,13 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -71,6 +74,7 @@ #include "llvm/Target/TargetMachine.h" #include #include +#include #include #include #include @@ -111,7 +115,8 @@ static void reportTranslationError(MachineFunction &MF, ORE.emit(R); } -IRTranslator::IRTranslator() : MachineFunctionPass(ID) { } +IRTranslator::IRTranslator(CodeGenOpt::Level optlevel) + : MachineFunctionPass(ID), OptLevel(optlevel) {} #ifndef NDEBUG namespace { @@ -155,6 +160,8 @@ void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); AU.addRequired(); + if (OptLevel != CodeGenOpt::None) + AU.addRequired(); getSelectionDAGFallbackAnalysisUsage(AU); MachineFunctionPass::getAnalysisUsage(AU); } @@ -360,28 +367,276 @@ bool IRTranslator::translateRet(const User &U, MachineIRBuilder &MIRBuilder) { return CLI->lowerReturn(MIRBuilder, Ret, VRegs, SwiftErrorVReg); } +void IRTranslator::emitBranchForMergedCondition( + const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB, + MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, + BranchProbability TProb, BranchProbability FProb, bool InvertCond) { + // If the leaf of the tree is a comparison, merge the condition into + // the caseblock. + if (const CmpInst *BOp = dyn_cast(Cond)) { + CmpInst::Predicate Condition; + if (const ICmpInst *IC = dyn_cast(Cond)) { + Condition = InvertCond ? IC->getInversePredicate() : IC->getPredicate(); + } else { + const FCmpInst *FC = cast(Cond); + Condition = InvertCond ? FC->getInversePredicate() : FC->getPredicate(); + } + + SwitchCG::CaseBlock CB(Condition, false, BOp->getOperand(0), + BOp->getOperand(1), nullptr, TBB, FBB, CurBB, + CurBuilder->getDebugLoc(), TProb, FProb); + SL->SwitchCases.push_back(CB); + return; + } + + // Create a CaseBlock record representing this branch. + CmpInst::Predicate Pred = InvertCond ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; + SwitchCG::CaseBlock CB( + Pred, false, Cond, ConstantInt::getTrue(MF->getFunction().getContext()), + nullptr, TBB, FBB, CurBB, CurBuilder->getDebugLoc(), TProb, FProb); + SL->SwitchCases.push_back(CB); +} + +static bool isValInBlock(const Value *V, const BasicBlock *BB) { + if (const Instruction *I = dyn_cast(V)) + return I->getParent() == BB; + return true; +} + +void IRTranslator::findMergedConditions( + const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB, + MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, + Instruction::BinaryOps Opc, BranchProbability TProb, + BranchProbability FProb, bool InvertCond) { + using namespace PatternMatch; + assert((Opc == Instruction::And || Opc == Instruction::Or) && + "Expected Opc to be AND/OR"); + // Skip over not part of the tree and remember to invert op and operands at + // next level. + Value *NotCond; + if (match(Cond, m_OneUse(m_Not(m_Value(NotCond)))) && + isValInBlock(NotCond, CurBB->getBasicBlock())) { + findMergedConditions(NotCond, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb, + !InvertCond); + return; + } + + const Instruction *BOp = dyn_cast(Cond); + // Compute the effective opcode for Cond, taking into account whether it needs + // to be inverted, e.g. + // and (not (or A, B)), C + // gets lowered as + // and (and (not A, not B), C) + unsigned BOpc = 0; + if (BOp) { + BOpc = BOp->getOpcode(); + if (InvertCond) { + if (BOpc == Instruction::And) + BOpc = Instruction::Or; + else if (BOpc == Instruction::Or) + BOpc = Instruction::And; + } + } + + // If this node is not part of the or/and tree, emit it as a branch. + if (!BOp || !(isa(BOp) || isa(BOp)) || + BOpc != static_cast(Opc) || !BOp->hasOneUse() || + BOp->getParent() != CurBB->getBasicBlock() || + !isValInBlock(BOp->getOperand(0), CurBB->getBasicBlock()) || + !isValInBlock(BOp->getOperand(1), CurBB->getBasicBlock())) { + emitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB, TProb, FProb, + InvertCond); + return; + } + + // Create TmpBB after CurBB. + MachineFunction::iterator BBI(CurBB); + MachineBasicBlock *TmpBB = + MF->CreateMachineBasicBlock(CurBB->getBasicBlock()); + CurBB->getParent()->insert(++BBI, TmpBB); + + if (Opc == Instruction::Or) { + // Codegen X | Y as: + // BB1: + // jmp_if_X TBB + // jmp TmpBB + // TmpBB: + // jmp_if_Y TBB + // jmp FBB + // + + // We have flexibility in setting Prob for BB1 and Prob for TmpBB. + // The requirement is that + // TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB) + // = TrueProb for original BB. + // Assuming the original probabilities are A and B, one choice is to set + // BB1's probabilities to A/2 and A/2+B, and set TmpBB's probabilities to + // A/(1+B) and 2B/(1+B). This choice assumes that + // TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB. + // Another choice is to assume TrueProb for BB1 equals to TrueProb for + // TmpBB, but the math is more complicated. + + auto NewTrueProb = TProb / 2; + auto NewFalseProb = TProb / 2 + FProb; + // Emit the LHS condition. + findMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc, + NewTrueProb, NewFalseProb, InvertCond); + + // Normalize A/2 and B to get A/(1+B) and 2B/(1+B). + SmallVector Probs{TProb / 2, FProb}; + BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end()); + // Emit the RHS condition into TmpBB. + findMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, + Probs[0], Probs[1], InvertCond); + } else { + assert(Opc == Instruction::And && "Unknown merge op!"); + // Codegen X & Y as: + // BB1: + // jmp_if_X TmpBB + // jmp FBB + // TmpBB: + // jmp_if_Y TBB + // jmp FBB + // + // This requires creation of TmpBB after CurBB. + + // We have flexibility in setting Prob for BB1 and Prob for TmpBB. + // The requirement is that + // FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB) + // = FalseProb for original BB. + // Assuming the original probabilities are A and B, one choice is to set + // BB1's probabilities to A+B/2 and B/2, and set TmpBB's probabilities to + // 2A/(1+A) and B/(1+A). This choice assumes that FalseProb for BB1 == + // TrueProb for BB1 * FalseProb for TmpBB. + + auto NewTrueProb = TProb + FProb / 2; + auto NewFalseProb = FProb / 2; + // Emit the LHS condition. + findMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc, + NewTrueProb, NewFalseProb, InvertCond); + + // Normalize A and B/2 to get 2A/(1+A) and B/(1+A). + SmallVector Probs{TProb, FProb / 2}; + BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end()); + // Emit the RHS condition into TmpBB. + findMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, + Probs[0], Probs[1], InvertCond); + } +} + +bool IRTranslator::shouldEmitAsBranches( + const std::vector &Cases) { + // For multiple cases, it's better to emit as branches. + if (Cases.size() != 2) + return true; + + // If this is two comparisons of the same values or'd or and'd together, they + // will get folded into a single comparison, so don't emit two blocks. + if ((Cases[0].CmpLHS == Cases[1].CmpLHS && + Cases[0].CmpRHS == Cases[1].CmpRHS) || + (Cases[0].CmpRHS == Cases[1].CmpLHS && + Cases[0].CmpLHS == Cases[1].CmpRHS)) { + return false; + } + + // Handle: (X != null) | (Y != null) --> (X|Y) != 0 + // Handle: (X == null) & (Y == null) --> (X|Y) == 0 + if (Cases[0].CmpRHS == Cases[1].CmpRHS && + Cases[0].PredInfo.Pred == Cases[1].PredInfo.Pred && + isa(Cases[0].CmpRHS) && + cast(Cases[0].CmpRHS)->isNullValue()) { + if (Cases[0].PredInfo.Pred == CmpInst::ICMP_EQ && + Cases[0].TrueBB == Cases[1].ThisBB) + return false; + if (Cases[0].PredInfo.Pred == CmpInst::ICMP_NE && + Cases[0].FalseBB == Cases[1].ThisBB) + return false; + } + + return true; +} + bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) { const BranchInst &BrInst = cast(U); - unsigned Succ = 0; - if (!BrInst.isUnconditional()) { - // We want a G_BRCOND to the true BB followed by an unconditional branch. - Register Tst = getOrCreateVReg(*BrInst.getCondition()); - const BasicBlock &TrueTgt = *cast(BrInst.getSuccessor(Succ++)); - MachineBasicBlock &TrueBB = getMBB(TrueTgt); - MIRBuilder.buildBrCond(Tst, TrueBB); + auto &CurMBB = MIRBuilder.getMBB(); + auto *Succ0MBB = &getMBB(*BrInst.getSuccessor(0)); + + if (BrInst.isUnconditional()) { + // If the unconditional target is the layout successor, fallthrough. + if (!CurMBB.isLayoutSuccessor(Succ0MBB)) + MIRBuilder.buildBr(*Succ0MBB); + + // Link successors. + for (const BasicBlock *Succ : successors(&BrInst)) + CurMBB.addSuccessor(&getMBB(*Succ)); + return true; } - const BasicBlock &BrTgt = *cast(BrInst.getSuccessor(Succ)); - MachineBasicBlock &TgtBB = getMBB(BrTgt); - MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + // If this condition is one of the special cases we handle, do special stuff + // now. + const Value *CondVal = BrInst.getCondition(); + MachineBasicBlock *Succ1MBB = &getMBB(*BrInst.getSuccessor(1)); - // If the unconditional target is the layout successor, fallthrough. - if (!CurBB.isLayoutSuccessor(&TgtBB)) - MIRBuilder.buildBr(TgtBB); + const auto &TLI = *MF->getSubtarget().getTargetLowering(); - // Link successors. - for (const BasicBlock *Succ : successors(&BrInst)) - CurBB.addSuccessor(&getMBB(*Succ)); + // If this is a series of conditions that are or'd or and'd together, emit + // this as a sequence of branches instead of setcc's with and/or operations. + // As long as jumps are not expensive (exceptions for multi-use logic ops, + // unpredictable branches, and vector extracts because those jumps are likely + // expensive for any target), this should improve performance. + // For example, instead of something like: + // cmp A, B + // C = seteq + // cmp D, E + // F = setle + // or C, F + // jnz foo + // Emit: + // cmp A, B + // je foo + // cmp D, E + // jle foo + using namespace PatternMatch; + if (const BinaryOperator *BOp = dyn_cast(CondVal)) { + Instruction::BinaryOps Opcode = BOp->getOpcode(); + Value *Vec, *BOp0 = BOp->getOperand(0), *BOp1 = BOp->getOperand(1); + if (!TLI.isJumpExpensive() && BOp->hasOneUse() && + !BrInst.hasMetadata(LLVMContext::MD_unpredictable) && + (Opcode == Instruction::And || Opcode == Instruction::Or) && + !(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) && + match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value())))) { + findMergedConditions(BOp, Succ0MBB, Succ1MBB, &CurMBB, &CurMBB, Opcode, + getEdgeProbability(&CurMBB, Succ0MBB), + getEdgeProbability(&CurMBB, Succ1MBB), + /*InvertCond=*/false); + assert(SL->SwitchCases[0].ThisBB == &CurMBB && "Unexpected lowering!"); + + // Allow some cases to be rejected. + if (shouldEmitAsBranches(SL->SwitchCases)) { + // Emit the branch for this block. + emitSwitchCase(SL->SwitchCases[0], &CurMBB, *CurBuilder); + SL->SwitchCases.erase(SL->SwitchCases.begin()); + return true; + } + + // Okay, we decided not to do this, remove any inserted MBB's and clear + // SwitchCases. + for (unsigned I = 1, E = SL->SwitchCases.size(); I != E; ++I) + MF->erase(SL->SwitchCases[I].ThisBB); + + SL->SwitchCases.clear(); + } + } + + // Create a CaseBlock record representing this branch. + SwitchCG::CaseBlock CB(CmpInst::ICMP_EQ, false, CondVal, + ConstantInt::getTrue(MF->getFunction().getContext()), + nullptr, Succ0MBB, Succ1MBB, &CurMBB, + CurBuilder->getDebugLoc()); + + // Use emitSwitchCase to actually insert the fast branch sequence for this + // cond branch. + emitSwitchCase(CB, &CurMBB, *CurBuilder); return true; } @@ -567,8 +822,23 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB, const LLT i1Ty = LLT::scalar(1); // Build the compare. if (!CB.CmpMHS) { - Register CondRHS = getOrCreateVReg(*CB.CmpRHS); - Cond = MIB.buildICmp(CB.PredInfo.Pred, i1Ty, CondLHS, CondRHS).getReg(0); + const auto *CI = dyn_cast(CB.CmpRHS); + // For conditional branch lowering, we might try to do something silly like + // emit an G_ICMP to compare an existing G_ICMP i1 result with true. If so, + // just re-use the existing condition vreg. + if (CI && CI->getZExtValue() == 1 && + MRI->getType(CondLHS).getSizeInBits() == 1 && + CB.PredInfo.Pred == CmpInst::ICMP_EQ) { + Cond = CondLHS; + } else { + Register CondRHS = getOrCreateVReg(*CB.CmpRHS); + if (CmpInst::isFPPredicate(CB.PredInfo.Pred)) + Cond = + MIB.buildFCmp(CB.PredInfo.Pred, i1Ty, CondLHS, CondRHS).getReg(0); + else + Cond = + MIB.buildICmp(CB.PredInfo.Pred, i1Ty, CondLHS, CondRHS).getReg(0); + } } else { assert(CB.PredInfo.Pred == CmpInst::ICMP_SLE && "Can only handle SLE ranges"); @@ -601,17 +871,8 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB, addSuccessorWithProb(CB.ThisBB, CB.FalseBB, CB.FalseProb); CB.ThisBB->normalizeSuccProbs(); - // if (SwitchBB->getBasicBlock() != CB.FalseBB->getBasicBlock()) - addMachineCFGPred({SwitchBB->getBasicBlock(), CB.FalseBB->getBasicBlock()}, - CB.ThisBB); - - // If the lhs block is the next block, invert the condition so that we can - // fall through to the lhs instead of the rhs block. - if (CB.TrueBB == CB.ThisBB->getNextNode()) { - std::swap(CB.TrueBB, CB.FalseBB); - auto True = MIB.buildConstant(i1Ty, 1); - Cond = MIB.buildXor(i1Ty, Cond, True).getReg(0); - } + addMachineCFGPred({SwitchBB->getBasicBlock(), CB.FalseBB->getBasicBlock()}, + CB.ThisBB); MIB.buildBrCond(Cond, *CB.TrueBB); MIB.buildBr(*CB.FalseBB); @@ -2590,6 +2851,10 @@ void IRTranslator::finalizeBasicBlock() { emitJumpTable(JTCase.second, JTCase.second.MBB); } SL->JTCases.clear(); + + for (auto &SwCase : SL->SwitchCases) + emitSwitchCase(SwCase, &CurBuilder->getMBB(), *CurBuilder); + SL->SwitchCases.clear(); } void IRTranslator::finalizeFunction() { @@ -2651,14 +2916,21 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { MRI = &MF->getRegInfo(); DL = &F.getParent()->getDataLayout(); ORE = std::make_unique(&F); + const TargetMachine &TM = MF->getTarget(); + TM.resetTargetOptions(F); + EnableOpts = OptLevel != CodeGenOpt::None && !skipFunction(F); FuncInfo.MF = MF; - FuncInfo.BPI = nullptr; + if (EnableOpts) + FuncInfo.BPI = &getAnalysis().getBPI(); + else + FuncInfo.BPI = nullptr; + const auto &TLI = *MF->getSubtarget().getTargetLowering(); - const TargetMachine &TM = MF->getTarget(); + SL = std::make_unique(this, FuncInfo); SL->init(TLI, TM, *DL); - EnableOpts = TM.getOptLevel() != CodeGenOpt::None && !skipFunction(F); + assert(PendingPHIs.empty() && "stale PHIs"); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp index 17bce517814de..9ca6d9a9a5517 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp @@ -10,6 +10,17 @@ // //===----------------------------------------------------------------------===// +// Enable optimizations to work around MSVC debug mode bug in 32-bit: +// https://developercommunity.visualstudio.com/content/problem/1179643/msvc-copies-overaligned-non-trivially-copyable-par.html +// FIXME: Remove this when the issue is closed. +#if defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86) +// We have to disable runtime checks in order to enable optimizations. This is +// done for the entire file because the problem is actually observed in STL +// template functions. +#pragma runtime_checks("", off) +#pragma optimize("gs", on) +#endif + #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" using namespace llvm; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 347fe7b0ee98d..e8ddfc8e083ed 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2033,7 +2033,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { return UnableToLegalize; LLT Ty = MRI.getType(MI.getOperand(0).getReg()); - if (!isPowerOf2_32(Ty.getSizeInBits())) + if (!Ty.isScalar()) return UnableToLegalize; Observer.changingInstr(MI); @@ -3285,7 +3285,7 @@ LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx, if (NumParts * NarrowTy.getNumElements() != DstTy.getNumElements()) return UnableToLegalize; - NarrowTy1 = LLT::vector(NumParts, SrcTy.getElementType().getSizeInBits()); + NarrowTy1 = LLT::vector(NarrowTy.getNumElements(), SrcTy.getElementType()); } else { NumParts = DstTy.getNumElements(); NarrowTy1 = SrcTy.getElementType(); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 6f8d233043e70..070a45951fed1 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/Constants.h" +#include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "globalisel-utils" @@ -470,7 +471,8 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, if (!DefMI) return false; - if (DefMI->getFlag(MachineInstr::FmNoNans)) + const TargetMachine& TM = DefMI->getMF()->getTarget(); + if (DefMI->getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) return true; if (SNaN) { @@ -740,3 +742,15 @@ bool llvm::isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector, } llvm_unreachable("Invalid boolean contents"); } + +int64_t llvm::getICmpTrueVal(const TargetLowering &TLI, bool IsVector, + bool IsFP) { + switch (TLI.getBooleanContents(IsVector, IsFP)) { + case TargetLowering::UndefinedBooleanContent: + case TargetLowering::ZeroOrOneBooleanContent: + return 1; + case TargetLowering::ZeroOrNegativeOneBooleanContent: + return -1; + } + llvm_unreachable("Invalid boolean contents"); +} diff --git a/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/llvm/lib/CodeGen/ImplicitNullChecks.cpp index dc1b0a867b0d6..c2b764e5580ce 100644 --- a/llvm/lib/CodeGen/ImplicitNullChecks.cpp +++ b/llvm/lib/CodeGen/ImplicitNullChecks.cpp @@ -204,13 +204,12 @@ class ImplicitNullChecks : public MachineFunctionPass { /// if it was hoisted to the NullCheck block. This is used by caller /// canHoistInst to decide if DependenceMI can be hoisted safely. bool canDependenceHoistingClobberLiveIns(MachineInstr *DependenceMI, - MachineBasicBlock *NullSucc, - unsigned PointerReg); + MachineBasicBlock *NullSucc); /// Return true if \p FaultingMI can be hoisted from after the /// instructions in \p InstsSeenSoFar to before them. Set \p Dependence to a /// non-null value if we also need to (and legally can) hoist a depedency. - bool canHoistInst(MachineInstr *FaultingMI, unsigned PointerReg, + bool canHoistInst(MachineInstr *FaultingMI, ArrayRef InstsSeenSoFar, MachineBasicBlock *NullSucc, MachineInstr *&Dependence); @@ -374,10 +373,14 @@ ImplicitNullChecks::isSuitableMemoryOp(const MachineInstr &MI, bool OffsetIsScalable; const MachineOperand *BaseOp; + // Implementation restriction for faulting_op insertion + // TODO: This could be relaxed if we find a test case which warrants it. + if (MI.getDesc().getNumDefs() > 1) + return SR_Unsuitable; // FIXME: This handles only simple addressing mode. if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI)) - return SR_Unsuitable; + return SR_Unsuitable; // We need the base of the memory instruction to be same as the register // where the null check is performed (i.e. PointerReg). @@ -409,8 +412,7 @@ ImplicitNullChecks::isSuitableMemoryOp(const MachineInstr &MI, } bool ImplicitNullChecks::canDependenceHoistingClobberLiveIns( - MachineInstr *DependenceMI, MachineBasicBlock *NullSucc, - unsigned PointerReg) { + MachineInstr *DependenceMI, MachineBasicBlock *NullSucc) { for (auto &DependenceMO : DependenceMI->operands()) { if (!(DependenceMO.isReg() && DependenceMO.getReg())) continue; @@ -435,12 +437,6 @@ bool ImplicitNullChecks::canDependenceHoistingClobberLiveIns( if (AnyAliasLiveIn(TRI, NullSucc, DependenceMO.getReg())) return true; - // The Dependency can't be re-defining the base register -- then we won't - // get the memory operation on the address we want. This is already - // checked in \c IsSuitableMemoryOp. - assert(!(DependenceMO.isDef() && - TRI->regsOverlap(DependenceMO.getReg(), PointerReg)) && - "Should have been checked before!"); } // The dependence does not clobber live-ins in NullSucc block. @@ -448,7 +444,6 @@ bool ImplicitNullChecks::canDependenceHoistingClobberLiveIns( } bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI, - unsigned PointerReg, ArrayRef InstsSeenSoFar, MachineBasicBlock *NullSucc, MachineInstr *&Dependence) { @@ -473,7 +468,7 @@ bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI, if (DependenceMI->mayLoadOrStore()) return false; - if (canDependenceHoistingClobberLiveIns(DependenceMI, NullSucc, PointerReg)) + if (canDependenceHoistingClobberLiveIns(DependenceMI, NullSucc)) return false; auto DepDepResult = @@ -511,9 +506,9 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( MBP.Predicate == MachineBranchPredicate::PRED_EQ))) return false; - // If we cannot erase the test instruction itself, then making the null check - // implicit does not buy us much. - if (!MBP.SingleUseCondition) + // If there is a separate condition generation instruction, we chose not to + // transform unless we can remove both condition and consuming branch. + if (MBP.ConditionDef && !MBP.SingleUseCondition) return false; MachineBasicBlock *NotNullSucc, *NullSucc; @@ -531,32 +526,34 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( if (NotNullSucc->pred_size() != 1) return false; - // To prevent the invalid transformation of the following code: - // - // mov %rax, %rcx - // test %rax, %rax - // %rax = ... - // je throw_npe - // mov(%rcx), %r9 - // mov(%rax), %r10 - // - // into: - // - // mov %rax, %rcx - // %rax = .... - // faulting_load_op("movl (%rax), %r10", throw_npe) - // mov(%rcx), %r9 - // - // we must ensure that there are no instructions between the 'test' and - // conditional jump that modify %rax. const Register PointerReg = MBP.LHS.getReg(); - assert(MBP.ConditionDef->getParent() == &MBB && "Should be in basic block"); - - for (auto I = MBB.rbegin(); MBP.ConditionDef != &*I; ++I) - if (I->modifiesRegister(PointerReg, TRI)) - return false; + if (MBP.ConditionDef) { + // To prevent the invalid transformation of the following code: + // + // mov %rax, %rcx + // test %rax, %rax + // %rax = ... + // je throw_npe + // mov(%rcx), %r9 + // mov(%rax), %r10 + // + // into: + // + // mov %rax, %rcx + // %rax = .... + // faulting_load_op("movl (%rax), %r10", throw_npe) + // mov(%rcx), %r9 + // + // we must ensure that there are no instructions between the 'test' and + // conditional jump that modify %rax. + assert(MBP.ConditionDef->getParent() == &MBB && + "Should be in basic block"); + for (auto I = MBB.rbegin(); MBP.ConditionDef != &*I; ++I) + if (I->modifiesRegister(PointerReg, TRI)) + return false; + } // Starting with a code fragment like: // // test %rax, %rax @@ -622,17 +619,15 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( if (SR == SR_Impossible) return false; if (SR == SR_Suitable && - canHoistInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, Dependence)) { + canHoistInst(&MI, InstsSeenSoFar, NullSucc, Dependence)) { NullCheckList.emplace_back(&MI, MBP.ConditionDef, &MBB, NotNullSucc, NullSucc, Dependence); return true; } - // If MI re-defines the PointerReg then we cannot move further. - if (llvm::any_of(MI.operands(), [&](MachineOperand &MO) { - return MO.isReg() && MO.getReg() && MO.isDef() && - TRI->regsOverlap(MO.getReg(), PointerReg); - })) + // If MI re-defines the PointerReg in a way that changes the value of + // PointerReg if it was null, then we cannot move further. + if (!TII->preservesZeroValueInReg(&MI, PointerReg, TRI)) return false; InstsSeenSoFar.push_back(&MI); } @@ -737,9 +732,11 @@ void ImplicitNullChecks::rewriteNullChecks( } NC.getMemOperation()->eraseFromParent(); - NC.getCheckOperation()->eraseFromParent(); + if (auto *CheckOp = NC.getCheckOperation()) + CheckOp->eraseFromParent(); - // Insert an *unconditional* branch to not-null successor. + // Insert an *unconditional* branch to not-null successor - we expect + // block placement to remove fallthroughs later. TII->insertBranch(*NC.getCheckBlock(), NC.getNotNullSucc(), nullptr, /*Cond=*/None, DL); diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index 59e8a5cea1c3c..911ac88c802fc 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -289,8 +289,9 @@ bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) { // Check that all uses satisfy our criteria. for (MachineRegisterInfo::reg_instr_nodbg_iterator - RI = MRI.reg_instr_nodbg_begin(SnipLI.reg), - E = MRI.reg_instr_nodbg_end(); RI != E; ) { + RI = MRI.reg_instr_nodbg_begin(SnipLI.reg()), + E = MRI.reg_instr_nodbg_end(); + RI != E;) { MachineInstr &MI = *RI++; // Allow copies to/from Reg. @@ -299,11 +300,11 @@ bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) { // Allow stack slot loads. int FI; - if (SnipLI.reg == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot) + if (SnipLI.reg() == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot) continue; // Allow stack slot stores. - if (SnipLI.reg == TII.isStoreToStackSlot(MI, FI) && FI == StackSlot) + if (SnipLI.reg() == TII.isStoreToStackSlot(MI, FI) && FI == StackSlot) continue; // Allow a single additional instruction. @@ -432,7 +433,7 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) { do { LiveInterval *LI; std::tie(LI, VNI) = WorkList.pop_back_val(); - Register Reg = LI->reg; + Register Reg = LI->reg(); LLVM_DEBUG(dbgs() << "Checking redundant spills for " << VNI->id << '@' << VNI->def << " in " << *LI << '\n'); @@ -511,7 +512,7 @@ void InlineSpiller::markValueUsed(LiveInterval *LI, VNInfo *VNI) { if (!SnippetCopies.count(MI)) continue; LiveInterval &SnipLI = LIS.getInterval(MI->getOperand(1).getReg()); - assert(isRegToSpill(SnipLI.reg) && "Unexpected register in copy"); + assert(isRegToSpill(SnipLI.reg()) && "Unexpected register in copy"); VNInfo *SnipVNI = SnipLI.getVNInfoAt(VNI->def.getRegSlot(true)); assert(SnipVNI && "Snippet undefined before copy"); WorkList.push_back(std::make_pair(&SnipLI, SnipVNI)); @@ -556,7 +557,7 @@ bool InlineSpiller::canGuaranteeAssignmentAfterRemat(Register VReg, bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { // Analyze instruction SmallVector, 8> Ops; - VirtRegInfo RI = AnalyzeVirtRegInBundle(MI, VirtReg.reg, &Ops); + VirtRegInfo RI = AnalyzeVirtRegInBundle(MI, VirtReg.reg(), &Ops); if (!RI.Reads) return false; @@ -568,7 +569,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { LLVM_DEBUG(dbgs() << "\tadding flags: "); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); - if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg) + if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg()) MO.setIsUndef(); } LLVM_DEBUG(dbgs() << UseIdx << '\t' << MI); @@ -608,7 +609,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { // If we can't guarantee that we'll be able to actually assign the new vreg, // we can't remat. - if (!canGuaranteeAssignmentAfterRemat(VirtReg.reg, MI)) { + if (!canGuaranteeAssignmentAfterRemat(VirtReg.reg(), MI)) { markValueUsed(&VirtReg, ParentVNI); LLVM_DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << MI); return false; @@ -633,7 +634,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { // Replace operands for (const auto &OpPair : Ops) { MachineOperand &MO = OpPair.first->getOperand(OpPair.second); - if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg) { + if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg()) { MO.setReg(NewVReg); MO.setIsKill(); } @@ -1171,7 +1172,7 @@ void HoistSpillHelper::addToMergeableSpills(MachineInstr &Spill, int StackSlot, // save a copy of LiveInterval in StackSlotToOrigLI because the original // LiveInterval may be cleared after all its references are spilled. if (StackSlotToOrigLI.find(StackSlot) == StackSlotToOrigLI.end()) { - auto LI = std::make_unique(OrigLI.reg, OrigLI.weight); + auto LI = std::make_unique(OrigLI.reg(), OrigLI.weight()); LI->assign(OrigLI, Allocator); StackSlotToOrigLI[StackSlot] = std::move(LI); } @@ -1199,7 +1200,7 @@ bool HoistSpillHelper::rmFromMergeableSpills(MachineInstr &Spill, bool HoistSpillHelper::isSpillCandBB(LiveInterval &OrigLI, VNInfo &OrigVNI, MachineBasicBlock &BB, Register &LiveReg) { SlotIndex Idx; - Register OrigReg = OrigLI.reg; + Register OrigReg = OrigLI.reg(); MachineBasicBlock::iterator MI = IPA.getLastInsertPointIter(OrigLI, BB); if (MI != BB.end()) Idx = LIS.getInstructionIndex(*MI); diff --git a/llvm/lib/CodeGen/InterferenceCache.cpp b/llvm/lib/CodeGen/InterferenceCache.cpp index 7b50dac4cd1a7..617db0450d02e 100644 --- a/llvm/lib/CodeGen/InterferenceCache.cpp +++ b/llvm/lib/CodeGen/InterferenceCache.cpp @@ -12,19 +12,15 @@ #include "InterferenceCache.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/CodeGen/LiveInterval.h" -#include "llvm/CodeGen/LiveIntervalUnion.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/ErrorHandling.h" #include #include -#include #include using namespace llvm; diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index cfaec85d3f3dd..e39811e33e8c6 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -3114,6 +3114,8 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, bool Changed = TTracker->Transfers.size() != 0; delete MTracker; + delete TTracker; + MTracker = nullptr; VTracker = nullptr; TTracker = nullptr; diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp index 97cc7a0c30343..bd7024e8f483c 100644 --- a/llvm/lib/CodeGen/LiveDebugVariables.cpp +++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp @@ -54,7 +54,6 @@ #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include @@ -777,12 +776,12 @@ void UserValue::addDefsFromCopies( if (Kills.empty()) return; // Don't track copies from physregs, there are too many uses. - if (!Register::isVirtualRegister(LI->reg)) + if (!Register::isVirtualRegister(LI->reg())) return; // Collect all the (vreg, valno) pairs that are copies of LI. SmallVector, 8> CopyValues; - for (MachineOperand &MO : MRI.use_nodbg_operands(LI->reg)) { + for (MachineOperand &MO : MRI.use_nodbg_operands(LI->reg())) { MachineInstr *MI = MO.getParent(); // Copies of the full value. if (MO.getSubReg() || !MI->isCopy()) @@ -1066,7 +1065,7 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef NewRegs, LII->start < LocMapI.stop()) { // Overlapping correct location. Allocate NewLocNo now. if (NewLocNo == UndefLocNo) { - MachineOperand MO = MachineOperand::CreateReg(LI->reg, false); + MachineOperand MO = MachineOperand::CreateReg(LI->reg(), false); MO.setSubReg(locations[OldLocNo].getSubReg()); NewLocNo = getLocationNo(MO); DidChange = true; diff --git a/llvm/lib/CodeGen/LiveInterval.cpp b/llvm/lib/CodeGen/LiveInterval.cpp index 930dc116205a3..ce0e58772068a 100644 --- a/llvm/lib/CodeGen/LiveInterval.cpp +++ b/llvm/lib/CodeGen/LiveInterval.cpp @@ -951,9 +951,9 @@ void LiveInterval::refineSubRanges( MatchingRange = createSubRangeFrom(Allocator, Matching, SR); // Now that the subrange is split in half, make sure we // only keep in the subranges the VNIs that touch the related half. - stripValuesNotDefiningMask(reg, *MatchingRange, Matching, Indexes, TRI, + stripValuesNotDefiningMask(reg(), *MatchingRange, Matching, Indexes, TRI, ComposeSubRegIdx); - stripValuesNotDefiningMask(reg, SR, SR.LaneMask, Indexes, TRI, + stripValuesNotDefiningMask(reg(), SR, SR.LaneMask, Indexes, TRI, ComposeSubRegIdx); } Apply(*MatchingRange); @@ -977,11 +977,11 @@ void LiveInterval::computeSubRangeUndefs(SmallVectorImpl &Undefs, LaneBitmask LaneMask, const MachineRegisterInfo &MRI, const SlotIndexes &Indexes) const { - assert(Register::isVirtualRegister(reg)); - LaneBitmask VRegMask = MRI.getMaxLaneMaskForVReg(reg); + assert(Register::isVirtualRegister(reg())); + LaneBitmask VRegMask = MRI.getMaxLaneMaskForVReg(reg()); assert((VRegMask & LaneMask).any()); const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); - for (const MachineOperand &MO : MRI.def_operands(reg)) { + for (const MachineOperand &MO : MRI.def_operands(reg())) { if (!MO.isUndef()) continue; unsigned SubReg = MO.getSubReg(); @@ -1043,12 +1043,12 @@ void LiveInterval::SubRange::print(raw_ostream &OS) const { } void LiveInterval::print(raw_ostream &OS) const { - OS << printReg(reg) << ' '; + OS << printReg(reg()) << ' '; super::print(OS); // Print subranges for (const SubRange &SR : subranges()) OS << SR; - OS << " weight:" << weight; + OS << " weight:" << Weight; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1087,7 +1087,7 @@ void LiveInterval::verify(const MachineRegisterInfo *MRI) const { // Make sure SubRanges are fine and LaneMasks are disjunct. LaneBitmask Mask; - LaneBitmask MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg) + LaneBitmask MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg()) : LaneBitmask::getAll(); for (const SubRange &SR : subranges()) { // Subrange lanemask should be disjunct to any previous subrange masks. @@ -1361,8 +1361,9 @@ unsigned ConnectedVNInfoEqClasses::Classify(const LiveRange &LR) { void ConnectedVNInfoEqClasses::Distribute(LiveInterval &LI, LiveInterval *LIV[], MachineRegisterInfo &MRI) { // Rewrite instructions. - for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(LI.reg), - RE = MRI.reg_end(); RI != RE;) { + for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(LI.reg()), + RE = MRI.reg_end(); + RI != RE;) { MachineOperand &MO = *RI; MachineInstr *MI = RI->getParent(); ++RI; @@ -1382,7 +1383,7 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval &LI, LiveInterval *LIV[], if (!VNI) continue; if (unsigned EqClass = getEqClass(VNI)) - MO.setReg(LIV[EqClass-1]->reg); + MO.setReg(LIV[EqClass - 1]->reg()); } // Distribute subregister liveranges. diff --git a/llvm/lib/CodeGen/LiveIntervalCalc.cpp b/llvm/lib/CodeGen/LiveIntervalCalc.cpp index 30c2d74a71c53..e8fd069d17a0a 100644 --- a/llvm/lib/CodeGen/LiveIntervalCalc.cpp +++ b/llvm/lib/CodeGen/LiveIntervalCalc.cpp @@ -60,7 +60,7 @@ void LiveIntervalCalc::calculate(LiveInterval &LI, bool TrackSubRegs) { // Visit all def operands. If the same instruction has multiple defs of Reg, // createDeadDef() will deduplicate. const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo(); - unsigned Reg = LI.reg; + unsigned Reg = LI.reg(); for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) { if (!MO.isDef() && !MO.readsReg()) continue; @@ -127,7 +127,7 @@ void LiveIntervalCalc::constructMainRangeFromSubranges(LiveInterval &LI) { } } resetLiveOutMap(); - extendToUses(MainRange, LI.reg, LaneBitmask::getAll(), &LI); + extendToUses(MainRange, LI.reg(), LaneBitmask::getAll(), &LI); } void LiveIntervalCalc::createDeadDefs(LiveRange &LR, Register Reg) { diff --git a/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/llvm/lib/CodeGen/LiveIntervalUnion.cpp index 43fa8f2d7157a..cccc14e4e8a44 100644 --- a/llvm/lib/CodeGen/LiveIntervalUnion.cpp +++ b/llvm/lib/CodeGen/LiveIntervalUnion.cpp @@ -85,8 +85,8 @@ LiveIntervalUnion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const { return; } for (LiveSegments::const_iterator SI = Segments.begin(); SI.valid(); ++SI) { - OS << " [" << SI.start() << ' ' << SI.stop() << "):" - << printReg(SI.value()->reg, TRI); + OS << " [" << SI.start() << ' ' << SI.stop() + << "):" << printReg(SI.value()->reg(), TRI); } OS << '\n'; } @@ -95,7 +95,7 @@ LiveIntervalUnion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const { // Verify the live intervals in this union and add them to the visited set. void LiveIntervalUnion::verify(LiveVirtRegBitSet& VisitedVRegs) { for (SegmentIter SI = Segments.begin(); SI.valid(); ++SI) - VisitedVRegs.set(SI.value()->reg); + VisitedVRegs.set(SI.value()->reg()); } #endif //!NDEBUG diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index b60fea6fb4e3d..d41b1f2b0adff 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -193,7 +193,7 @@ bool LiveIntervals::computeVirtRegInterval(LiveInterval &LI) { assert(LICalc && "LICalc not initialized."); assert(LI.empty() && "Should only compute empty intervals."); LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); - LICalc->calculate(LI, MRI->shouldTrackSubRegLiveness(LI.reg)); + LICalc->calculate(LI, MRI->shouldTrackSubRegLiveness(LI.reg())); return computeDeadValues(LI, nullptr); } @@ -453,13 +453,13 @@ void LiveIntervals::extendSegmentsToUses(LiveRange &Segments, bool LiveIntervals::shrinkToUses(LiveInterval *li, SmallVectorImpl *dead) { LLVM_DEBUG(dbgs() << "Shrink: " << *li << '\n'); - assert(Register::isVirtualRegister(li->reg) && + assert(Register::isVirtualRegister(li->reg()) && "Can only shrink virtual registers"); // Shrink subregister live ranges. bool NeedsCleanup = false; for (LiveInterval::SubRange &S : li->subranges()) { - shrinkToUses(S, li->reg); + shrinkToUses(S, li->reg()); if (S.empty()) NeedsCleanup = true; } @@ -469,8 +469,8 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, // Find all the values used, including PHI kills. ShrinkToUsesWorkList WorkList; - // Visit all instructions reading li->reg. - unsigned Reg = li->reg; + // Visit all instructions reading li->reg(). + unsigned Reg = li->reg(); for (MachineInstr &UseMI : MRI->reg_instructions(Reg)) { if (UseMI.isDebugValue() || !UseMI.readsVirtualRegister(Reg)) continue; @@ -523,7 +523,7 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI, // Is the register live before? Otherwise we may have to add a read-undef // flag for subregister defs. - unsigned VReg = LI.reg; + unsigned VReg = LI.reg(); if (MRI->shouldTrackSubRegLiveness(VReg)) { if ((I == LI.begin() || std::prev(I)->end < Def) && !VNI->isPHIDef()) { MachineInstr *MI = getInstructionFromIndex(Def); @@ -543,7 +543,7 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI, // This is a dead def. Make sure the instruction knows. MachineInstr *MI = getInstructionFromIndex(Def); assert(MI && "No instruction defining live value"); - MI->addRegisterDead(LI.reg, TRI); + MI->addRegisterDead(LI.reg(), TRI); if (HaveDeadDef) MayHaveSplitComponents = true; HaveDeadDef = true; @@ -1716,7 +1716,7 @@ void LiveIntervals::splitSeparateComponents(LiveInterval &LI, if (NumComp <= 1) return; LLVM_DEBUG(dbgs() << " Split " << NumComp << " components: " << LI << '\n'); - unsigned Reg = LI.reg; + unsigned Reg = LI.reg(); const TargetRegisterClass *RegClass = MRI->getRegClass(Reg); for (unsigned I = 1; I < NumComp; ++I) { Register NewVReg = MRI->createVirtualRegister(RegClass); diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp index 9de77c19a23a2..f269020af2219 100644 --- a/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -188,7 +188,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI, MachineInstr *DefMI = nullptr, *UseMI = nullptr; // Check that there is a single def and a single use. - for (MachineOperand &MO : MRI.reg_nodbg_operands(LI->reg)) { + for (MachineOperand &MO : MRI.reg_nodbg_operands(LI->reg())) { MachineInstr *MI = MO.getParent(); if (MO.isDef()) { if (DefMI && DefMI != MI) @@ -224,7 +224,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI, << " into single use: " << *UseMI); SmallVector Ops; - if (UseMI->readsWritesVirtualRegister(LI->reg, &Ops).second) + if (UseMI->readsWritesVirtualRegister(LI->reg(), &Ops).second) return false; MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI, &LIS); @@ -236,7 +236,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI, if (UseMI->shouldUpdateCallSiteInfo()) UseMI->getMF()->moveCallSiteInfo(UseMI, FoldMI); UseMI->eraseFromParent(); - DefMI->addRegisterDead(LI->reg, nullptr); + DefMI->addRegisterDead(LI->reg(), nullptr); Dead.push_back(DefMI); ++NumDCEFoldedLoads; return true; @@ -332,7 +332,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink, // Remove defined value. if (MOI->isDef()) { if (TheDelegate && LI.getVNInfoAt(Idx) != nullptr) - TheDelegate->LRE_WillShrinkVirtReg(LI.reg); + TheDelegate->LRE_WillShrinkVirtReg(LI.reg()); LIS.removeVRegDefAt(LI, Idx); if (LI.empty()) RegsToErase.push_back(Reg); @@ -369,7 +369,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink, pop_back(); DeadRemats->insert(MI); const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); - MI->substituteRegister(Dest, NewLI.reg, 0, TRI); + MI->substituteRegister(Dest, NewLI.reg(), 0, TRI); MI->getOperand(0).setIsDead(true); } else { if (TheDelegate) @@ -409,7 +409,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl &Dead, ToShrink.pop_back(); if (foldAsLoad(LI, Dead)) continue; - unsigned VReg = LI->reg; + unsigned VReg = LI->reg(); if (TheDelegate) TheDelegate->LRE_WillShrinkVirtReg(VReg); if (!LIS.shrinkToUses(LI, &Dead)) @@ -442,9 +442,9 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl &Dead, // intervals their own originals instead of referring to LI. The original // interval must contain all the split products, and LI doesn't. if (Original != VReg && Original != 0) - VRM->setIsSplitFromReg(SplitLI->reg, Original); + VRM->setIsSplitFromReg(SplitLI->reg(), Original); if (TheDelegate) - TheDelegate->LRE_DidCloneVirtReg(SplitLI->reg, VReg); + TheDelegate->LRE_DidCloneVirtReg(SplitLI->reg(), VReg); } } } @@ -466,11 +466,11 @@ LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF, VirtRegAuxInfo VRAI(MF, LIS, VRM, Loops, MBFI); for (unsigned I = 0, Size = size(); I < Size; ++I) { LiveInterval &LI = LIS.getInterval(get(I)); - if (MRI.recomputeRegClass(LI.reg)) + if (MRI.recomputeRegClass(LI.reg())) LLVM_DEBUG({ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - dbgs() << "Inflated " << printReg(LI.reg) << " to " - << TRI->getRegClassName(MRI.getRegClass(LI.reg)) << '\n'; + dbgs() << "Inflated " << printReg(LI.reg()) << " to " + << TRI->getRegClassName(MRI.getRegClass(LI.reg())) << '\n'; }); VRAI.calculateSpillWeightAndHint(LI); } diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp index 08f046420fa1d..6b1775f28c045 100644 --- a/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -102,10 +102,10 @@ static bool foreachUnit(const TargetRegisterInfo *TRI, } void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) { - LLVM_DEBUG(dbgs() << "assigning " << printReg(VirtReg.reg, TRI) << " to " + LLVM_DEBUG(dbgs() << "assigning " << printReg(VirtReg.reg(), TRI) << " to " << printReg(PhysReg, TRI) << ':'); - assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment"); - VRM->assignVirt2Phys(VirtReg.reg, PhysReg); + assert(!VRM->hasPhys(VirtReg.reg()) && "Duplicate VirtReg assignment"); + VRM->assignVirt2Phys(VirtReg.reg(), PhysReg); foreachUnit( TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) { @@ -119,10 +119,10 @@ void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) { } void LiveRegMatrix::unassign(LiveInterval &VirtReg) { - Register PhysReg = VRM->getPhys(VirtReg.reg); - LLVM_DEBUG(dbgs() << "unassigning " << printReg(VirtReg.reg, TRI) << " from " - << printReg(PhysReg, TRI) << ':'); - VRM->clearVirt(VirtReg.reg); + Register PhysReg = VRM->getPhys(VirtReg.reg()); + LLVM_DEBUG(dbgs() << "unassigning " << printReg(VirtReg.reg(), TRI) + << " from " << printReg(PhysReg, TRI) << ':'); + VRM->clearVirt(VirtReg.reg()); foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) { @@ -148,8 +148,8 @@ bool LiveRegMatrix::checkRegMaskInterference(LiveInterval &VirtReg, // Check if the cached information is valid. // The same BitVector can be reused for all PhysRegs. // We could cache multiple VirtRegs if it becomes necessary. - if (RegMaskVirtReg != VirtReg.reg || RegMaskTag != UserTag) { - RegMaskVirtReg = VirtReg.reg; + if (RegMaskVirtReg != VirtReg.reg() || RegMaskTag != UserTag) { + RegMaskVirtReg = VirtReg.reg(); RegMaskTag = UserTag; RegMaskUsable.clear(); LIS->checkRegMaskInterference(VirtReg, RegMaskUsable); @@ -165,7 +165,7 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg, unsigned PhysReg) { if (VirtReg.empty()) return false; - CoalescerPair CP(VirtReg.reg, PhysReg, *TRI); + CoalescerPair CP(VirtReg.reg(), PhysReg, *TRI); bool Result = foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) { diff --git a/llvm/lib/CodeGen/LiveRegUnits.cpp b/llvm/lib/CodeGen/LiveRegUnits.cpp index b2731aa0e7dbc..ea2075bc139df 100644 --- a/llvm/lib/CodeGen/LiveRegUnits.cpp +++ b/llvm/lib/CodeGen/LiveRegUnits.cpp @@ -11,15 +11,11 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/LiveRegUnits.h" - #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/MC/MCRegisterInfo.h" using namespace llvm; diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp index 204fb556d8105..ec3cce3fa1f15 100644 --- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -117,7 +117,7 @@ bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) { // If the target doesn't want/need this pass, or if there are no locals // to consider, early exit. - if (!TRI->requiresVirtualBaseRegisters(MF) || LocalObjectCount == 0) + if (LocalObjectCount == 0 || !TRI->requiresVirtualBaseRegisters(MF)) return true; // Make sure we have enough space to store the local offsets. diff --git a/llvm/lib/CodeGen/LowLevelType.cpp b/llvm/lib/CodeGen/LowLevelType.cpp index 33752a1f9230f..2bda586db8c78 100644 --- a/llvm/lib/CodeGen/LowLevelType.cpp +++ b/llvm/lib/CodeGen/LowLevelType.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/LowLevelType.h" +#include "llvm/ADT/APFloat.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/raw_ostream.h" @@ -58,3 +59,18 @@ LLT llvm::getLLTForMVT(MVT Ty) { return LLT::vector(Ty.getVectorNumElements(), Ty.getVectorElementType().getSizeInBits()); } + +const llvm::fltSemantics &llvm::getFltSemanticForLLT(LLT Ty) { + assert(Ty.isScalar() && "Expected a scalar type."); + switch (Ty.getSizeInBits()) { + case 16: + return APFloat::IEEEhalf(); + case 32: + return APFloat::IEEEsingle(); + case 64: + return APFloat::IEEEdouble(); + case 128: + return APFloat::IEEEquad(); + } + llvm_unreachable("Invalid FP type size."); +} diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 945a560de3ca9..030c3d3e23ab4 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -451,10 +451,8 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF, } // Check Basic Block Section Flags. if (MF.getTarget().getBBSectionsType() == BasicBlockSection::Labels) { - MF.createBBLabels(); MF.setBBSectionsType(BasicBlockSection::Labels); } else if (MF.hasBBSections()) { - MF.createBBLabels(); MF.assignBeginEndSections(); } PFS.SM = &SM; diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index ebdd17fc728d3..42d519970c4d4 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -60,28 +60,11 @@ MCSymbol *MachineBasicBlock::getSymbol() const { if (!CachedMCSymbol) { const MachineFunction *MF = getParent(); MCContext &Ctx = MF->getContext(); - auto Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix(); - assert(getNumber() >= 0 && "cannot get label for unreachable MBB"); - - // We emit a non-temporary symbol for every basic block if we have BBLabels - // or -- with basic block sections -- when a basic block begins a section. - // With basic block symbols, we use a unary encoding which can - // compress the symbol names significantly. For basic block sections where - // this block is the first in a cluster, we use a non-temp descriptive name. - // Otherwise we fall back to use temp label. - if (MF->hasBBLabels()) { - auto Iter = MF->getBBSectionsSymbolPrefix().begin(); - if (getNumber() < 0 || - getNumber() >= (int)MF->getBBSectionsSymbolPrefix().size()) - report_fatal_error("Unreachable MBB: " + Twine(getNumber())); - // The basic blocks for function foo are named a.BB.foo, aa.BB.foo, and - // so on. - std::string Prefix(Iter + 1, Iter + getNumber() + 1); - std::reverse(Prefix.begin(), Prefix.end()); - CachedMCSymbol = - Ctx.getOrCreateSymbol(Twine(Prefix) + ".BB." + Twine(MF->getName())); - } else if (MF->hasBBSections() && isBeginSection()) { + // We emit a non-temporary symbol -- with a descriptive name -- if it begins + // a section (with basic block sections). Otherwise we fall back to use temp + // label. + if (MF->hasBBSections() && isBeginSection()) { SmallString<5> Suffix; if (SectionID == MBBSectionID::ColdSectionID) { Suffix += ".cold"; @@ -92,6 +75,7 @@ MCSymbol *MachineBasicBlock::getSymbol() const { } CachedMCSymbol = Ctx.getOrCreateSymbol(MF->getName() + Suffix); } else { + const StringRef Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix(); CachedMCSymbol = Ctx.getOrCreateSymbol(Twine(Prefix) + "BB" + Twine(MF->getFunctionNumber()) + "_" + Twine(getNumber())); @@ -844,7 +828,7 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old, void MachineBasicBlock::copySuccessor(MachineBasicBlock *Orig, succ_iterator I) { - if (Orig->Probs.empty()) + if (!Orig->Probs.empty()) addSuccessor(*I, Orig->getSuccProbability(I)); else addSuccessorWithoutProb(*I); diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 0950d6497e433..e4473fd124dfc 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -341,33 +341,6 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) { MBBNumbering.resize(BlockNo); } -/// This is used with -fbasic-block-sections or -fbasicblock-labels option. -/// A unary encoding of basic block labels is done to keep ".strtab" sizes -/// small. -void MachineFunction::createBBLabels() { - const TargetInstrInfo *TII = getSubtarget().getInstrInfo(); - this->BBSectionsSymbolPrefix.resize(getNumBlockIDs(), 'a'); - for (auto MBBI = begin(), E = end(); MBBI != E; ++MBBI) { - assert( - (MBBI->getNumber() >= 0 && MBBI->getNumber() < (int)getNumBlockIDs()) && - "BasicBlock number was out of range!"); - // 'a' - Normal block. - // 'r' - Return block. - // 'l' - Landing Pad. - // 'L' - Return and landing pad. - bool isEHPad = MBBI->isEHPad(); - bool isRetBlock = MBBI->isReturnBlock() && !TII->isTailCall(MBBI->back()); - char type = 'a'; - if (isEHPad && isRetBlock) - type = 'L'; - else if (isEHPad) - type = 'l'; - else if (isRetBlock) - type = 'r'; - BBSectionsSymbolPrefix[MBBI->getNumber()] = type; - } -} - /// This method iterates over the basic blocks and assigns their IsBeginSection /// and IsEndSection fields. This must be called after MBB layout is finalized /// and the SectionID's are assigned to MBBs. diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index 457db8d50ca9e..ebae5eb380de8 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -116,7 +116,7 @@ void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) { /// the MCInstrDesc. MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid, DebugLoc dl, bool NoImp) - : MCID(&tid), debugLoc(std::move(dl)) { + : MCID(&tid), debugLoc(std::move(dl)), DebugInstrNum(0) { assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); // Reserve space for the expected number of operands. @@ -130,10 +130,12 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid, addImplicitDefUseOperands(MF); } -/// MachineInstr ctor - Copies MachineInstr arg exactly -/// +/// MachineInstr ctor - Copies MachineInstr arg exactly. +/// Does not copy the number from debug instruction numbering, to preserve +/// uniqueness. MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI) - : MCID(&MI.getDesc()), Info(MI.Info), debugLoc(MI.getDebugLoc()) { + : MCID(&MI.getDesc()), Info(MI.Info), debugLoc(MI.getDebugLoc()), + DebugInstrNum(0) { assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); CapOperands = OperandCapacity::get(MI.getNumOperands()); @@ -839,27 +841,27 @@ const DILabel *MachineInstr::getDebugLabel() const { } const MachineOperand &MachineInstr::getDebugVariableOp() const { - assert(isDebugValue() && "not a DBG_VALUE"); + assert((isDebugValue() || isDebugRef()) && "not a DBG_VALUE"); return getOperand(2); } MachineOperand &MachineInstr::getDebugVariableOp() { - assert(isDebugValue() && "not a DBG_VALUE"); + assert((isDebugValue() || isDebugRef()) && "not a DBG_VALUE"); return getOperand(2); } const DILocalVariable *MachineInstr::getDebugVariable() const { - assert(isDebugValue() && "not a DBG_VALUE"); + assert((isDebugValue() || isDebugRef()) && "not a DBG_VALUE"); return cast(getOperand(2).getMetadata()); } MachineOperand &MachineInstr::getDebugExpressionOp() { - assert(isDebugValue() && "not a DBG_VALUE"); + assert((isDebugValue() || isDebugRef()) && "not a DBG_VALUE"); return getOperand(3); } const DIExpression *MachineInstr::getDebugExpression() const { - assert(isDebugValue() && "not a DBG_VALUE"); + assert((isDebugValue() || isDebugRef()) && "not a DBG_VALUE"); return cast(getOperand(3).getMetadata()); } @@ -1757,6 +1759,12 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, HeapAllocMarker->printAsOperand(OS, MST); } + if (DebugInstrNum) { + if (!FirstOp) + OS << ","; + OS << " debug-instr-number " << DebugInstrNum; + } + if (!SkipDebugLoc) { if (const DebugLoc &DL = getDebugLoc()) { if (!FirstOp) @@ -2231,3 +2239,9 @@ MachineInstr::getFoldedRestoreSize(const TargetInstrInfo *TII) const { return getSpillSlotSize(Accesses, getMF()->getFrameInfo()); return None; } + +unsigned MachineInstr::getDebugInstrNum() { + if (DebugInstrNum == 0) + DebugInstrNum = getParent()->getParent()->getNewDebugInstrNum(); + return DebugInstrNum; +} diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 5e8a916b3b3b1..fc2e5ce0440a3 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -90,7 +90,7 @@ static cl::opt DisableHoistingToHotterBlocks("disable-hoisting-to-hotter-blocks", cl::desc("Disable hoisting instructions to" " hotter blocks"), - cl::init(UseBFI::None), cl::Hidden, + cl::init(UseBFI::PGO), cl::Hidden, cl::values(clEnumValN(UseBFI::None, "none", "disable the feature"), clEnumValN(UseBFI::PGO, "pgo", diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp index f9d099e029956..715a2ba4667d2 100644 --- a/llvm/lib/CodeGen/MachineOutliner.cpp +++ b/llvm/lib/CodeGen/MachineOutliner.cpp @@ -59,10 +59,8 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Twine.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 45a5ef71d0fda..7b6f59f0d91ad 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -268,6 +268,7 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) { void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) { // Reset the pragma for the next loop in iteration. disabledByPragma = false; + II_setByPragma = 0; MachineBasicBlock *LBLK = L.getTopBlock(); diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 2aa14c8131edd..312429955021f 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -2529,7 +2529,7 @@ void MachineVerifier::verifyLiveIntervals() { } const LiveInterval &LI = LiveInts->getInterval(Reg); - assert(Reg == LI.reg && "Invalid reg to interval mapping"); + assert(Reg == LI.reg() && "Invalid reg to interval mapping"); verifyLiveInterval(LI); } @@ -2855,7 +2855,7 @@ void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg, } void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) { - unsigned Reg = LI.reg; + unsigned Reg = LI.reg(); assert(Register::isVirtualRegister(Reg)); verifyLiveRange(LI, Reg); @@ -2872,10 +2872,10 @@ void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) { } if (SR.empty()) { report("Subrange must not be empty", MF); - report_context(SR, LI.reg, SR.LaneMask); + report_context(SR, LI.reg(), SR.LaneMask); } Mask |= SR.LaneMask; - verifyLiveRange(SR, LI.reg, SR.LaneMask); + verifyLiveRange(SR, LI.reg(), SR.LaneMask); if (!LI.covers(SR)) { report("A Subrange is not covered by the main range", MF); report_context(LI); diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index d85b1b7988cec..095da09ea82b8 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -11,9 +11,7 @@ #include "llvm/Analysis/MemoryLocation.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopUtils.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/InitializePasses.h" #include "llvm/MC/MCContext.h" #include "llvm/Support/Debug.h" diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 05c843078fb1a..ed2a50e90ffe7 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -178,6 +178,11 @@ namespace { } } + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties() + .set(MachineFunctionProperties::Property::IsSSA); + } + /// Track Def -> Use info used for rewriting copies. using RewriteMapTy = SmallDenseMap; diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index 5a4837079bed9..86c2f63fd3aac 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -397,7 +397,6 @@ ReachingDefAnalysis::getGlobalReachingDefs(MachineInstr *MI, int PhysReg, return; } - SmallPtrSet Visited; for (auto *MBB : MI->getParent()->predecessors()) getLiveOuts(MBB, PhysReg, Defs); } @@ -437,18 +436,15 @@ MachineInstr *ReachingDefAnalysis::getUniqueReachingMIDef(MachineInstr *MI, SmallPtrSet VisitedBBs; SmallPtrSet Incoming; MachineBasicBlock *Parent = MI->getParent(); - VisitedBBs.insert(Parent); for (auto *Pred : Parent->predecessors()) - getLiveOuts(Pred, PhysReg, Incoming, VisitedBBs); + getLiveOuts(Pred, PhysReg, Incoming); - // If we have a local def and an incoming instruction, then there's not a - // unique instruction def. - if (!Incoming.empty() && LocalDef) - return nullptr; - else if (Incoming.size() == 1) + // Check that we have a single incoming value and that it does not + // come from the same block as MI - since it would mean that the def + // is executed after MI. + if (Incoming.size() == 1 && (*Incoming.begin())->getParent() != Parent) return *Incoming.begin(); - else - return LocalDef; + return nullptr; } MachineInstr *ReachingDefAnalysis::getMIOperand(MachineInstr *MI, diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp index d228268536724..f7fe1063afeae 100644 --- a/llvm/lib/CodeGen/RegAllocBase.cpp +++ b/llvm/lib/CodeGen/RegAllocBase.cpp @@ -87,13 +87,13 @@ void RegAllocBase::allocatePhysRegs() { // Continue assigning vregs one at a time to available physical registers. while (LiveInterval *VirtReg = dequeue()) { - assert(!VRM->hasPhys(VirtReg->reg) && "Register already assigned"); + assert(!VRM->hasPhys(VirtReg->reg()) && "Register already assigned"); // Unused registers can appear when the spiller coalesces snippets. - if (MRI->reg_nodbg_empty(VirtReg->reg)) { + if (MRI->reg_nodbg_empty(VirtReg->reg())) { LLVM_DEBUG(dbgs() << "Dropping unused " << *VirtReg << '\n'); aboutToRemoveInterval(*VirtReg); - LIS->removeInterval(VirtReg->reg); + LIS->removeInterval(VirtReg->reg()); continue; } @@ -104,8 +104,8 @@ void RegAllocBase::allocatePhysRegs() { // register if possible and populate a list of new live intervals that // result from splitting. LLVM_DEBUG(dbgs() << "\nselectOrSplit " - << TRI->getRegClassName(MRI->getRegClass(VirtReg->reg)) - << ':' << *VirtReg << " w=" << VirtReg->weight << '\n'); + << TRI->getRegClassName(MRI->getRegClass(VirtReg->reg())) + << ':' << *VirtReg << " w=" << VirtReg->weight() << '\n'); using VirtRegVec = SmallVector; @@ -117,8 +117,9 @@ void RegAllocBase::allocatePhysRegs() { // Probably caused by an inline asm. MachineInstr *MI = nullptr; for (MachineRegisterInfo::reg_instr_iterator - I = MRI->reg_instr_begin(VirtReg->reg), E = MRI->reg_instr_end(); - I != E; ) { + I = MRI->reg_instr_begin(VirtReg->reg()), + E = MRI->reg_instr_end(); + I != E;) { MI = &*(I++); if (MI->isInlineAsm()) break; @@ -133,8 +134,9 @@ void RegAllocBase::allocatePhysRegs() { report_fatal_error("ran out of registers during register allocation"); } // Keep going after reporting the error. - VRM->assignVirt2Phys(VirtReg->reg, - RegClassInfo.getOrder(MRI->getRegClass(VirtReg->reg)).front()); + VRM->assignVirt2Phys( + VirtReg->reg(), + RegClassInfo.getOrder(MRI->getRegClass(VirtReg->reg())).front()); continue; } @@ -145,16 +147,16 @@ void RegAllocBase::allocatePhysRegs() { assert(LIS->hasInterval(Reg)); LiveInterval *SplitVirtReg = &LIS->getInterval(Reg); - assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned"); - if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) { + assert(!VRM->hasPhys(SplitVirtReg->reg()) && "Register already assigned"); + if (MRI->reg_nodbg_empty(SplitVirtReg->reg())) { assert(SplitVirtReg->empty() && "Non-empty but used interval"); LLVM_DEBUG(dbgs() << "not queueing unused " << *SplitVirtReg << '\n'); aboutToRemoveInterval(*SplitVirtReg); - LIS->removeInterval(SplitVirtReg->reg); + LIS->removeInterval(SplitVirtReg->reg()); continue; } LLVM_DEBUG(dbgs() << "queuing new interval: " << *SplitVirtReg << "\n"); - assert(Register::isVirtualRegister(SplitVirtReg->reg) && + assert(Register::isVirtualRegister(SplitVirtReg->reg()) && "expect split value in virtual register"); enqueue(SplitVirtReg); ++NumNewQueued; diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp index 5009bcc0a3973..a4ce9d70a270a 100644 --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -46,7 +46,7 @@ static RegisterRegAlloc basicRegAlloc("basic", "basic register allocator", namespace { struct CompSpillWeight { bool operator()(LiveInterval *A, LiveInterval *B) const { - return A->weight < B->weight; + return A->weight() < B->weight(); } }; } @@ -213,7 +213,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, Register PhysReg, Q.collectInterferingVRegs(); for (unsigned i = Q.interferingVRegs().size(); i; --i) { LiveInterval *Intf = Q.interferingVRegs()[i - 1]; - if (!Intf->isSpillable() || Intf->weight > VirtReg.weight) + if (!Intf->isSpillable() || Intf->weight() > VirtReg.weight()) return false; Intfs.push_back(Intf); } @@ -227,7 +227,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, Register PhysReg, LiveInterval &Spill = *Intfs[i]; // Skip duplicates. - if (!VRM->hasPhys(Spill.reg)) + if (!VRM->hasPhys(Spill.reg())) continue; // Deallocate the interfering vreg by removing it from the union. @@ -259,7 +259,7 @@ Register RABasic::selectOrSplit(LiveInterval &VirtReg, SmallVector PhysRegSpillCands; // Check for an available register in this class. - AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix); + AllocationOrder Order(VirtReg.reg(), *VRM, RegClassInfo, Matrix); while (Register PhysReg = Order.next()) { // Check for interference in PhysReg switch (Matrix->checkInterference(VirtReg, PhysReg)) { diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 5396f9f3a1432..db1b904fb2e6f 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -106,8 +106,13 @@ namespace { /// that it is alive across blocks. BitVector MayLiveAcrossBlocks; - /// State of a register unit. - enum RegUnitState { + /// State of a physical register. + enum RegState { + /// A disabled register is not available for allocation, but an alias may + /// be in use. A register can only be moved out of the disabled state if + /// all aliases are disabled. + regDisabled, + /// A free register is not currently in use and can be allocated /// immediately without checking aliases. regFree, @@ -121,8 +126,8 @@ namespace { /// register. In that case, LiveVirtRegs contains the inverse mapping. }; - /// Maps each physical register to a RegUnitState enum or virtual register. - std::vector RegUnitStates; + /// Maps each physical register to a RegState enum or a virtual register. + std::vector PhysRegState; SmallVector VirtDead; SmallVector Coalesced; @@ -184,10 +189,6 @@ namespace { bool isLastUseOfLocalReg(const MachineOperand &MO) const; void addKillFlag(const LiveReg &LRI); -#ifndef NDEBUG - bool verifyRegStateMapping(const LiveReg &LR) const; -#endif - void killVirtReg(LiveReg &LR); void killVirtReg(Register VirtReg); void spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR); @@ -195,7 +196,7 @@ namespace { void usePhysReg(MachineOperand &MO); void definePhysReg(MachineBasicBlock::iterator MI, MCPhysReg PhysReg, - unsigned NewState); + RegState NewState); unsigned calcSpillCost(MCPhysReg PhysReg) const; void assignVirtToPhysReg(LiveReg &, MCPhysReg PhysReg); @@ -228,7 +229,7 @@ namespace { bool mayLiveOut(Register VirtReg); bool mayLiveIn(Register VirtReg); - void dumpState() const; + void dumpState(); }; } // end anonymous namespace @@ -239,8 +240,7 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false, false) void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) { - for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) - RegUnitStates[*UI] = NewState; + PhysRegState[PhysReg] = NewState; } /// This allocates space for the specified virtual register to be held on the @@ -263,6 +263,20 @@ int RegAllocFast::getStackSpaceFor(Register VirtReg) { return FrameIdx; } +static bool dominates(MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator A, + MachineBasicBlock::const_iterator B) { + auto MBBEnd = MBB.end(); + if (B == MBBEnd) + return true; + + MachineBasicBlock::const_iterator I = MBB.begin(); + for (; &*I != A && &*I != B; ++I) + ; + + return &*I == A; +} + /// Returns false if \p VirtReg is known to not live out of the current block. bool RegAllocFast::mayLiveOut(Register VirtReg) { if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg))) { @@ -270,11 +284,16 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) { return !MBB->succ_empty(); } - // If this block loops back to itself, it would be necessary to check whether - // the use comes after the def. + const MachineInstr *SelfLoopDef = nullptr; + + // If this block loops back to itself, it is necessary to check whether the + // use comes after the def. if (MBB->isSuccessor(MBB)) { - MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg)); - return true; + SelfLoopDef = MRI->getUniqueVRegDef(VirtReg); + if (!SelfLoopDef) { + MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg)); + return true; + } } // See if the first \p Limit uses of the register are all in the current @@ -287,6 +306,16 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) { // Cannot be live-out if there are no successors. return !MBB->succ_empty(); } + + if (SelfLoopDef) { + // Try to handle some simple cases to avoid spilling and reloading every + // value inside a self looping block. + if (SelfLoopDef == &UseInst || + !dominates(*MBB, SelfLoopDef->getIterator(), UseInst.getIterator())) { + MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg)); + return true; + } + } } return false; @@ -384,23 +413,12 @@ void RegAllocFast::addKillFlag(const LiveReg &LR) { } } -#ifndef NDEBUG -bool RegAllocFast::verifyRegStateMapping(const LiveReg &LR) const { - for (MCRegUnitIterator UI(LR.PhysReg, TRI); UI.isValid(); ++UI) { - if (RegUnitStates[*UI] != LR.VirtReg) - return false; - } - - return true; -} -#endif - /// Mark virtreg as no longer available. void RegAllocFast::killVirtReg(LiveReg &LR) { - assert(verifyRegStateMapping(LR) && "Broken RegState mapping"); addKillFlag(LR); - MCPhysReg PhysReg = LR.PhysReg; - setPhysRegState(PhysReg, regFree); + assert(PhysRegState[LR.PhysReg] == LR.VirtReg && + "Broken RegState mapping"); + setPhysRegState(LR.PhysReg, regFree); LR.PhysReg = 0; } @@ -427,9 +445,7 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI, /// Do the actual work of spilling. void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR) { - assert(verifyRegStateMapping(LR) && "Broken RegState mapping"); - - MCPhysReg PhysReg = LR.PhysReg; + assert(PhysRegState[LR.PhysReg] == LR.VirtReg && "Broken RegState mapping"); if (LR.Dirty) { // If this physreg is used by the instruction, we want to kill it on the @@ -437,7 +453,7 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR) { bool SpillKill = MachineBasicBlock::iterator(LR.LastUse) != MI; LR.Dirty = false; - spill(MI, LR.VirtReg, PhysReg, SpillKill); + spill(MI, LR.VirtReg, LR.PhysReg, SpillKill); if (SpillKill) LR.LastUse = nullptr; // Don't kill register again @@ -473,16 +489,53 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) { assert(PhysReg.isPhysical() && "Bad usePhysReg operand"); markRegUsedInInstr(PhysReg); + switch (PhysRegState[PhysReg]) { + case regDisabled: + break; + case regReserved: + PhysRegState[PhysReg] = regFree; + LLVM_FALLTHROUGH; + case regFree: + MO.setIsKill(); + return; + default: + // The physreg was allocated to a virtual register. That means the value we + // wanted has been clobbered. + llvm_unreachable("Instruction uses an allocated register"); + } - for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) { - switch (RegUnitStates[*UI]) { + // Maybe a superregister is reserved? + for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) { + MCPhysReg Alias = *AI; + switch (PhysRegState[Alias]) { + case regDisabled: + break; case regReserved: - RegUnitStates[*UI] = regFree; + // Either PhysReg is a subregister of Alias and we mark the + // whole register as free, or PhysReg is the superregister of + // Alias and we mark all the aliases as disabled before freeing + // PhysReg. + // In the latter case, since PhysReg was disabled, this means that + // its value is defined only by physical sub-registers. This check + // is performed by the assert of the default case in this loop. + // Note: The value of the superregister may only be partial + // defined, that is why regDisabled is a valid state for aliases. + assert((TRI->isSuperRegister(PhysReg, Alias) || + TRI->isSuperRegister(Alias, PhysReg)) && + "Instruction is not using a subregister of a reserved register"); LLVM_FALLTHROUGH; case regFree: + if (TRI->isSuperRegister(PhysReg, Alias)) { + // Leave the superregister in the working set. + setPhysRegState(Alias, regFree); + MO.getParent()->addRegisterKilled(Alias, TRI, true); + return; + } + // Some other alias was in the working set - clear it. + setPhysRegState(Alias, regDisabled); break; default: - llvm_unreachable("Unexpected reg unit state"); + llvm_unreachable("Instruction uses an alias of an allocated register"); } } @@ -495,20 +548,38 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) { /// similar to defineVirtReg except the physreg is reserved instead of /// allocated. void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI, - MCPhysReg PhysReg, unsigned NewState) { - for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) { - switch (unsigned VirtReg = RegUnitStates[*UI]) { + MCPhysReg PhysReg, RegState NewState) { + markRegUsedInInstr(PhysReg); + switch (Register VirtReg = PhysRegState[PhysReg]) { + case regDisabled: + break; + default: + spillVirtReg(MI, VirtReg); + LLVM_FALLTHROUGH; + case regFree: + case regReserved: + setPhysRegState(PhysReg, NewState); + return; + } + + // This is a disabled register, disable all aliases. + setPhysRegState(PhysReg, NewState); + for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) { + MCPhysReg Alias = *AI; + switch (Register VirtReg = PhysRegState[Alias]) { + case regDisabled: + break; default: spillVirtReg(MI, VirtReg); - break; + LLVM_FALLTHROUGH; case regFree: case regReserved: + setPhysRegState(Alias, regDisabled); + if (TRI->isSuperRegister(PhysReg, Alias)) + return; break; } } - - markRegUsedInInstr(PhysReg); - setPhysRegState(PhysReg, NewState); } /// Return the cost of spilling clearing out PhysReg and aliases so it is free @@ -521,24 +592,46 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const { << " is already used in instr.\n"); return spillImpossible; } + switch (Register VirtReg = PhysRegState[PhysReg]) { + case regDisabled: + break; + case regFree: + return 0; + case regReserved: + LLVM_DEBUG(dbgs() << printReg(VirtReg, TRI) << " corresponding " + << printReg(PhysReg, TRI) << " is reserved already.\n"); + return spillImpossible; + default: { + LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg); + assert(LRI != LiveVirtRegs.end() && LRI->PhysReg && + "Missing VirtReg entry"); + return LRI->Dirty ? spillDirty : spillClean; + } + } - for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) { - switch (unsigned VirtReg = RegUnitStates[*UI]) { + // This is a disabled register, add up cost of aliases. + LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << " is disabled.\n"); + unsigned Cost = 0; + for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) { + MCPhysReg Alias = *AI; + switch (Register VirtReg = PhysRegState[Alias]) { + case regDisabled: + break; case regFree: + ++Cost; break; case regReserved: - LLVM_DEBUG(dbgs() << printReg(VirtReg, TRI) << " corresponding " - << printReg(PhysReg, TRI) << " is reserved already.\n"); return spillImpossible; default: { LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg); assert(LRI != LiveVirtRegs.end() && LRI->PhysReg && "Missing VirtReg entry"); - return LRI->Dirty ? spillDirty : spillClean; + Cost += LRI->Dirty ? spillDirty : spillClean; + break; } } } - return 0; + return Cost; } /// This method updates local state so that we know that PhysReg is the @@ -845,17 +938,9 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI, if (!Reg || !Reg.isPhysical()) continue; markRegUsedInInstr(Reg); - - for (MCRegUnitIterator UI(Reg, TRI); UI.isValid(); ++UI) { - if (!ThroughRegs.count(RegUnitStates[*UI])) - continue; - - // Need to spill any aliasing registers. - for (MCRegUnitRootIterator RI(*UI, TRI); RI.isValid(); ++RI) { - for (MCSuperRegIterator SI(*RI, TRI, true); SI.isValid(); ++SI) { - definePhysReg(MI, *SI, regFree); - } - } + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { + if (ThroughRegs.count(PhysRegState[*AI])) + definePhysReg(MI, *AI, regFree); } } @@ -919,40 +1004,37 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI, } #ifndef NDEBUG - -void RegAllocFast::dumpState() const { - for (unsigned Unit = 1, UnitE = TRI->getNumRegUnits(); Unit != UnitE; - ++Unit) { - switch (unsigned VirtReg = RegUnitStates[Unit]) { +void RegAllocFast::dumpState() { + for (unsigned Reg = 1, E = TRI->getNumRegs(); Reg != E; ++Reg) { + if (PhysRegState[Reg] == regDisabled) continue; + dbgs() << " " << printReg(Reg, TRI); + switch(PhysRegState[Reg]) { case regFree: break; case regReserved: - dbgs() << " " << printRegUnit(Unit, TRI) << "[P]"; + dbgs() << "*"; break; default: { - dbgs() << ' ' << printRegUnit(Unit, TRI) << '=' << printReg(VirtReg); - LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg); - assert(I != LiveVirtRegs.end() && "have LiveVirtRegs entry"); - if (I->Dirty) - dbgs() << "[D]"; - assert(TRI->hasRegUnit(I->PhysReg, Unit) && "inverse mapping present"); + dbgs() << '=' << printReg(PhysRegState[Reg]); + LiveRegMap::iterator LRI = findLiveVirtReg(PhysRegState[Reg]); + assert(LRI != LiveVirtRegs.end() && LRI->PhysReg && + "Missing VirtReg entry"); + if (LRI->Dirty) + dbgs() << "*"; + assert(LRI->PhysReg == Reg && "Bad inverse map"); break; } } } dbgs() << '\n'; // Check that LiveVirtRegs is the inverse. - for (const LiveReg &LR : LiveVirtRegs) { - Register VirtReg = LR.VirtReg; - assert(VirtReg.isVirtual() && "Bad map key"); - MCPhysReg PhysReg = LR.PhysReg; - if (PhysReg != 0) { - assert(Register::isPhysicalRegister(PhysReg) && - "mapped to physreg"); - for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) { - assert(RegUnitStates[*UI] == VirtReg && "inverse map valid"); - } - } + for (LiveRegMap::iterator i = LiveVirtRegs.begin(), + e = LiveVirtRegs.end(); i != e; ++i) { + if (!i->PhysReg) + continue; + assert(i->VirtReg.isVirtual() && "Bad map key"); + assert(Register::isPhysicalRegister(i->PhysReg) && "Bad map value"); + assert(PhysRegState[i->PhysReg] == i->VirtReg && "Bad inverse map"); } } #endif @@ -1142,8 +1224,8 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { // Kill dead defs after the scan to ensure that multiple defs of the same // register are allocated identically. We didn't need to do this for uses - // because we are crerating our own kill flags, and they are always at the - // last use. + // because we are creating our own kill flags, and they are always at the last + // use. for (Register VirtReg : VirtDead) killVirtReg(VirtReg); VirtDead.clear(); @@ -1194,7 +1276,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) { this->MBB = &MBB; LLVM_DEBUG(dbgs() << "\nAllocating " << MBB); - RegUnitStates.assign(TRI->getNumRegUnits(), regFree); + PhysRegState.assign(TRI->getNumRegs(), regDisabled); assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?"); MachineBasicBlock::iterator MII = MBB.begin(); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 41cf002612654..415eb6a8fe7ff 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -247,12 +247,12 @@ class RAGreedy : public MachineFunctionPass, IndexedMap ExtraRegInfo; LiveRangeStage getStage(const LiveInterval &VirtReg) const { - return ExtraRegInfo[VirtReg.reg].Stage; + return ExtraRegInfo[VirtReg.reg()].Stage; } void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) { ExtraRegInfo.resize(MRI->getNumVirtRegs()); - ExtraRegInfo[VirtReg.reg].Stage = Stage; + ExtraRegInfo[VirtReg.reg()].Stage = Stage; } template @@ -677,7 +677,7 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) { // Prioritize live ranges by size, assigning larger ranges first. // The queue holds (size, reg) pairs. const unsigned Size = LI->getSize(); - const unsigned Reg = LI->reg; + const unsigned Reg = LI->reg(); assert(Register::isVirtualRegister(Reg) && "Can only enqueue virtual registers"); unsigned Prio; @@ -768,7 +768,7 @@ Register RAGreedy::tryAssign(LiveInterval &VirtReg, // If we missed a simple hint, try to cheaply evict interference from the // preferred register. - if (Register Hint = MRI->getSimpleHint(VirtReg.reg)) + if (Register Hint = MRI->getSimpleHint(VirtReg.reg())) if (Order.isHint(Hint)) { LLVM_DEBUG(dbgs() << "missed hint " << printReg(Hint, TRI) << '\n'); EvictionCost MaxCost; @@ -800,7 +800,7 @@ Register RAGreedy::tryAssign(LiveInterval &VirtReg, //===----------------------------------------------------------------------===// Register RAGreedy::canReassign(LiveInterval &VirtReg, Register PrevReg) { - AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix); + AllocationOrder Order(VirtReg.reg(), *VRM, RegClassInfo, Matrix); Register PhysReg; while ((PhysReg = Order.next())) { if (PhysReg == PrevReg) @@ -846,8 +846,8 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint, if (CanSplit && IsHint && !BreaksHint) return true; - if (A.weight > B.weight) { - LLVM_DEBUG(dbgs() << "should evict: " << B << " w= " << B.weight << '\n'); + if (A.weight() > B.weight()) { + LLVM_DEBUG(dbgs() << "should evict: " << B << " w= " << B.weight() << '\n'); return true; } return false; @@ -878,7 +878,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, Register PhysReg, // // This works out so a register without a cascade number is allowed to evict // anything, and it can be evicted by anything. - unsigned Cascade = ExtraRegInfo[VirtReg.reg].Cascade; + unsigned Cascade = ExtraRegInfo[VirtReg.reg()].Cascade; if (!Cascade) Cascade = NextCascade; @@ -892,13 +892,13 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, Register PhysReg, // Check if any interfering live range is heavier than MaxWeight. for (unsigned i = Q.interferingVRegs().size(); i; --i) { LiveInterval *Intf = Q.interferingVRegs()[i - 1]; - assert(Register::isVirtualRegister(Intf->reg) && + assert(Register::isVirtualRegister(Intf->reg()) && "Only expecting virtual register interference from query"); // Do not allow eviction of a virtual register if we are in the middle // of last-chance recoloring and this virtual register is one that we // have scavenged a physical register for. - if (FixedRegisters.count(Intf->reg)) + if (FixedRegisters.count(Intf->reg())) return false; // Never evict spill products. They cannot split or spill. @@ -910,12 +910,14 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, Register PhysReg, // // Also allow urgent evictions of unspillable ranges from a strictly // larger allocation order. - bool Urgent = !VirtReg.isSpillable() && - (Intf->isSpillable() || - RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg)) < - RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(Intf->reg))); + bool Urgent = + !VirtReg.isSpillable() && + (Intf->isSpillable() || + RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg())) < + RegClassInfo.getNumAllocatableRegs( + MRI->getRegClass(Intf->reg()))); // Only evict older cascades or live ranges without a cascade. - unsigned IntfCascade = ExtraRegInfo[Intf->reg].Cascade; + unsigned IntfCascade = ExtraRegInfo[Intf->reg()].Cascade; if (Cascade <= IntfCascade) { if (!Urgent) return false; @@ -924,10 +926,10 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, Register PhysReg, Cost.BrokenHints += 10; } // Would this break a satisfied hint? - bool BreaksHint = VRM->hasPreferredPhys(Intf->reg); + bool BreaksHint = VRM->hasPreferredPhys(Intf->reg()); // Update eviction cost. Cost.BrokenHints += BreaksHint; - Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight); + Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight()); // Abort if this would be too expensive. if (!(Cost < MaxCost)) return false; @@ -977,17 +979,17 @@ bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg, continue; // Cannot evict non virtual reg interference. - if (!Register::isVirtualRegister(Intf->reg)) + if (!Register::isVirtualRegister(Intf->reg())) return false; // Never evict spill products. They cannot split or spill. if (getStage(*Intf) == RS_Done) return false; // Would this break a satisfied hint? - bool BreaksHint = VRM->hasPreferredPhys(Intf->reg); + bool BreaksHint = VRM->hasPreferredPhys(Intf->reg()); // Update eviction cost. Cost.BrokenHints += BreaksHint; - Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight); + Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight()); // Abort if this would be too expensive. if (!(Cost < MaxCost)) return false; @@ -1018,7 +1020,7 @@ unsigned RAGreedy::getCheapestEvicteeWeight(const AllocationOrder &Order, float *BestEvictweight) { EvictionCost BestEvictCost; BestEvictCost.setMax(); - BestEvictCost.MaxWeight = VirtReg.weight; + BestEvictCost.MaxWeight = VirtReg.weight(); unsigned BestEvicteePhys = 0; // Go over all physical registers and find the best candidate for eviction @@ -1043,9 +1045,9 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, Register PhysReg, // Make sure that VirtReg has a cascade number, and assign that cascade // number to every evicted register. These live ranges than then only be // evicted by a newer cascade, preventing infinite loops. - unsigned Cascade = ExtraRegInfo[VirtReg.reg].Cascade; + unsigned Cascade = ExtraRegInfo[VirtReg.reg()].Cascade; if (!Cascade) - Cascade = ExtraRegInfo[VirtReg.reg].Cascade = NextCascade++; + Cascade = ExtraRegInfo[VirtReg.reg()].Cascade = NextCascade++; LLVM_DEBUG(dbgs() << "evicting " << printReg(PhysReg, TRI) << " interference: Cascade " << Cascade << '\n'); @@ -1067,18 +1069,18 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, Register PhysReg, for (unsigned i = 0, e = Intfs.size(); i != e; ++i) { LiveInterval *Intf = Intfs[i]; // The same VirtReg may be present in multiple RegUnits. Skip duplicates. - if (!VRM->hasPhys(Intf->reg)) + if (!VRM->hasPhys(Intf->reg())) continue; - LastEvicted.addEviction(PhysReg, VirtReg.reg, Intf->reg); + LastEvicted.addEviction(PhysReg, VirtReg.reg(), Intf->reg()); Matrix->unassign(*Intf); - assert((ExtraRegInfo[Intf->reg].Cascade < Cascade || + assert((ExtraRegInfo[Intf->reg()].Cascade < Cascade || VirtReg.isSpillable() < Intf->isSpillable()) && "Cannot decrease cascade number, illegal eviction"); - ExtraRegInfo[Intf->reg].Cascade = Cascade; + ExtraRegInfo[Intf->reg()].Cascade = Cascade; ++NumEvicted; - NewVRegs.push_back(Intf->reg); + NewVRegs.push_back(Intf->reg()); } } @@ -1114,10 +1116,10 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg, // hints, and only evict smaller spill weights. if (CostPerUseLimit < ~0u) { BestCost.BrokenHints = 0; - BestCost.MaxWeight = VirtReg.weight; + BestCost.MaxWeight = VirtReg.weight(); // Check of any registers in RC are below CostPerUseLimit. - const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg); + const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg()); unsigned MinCost = RegClassInfo.getMinCost(RC); if (MinCost >= CostPerUseLimit) { LLVM_DEBUG(dbgs() << TRI->getRegClassName(RC) << " minimum cost = " @@ -1578,7 +1580,7 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand, bool *CanCauseEvictionChain) { BlockFrequency GlobalCost = 0; const BitVector &LiveBundles = Cand.LiveBundles; - unsigned VirtRegToSplit = SA->getParent().reg; + unsigned VirtRegToSplit = SA->getParent().reg(); ArrayRef UseBlocks = SA->getUseBlocks(); for (unsigned i = 0; i != UseBlocks.size(); ++i) { const SplitAnalysis::BlockInfo &BI = UseBlocks[i]; @@ -1679,7 +1681,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit, // Isolate even single instructions when dealing with a proper sub-class. // That guarantees register class inflation for the stack interval because it // is all copies. - unsigned Reg = SA->getParent().reg; + unsigned Reg = SA->getParent().reg(); bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg)); // First handle all the blocks with uses. @@ -1942,7 +1944,7 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg, // See splitCanCauseEvictionChain for detailed description of bad // eviction chain scenarios. LLVM_DEBUG(dbgs() << "Best split candidate of vreg " - << printReg(VirtReg.reg, TRI) << " may "); + << printReg(VirtReg.reg(), TRI) << " may "); if (!(*CanCauseEvictionChain)) LLVM_DEBUG(dbgs() << "not "); LLVM_DEBUG(dbgs() << "cause bad eviction chain\n"); @@ -2001,7 +2003,7 @@ unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand, unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order, SmallVectorImpl &NewVRegs) { assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed"); - Register Reg = VirtReg.reg; + Register Reg = VirtReg.reg(); bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg)); LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats); SE->reset(LREdit, SplitSpillMode); @@ -2067,7 +2069,7 @@ static unsigned getNumAllocatableRegsForConstraints( unsigned RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order, SmallVectorImpl &NewVRegs) { - const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg); + const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg()); // There is no point to this if there are no larger sub-classes. if (!RegClassInfo.isProperSubClass(CurRC)) return 0; @@ -2095,8 +2097,8 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order, if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Uses[i])) if (MI->isFullCopy() || SuperRCNumAllocatableRegs == - getNumAllocatableRegsForConstraints(MI, VirtReg.reg, SuperRC, TII, - TRI, RCI)) { + getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC, + TII, TRI, RCI)) { LLVM_DEBUG(dbgs() << " skip:\t" << Uses[i] << '\t' << *MI); continue; } @@ -2113,7 +2115,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order, SmallVector IntvMap; SE->finish(&IntvMap); - DebugVars->splitRegister(VirtReg.reg, LREdit.regs(), *LIS); + DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS); ExtraRegInfo.resize(MRI->getNumVirtRegs()); // Assign all new registers to RS_Spill. This was the last chance. @@ -2169,7 +2171,7 @@ void RAGreedy::calcGapWeights(unsigned PhysReg, break; // Update the gaps covered by IntI. - const float weight = IntI.value()->weight; + const float weight = IntI.value()->weight(); for (; Gap != NumGaps; ++Gap) { GapWeight[Gap] = std::max(GapWeight[Gap], weight); if (Uses[Gap+1].getBaseIndex() >= IntI.stop()) @@ -2409,7 +2411,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, SE->useIntv(SegStart, SegStop); SmallVector IntvMap; SE->finish(&IntvMap); - DebugVars->splitRegister(VirtReg.reg, LREdit.regs(), *LIS); + DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS); // If the new range has the same number of instructions as before, mark it as // RS_Split2 so the next split will be forced to make progress. Otherwise, @@ -2511,7 +2513,7 @@ bool RAGreedy::mayRecolorAllInterferences(unsigned PhysReg, LiveInterval &VirtReg, SmallLISet &RecoloringCandidates, const SmallVirtRegSet &FixedRegisters) { - const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg); + const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg()); for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); @@ -2530,9 +2532,10 @@ RAGreedy::mayRecolorAllInterferences(unsigned PhysReg, LiveInterval &VirtReg, // However, if VirtReg has tied defs and Intf doesn't, then // there is still a point in examining if it can be recolorable. if (((getStage(*Intf) == RS_Done && - MRI->getRegClass(Intf->reg) == CurRC) && - !(hasTiedDef(MRI, VirtReg.reg) && !hasTiedDef(MRI, Intf->reg))) || - FixedRegisters.count(Intf->reg)) { + MRI->getRegClass(Intf->reg()) == CurRC) && + !(hasTiedDef(MRI, VirtReg.reg()) && + !hasTiedDef(MRI, Intf->reg()))) || + FixedRegisters.count(Intf->reg())) { LLVM_DEBUG( dbgs() << "Early abort: the interference is not recolorable.\n"); return false; @@ -2587,6 +2590,9 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg, SmallVectorImpl &NewVRegs, SmallVirtRegSet &FixedRegisters, unsigned Depth) { + if (!TRI->shouldUseLastChanceRecoloringForVirtReg(*MF, VirtReg)) + return ~0u; + LLVM_DEBUG(dbgs() << "Try last chance recoloring for " << VirtReg << '\n'); // Ranges must be Done. assert((getStage(VirtReg) >= RS_Done || !VirtReg.isSpillable()) && @@ -2608,8 +2614,8 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg, DenseMap VirtRegToPhysReg; // Mark VirtReg as fixed, i.e., it will not be recolored pass this point in // this recoloring "session". - assert(!FixedRegisters.count(VirtReg.reg)); - FixedRegisters.insert(VirtReg.reg); + assert(!FixedRegisters.count(VirtReg.reg())); + FixedRegisters.insert(VirtReg.reg()); SmallVector CurrentNewVRegs; Order.rewind(); @@ -2644,7 +2650,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg, for (SmallLISet::iterator It = RecoloringCandidates.begin(), EndIt = RecoloringCandidates.end(); It != EndIt; ++It) { - Register ItVirtReg = (*It)->reg; + Register ItVirtReg = (*It)->reg(); enqueue(RecoloringQueue, *It); assert(VRM->hasPhys(ItVirtReg) && "Interferences are supposed to be with allocated variables"); @@ -2697,7 +2703,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg, for (SmallLISet::iterator It = RecoloringCandidates.begin(), EndIt = RecoloringCandidates.end(); It != EndIt; ++It) { - Register ItVirtReg = (*It)->reg; + Register ItVirtReg = (*It)->reg(); if (VRM->hasPhys(ItVirtReg)) Matrix->unassign(**It); Register ItPhysReg = VirtRegToPhysReg[ItVirtReg]; @@ -2743,7 +2749,7 @@ bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue, << " succeeded with: " << printReg(PhysReg, TRI) << '\n'); Matrix->assign(*LI, PhysReg); - FixedRegisters.insert(LI->reg); + FixedRegisters.insert(LI->reg()); } return true; } @@ -2900,7 +2906,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) { SmallSet Visited; SmallVector RecoloringCandidates; HintsInfo Info; - unsigned Reg = VirtReg.reg; + unsigned Reg = VirtReg.reg(); Register PhysReg = VRM->getPhys(Reg); // Start the recoloring algorithm from the input live-interval, then // it will propagate to the ones that are copy-related with it. @@ -3003,11 +3009,11 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) { /// getting rid of 2 copies. void RAGreedy::tryHintsRecoloring() { for (LiveInterval *LI : SetOfBrokenHints) { - assert(Register::isVirtualRegister(LI->reg) && + assert(Register::isVirtualRegister(LI->reg()) && "Recoloring is possible only for virtual registers"); // Some dead defs may be around (e.g., because of debug uses). // Ignore those. - if (!VRM->hasPhys(LI->reg)) + if (!VRM->hasPhys(LI->reg())) continue; tryHintRecoloring(*LI); } @@ -3019,10 +3025,10 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, unsigned Depth) { unsigned CostPerUseLimit = ~0u; // First try assigning a free register. - AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix); + AllocationOrder Order(VirtReg.reg(), *VRM, RegClassInfo, Matrix); if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs, FixedRegisters)) { // If VirtReg got an assignment, the eviction info is no longre relevant. - LastEvicted.clearEvicteeInfo(VirtReg.reg); + LastEvicted.clearEvicteeInfo(VirtReg.reg()); // When NewVRegs is not empty, we may have made decisions such as evicting // a virtual register, go with the earlier decisions and use the physical // register. @@ -3040,7 +3046,7 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, LiveRangeStage Stage = getStage(VirtReg); LLVM_DEBUG(dbgs() << StageName[Stage] << " Cascade " - << ExtraRegInfo[VirtReg.reg].Cascade << '\n'); + << ExtraRegInfo[VirtReg.reg()].Cascade << '\n'); // Try to evict a less worthy live range, but only for ranges from the primary // queue. The RS_Split ranges already failed to do this, and they should not @@ -3049,7 +3055,7 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, if (Register PhysReg = tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit, FixedRegisters)) { - Register Hint = MRI->getSimpleHint(VirtReg.reg); + Register Hint = MRI->getSimpleHint(VirtReg.reg()); // If VirtReg has a hint and that hint is broken record this // virtual register as a recoloring candidate for broken hint. // Indeed, since we evicted a variable in its neighborhood it is @@ -3059,7 +3065,7 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, SetOfBrokenHints.insert(&VirtReg); // If VirtReg eviction someone, the eviction info for it as an evictee is // no longre relevant. - LastEvicted.clearEvicteeInfo(VirtReg.reg); + LastEvicted.clearEvicteeInfo(VirtReg.reg()); return PhysReg; } @@ -3071,7 +3077,7 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, if (Stage < RS_Split) { setStage(VirtReg, RS_Split); LLVM_DEBUG(dbgs() << "wait for second round\n"); - NewVRegs.push_back(VirtReg.reg); + NewVRegs.push_back(VirtReg.reg()); return 0; } @@ -3081,7 +3087,7 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, Register PhysReg = trySplit(VirtReg, Order, NewVRegs, FixedRegisters); if (PhysReg || (NewVRegs.size() - NewVRegSizeBefore)) { // If VirtReg got split, the eviction info is no longer relevant. - LastEvicted.clearEvicteeInfo(VirtReg.reg); + LastEvicted.clearEvicteeInfo(VirtReg.reg()); return PhysReg; } } @@ -3093,14 +3099,16 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, Depth); // Finally spill VirtReg itself. - if (EnableDeferredSpilling && getStage(VirtReg) < RS_Memory) { + if ((EnableDeferredSpilling || + TRI->shouldUseDeferredSpillingForVirtReg(*MF, VirtReg)) && + getStage(VirtReg) < RS_Memory) { // TODO: This is experimental and in particular, we do not model // the live range splitting done by spilling correctly. // We would need a deep integration with the spiller to do the // right thing here. Anyway, that is still good for early testing. setStage(VirtReg, RS_Memory); LLVM_DEBUG(dbgs() << "Do as if this register is in memory\n"); - NewVRegs.push_back(VirtReg.reg); + NewVRegs.push_back(VirtReg.reg()); } else { NamedRegionTimer T("spill", "Spiller", TimerGroupName, TimerGroupDescription, TimePassesIsEnabled); @@ -3111,7 +3119,7 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, // Tell LiveDebugVariables about the new ranges. Ranges not being covered by // the new regs are kept in LDV (still mapping to the old register), until // we rewrite spilled locations in LDV at a later stage. - DebugVars->splitRegister(VirtReg.reg, LRE.regs(), *LIS); + DebugVars->splitRegister(VirtReg.reg(), LRE.regs(), *LIS); if (VerifyEnabled) MF->verify(this, "After spilling"); diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp index 34701b71f2816..0f848f62f7d1e 100644 --- a/llvm/lib/CodeGen/RegAllocPBQP.cpp +++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp @@ -199,7 +199,7 @@ class SpillCosts : public PBQPRAConstraint { for (auto NId : G.nodeIds()) { PBQP::PBQPNum SpillCost = - LIS.getInterval(G.getNodeMetadata(NId).getVReg()).weight; + LIS.getInterval(G.getNodeMetadata(NId).getVReg()).weight(); if (SpillCost == 0.0) SpillCost = std::numeric_limits::min(); else @@ -290,7 +290,7 @@ class Interference : public PBQPRAConstraint { // If two intervals end at the same point, we need a way to break the tie or // the set will assume they're actually equal and refuse to insert a // "duplicate". Just compare the vregs - fast and guaranteed unique. - return std::get<0>(I1)->reg < std::get<0>(I2)->reg; + return std::get<0>(I1)->reg() < std::get<0>(I2)->reg(); } static bool isAtLastSegment(const IntervalInfo &I) { @@ -595,8 +595,8 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM, // If this is an empty interval move it to the EmptyIntervalVRegs set then // continue. if (VRegLI.empty()) { - EmptyIntervalVRegs.insert(VRegLI.reg); - VRegsToAlloc.erase(VRegLI.reg); + EmptyIntervalVRegs.insert(VRegLI.reg()); + VRegsToAlloc.erase(VRegLI.reg()); continue; } @@ -684,7 +684,7 @@ void RegAllocPBQP::spillVReg(Register VReg, const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); (void)TRI; LLVM_DEBUG(dbgs() << "VREG " << printReg(VReg, &TRI) << " -> SPILLED (Cost: " - << LRE.getParent().weight << ", New vregs: "); + << LRE.getParent().weight() << ", New vregs: "); // Copy any newly inserted live intervals into the list of regs to // allocate. @@ -692,8 +692,8 @@ void RegAllocPBQP::spillVReg(Register VReg, I != E; ++I) { const LiveInterval &LI = LIS.getInterval(*I); assert(!LI.empty() && "Empty spill range."); - LLVM_DEBUG(dbgs() << printReg(LI.reg, &TRI) << " "); - VRegsToAlloc.insert(LI.reg); + LLVM_DEBUG(dbgs() << printReg(LI.reg(), &TRI) << " "); + VRegsToAlloc.insert(LI.reg()); } LLVM_DEBUG(dbgs() << ")\n"); @@ -749,10 +749,10 @@ void RegAllocPBQP::finalizeAlloc(MachineFunction &MF, I != E; ++I) { LiveInterval &LI = LIS.getInterval(*I); - unsigned PReg = MRI.getSimpleHint(LI.reg); + unsigned PReg = MRI.getSimpleHint(LI.reg()); if (PReg == 0) { - const TargetRegisterClass &RC = *MRI.getRegClass(LI.reg); + const TargetRegisterClass &RC = *MRI.getRegClass(LI.reg()); const ArrayRef RawPRegOrder = RC.getRawAllocationOrder(MF); for (unsigned CandidateReg : RawPRegOrder) { if (!VRM.getRegInfo().isReserved(CandidateReg)) { @@ -764,7 +764,7 @@ void RegAllocPBQP::finalizeAlloc(MachineFunction &MF, "No un-reserved physical registers in this register class"); } - VRM.assignVirt2Phys(LI.reg, PReg); + VRM.assignVirt2Phys(LI.reg(), PReg); } } diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 17160a9f42cd5..9bff32bb39166 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -649,7 +649,7 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP, // in IntB, we can merge them. if (ValS+1 != BS) return false; - LLVM_DEBUG(dbgs() << "Extending: " << printReg(IntB.reg, TRI)); + LLVM_DEBUG(dbgs() << "Extending: " << printReg(IntB.reg(), TRI)); SlotIndex FillerStart = ValS->end, FillerEnd = BS->start; // We are about to delete CopyMI, so need to remove it as the 'instruction @@ -692,13 +692,13 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP, // If the source instruction was killing the source register before the // merge, unset the isKill marker given the live range has been extended. - int UIdx = ValSEndInst->findRegisterUseOperandIdx(IntB.reg, true); + int UIdx = ValSEndInst->findRegisterUseOperandIdx(IntB.reg(), true); if (UIdx != -1) { ValSEndInst->getOperand(UIdx).setIsKill(false); } // Rewrite the copy. - CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI); + CopyMI->substituteRegister(IntA.reg(), IntB.reg(), 0, *TRI); // If the copy instruction was killing the destination register or any // subrange before the merge trim the live range. bool RecomputeLiveRange = AS->end == CopyIdx; @@ -817,7 +817,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, return { false, false }; // If DefMI is a two-address instruction then commuting it will change the // destination register. - int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg); + int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg()); assert(DefIdx != -1); unsigned UseOpIdx; if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx)) @@ -838,7 +838,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx); Register NewReg = NewDstMO.getReg(); - if (NewReg != IntB.reg || !IntB.Query(AValNo->def).isKill()) + if (NewReg != IntB.reg() || !IntB.Query(AValNo->def).isKill()) return { false, false }; // Make sure there are no other definitions of IntB that would reach the @@ -848,7 +848,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, // If some of the uses of IntA.reg is already coalesced away, return false. // It's not possible to determine whether it's safe to perform the coalescing. - for (MachineOperand &MO : MRI->use_nodbg_operands(IntA.reg)) { + for (MachineOperand &MO : MRI->use_nodbg_operands(IntA.reg())) { MachineInstr *UseMI = MO.getParent(); unsigned OpNo = &MO - &UseMI->getOperand(0); SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI); @@ -870,9 +870,9 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx); if (!NewMI) return { false, false }; - if (Register::isVirtualRegister(IntA.reg) && - Register::isVirtualRegister(IntB.reg) && - !MRI->constrainRegClass(IntB.reg, MRI->getRegClass(IntA.reg))) + if (Register::isVirtualRegister(IntA.reg()) && + Register::isVirtualRegister(IntB.reg()) && + !MRI->constrainRegClass(IntB.reg(), MRI->getRegClass(IntA.reg()))) return { false, false }; if (NewMI != DefMI) { LIS->ReplaceMachineInstrInMaps(*DefMI, *NewMI); @@ -891,9 +891,10 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, // = B // Update uses of IntA of the specific Val# with IntB. - for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg), + for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg()), UE = MRI->use_end(); - UI != UE; /* ++UI is below because of possible MI removal */) { + UI != UE; + /* ++UI is below because of possible MI removal */) { MachineOperand &UseMO = *UI; ++UI; if (UseMO.isUndef()) @@ -920,7 +921,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, continue; if (!UseMI->isCopy()) continue; - if (UseMI->getOperand(0).getReg() != IntB.reg || + if (UseMI->getOperand(0).getReg() != IntB.reg() || UseMI->getOperand(0).getSubReg()) continue; @@ -951,10 +952,10 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); if (IntA.hasSubRanges() || IntB.hasSubRanges()) { if (!IntA.hasSubRanges()) { - LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntA.reg); + LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntA.reg()); IntA.createSubRangeFrom(Allocator, Mask, IntA); } else if (!IntB.hasSubRanges()) { - LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntB.reg); + LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntB.reg()); IntB.createSubRangeFrom(Allocator, Mask, IntB); } SlotIndex AIdx = CopyIdx.getRegSlot(true); @@ -1100,8 +1101,8 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP, continue; } // Check DefMI is a reverse copy and it is in BB Pred. - if (DefMI->getOperand(0).getReg() != IntA.reg || - DefMI->getOperand(1).getReg() != IntB.reg || + if (DefMI->getOperand(0).getReg() != IntA.reg() || + DefMI->getOperand(1).getReg() != IntB.reg() || DefMI->getParent() != Pred) { CopyLeftBB = Pred; continue; @@ -1158,8 +1159,8 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP, // Insert new copy to CopyLeftBB. MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(), - TII->get(TargetOpcode::COPY), IntB.reg) - .addReg(IntA.reg); + TII->get(TargetOpcode::COPY), IntB.reg()) + .addReg(IntA.reg()); SlotIndex NewCopyIdx = LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot(); IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); @@ -1752,7 +1753,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg, unsigned DstReg, if (SubIdx != 0 && MO.isUse() && MRI->shouldTrackSubRegLiveness(DstReg)) { if (!DstInt->hasSubRanges()) { BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); - LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstInt->reg); + LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstInt->reg()); LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(SubIdx); LaneBitmask UnusedLanes = FullMask & ~UsedLanes; DstInt->createSubRangeFrom(Allocator, UsedLanes, *DstInt); @@ -1991,7 +1992,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { continue; LLVM_DEBUG(dbgs() << "Shrink LaneUses (Lane " << PrintLaneMask(S.LaneMask) << ")\n"); - LIS->shrinkToUses(S, LI.reg); + LIS->shrinkToUses(S, LI.reg()); } LI.removeEmptySubRanges(); } @@ -3353,7 +3354,7 @@ void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI, bool RegisterCoalescer::isHighCostLiveInterval(LiveInterval &LI) { if (LI.valnos.size() < LargeIntervalSizeThreshold) return false; - auto &Counter = LargeLIVisitCounter[LI.reg]; + auto &Counter = LargeLIVisitCounter[LI.reg()]; if (Counter < LargeIntervalFreqThreshold) { Counter++; return false; @@ -3456,8 +3457,8 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { // Kill flags are going to be wrong if the live ranges were overlapping. // Eventually, we should simply clear all kill flags when computing live // ranges. They are reinserted after register allocation. - MRI->clearKillFlags(LHS.reg); - MRI->clearKillFlags(RHS.reg); + MRI->clearKillFlags(LHS.reg()); + MRI->clearKillFlags(RHS.reg()); if (!EndPoints.empty()) { // Recompute the parts of the live range we had to remove because of diff --git a/llvm/lib/CodeGen/RenameIndependentSubregs.cpp b/llvm/lib/CodeGen/RenameIndependentSubregs.cpp index 4ee28d6bbb465..0872ec303460d 100644 --- a/llvm/lib/CodeGen/RenameIndependentSubregs.cpp +++ b/llvm/lib/CodeGen/RenameIndependentSubregs.cpp @@ -130,7 +130,7 @@ bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const { return false; // Create a new VReg for each class. - unsigned Reg = LI.reg; + unsigned Reg = LI.reg(); const TargetRegisterClass *RegClass = MRI->getRegClass(Reg); SmallVector Intervals; Intervals.push_back(&LI); @@ -175,7 +175,7 @@ bool RenameIndependentSubregs::findComponents(IntEqClasses &Classes, // across subranges when they are affected by the same MachineOperand. const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo(); Classes.grow(NumComponents); - unsigned Reg = LI.reg; + unsigned Reg = LI.reg(); for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) { if (!MO.isDef() && !MO.readsReg()) continue; @@ -212,7 +212,7 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes, const SmallVectorImpl &SubRangeInfos, const SmallVectorImpl &Intervals) const { const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo(); - unsigned Reg = Intervals[0]->reg; + unsigned Reg = Intervals[0]->reg(); for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg), E = MRI->reg_nodbg_end(); I != E; ) { MachineOperand &MO = *I++; @@ -242,7 +242,7 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes, break; } - unsigned VReg = Intervals[ID]->reg; + unsigned VReg = Intervals[ID]->reg(); MO.setReg(VReg); if (MO.isTied() && Reg != VReg) { @@ -304,7 +304,7 @@ void RenameIndependentSubregs::computeMainRangesFixFlags( const SlotIndexes &Indexes = *LIS->getSlotIndexes(); for (size_t I = 0, E = Intervals.size(); I < E; ++I) { LiveInterval &LI = *Intervals[I]; - unsigned Reg = LI.reg; + unsigned Reg = LI.reg(); LI.removeEmptySubRanges(); diff --git a/llvm/lib/CodeGen/SafeStackLayout.cpp b/llvm/lib/CodeGen/SafeStackLayout.cpp index c823454f825cd..f333e5046ec62 100644 --- a/llvm/lib/CodeGen/SafeStackLayout.cpp +++ b/llvm/lib/CodeGen/SafeStackLayout.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "SafeStackLayout.h" -#include "llvm/Analysis/StackLifetime.h" #include "llvm/IR/Value.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" diff --git a/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp index 15b67e3b69cc1..3443743a28c5f 100644 --- a/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp @@ -865,6 +865,12 @@ bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { IntrinsicInst *II = dyn_cast(CI); if (II) { + // The scalarization code below does not work for scalable vectors. + if (isa(II->getType()) || + any_of(II->arg_operands(), + [](Value *V) { return isa(V->getType()); })) + return false; + switch (II->getIntrinsicID()) { default: break; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 37d8cdd695445..285bd2455b9f2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1558,9 +1558,15 @@ void DAGCombiner::Run(CombineLevel AtLevel) { DAG.ReplaceAllUsesWith(N, &RV); } - // Push the new node and any users onto the worklist - AddToWorklist(RV.getNode()); - AddUsersToWorklist(RV.getNode()); + // Push the new node and any users onto the worklist. Omit this if the + // new node is the EntryToken (e.g. if a store managed to get optimized + // out), because re-visiting the EntryToken and its users will not uncover + // any additional opportunities, but there may be a large number of such + // users, potentially causing compile time explosion. + if (RV.getOpcode() != ISD::EntryToken) { + AddToWorklist(RV.getNode()); + AddUsersToWorklist(RV.getNode()); + } // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to @@ -7042,7 +7048,7 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { SDValue NewStore = DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(), - FirstStore->getPointerInfo(), FirstStore->getAlignment()); + FirstStore->getPointerInfo(), FirstStore->getAlign()); // Rely on other DAG combine rules to remove the other individual stores. DAG.ReplaceAllUsesWith(N, NewStore.getNode()); @@ -7225,10 +7231,10 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { if (!Allowed || !Fast) return SDValue(); - SDValue NewLoad = DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, - SDLoc(N), VT, Chain, FirstLoad->getBasePtr(), - FirstLoad->getPointerInfo(), MemVT, - FirstLoad->getAlignment()); + SDValue NewLoad = + DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT, + Chain, FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), MemVT, FirstLoad->getAlign()); // Transfer chain users from old loads to the new load. for (LoadSDNode *L : Loads) @@ -7398,9 +7404,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { if (N0.hasOneUse()) { // FIXME Can we handle multiple uses? Could we token factor the chain // results from the new/old setcc? - SDValue SetCC = DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC, - N0.getOperand(0), - N0Opcode == ISD::STRICT_FSETCCS); + SDValue SetCC = + DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC, SDNodeFlags(), + N0.getOperand(0), N0Opcode == ISD::STRICT_FSETCCS); CombineTo(N, SetCC); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1)); recursivelyDeleteUnusedNodes(N0.getNode()); @@ -9238,6 +9244,14 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { if (ISD::isBuildVectorAllZeros(Mask.getNode())) return Chain; + // If this is a masked load with an all ones mask, we can use a unmasked load. + // FIXME: Can we do this for indexed, compressing, or truncating stores? + if (ISD::isBuildVectorAllOnes(Mask.getNode()) && + MST->isUnindexed() && !MST->isCompressingStore() && + !MST->isTruncatingStore()) + return DAG.getStore(MST->getChain(), SDLoc(N), MST->getValue(), + MST->getBasePtr(), MST->getMemOperand()); + // Try transforming N to an indexed store. if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); @@ -9266,6 +9280,16 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { if (ISD::isBuildVectorAllZeros(Mask.getNode())) return CombineTo(N, MLD->getPassThru(), MLD->getChain()); + // If this is a masked load with an all ones mask, we can use a unmasked load. + // FIXME: Can we do this for indexed, expanding, or extending loads? + if (ISD::isBuildVectorAllOnes(Mask.getNode()) && + MLD->isUnindexed() && !MLD->isExpandingLoad() && + MLD->getExtensionType() == ISD::NON_EXTLOAD) { + SDValue NewLd = DAG.getLoad(N->getValueType(0), SDLoc(N), MLD->getChain(), + MLD->getBasePtr(), MLD->getMemOperand()); + return CombineTo(N, NewLd, NewLd.getValue(1)); + } + // Try transforming N to an indexed load. if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); @@ -9783,7 +9807,7 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { SDValue BasePtr = LN0->getBasePtr(); for (unsigned Idx = 0; Idx < NumSplits; Idx++) { const unsigned Offset = Idx * Stride; - const unsigned Align = MinAlign(LN0->getAlignment(), Offset); + const Align Align = commonAlignment(LN0->getAlign(), Offset); SDValue SplitLoad = DAG.getExtLoad( ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr, @@ -10207,7 +10231,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); ISD::CondCode CC = cast(N0.getOperand(2))->get(); - EVT N00VT = N0.getOperand(0).getValueType(); + EVT N00VT = N00.getValueType(); // sext(setcc) -> sext_in_reg(vsetcc) for vectors. // Only do this before legalize for now. @@ -11009,7 +11033,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { ShAmt = AdjustBigEndianShift(ShAmt); uint64_t PtrOff = ShAmt / 8; - unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff); + Align NewAlign = commonAlignment(LN0->getAlign(), PtrOff); SDLoc DL(LN0); // The original load itself didn't wrap, so an offset within it doesn't. SDNodeFlags Flags; @@ -11729,7 +11753,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { *LN0->getMemOperand())) { SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), - LN0->getPointerInfo(), LN0->getAlignment(), + LN0->getPointerInfo(), LN0->getAlign(), LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); return Load; @@ -13160,11 +13184,11 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { if (N1CFP && N1CFP->isZero()) return N2; } - // TODO: The FMA node should have flags that propagate to these nodes. + if (N0CFP && N0CFP->isExactlyValue(1.0)) - return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2); + return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2, Flags); if (N1CFP && N1CFP->isExactlyValue(1.0)) - return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2); + return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2, Flags); // Canonicalize (fma c, x, y) -> (fma x, c, y) if (isConstantFPBuildVectorOrConstantFP(N0) && @@ -13193,19 +13217,16 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { } } - // (fma x, 1, y) -> (fadd x, y) // (fma x, -1, y) -> (fadd (fneg x), y) if (N1CFP) { if (N1CFP->isExactlyValue(1.0)) - // TODO: The FMA node should have flags that propagate to this node. - return DAG.getNode(ISD::FADD, DL, VT, N0, N2); + return DAG.getNode(ISD::FADD, DL, VT, N0, N2, Flags); if (N1CFP->isExactlyValue(-1.0) && (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) { SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0); AddToWorklist(RHSNeg.getNode()); - // TODO: The FMA node should have flags that propagate to this node. - return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg); + return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg, Flags); } // fma (fneg x), K, y -> fma x -K, y @@ -14040,6 +14061,10 @@ static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N, EVT VT = N->getValueType(0); const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); + const SDNodeFlags Flags = N->getFlags(); + unsigned Opc = N->getOpcode(); + bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM; + bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM; if (N0CFP && N1CFP) { const APFloat &C0 = N0CFP->getValueAPF(); @@ -14050,7 +14075,36 @@ static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N, // Canonicalize to constant on RHS. if (isConstantFPBuildVectorOrConstantFP(N0) && !isConstantFPBuildVectorOrConstantFP(N1)) - return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0, Flags); + + if (N1CFP) { + const APFloat &AF = N1CFP->getValueAPF(); + + // minnum(X, nan) -> X + // maxnum(X, nan) -> X + // minimum(X, nan) -> nan + // maximum(X, nan) -> nan + if (AF.isNaN()) + return PropagatesNaN ? N->getOperand(1) : N->getOperand(0); + + // In the following folds, inf can be replaced with the largest finite + // float, if the ninf flag is set. + if (AF.isInfinity() || (Flags.hasNoInfs() && AF.isLargest())) { + // minnum(X, -inf) -> -inf + // maxnum(X, +inf) -> +inf + // minimum(X, -inf) -> -inf if nnan + // maximum(X, +inf) -> +inf if nnan + if (IsMin == AF.isNegative() && (!PropagatesNaN || Flags.hasNoNaNs())) + return N->getOperand(1); + + // minnum(X, +inf) -> X if nnan + // maxnum(X, -inf) -> X if nnan + // minimum(X, +inf) -> X + // maximum(X, -inf) -> X + if (IsMin != AF.isNegative() && (PropagatesNaN || Flags.hasNoNaNs())) + return N->getOperand(0); + } + } return SDValue(); } @@ -15676,8 +15730,6 @@ ShrinkLoadReplaceStoreWithStore(const std::pair &MaskInfo, // Figure out the offset for the store and the alignment of the access. unsigned StOffset; - unsigned NewAlign = St->getAlignment(); - if (DAG.getDataLayout().isLittleEndian()) StOffset = ByteShift; else @@ -15687,7 +15739,6 @@ ShrinkLoadReplaceStoreWithStore(const std::pair &MaskInfo, if (StOffset) { SDLoc DL(IVal); Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(StOffset), DL); - NewAlign = MinAlign(NewAlign, StOffset); } // Truncate down to the new size. @@ -15696,7 +15747,8 @@ ShrinkLoadReplaceStoreWithStore(const std::pair &MaskInfo, ++OpsNarrowed; return DAG .getStore(St->getChain(), SDLoc(St), IVal, Ptr, - St->getPointerInfo().getWithOffset(StOffset), NewAlign); + St->getPointerInfo().getWithOffset(StOffset), + St->getOriginalAlign()); } /// Look for sequence of load / op / store where op is one of 'or', 'xor', and @@ -16109,9 +16161,9 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts( // make sure we use trunc store if it's necessary to be legal. SDValue NewStore; if (!UseTrunc) { - NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), - FirstInChain->getAlignment()); + NewStore = + DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), FirstInChain->getAlign()); } else { // Must be realized as a trunc store EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); @@ -16123,8 +16175,7 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts( NewStore = DAG.getTruncStore( NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/, - FirstInChain->getAlignment(), - FirstInChain->getMemOperand()->getFlags()); + FirstInChain->getAlign(), FirstInChain->getMemOperand()->getFlags()); } // Replace all merged stores with the new store. @@ -16655,7 +16706,7 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl &StoreNodes, } LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned FirstStoreAS = FirstInChain->getAddressSpace(); - unsigned FirstStoreAlign = FirstInChain->getAlignment(); + Align FirstStoreAlign = FirstInChain->getAlign(); LoadSDNode *FirstLoad = cast(LoadNodes[0].MemNode); // Scan the memory operations on the chain and find the first @@ -16750,7 +16801,7 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl &StoreNodes, // the NumElem refers to array/index size. unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1); NumElem = std::min(LastLegalType, NumElem); - unsigned FirstLoadAlign = FirstLoad->getAlignment(); + Align FirstLoadAlign = FirstLoad->getAlign(); if (NumElem < 2) { // We know that candidate stores are in order and of correct @@ -16762,8 +16813,8 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl &StoreNodes, // can here. unsigned NumSkip = 1; while ((NumSkip < LoadNodes.size()) && - (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) && - (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) + (LoadNodes[NumSkip].MemNode->getAlign() <= FirstLoadAlign) && + (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign)) NumSkip++; StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip); @@ -16836,11 +16887,10 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl &StoreNodes, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), JointMemOpVT, FirstLoadAlign, LdMMOFlags); - NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad, - FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), JointMemOpVT, - FirstInChain->getAlignment(), - FirstInChain->getMemOperand()->getFlags()); + NewStore = DAG.getTruncStore( + NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), JointMemOpVT, + FirstInChain->getAlign(), FirstInChain->getMemOperand()->getFlags()); } // Transfer chain users from old loads to the new load. @@ -17042,17 +17092,15 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { if (DAG.getDataLayout().isBigEndian()) std::swap(Lo, Hi); - unsigned Alignment = ST->getAlignment(); MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); AAMDNodes AAInfo = ST->getAAInfo(); SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), - ST->getAlignment(), MMOFlags, AAInfo); + ST->getOriginalAlign(), MMOFlags, AAInfo); Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(4), DL); - Alignment = MinAlign(Alignment, 4U); SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(4), - Alignment, MMOFlags, AAInfo); + ST->getOriginalAlign(), MMOFlags, AAInfo); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, St0, St1); } @@ -17385,7 +17433,6 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { return SDValue(); // Start to split store. - unsigned Alignment = ST->getAlignment(); MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); AAMDNodes AAInfo = ST->getAAInfo(); @@ -17398,13 +17445,12 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { SDValue Ptr = ST->getBasePtr(); // Lower value store. SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), - ST->getAlignment(), MMOFlags, AAInfo); + ST->getOriginalAlign(), MMOFlags, AAInfo); Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(HalfValBitSize / 8), DL); // Higher value store. - SDValue St1 = - DAG.getStore(St0, DL, Hi, Ptr, - ST->getPointerInfo().getWithOffset(HalfValBitSize / 8), - Alignment / 2, MMOFlags, AAInfo); + SDValue St1 = DAG.getStore( + St0, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(HalfValBitSize / 8), + ST->getOriginalAlign(), MMOFlags, AAInfo); return St1; } @@ -21193,7 +21239,7 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, // It is safe to replace the two loads if they have different alignments, // but the new load must be the minimum (most restrictive) alignment of the // inputs. - unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment()); + Align Alignment = std::min(LLD->getAlign(), RLD->getAlign()); MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags(); if (!RLD->isInvariant()) MMOFlags &= ~MachineMemOperand::MOInvariant; diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 1b924037c3be0..178614cdadf4a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -690,6 +690,12 @@ bool FastISel::selectGetElementPtr(const User *I) { Register N = getRegForValue(I->getOperand(0)); if (!N) // Unhandled operand. Halt "fast" selection and bail. return false; + + // FIXME: The code below does not handle vector GEPs. Halt "fast" selection + // and bail. + if (isa(I->getType())) + return false; + bool NIsKill = hasTrivialKill(I->getOperand(0)); // Keep a running tab of the total offset to coalesce multiple N = N + Offset diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index ff84fdd62075c..e2da367cfe3f6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -89,18 +89,9 @@ static unsigned getStatepointGCArgStartIdx(MachineInstr *MI) { "STATEPOINT node expected"); unsigned OperIdx = StatepointOpers(MI).getNumDeoptArgsIdx(); unsigned NumDeopts = MI->getOperand(OperIdx).getImm(); - // At this point stack references has not been lowered yet, so they - // take single operand. ++OperIdx; - while (NumDeopts--) { - MachineOperand &MO = MI->getOperand(OperIdx); - if (MO.isImm() && MO.getImm() == StackMaps::ConstantOp) { - ++OperIdx; - assert(MI->getOperand(OperIdx).isImm() && - "Unexpected statepoint operand"); - } - ++OperIdx; - } + while (NumDeopts--) + OperIdx = StackMaps::getNextMetaArgIdx(MI, OperIdx); return OperIdx; } @@ -1002,11 +993,14 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, assert(!HasPhysRegOuts && "STATEPOINT mishandled"); MachineInstr *MI = MIB; unsigned Def = 0; - unsigned Use = getStatepointGCArgStartIdx(MI) + 1; + unsigned Use = getStatepointGCArgStartIdx(MI); + Use = StackMaps::getNextMetaArgIdx(MI, Use); // first derived + assert(Use < MI->getNumOperands()); while (Def < NumDefs) { if (MI->getOperand(Use).isReg()) MI->tieOperands(Def++, Use); - Use += 2; + Use = StackMaps::getNextMetaArgIdx(MI, Use); // next base + Use = StackMaps::getNextMetaArgIdx(MI, Use); // next derived } } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index f6e4b9363d1a1..9a718480aee8f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -181,6 +181,7 @@ class SelectionDAGLegalize { SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl); SDValue ExpandBSWAP(SDValue Op, const SDLoc &dl); + SDValue ExpandPARITY(SDValue Op, const SDLoc &dl); SDValue ExpandExtractFromVectorThroughStack(SDValue Op); SDValue ExpandInsertToVectorThroughStack(SDValue Op); @@ -1735,12 +1736,16 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode( if (CCCode != ISD::SETO && CCCode != ISD::SETUO) { // If we aren't the ordered or unorder operation, // then the pattern is (LHS CC1 RHS) Opc (LHS CC2 RHS). - SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1, Chain, IsSignaling); - SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2, Chain, IsSignaling); + SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1, SDNodeFlags(), Chain, + IsSignaling); + SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2, SDNodeFlags(), Chain, + IsSignaling); } else { // Otherwise, the pattern is (LHS CC1 LHS) Opc (RHS CC2 RHS) - SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1, Chain, IsSignaling); - SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2, Chain, IsSignaling); + SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1, SDNodeFlags(), Chain, + IsSignaling); + SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2, SDNodeFlags(), Chain, + IsSignaling); } if (Chain) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, SetCC1.getValue(1), @@ -1767,9 +1772,9 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, const SDLoc &dl, SDValue Chain) { // Create the stack frame object. - unsigned SrcAlign = DAG.getDataLayout().getPrefTypeAlignment( + Align SrcAlign = DAG.getDataLayout().getPrefTypeAlign( SrcOp.getValueType().getTypeForEVT(*DAG.getContext())); - SDValue FIPtr = DAG.CreateStackTemporary(SlotVT, SrcAlign); + SDValue FIPtr = DAG.CreateStackTemporary(SlotVT.getStoreSize(), SrcAlign); FrameIndexSDNode *StackPtrFI = cast(FIPtr); int SPFI = StackPtrFI->getIndex(); @@ -1780,7 +1785,7 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT, unsigned SlotSize = SlotVT.getSizeInBits(); unsigned DestSize = DestVT.getSizeInBits(); Type *DestType = DestVT.getTypeForEVT(*DAG.getContext()); - unsigned DestAlign = DAG.getDataLayout().getPrefTypeAlignment(DestType); + Align DestAlign = DAG.getDataLayout().getPrefTypeAlign(DestType); // Emit a store to the stack slot. Use a truncstore if the input value is // later than DestVT. @@ -1798,7 +1803,7 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT, // Result is a load from the stack slot. if (SlotSize == DestSize) return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo, DestAlign); - + assert(SlotSize < DestSize && "Unknown extension!"); return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, PtrInfo, SlotVT, DestAlign); @@ -2781,6 +2786,28 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) { } } +/// Open code the operations for PARITY of the specified operation. +SDValue SelectionDAGLegalize::ExpandPARITY(SDValue Op, const SDLoc &dl) { + EVT VT = Op.getValueType(); + EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); + unsigned Sz = VT.getScalarSizeInBits(); + + // If CTPOP is legal, use it. Otherwise use shifts and xor. + SDValue Result; + if (TLI.isOperationLegal(ISD::CTPOP, VT)) { + Result = DAG.getNode(ISD::CTPOP, dl, VT, Op); + } else { + Result = Op; + for (unsigned i = Log2_32_Ceil(Sz); i != 0;) { + SDValue Shift = DAG.getNode(ISD::SRL, dl, VT, Result, + DAG.getConstant(1ULL << (--i), dl, ShVT)); + Result = DAG.getNode(ISD::XOR, dl, VT, Result, Shift); + } + } + + return DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(1, dl, VT)); +} + bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { LLVM_DEBUG(dbgs() << "Trying to expand node\n"); SmallVector Results; @@ -2812,6 +2839,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { case ISD::BSWAP: Results.push_back(ExpandBSWAP(Node->getOperand(0), dl)); break; + case ISD::PARITY: + Results.push_back(ExpandPARITY(Node->getOperand(0), dl)); + break; case ISD::FRAMEADDR: case ISD::RETURNADDR: case ISD::FRAME_TO_ARGS_OFFSET: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 09b5f14bdb7b4..27105060c785c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -134,6 +134,12 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::UINT_TO_FP: R = SoftenFloatRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = SoftenFloatRes_UNDEF(N); break; case ISD::VAARG: R = SoftenFloatRes_VAARG(N); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + R = SoftenFloatRes_VECREDUCE(N); + break; } // If R is null, the sub-method took care of registering the result. @@ -772,6 +778,12 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) { return Tmp.first; } +SDValue DAGTypeLegalizer::SoftenFloatRes_VECREDUCE(SDNode *N) { + // Expand and soften recursively. + ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG)); + return SDValue(); +} + //===----------------------------------------------------------------------===// // Convert Float Operand to Integer @@ -1777,17 +1789,18 @@ void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS, // The following can be improved, but not that much. SDValue Tmp1, Tmp2, Tmp3, OutputChain; Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), LHSHi, - RHSHi, ISD::SETOEQ, Chain, IsSignaling); + RHSHi, ISD::SETOEQ, SDNodeFlags(), Chain, IsSignaling); OutputChain = Tmp1->getNumValues() > 1 ? Tmp1.getValue(1) : SDValue(); Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()), LHSLo, - RHSLo, CCCode, OutputChain, IsSignaling); + RHSLo, CCCode, SDNodeFlags(), OutputChain, IsSignaling); OutputChain = Tmp2->getNumValues() > 1 ? Tmp2.getValue(1) : SDValue(); Tmp3 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2); - Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), LHSHi, - RHSHi, ISD::SETUNE, OutputChain, IsSignaling); + Tmp1 = + DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), LHSHi, RHSHi, + ISD::SETUNE, SDNodeFlags(), OutputChain, IsSignaling); OutputChain = Tmp1->getNumValues() > 1 ? Tmp1.getValue(1) : SDValue(); Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), LHSHi, - RHSHi, CCCode, OutputChain, IsSignaling); + RHSHi, CCCode, SDNodeFlags(), OutputChain, IsSignaling); OutputChain = Tmp2->getNumValues() > 1 ? Tmp2.getValue(1) : SDValue(); Tmp1 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2); NewLHS = DAG.getNode(ISD::OR, dl, Tmp1.getValueType(), Tmp1, Tmp3); @@ -2231,6 +2244,12 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = PromoteFloatRes_UNDEF(N); break; case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + R = PromoteFloatRes_VECREDUCE(N); + break; } if (R.getNode()) @@ -2462,6 +2481,15 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_UNDEF(SDNode *N) { N->getValueType(0))); } +SDValue DAGTypeLegalizer::PromoteFloatRes_VECREDUCE(SDNode *N) { + // Expand and promote recursively. + // TODO: This is non-optimal, but dealing with the concurrently happening + // vector-legalization is non-trivial. We could do something similar to + // PromoteFloatRes_EXTRACT_VECTOR_ELT here. + ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG)); + return SDValue(); +} + SDValue DAGTypeLegalizer::BitcastToInt_ATOMIC_SWAP(SDNode *N) { EVT VT = N->getValueType(0); @@ -2570,6 +2598,12 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::UINT_TO_FP: R = SoftPromoteHalfRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = SoftPromoteHalfRes_UNDEF(N); break; case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + R = SoftPromoteHalfRes_VECREDUCE(N); + break; } if (R.getNode()) @@ -2762,6 +2796,12 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BinOp(SDNode *N) { return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); } +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_VECREDUCE(SDNode *N) { + // Expand and soften recursively. + ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG)); + return SDValue(); +} + //===----------------------------------------------------------------------===// // Half Operand Soft Promotion //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 77a79a0479ef7..0000fcb1dde1b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -62,7 +62,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::Constant: Res = PromoteIntRes_Constant(N); break; case ISD::CTLZ_ZERO_UNDEF: case ISD::CTLZ: Res = PromoteIntRes_CTLZ(N); break; - case ISD::CTPOP: Res = PromoteIntRes_CTPOP(N); break; + case ISD::PARITY: + case ISD::CTPOP: Res = PromoteIntRes_CTPOP_PARITY(N); break; case ISD::CTTZ_ZERO_UNDEF: case ISD::CTTZ: Res = PromoteIntRes_CTTZ(N); break; case ISD::EXTRACT_VECTOR_ELT: @@ -503,10 +504,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) { NVT)); } -SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) { - // Zero extend to the promoted type and do the count there. +SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) { + // Zero extend to the promoted type and do the count or parity there. SDValue Op = ZExtPromotedInteger(N->getOperand(0)); - return DAG.getNode(ISD::CTPOP, SDLoc(N), Op.getValueType(), Op); + return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op); } SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) { @@ -1980,6 +1981,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::AssertZext: ExpandIntRes_AssertZext(N, Lo, Hi); break; case ISD::BITREVERSE: ExpandIntRes_BITREVERSE(N, Lo, Hi); break; case ISD::BSWAP: ExpandIntRes_BSWAP(N, Lo, Hi); break; + case ISD::PARITY: ExpandIntRes_PARITY(N, Lo, Hi); break; case ISD::Constant: ExpandIntRes_Constant(N, Lo, Hi); break; case ISD::ABS: ExpandIntRes_ABS(N, Lo, Hi); break; case ISD::CTLZ_ZERO_UNDEF: @@ -2772,6 +2774,17 @@ void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N, Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi); } +void DAGTypeLegalizer::ExpandIntRes_PARITY(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc dl(N); + // parity(HiLo) -> parity(Lo^Hi) + GetExpandedInteger(N->getOperand(0), Lo, Hi); + EVT NVT = Lo.getValueType(); + Lo = + DAG.getNode(ISD::PARITY, dl, NVT, DAG.getNode(ISD::XOR, dl, NVT, Lo, Hi)); + Hi = DAG.getConstant(0, dl, NVT); +} + void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); @@ -2789,16 +2802,38 @@ void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N, void DAGTypeLegalizer::ExpandIntRes_ABS(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); + SDValue N0 = N->getOperand(0); + GetExpandedInteger(N0, Lo, Hi); + EVT NVT = Lo.getValueType(); + + // If we have ADDCARRY, use the expanded form of the sra+add+xor sequence we + // use in LegalizeDAG. The ADD part of the expansion is based on + // ExpandIntRes_ADDSUB which also uses ADDCARRY/UADDO after checking that + // ADDCARRY is LegalOrCustom. Each of the pieces here can be further expanded + // if needed. Shift expansion has a special case for filling with sign bits + // so that we will only end up with one SRA. + bool HasAddCarry = TLI.isOperationLegalOrCustom( + ISD::ADDCARRY, TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); + if (HasAddCarry) { + EVT ShiftAmtTy = getShiftAmountTyForConstant(NVT, TLI, DAG); + SDValue Sign = + DAG.getNode(ISD::SRA, dl, NVT, Hi, + DAG.getConstant(NVT.getSizeInBits() - 1, dl, ShiftAmtTy)); + SDVTList VTList = DAG.getVTList(NVT, getSetCCResultType(NVT)); + Lo = DAG.getNode(ISD::UADDO, dl, VTList, Lo, Sign); + Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Hi, Sign, Lo.getValue(1)); + Lo = DAG.getNode(ISD::XOR, dl, NVT, Lo, Sign); + Hi = DAG.getNode(ISD::XOR, dl, NVT, Hi, Sign); + return; + } + // abs(HiLo) -> (Hi < 0 ? -HiLo : HiLo) EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); SDValue Neg = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), N0); SDValue NegLo, NegHi; SplitInteger(Neg, NegLo, NegHi); - GetExpandedInteger(N0, Lo, Hi); - EVT NVT = Lo.getValueType(); SDValue HiIsNeg = DAG.getSetCC(dl, getSetCCResultType(NVT), DAG.getConstant(0, dl, NVT), Hi, ISD::SETGT); Lo = DAG.getSelect(dl, NVT, HiIsNeg, NegLo, Lo); @@ -4680,8 +4715,23 @@ SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N) { SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) { SDLoc dl(N); + + EVT ResVT = N->getValueType(0); unsigned NumElems = N->getNumOperands(); + if (ResVT.isScalableVector()) { + SDValue ResVec = DAG.getUNDEF(ResVT); + + for (unsigned OpIdx = 0; OpIdx < NumElems; ++OpIdx) { + SDValue Op = N->getOperand(OpIdx); + unsigned OpNumElts = Op.getValueType().getVectorMinNumElements(); + ResVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ResVec, Op, + DAG.getIntPtrConstant(OpIdx * OpNumElts, dl)); + } + + return ResVec; + } + EVT RetSclrTy = N->getValueType(0).getVectorElementType(); SmallVector NewOps; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index ae087d3bbd8cb..855d9f3c12a84 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -955,11 +955,12 @@ bool DAGTypeLegalizer::CustomWidenLowerNode(SDNode *N, EVT VT) { assert(Results.size() == N->getNumValues() && "Custom lowering returned the wrong number of results!"); for (unsigned i = 0, e = Results.size(); i != e; ++i) { - // If this is a chain output just replace it. - if (Results[i].getValueType() == MVT::Other) - ReplaceValueWith(SDValue(N, i), Results[i]); - else + // If this is a chain output or already widened just replace it. + bool WasWidened = SDValue(N, i).getValueType() != Results[i].getValueType(); + if (WasWidened) SetWidenedVector(SDValue(N, i), Results[i]); + else + ReplaceValueWith(SDValue(N, i), Results[i]); } return true; } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 34c563672753d..fbbb35cb905f2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -311,7 +311,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_BUILD_PAIR(SDNode *N); SDValue PromoteIntRes_Constant(SDNode *N); SDValue PromoteIntRes_CTLZ(SDNode *N); - SDValue PromoteIntRes_CTPOP(SDNode *N); + SDValue PromoteIntRes_CTPOP_PARITY(SDNode *N); SDValue PromoteIntRes_CTTZ(SDNode *N); SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N); SDValue PromoteIntRes_FP_TO_XINT(SDNode *N); @@ -431,6 +431,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void ExpandIntRes_ADDSUBCARRY (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_BITREVERSE (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_BSWAP (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_PARITY (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_MUL (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_SDIV (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_SREM (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -547,6 +548,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftenFloatRes_UNDEF(SDNode *N); SDValue SoftenFloatRes_VAARG(SDNode *N); SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N); + SDValue SoftenFloatRes_VECREDUCE(SDNode *N); // Convert Float Operand to Integer. bool SoftenFloatOperand(SDNode *N, unsigned OpNo); @@ -665,6 +667,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteFloatRes_UNDEF(SDNode *N); SDValue BitcastToInt_ATOMIC_SWAP(SDNode *N); SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N); + SDValue PromoteFloatRes_VECREDUCE(SDNode *N); bool PromoteFloatOperand(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo); @@ -702,6 +705,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N); SDValue SoftPromoteHalfRes_XINT_TO_FP(SDNode *N); SDValue SoftPromoteHalfRes_UNDEF(SDNode *N); + SDValue SoftPromoteHalfRes_VECREDUCE(SDNode *N); bool SoftPromoteHalfOperand(SDNode *N, unsigned OpNo); SDValue SoftPromoteHalfOp_BITCAST(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 093f7b1680edd..b09303e5219eb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1006,7 +1006,8 @@ void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT, Flags.setNoUnsignedWrap(true); if (ScaledOffset) *ScaledOffset += IncrementSize; - Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement); + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement, + Flags); } else { MPI = N->getPointerInfo().getWithOffset(IncrementSize); // Increment the pointer to the other half. @@ -2044,16 +2045,12 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::FP_TO_UINT: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: - case ISD::CTTZ: - case ISD::CTLZ: - case ISD::CTPOP: case ISD::STRICT_FP_EXTEND: case ISD::FP_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::FTRUNC: - case ISD::FCANONICALIZE: Res = SplitVecOp_UnaryOp(N); break; @@ -2146,7 +2143,6 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) { EVT LoOpVT, HiOpVT; std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT); - bool NoNaN = N->getFlags().hasNoNaNs(); unsigned CombineOpc = 0; switch (N->getOpcode()) { case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break; @@ -2160,12 +2156,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) { case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break; case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break; case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break; - case ISD::VECREDUCE_FMAX: - CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM; - break; - case ISD::VECREDUCE_FMIN: - CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM; - break; + case ISD::VECREDUCE_FMAX: CombineOpc = ISD::FMAXNUM; break; + case ISD::VECREDUCE_FMIN: CombineOpc = ISD::FMINNUM; break; default: llvm_unreachable("Unexpected reduce ISD node"); } @@ -3307,19 +3299,34 @@ SDValue DAGTypeLegalizer::WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo) { } SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { + LLVMContext &Ctx = *DAG.getContext(); SDValue InOp = N->getOperand(0); SDLoc DL(N); - EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + EVT WidenVT = TLI.getTypeToTransformTo(Ctx, N->getValueType(0)); unsigned WidenNumElts = WidenVT.getVectorNumElements(); EVT InVT = InOp.getValueType(); - EVT InEltVT = InVT.getVectorElementType(); - EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts); unsigned Opcode = N->getOpcode(); - unsigned InVTNumElts = InVT.getVectorNumElements(); const SDNodeFlags Flags = N->getFlags(); + + // Handle the case of ZERO_EXTEND where the promoted InVT element size does + // not equal that of WidenVT. + if (N->getOpcode() == ISD::ZERO_EXTEND && + getTypeAction(InVT) == TargetLowering::TypePromoteInteger && + TLI.getTypeToTransformTo(Ctx, InVT).getScalarSizeInBits() != + WidenVT.getScalarSizeInBits()) { + InOp = ZExtPromotedInteger(InOp); + InVT = InOp.getValueType(); + if (WidenVT.getScalarSizeInBits() < InVT.getScalarSizeInBits()) + Opcode = ISD::TRUNCATE; + } + + EVT InEltVT = InVT.getVectorElementType(); + EVT InWidenVT = EVT::getVectorVT(Ctx, InEltVT, WidenNumElts); + unsigned InVTNumElts = InVT.getVectorNumElements(); + if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) { InOp = GetWidenedVector(N->getOperand(0)); InVT = InOp.getValueType(); @@ -4756,6 +4763,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { EVT OrigVT = N->getOperand(0).getValueType(); EVT WideVT = Op.getValueType(); EVT ElemVT = OrigVT.getVectorElementType(); + SDNodeFlags Flags = N->getFlags(); SDValue NeutralElem; switch (N->getOpcode()) { @@ -4787,12 +4795,18 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { NeutralElem = DAG.getConstantFP(1.0, dl, ElemVT); break; case ISD::VECREDUCE_FMAX: + // This has maxnum semantics, so NaN represents missing data. We must clear + // 'nnan' if it was set because the NaN would be a poison value. NeutralElem = DAG.getConstantFP( - -std::numeric_limits::infinity(), dl, ElemVT); + std::numeric_limits::quiet_NaN(), dl, ElemVT); + Flags.setNoNaNs(false); break; case ISD::VECREDUCE_FMIN: + // This has minnum semantics, so NaN represents missing data. We must clear + // 'nnan' if it was set because the NaN would be a poison value. NeutralElem = DAG.getConstantFP( - std::numeric_limits::infinity(), dl, ElemVT); + std::numeric_limits::quiet_NaN(), dl, ElemVT); + Flags.setNoNaNs(false); break; } @@ -4803,7 +4817,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem, DAG.getVectorIdxConstant(Idx, dl)); - return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Op, N->getFlags()); + return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Op, Flags); } SDValue DAGTypeLegalizer::WidenVecOp_VSELECT(SDNode *N) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 2350248626c71..f94e0a034807c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3053,6 +3053,11 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1); break; } + case ISD::PARITY: { + // Parity returns 0 everywhere but the LSB. + Known.Zero.setBitsFrom(1); + break; + } case ISD::LOAD: { LoadSDNode *LD = cast(Op); const Constant *Cst = TLI->getTargetConstantFromLoad(LD); @@ -3370,21 +3375,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, } case ISD::ABS: { Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); - - // If the source's MSB is zero then we know the rest of the bits already. - if (Known2.isNonNegative()) { - Known.Zero = Known2.Zero; - Known.One = Known2.One; - break; - } - - // We only know that the absolute values's MSB will be zero iff there is - // a set bit that isn't the sign bit (otherwise it could be INT_MIN). - Known2.One.clearSignBit(); - if (Known2.One.getBoolValue()) { - Known.Zero = APInt::getSignMask(BitWidth); - break; - } + Known = Known2.abs(); break; } case ISD::UMIN: { @@ -6112,7 +6103,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, Store = DAG.getStore( Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl), - DstPtrInfo.getWithOffset(DstOff), Alignment.value(), MMOFlags); + DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags); OutChains.push_back(Store); } } @@ -6136,13 +6127,13 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, ISD::EXTLOAD, dl, NVT, Chain, DAG.getMemBasePlusOffset(Src, TypeSize::Fixed(SrcOff), dl), SrcPtrInfo.getWithOffset(SrcOff), VT, - commonAlignment(*SrcAlign, SrcOff).value(), SrcMMOFlags); + commonAlignment(*SrcAlign, SrcOff), SrcMMOFlags); OutLoadChains.push_back(Value.getValue(1)); Store = DAG.getTruncStore( Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl), - DstPtrInfo.getWithOffset(DstOff), VT, Alignment.value(), MMOFlags); + DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags); OutStoreChains.push_back(Store); } SrcOff += VTSize; @@ -6262,10 +6253,10 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, if (isDereferenceable) SrcMMOFlags |= MachineMemOperand::MODereferenceable; - Value = DAG.getLoad( - VT, dl, Chain, - DAG.getMemBasePlusOffset(Src, TypeSize::Fixed(SrcOff), dl), - SrcPtrInfo.getWithOffset(SrcOff), SrcAlign->value(), SrcMMOFlags); + Value = + DAG.getLoad(VT, dl, Chain, + DAG.getMemBasePlusOffset(Src, TypeSize::Fixed(SrcOff), dl), + SrcPtrInfo.getWithOffset(SrcOff), *SrcAlign, SrcMMOFlags); LoadValues.push_back(Value); LoadChains.push_back(Value.getValue(1)); SrcOff += VTSize; @@ -6277,10 +6268,10 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, unsigned VTSize = VT.getSizeInBits() / 8; SDValue Store; - Store = DAG.getStore( - Chain, dl, LoadValues[i], - DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl), - DstPtrInfo.getWithOffset(DstOff), Alignment.value(), MMOFlags); + Store = + DAG.getStore(Chain, dl, LoadValues[i], + DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl), + DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags); OutChains.push_back(Store); DstOff += VTSize; } @@ -6380,7 +6371,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl, SDValue Store = DAG.getStore( Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl), - DstPtrInfo.getWithOffset(DstOff), Alignment.value(), + DstPtrInfo.getWithOffset(DstOff), Alignment, isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone); OutChains.push_back(Store); DstOff += VT.getSizeInBits() / 8; @@ -7045,8 +7036,7 @@ SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, const SDLoc &dl, ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl, LD->getChain(), Base, Offset, LD->getPointerInfo(), - LD->getMemoryVT(), LD->getAlignment(), MMOFlags, - LD->getAAInfo()); + LD->getMemoryVT(), LD->getAlign(), MMOFlags, LD->getAAInfo()); } SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 5e6cb03f3839c..530ede44548ae 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/ConstantFolding.h" @@ -82,6 +83,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Statepoint.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -1120,27 +1122,6 @@ void SelectionDAGBuilder::visit(const Instruction &I) { visit(I.getOpcode(), I); - if (auto *FPMO = dyn_cast(&I)) { - // ConstrainedFPIntrinsics handle their own FMF. - if (!isa(&I)) { - // Propagate the fast-math-flags of this IR instruction to the DAG node that - // maps to this instruction. - // TODO: We could handle all flags (nsw, etc) here. - // TODO: If an IR instruction maps to >1 node, only the final node will have - // flags set. - // TODO: The handling of flags should be improved, see - // https://reviews.llvm.org/D86871 - if (SDNode *Node = getNodeForIRValue(&I)) { - SDNodeFlags IncomingFlags; - IncomingFlags.copyFMF(*FPMO); - if (!Node->getFlags().isDefined()) - Node->setFlags(IncomingFlags); - else - Node->intersectFlagsWith(IncomingFlags); - } - } - } - if (!I.isTerminator() && !HasTailCall && !isa(I)) // statepoints handle their exports internally CopyToExportRegsIfNeeded(&I); @@ -2560,7 +2541,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, SDLoc dl = getCurSDLoc(); SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy); const Module &M = *ParentBB->getParent()->getFunction().getParent(); - unsigned Align = DL->getPrefTypeAlignment(Type::getInt8PtrTy(M.getContext())); + Align Align = DL->getPrefTypeAlign(Type::getInt8PtrTy(M.getContext())); // Generate code to load the content of the guard slot. SDValue GuardVal = DAG.getLoad( @@ -3023,9 +3004,10 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) { Flags.setNoSignedWrap(OFBinOp->hasNoSignedWrap()); Flags.setNoUnsignedWrap(OFBinOp->hasNoUnsignedWrap()); } - if (auto *ExactOp = dyn_cast(&I)) { + if (auto *ExactOp = dyn_cast(&I)) Flags.setExact(ExactOp->isExact()); - } + if (auto *FPOp = dyn_cast(&I)) + Flags.copyFMF(*FPOp); SDValue Op1 = getValue(I.getOperand(0)); SDValue Op2 = getValue(I.getOperand(1)); @@ -3135,13 +3117,16 @@ void SelectionDAGBuilder::visitFCmp(const User &I) { SDValue Op2 = getValue(I.getOperand(1)); ISD::CondCode Condition = getFCmpCondCode(predicate); - auto *FPMO = dyn_cast(&I); - if ((FPMO && FPMO->hasNoNaNs()) || TM.Options.NoNaNsFPMath) + auto *FPMO = cast(&I); + if (FPMO->hasNoNaNs() || TM.Options.NoNaNsFPMath) Condition = getFCmpCodeWithoutNaN(Condition); + SDNodeFlags Flags; + Flags.copyFMF(*FPMO); + EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(), I.getType()); - setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition)); + setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition, Flags)); } // Check if the condition of the select has one use or two users that are both @@ -3169,6 +3154,10 @@ void SelectionDAGBuilder::visitSelect(const User &I) { bool IsUnaryAbs = false; + SDNodeFlags Flags; + if (auto *FPOp = dyn_cast(&I)) + Flags.copyFMF(*FPOp); + // Min/max matching is only viable if all output VTs are the same. if (is_splat(ValueVTs)) { EVT VT = ValueVTs[0]; @@ -3272,7 +3261,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) { Ops.push_back(SDValue(RHSVal.getNode(), RHSVal.getResNo() + i)); Values[i] = DAG.getNode( OpCode, getCurSDLoc(), - LHSVal.getNode()->getValueType(LHSVal.getResNo() + i), Ops); + LHSVal.getNode()->getValueType(LHSVal.getResNo() + i), Ops, Flags); } } @@ -4876,7 +4865,7 @@ static SDValue getLimitedPrecisionExp2(SDValue t0, const SDLoc &dl, /// expandExp - Lower an exp intrinsic. Handles the special sequences for /// limited-precision mode. static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG, - const TargetLowering &TLI) { + const TargetLowering &TLI, SDNodeFlags Flags) { if (Op.getValueType() == MVT::f32 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) { @@ -4892,13 +4881,13 @@ static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG, } // No special expansion. - return DAG.getNode(ISD::FEXP, dl, Op.getValueType(), Op); + return DAG.getNode(ISD::FEXP, dl, Op.getValueType(), Op, Flags); } /// expandLog - Lower a log intrinsic. Handles the special sequences for /// limited-precision mode. static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG, - const TargetLowering &TLI) { + const TargetLowering &TLI, SDNodeFlags Flags) { // TODO: What fast-math-flags should be set on the floating-point nodes? if (Op.getValueType() == MVT::f32 && @@ -4991,13 +4980,13 @@ static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG, } // No special expansion. - return DAG.getNode(ISD::FLOG, dl, Op.getValueType(), Op); + return DAG.getNode(ISD::FLOG, dl, Op.getValueType(), Op, Flags); } /// expandLog2 - Lower a log2 intrinsic. Handles the special sequences for /// limited-precision mode. static SDValue expandLog2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG, - const TargetLowering &TLI) { + const TargetLowering &TLI, SDNodeFlags Flags) { // TODO: What fast-math-flags should be set on the floating-point nodes? if (Op.getValueType() == MVT::f32 && @@ -5088,13 +5077,13 @@ static SDValue expandLog2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG, } // No special expansion. - return DAG.getNode(ISD::FLOG2, dl, Op.getValueType(), Op); + return DAG.getNode(ISD::FLOG2, dl, Op.getValueType(), Op, Flags); } /// expandLog10 - Lower a log10 intrinsic. Handles the special sequences for /// limited-precision mode. static SDValue expandLog10(const SDLoc &dl, SDValue Op, SelectionDAG &DAG, - const TargetLowering &TLI) { + const TargetLowering &TLI, SDNodeFlags Flags) { // TODO: What fast-math-flags should be set on the floating-point nodes? if (Op.getValueType() == MVT::f32 && @@ -5178,25 +5167,26 @@ static SDValue expandLog10(const SDLoc &dl, SDValue Op, SelectionDAG &DAG, } // No special expansion. - return DAG.getNode(ISD::FLOG10, dl, Op.getValueType(), Op); + return DAG.getNode(ISD::FLOG10, dl, Op.getValueType(), Op, Flags); } /// expandExp2 - Lower an exp2 intrinsic. Handles the special sequences for /// limited-precision mode. static SDValue expandExp2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG, - const TargetLowering &TLI) { + const TargetLowering &TLI, SDNodeFlags Flags) { if (Op.getValueType() == MVT::f32 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) return getLimitedPrecisionExp2(Op, dl, DAG); // No special expansion. - return DAG.getNode(ISD::FEXP2, dl, Op.getValueType(), Op); + return DAG.getNode(ISD::FEXP2, dl, Op.getValueType(), Op, Flags); } /// visitPow - Lower a pow intrinsic. Handles the special sequences for /// limited-precision mode with x == 10.0f. static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS, - SelectionDAG &DAG, const TargetLowering &TLI) { + SelectionDAG &DAG, const TargetLowering &TLI, + SDNodeFlags Flags) { bool IsExp10 = false; if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) { @@ -5219,7 +5209,7 @@ static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS, } // No special expansion. - return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS); + return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS, Flags); } /// ExpandPowI - Expand a llvm.powi intrinsic. @@ -5640,6 +5630,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, DebugLoc dl = getCurDebugLoc(); SDValue Res; + SDNodeFlags Flags; + if (auto *FPOp = dyn_cast(&I)) + Flags.copyFMF(*FPOp); + switch (Intrinsic) { default: // By default, turn this into a target intrinsic node. @@ -6054,23 +6048,26 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, getValue(I.getArgOperand(1)), DAG)); return; case Intrinsic::log: - setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI)); + setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags)); return; case Intrinsic::log2: - setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI)); + setValue(&I, + expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags)); return; case Intrinsic::log10: - setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI)); + setValue(&I, + expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags)); return; case Intrinsic::exp: - setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI)); + setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags)); return; case Intrinsic::exp2: - setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI)); + setValue(&I, + expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags)); return; case Intrinsic::pow: setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)), - getValue(I.getArgOperand(1)), DAG, TLI)); + getValue(I.getArgOperand(1)), DAG, TLI, Flags)); return; case Intrinsic::sqrt: case Intrinsic::fabs: @@ -6103,7 +6100,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, setValue(&I, DAG.getNode(Opcode, sdl, getValue(I.getArgOperand(0)).getValueType(), - getValue(I.getArgOperand(0)))); + getValue(I.getArgOperand(0)), Flags)); return; } case Intrinsic::lround: @@ -6128,38 +6125,37 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, setValue(&I, DAG.getNode(ISD::FMINNUM, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), - getValue(I.getArgOperand(1)))); + getValue(I.getArgOperand(1)), Flags)); return; case Intrinsic::maxnum: setValue(&I, DAG.getNode(ISD::FMAXNUM, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), - getValue(I.getArgOperand(1)))); + getValue(I.getArgOperand(1)), Flags)); return; case Intrinsic::minimum: setValue(&I, DAG.getNode(ISD::FMINIMUM, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), - getValue(I.getArgOperand(1)))); + getValue(I.getArgOperand(1)), Flags)); return; case Intrinsic::maximum: setValue(&I, DAG.getNode(ISD::FMAXIMUM, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), - getValue(I.getArgOperand(1)))); + getValue(I.getArgOperand(1)), Flags)); return; case Intrinsic::copysign: setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), - getValue(I.getArgOperand(1)))); + getValue(I.getArgOperand(1)), Flags)); return; case Intrinsic::fma: - setValue(&I, DAG.getNode(ISD::FMA, sdl, - getValue(I.getArgOperand(0)).getValueType(), - getValue(I.getArgOperand(0)), - getValue(I.getArgOperand(1)), - getValue(I.getArgOperand(2)))); + setValue(&I, DAG.getNode( + ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), + getValue(I.getArgOperand(2)), Flags)); return; #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ case Intrinsic::INTRINSIC: @@ -6174,17 +6170,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), - getValue(I.getArgOperand(2)))); + getValue(I.getArgOperand(2)), Flags)); } else { // TODO: Intrinsic calls should have fast-math-flags. - SDValue Mul = DAG.getNode(ISD::FMUL, sdl, - getValue(I.getArgOperand(0)).getValueType(), - getValue(I.getArgOperand(0)), - getValue(I.getArgOperand(1))); + SDValue Mul = DAG.getNode( + ISD::FMUL, sdl, getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), Flags); SDValue Add = DAG.getNode(ISD::FADD, sdl, getValue(I.getArgOperand(0)).getValueType(), - Mul, - getValue(I.getArgOperand(2))); + Mul, getValue(I.getArgOperand(2)), Flags); setValue(&I, Add); } return; @@ -6388,7 +6382,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } else { EVT PtrTy = TLI.getValueType(DAG.getDataLayout(), I.getType()); const Value *Global = TLI.getSDagStackGuard(M); - unsigned Align = DL->getPrefTypeAlignment(Global->getType()); + Align Align = DL->getPrefTypeAlign(Global->getType()); Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global), MachinePointerInfo(Global, 0), Align, MachineMemOperand::MOVolatile); @@ -6419,9 +6413,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, SDValue FIN = DAG.getFrameIndex(FI, PtrTy); // Store the stack protector onto the stack. - Res = DAG.getStore(Chain, sdl, Src, FIN, MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), FI), - /* Alignment = */ 0, MachineMemOperand::MOVolatile); + Res = DAG.getStore( + Chain, sdl, Src, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + MaybeAlign(), MachineMemOperand::MOVolatile); setValue(&I, Res); DAG.setRoot(Res); return; @@ -7253,9 +7248,9 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, } SDValue Ptr = Builder.getValue(PtrVal); - SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root, - Ptr, MachinePointerInfo(PtrVal), - /* Alignment = */ 1); + SDValue LoadVal = + Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root, Ptr, + MachinePointerInfo(PtrVal), Align(1)); if (!ConstantMemory) Builder.PendingLoads.push_back(LoadVal.getValue(1)); @@ -7532,8 +7527,12 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, if (!I.onlyReadsMemory()) return false; + SDNodeFlags Flags; + Flags.copyFMF(cast(I)); + SDValue Tmp = getValue(I.getArgOperand(0)); - setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), Tmp.getValueType(), Tmp)); + setValue(&I, + DAG.getNode(Opcode, getCurSDLoc(), Tmp.getValueType(), Tmp, Flags)); return true; } @@ -7548,10 +7547,13 @@ bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I, if (!I.onlyReadsMemory()) return false; + SDNodeFlags Flags; + Flags.copyFMF(cast(I)); + SDValue Tmp0 = getValue(I.getArgOperand(0)); SDValue Tmp1 = getValue(I.getArgOperand(1)); EVT VT = Tmp0.getValueType(); - setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1)); + setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1, Flags)); return true; } @@ -8951,24 +8953,26 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I, SDLoc dl = getCurSDLoc(); EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); SDValue Res; - FastMathFlags FMF; - if (isa(I)) - FMF = I.getFastMathFlags(); + SDNodeFlags SDFlags; + if (auto *FPMO = dyn_cast(&I)) + SDFlags.copyFMF(*FPMO); switch (Intrinsic) { case Intrinsic::experimental_vector_reduce_v2_fadd: - if (FMF.allowReassoc()) + if (SDFlags.hasAllowReassociation()) Res = DAG.getNode(ISD::FADD, dl, VT, Op1, - DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2)); + DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2, SDFlags), + SDFlags); else - Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2); + Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2, SDFlags); break; case Intrinsic::experimental_vector_reduce_v2_fmul: - if (FMF.allowReassoc()) + if (SDFlags.hasAllowReassociation()) Res = DAG.getNode(ISD::FMUL, dl, VT, Op1, - DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2)); + DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2, SDFlags), + SDFlags); else - Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2); + Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2, SDFlags); break; case Intrinsic::experimental_vector_reduce_add: Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1); @@ -8998,10 +9002,10 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I, Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1); break; case Intrinsic::experimental_vector_reduce_fmax: - Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1); + Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags); break; case Intrinsic::experimental_vector_reduce_fmin: - Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1); + Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags); break; default: llvm_unreachable("Unhandled vector reduce intrinsic"); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 7bad055198140..4904134a7d400 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -18,7 +18,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/SwitchLoweringUtils.h" @@ -26,7 +25,6 @@ #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Instruction.h" -#include "llvm/IR/Statepoint.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/ErrorHandling.h" @@ -39,6 +37,7 @@ namespace llvm { +class AAResults; class AllocaInst; class AtomicCmpXchgInst; class AtomicRMWInst; @@ -63,6 +62,7 @@ class FunctionLoweringInfo; class GCFunctionInfo; class GCRelocateInst; class GCResultInst; +class GCStatepointInst; class IndirectBrInst; class InvokeInst; class LandingPadInst; @@ -388,7 +388,7 @@ class SelectionDAGBuilder { SelectionDAG &DAG; const DataLayout *DL = nullptr; - AliasAnalysis *AA = nullptr; + AAResults *AA = nullptr; const TargetLibraryInfo *LibInfo; class SDAGSwitchLowering : public SwitchCG::SwitchLowering { @@ -442,7 +442,7 @@ class SelectionDAGBuilder { SL(std::make_unique(this, funcinfo)), FuncInfo(funcinfo), SwiftError(swifterror) {} - void init(GCFunctionInfo *gfi, AliasAnalysis *AA, + void init(GCFunctionInfo *gfi, AAResults *AA, const TargetLibraryInfo *li); /// Clear out the current SelectionDAG and the associated state and prepare @@ -518,13 +518,6 @@ class SelectionDAGBuilder { SDValue getValue(const Value *V); - /// Return the SDNode for the specified IR value if it exists. - SDNode *getNodeForIRValue(const Value *V) { - if (NodeMap.find(V) == NodeMap.end()) - return nullptr; - return NodeMap[V].getNode(); - } - SDValue getNonRegisterValue(const Value *V); SDValue getValueImpl(const Value *V); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index fcd09b6141677..f854a4f4d35f8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -412,6 +412,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::CTTZ_ZERO_UNDEF: return "cttz_zero_undef"; case ISD::CTLZ: return "ctlz"; case ISD::CTLZ_ZERO_UNDEF: return "ctlz_zero_undef"; + case ISD::PARITY: return "parity"; // Trampolines case ISD::INIT_TRAMPOLINE: return "init_trampoline"; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 8650cfceb86c5..ffabe7a5b0411 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -75,6 +75,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/Statepoint.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index 7cbeb1016c67b..7d3fe690cf101 100644 --- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -14,12 +14,10 @@ #include "StatepointLowering.h" #include "SelectionDAGBuilder.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GCMetadata.h" @@ -30,7 +28,6 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetOpcodes.h" @@ -841,7 +838,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( Register Reg = FuncInfo.CreateRegs(RetTy); RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Reg, RetTy, None); - SDValue Chain = DAG.getEntryNode(); + SDValue Chain = DAG.getRoot(); RFV.getCopyToRegs(Relocated, DAG, getCurSDLoc(), Chain, nullptr); PendingExports.push_back(Chain); @@ -919,8 +916,9 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( // Remove original call node DAG.DeleteNode(CallNode); - // DON'T set the root - under the assumption that it's already set past the - // inserted node we created. + // Since we always emit CopyToRegs (even for local relocates), we must + // update root, so that they are emitted before any local uses. + (void)getControlRoot(); // TODO: A better future implementation would be to emit a single variable // argument, variable return value STATEPOINT node here and then hookup the diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index ae98edb74466d..5c9273150014f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1325,15 +1325,15 @@ bool TargetLowering::SimplifyDemandedBits( return true; // If all of the unknown bits are known to be zero on one side or the other - // (but not both) turn this into an *inclusive* or. + // turn this into an *inclusive* or. // e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0 if (DemandedBits.isSubsetOf(Known.Zero | Known2.Zero)) return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, VT, Op0, Op1)); ConstantSDNode* C = isConstOrConstSplat(Op1, DemandedElts); if (C) { - // If one side is a constant, and all of the known set bits on the other - // side are also set in the constant, turn this into an AND, as we know + // If one side is a constant, and all of the set bits in the constant are + // also known set on the other side, turn this into an AND, as we know // the bits will be cleared. // e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2 // NB: it is okay if more bits are known than are requested @@ -1748,6 +1748,17 @@ bool TargetLowering::SimplifyDemandedBits( Known.Zero = Known2.Zero.byteSwap(); break; } + case ISD::CTPOP: { + // If only 1 bit is demanded, replace with PARITY as long as we're before + // op legalization. + // FIXME: Limit to scalars for now. + if (DemandedBits.isOneValue() && !TLO.LegalOps && !VT.isVector()) + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::PARITY, dl, VT, + Op.getOperand(0))); + + Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth); + break; + } case ISD::SIGN_EXTEND_INREG: { SDValue Op0 = Op.getOperand(0); EVT ExVT = cast(Op.getOperand(1))->getVT(); @@ -3590,10 +3601,10 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, if (bestOffset != 0) Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(bestOffset), dl); - unsigned NewAlign = MinAlign(Lod->getAlignment(), bestOffset); - SDValue NewLoad = DAG.getLoad( - newVT, dl, Lod->getChain(), Ptr, - Lod->getPointerInfo().getWithOffset(bestOffset), NewAlign); + SDValue NewLoad = + DAG.getLoad(newVT, dl, Lod->getChain(), Ptr, + Lod->getPointerInfo().getWithOffset(bestOffset), + Lod->getOriginalAlign()); return DAG.getSetCC(dl, VT, DAG.getNode(ISD::AND, dl, newVT, NewLoad, DAG.getConstant(bestMask.trunc(bestWidth), @@ -5762,8 +5773,10 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, // If we already have the use of the negated floating constant, it is free // to negate it even it has multiple uses. - if (!Op.hasOneUse() && CFP.use_empty()) + if (!Op.hasOneUse() && CFP.use_empty()) { + RemoveDeadNode(CFP); break; + } Cost = NegatibleCost::Neutral; return CFP; } @@ -5821,7 +5834,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, if (NegX && (CostX <= CostY)) { Cost = CostX; SDValue N = DAG.getNode(ISD::FSUB, DL, VT, NegX, Y, Flags); - RemoveDeadNode(NegY); + if (NegY != N) + RemoveDeadNode(NegY); return N; } @@ -5829,7 +5843,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, if (NegY) { Cost = CostY; SDValue N = DAG.getNode(ISD::FSUB, DL, VT, NegY, X, Flags); - RemoveDeadNode(NegX); + if (NegX != N) + RemoveDeadNode(NegX); return N; } break; @@ -5868,7 +5883,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, if (NegX && (CostX <= CostY)) { Cost = CostX; SDValue N = DAG.getNode(Opcode, DL, VT, NegX, Y, Flags); - RemoveDeadNode(NegY); + if (NegY != N) + RemoveDeadNode(NegY); return N; } @@ -5881,7 +5897,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, if (NegY) { Cost = CostY; SDValue N = DAG.getNode(Opcode, DL, VT, X, NegY, Flags); - RemoveDeadNode(NegX); + if (NegX != N) + RemoveDeadNode(NegX); return N; } break; @@ -5912,7 +5929,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, if (NegX && (CostX <= CostY)) { Cost = std::min(CostX, CostZ); SDValue N = DAG.getNode(Opcode, DL, VT, NegX, Y, NegZ, Flags); - RemoveDeadNode(NegY); + if (NegY != N) + RemoveDeadNode(NegY); return N; } @@ -5920,7 +5938,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, if (NegY) { Cost = std::min(CostY, CostZ); SDValue N = DAG.getNode(Opcode, DL, VT, X, NegY, NegZ, Flags); - RemoveDeadNode(NegX); + if (NegX != N) + RemoveDeadNode(NegX); return N; } break; @@ -6409,7 +6428,7 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result, SDValue Sel; if (Node->isStrictFPOpcode()) { - Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT, + Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT, SDNodeFlags(), Node->getOperand(0), /*IsSignaling*/ true); Chain = Sel.getValue(1); } else { @@ -6806,7 +6825,7 @@ TargetLowering::scalarizeVectorLoad(LoadSDNode *LD, // the codegen worse. SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, SL, LoadVT, Chain, BasePTR, - LD->getPointerInfo(), SrcIntVT, LD->getAlignment(), + LD->getPointerInfo(), SrcIntVT, LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo()); SmallVector Vals; @@ -6843,7 +6862,7 @@ TargetLowering::scalarizeVectorLoad(LoadSDNode *LD, SDValue ScalarLoad = DAG.getExtLoad(ExtType, SL, DstEltVT, Chain, BasePTR, LD->getPointerInfo().getWithOffset(Idx * Stride), - SrcEltVT, MinAlign(LD->getAlignment(), Idx * Stride), + SrcEltVT, LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo()); BasePTR = DAG.getObjectPtrOffset(SL, BasePTR, TypeSize::Fixed(Stride)); @@ -6906,7 +6925,7 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST, } return DAG.getStore(Chain, SL, CurrVal, BasePtr, ST->getPointerInfo(), - ST->getAlignment(), ST->getMemOperand()->getFlags(), + ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo()); } @@ -6926,8 +6945,8 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST, // This scalar TruncStore may be illegal, but we legalize it later. SDValue Store = DAG.getTruncStore( Chain, SL, Elt, Ptr, ST->getPointerInfo().getWithOffset(Idx * Stride), - MemSclVT, MinAlign(ST->getAlignment(), Idx * Stride), - ST->getMemOperand()->getFlags(), ST->getAAInfo()); + MemSclVT, ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), + ST->getAAInfo()); Stores.push_back(Store); } @@ -6992,7 +7011,7 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const { // Load one integer register's worth from the original location. SDValue Load = DAG.getLoad( RegVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(Offset), - MinAlign(LD->getAlignment(), Offset), LD->getMemOperand()->getFlags(), + LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo()); // Follow the load with a store to the stack slot. Remember the store. Stores.push_back(DAG.getStore( @@ -7011,8 +7030,8 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr, LD->getPointerInfo().getWithOffset(Offset), MemVT, - MinAlign(LD->getAlignment(), Offset), - LD->getMemOperand()->getFlags(), LD->getAAInfo()); + LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), + LD->getAAInfo()); // Follow the load with a store to the stack slot. Remember the store. // On big-endian machines this requires a truncating store to ensure // that the bits end up in the right place. @@ -7042,7 +7061,7 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const { NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2); NumBits >>= 1; - unsigned Alignment = LD->getAlignment(); + Align Alignment = LD->getOriginalAlign(); unsigned IncrementSize = NumBits / 8; ISD::LoadExtType HiExtType = LD->getExtensionType(); @@ -7060,8 +7079,8 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const { Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize)); Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize), - NewLoadedVT, MinAlign(Alignment, IncrementSize), - LD->getMemOperand()->getFlags(), LD->getAAInfo()); + NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(), + LD->getAAInfo()); } else { Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(), NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(), @@ -7070,8 +7089,8 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const { Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize)); Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize), - NewLoadedVT, MinAlign(Alignment, IncrementSize), - LD->getMemOperand()->getFlags(), LD->getAAInfo()); + NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(), + LD->getAAInfo()); } // aggregate the two parts @@ -7095,7 +7114,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST, SDValue Ptr = ST->getBasePtr(); SDValue Val = ST->getValue(); EVT VT = Val.getValueType(); - int Alignment = ST->getAlignment(); + Align Alignment = ST->getOriginalAlign(); auto &MF = DAG.getMachineFunction(); EVT StoreMemVT = ST->getMemoryVT(); @@ -7152,7 +7171,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST, // Store it to the final location. Remember the store. Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr, ST->getPointerInfo().getWithOffset(Offset), - MinAlign(ST->getAlignment(), Offset), + ST->getOriginalAlign(), ST->getMemOperand()->getFlags())); // Increment the pointers. Offset += RegBytes; @@ -7174,7 +7193,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST, Stores.push_back( DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr, ST->getPointerInfo().getWithOffset(Offset), LoadMemVT, - MinAlign(ST->getAlignment(), Offset), + ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo())); // The order of the stores doesn't matter - say it with a TokenFactor. SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); @@ -7202,7 +7221,6 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST, ST->getMemOperand()->getFlags()); Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize)); - Alignment = MinAlign(Alignment, IncrementSize); Store2 = DAG.getTruncStore( Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr, ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, Alignment, @@ -7934,7 +7952,6 @@ bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result, SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const { SDLoc dl(Node); - bool NoNaN = Node->getFlags().hasNoNaNs(); unsigned BaseOpcode = 0; switch (Node->getOpcode()) { default: llvm_unreachable("Expected VECREDUCE opcode"); @@ -7949,12 +7966,8 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const { case ISD::VECREDUCE_SMIN: BaseOpcode = ISD::SMIN; break; case ISD::VECREDUCE_UMAX: BaseOpcode = ISD::UMAX; break; case ISD::VECREDUCE_UMIN: BaseOpcode = ISD::UMIN; break; - case ISD::VECREDUCE_FMAX: - BaseOpcode = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM; - break; - case ISD::VECREDUCE_FMIN: - BaseOpcode = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM; - break; + case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break; + case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break; } SDValue Op = Node->getOperand(0); diff --git a/llvm/lib/CodeGen/SpillPlacement.cpp b/llvm/lib/CodeGen/SpillPlacement.cpp index 36a0ddf67b193..4bb50a285497f 100644 --- a/llvm/lib/CodeGen/SpillPlacement.cpp +++ b/llvm/lib/CodeGen/SpillPlacement.cpp @@ -27,10 +27,7 @@ //===----------------------------------------------------------------------===// #include "SpillPlacement.h" -#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/SparseSet.h" #include "llvm/CodeGen/EdgeBundles.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" @@ -39,7 +36,6 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" -#include "llvm/Support/BlockFrequency.h" #include #include #include diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp index 8ff1cffcd1e6a..4029c855c910e 100644 --- a/llvm/lib/CodeGen/SplitKit.cpp +++ b/llvm/lib/CodeGen/SplitKit.cpp @@ -168,7 +168,7 @@ void SplitAnalysis::analyzeUses() { // Get use slots form the use-def chain. const MachineRegisterInfo &MRI = MF.getRegInfo(); - for (MachineOperand &MO : MRI.use_nodbg_operands(CurLI->reg)) + for (MachineOperand &MO : MRI.use_nodbg_operands(CurLI->reg())) if (!MO.isUndef()) UseSlots.push_back(LIS.getInstructionIndex(*MO.getParent()).getRegSlot()); @@ -333,7 +333,7 @@ unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const { } bool SplitAnalysis::isOriginalEndpoint(SlotIndex Idx) const { - unsigned OrigReg = VRM.getOriginal(CurLI->reg); + unsigned OrigReg = VRM.getOriginal(CurLI->reg()); const LiveInterval &Orig = LIS.getInterval(OrigReg); assert(!Orig.empty() && "Splitting empty interval?"); LiveInterval::const_iterator I = Orig.find(Idx); @@ -433,7 +433,7 @@ void SplitEditor::addDeadDef(LiveInterval &LI, VNInfo *VNI, bool Original) { LaneBitmask LM; for (const MachineOperand &DefOp : DefMI->defs()) { Register R = DefOp.getReg(); - if (R != LI.reg) + if (R != LI.reg()) continue; if (unsigned SR = DefOp.getSubReg()) LM |= TRI.getSubRegIndexLaneMask(SR); @@ -636,7 +636,7 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx, LiveInterval &OrigLI = LIS.getInterval(Original); VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx); - unsigned Reg = LI->reg; + unsigned Reg = LI->reg(); bool DidRemat = false; if (OrigVNI) { LiveRangeEdit::Remat RM(ParentVNI); @@ -649,10 +649,13 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx, } if (!DidRemat) { LaneBitmask LaneMask; - if (LI->hasSubRanges()) { + if (OrigLI.hasSubRanges()) { LaneMask = LaneBitmask::getNone(); - for (LiveInterval::SubRange &S : LI->subranges()) - LaneMask |= S.LaneMask; + for (LiveInterval::SubRange &S : OrigLI.subranges()) { + if (S.liveAt(UseIdx)) + LaneMask |= S.LaneMask; + } + assert(LaneMask.any() && "Interval has no live subranges"); } else { LaneMask = LaneBitmask::getAll(); } @@ -1329,7 +1332,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) { // Rewrite to the mapped register at Idx. unsigned RegIdx = RegAssign.lookup(Idx); LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx)); - MO.setReg(LI.reg); + MO.setReg(LI.reg()); LLVM_DEBUG(dbgs() << " rewr " << printMBBReference(*MI->getParent()) << '\t' << Idx << ':' << RegIdx << '\t' << *MI); @@ -1411,7 +1414,7 @@ void SplitEditor::deleteRematVictims() { continue; MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def); assert(MI && "Missing instruction for dead def"); - MI->addRegisterDead(LI->reg, &TRI); + MI->addRegisterDead(LI->reg(), &TRI); if (!MI->allDefsAreDead()) continue; @@ -1531,7 +1534,7 @@ void SplitEditor::finish(SmallVectorImpl *LRMap) { LIS.splitSeparateComponents(LI, SplitLIs); unsigned Original = VRM.getOriginal(VReg); for (LiveInterval *SplitLI : SplitLIs) - VRM.setIsSplitFromReg(SplitLI->reg, Original); + VRM.setIsSplitFromReg(SplitLI->reg(), Original); // The new intervals all map back to i. if (LRMap) diff --git a/llvm/lib/CodeGen/StackMaps.cpp b/llvm/lib/CodeGen/StackMaps.cpp index 113d477ec80a7..806ba1aa98226 100644 --- a/llvm/lib/CodeGen/StackMaps.cpp +++ b/llvm/lib/CodeGen/StackMaps.cpp @@ -88,6 +88,29 @@ StackMaps::StackMaps(AsmPrinter &AP) : AP(AP) { llvm_unreachable("Unsupported stackmap version!"); } +unsigned StackMaps::getNextMetaArgIdx(MachineInstr *MI, unsigned CurIdx) { + assert(CurIdx < MI->getNumOperands() && "Bad meta arg index"); + const auto &MO = MI->getOperand(CurIdx); + if (MO.isImm()) { + switch (MO.getImm()) { + default: + llvm_unreachable("Unrecognized operand type."); + case StackMaps::DirectMemRefOp: + CurIdx += 2; + break; + case StackMaps::IndirectMemRefOp: + CurIdx += 3; + break; + case StackMaps::ConstantOp: + ++CurIdx; + break; + } + } + ++CurIdx; + assert(CurIdx < MI->getNumOperands() && "points past operand list"); + return CurIdx; +} + /// Go up the super-register chain until we hit a valid dwarf register number. static unsigned getDwarfRegNum(unsigned Reg, const TargetRegisterInfo *TRI) { int RegNum = TRI->getDwarfRegNum(Reg, false); diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp index 3cc5d30ebad7d..a6f8974f33436 100644 --- a/llvm/lib/CodeGen/StackSlotColoring.cpp +++ b/llvm/lib/CodeGen/StackSlotColoring.cpp @@ -145,7 +145,7 @@ namespace { // their weight. struct IntervalSorter { bool operator()(LiveInterval* LHS, LiveInterval* RHS) const { - return LHS->weight > RHS->weight; + return LHS->weight() > RHS->weight(); } }; @@ -174,7 +174,8 @@ void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) { continue; LiveInterval &li = LS->getInterval(FI); if (!MI.isDebugValue()) - li.weight += LiveIntervals::getSpillWeight(false, true, MBFI, MI); + li.incrementWeight( + LiveIntervals::getSpillWeight(false, true, MBFI, MI)); } for (MachineInstr::mmo_iterator MMOI = MI.memoperands_begin(), EE = MI.memoperands_end(); @@ -222,7 +223,7 @@ void StackSlotColoring::InitializeSlots() { for (auto *I : Intervals) { LiveInterval &li = I->second; LLVM_DEBUG(li.dump()); - int FI = Register::stackSlot2Index(li.reg); + int FI = Register::stackSlot2Index(li.reg()); if (MFI->isDeadObjectIndex(FI)) continue; @@ -269,7 +270,7 @@ StackSlotColoring::OverlapWithAssignments(LiveInterval *li, int Color) const { int StackSlotColoring::ColorSlot(LiveInterval *li) { int Color = -1; bool Share = false; - int FI = Register::stackSlot2Index(li->reg); + int FI = Register::stackSlot2Index(li->reg()); uint8_t StackID = MFI->getStackID(FI); if (!DisableSharing) { @@ -331,12 +332,12 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) { bool Changed = false; for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) { LiveInterval *li = SSIntervals[i]; - int SS = Register::stackSlot2Index(li->reg); + int SS = Register::stackSlot2Index(li->reg()); int NewSS = ColorSlot(li); assert(NewSS >= 0 && "Stack coloring failed?"); SlotMapping[SS] = NewSS; RevMap[NewSS].push_back(SS); - SlotWeights[NewSS] += li->weight; + SlotWeights[NewSS] += li->weight(); UsedColors.set(NewSS); Changed |= (SS != NewSS); } @@ -344,8 +345,8 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "\nSpill slots after coloring:\n"); for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) { LiveInterval *li = SSIntervals[i]; - int SS = Register::stackSlot2Index(li->reg); - li->weight = SlotWeights[SS]; + int SS = Register::stackSlot2Index(li->reg()); + li->setWeight(SlotWeights[SS]); } // Sort them by new weight. llvm::stable_sort(SSIntervals, IntervalSorter()); diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp index 12745747f5f80..dfcec32d95376 100644 --- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp +++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp @@ -14,6 +14,7 @@ #include "llvm/CodeGen/SwitchLoweringUtils.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 958bb7939046b..7ef37db68a28b 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -692,6 +692,7 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::BITREVERSE, VT, Expand); + setOperationAction(ISD::PARITY, VT, Expand); // These library functions default to expand. setOperationAction(ISD::FROUND, VT, Expand); diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index e2ef12d8ac77f..e89353c9ad276 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -68,7 +68,7 @@ bool TargetRegisterInfo::shouldRegionSplitForVirtReg( const MachineFunction &MF, const LiveInterval &VirtReg) const { const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); const MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineInstr *MI = MRI.getUniqueVRegDef(VirtReg.reg); + MachineInstr *MI = MRI.getUniqueVRegDef(VirtReg.reg()); if (MI && TII->isTriviallyReMaterializable(*MI) && VirtReg.size() > HugeSizeForSplit) return false; diff --git a/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp b/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp index e84e1c9cea78e..682747a2b81fe 100644 --- a/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp +++ b/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp @@ -5,8 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h" +#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h" +#include "llvm/DebugInfo/CodeView/TypeRecord.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Support/Endian.h" diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp index d31c358798211..47eba48c279dd 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -1036,7 +1036,9 @@ DWARFContext::DIEsForAddress DWARFContext::getDIEsForAddress(uint64_t Address) { static bool getFunctionNameAndStartLineForAddress(DWARFCompileUnit *CU, uint64_t Address, FunctionNameKind Kind, + DILineInfoSpecifier::FileLineInfoKind FileNameKind, std::string &FunctionName, + std::string &StartFile, uint32_t &StartLine) { // The address may correspond to instruction in some inlined function, // so we have to build the chain of inlined functions and take the @@ -1053,6 +1055,11 @@ static bool getFunctionNameAndStartLineForAddress(DWARFCompileUnit *CU, FunctionName = Name; FoundResult = true; } + std::string DeclFile = DIE.getDeclFile(FileNameKind); + if (!DeclFile.empty()) { + StartFile = DeclFile; + FoundResult = true; + } if (auto DeclLineResult = DIE.getDeclLine()) { StartLine = DeclLineResult; FoundResult = true; @@ -1224,8 +1231,9 @@ DILineInfo DWARFContext::getLineInfoForAddress(object::SectionedAddress Address, if (!CU) return Result; - getFunctionNameAndStartLineForAddress(CU, Address.Address, Spec.FNKind, - Result.FunctionName, Result.StartLine); + getFunctionNameAndStartLineForAddress(CU, Address.Address, Spec.FNKind, Spec.FLIKind, + Result.FunctionName, + Result.StartFileName, Result.StartLine); if (Spec.FLIKind != FileLineInfoKind::None) { if (const DWARFLineTable *LineTable = getLineTableForUnit(CU)) { LineTable->getFileLineInfoForAddress( @@ -1244,15 +1252,17 @@ DILineInfoTable DWARFContext::getLineInfoForAddressRange( return Lines; uint32_t StartLine = 0; + std::string StartFileName; std::string FunctionName(DILineInfo::BadString); - getFunctionNameAndStartLineForAddress(CU, Address.Address, Spec.FNKind, - FunctionName, StartLine); + getFunctionNameAndStartLineForAddress(CU, Address.Address, Spec.FNKind, Spec.FLIKind, + FunctionName, StartFileName, StartLine); // If the Specifier says we don't need FileLineInfo, just // return the top-most function at the starting address. if (Spec.FLIKind == FileLineInfoKind::None) { DILineInfo Result; Result.FunctionName = FunctionName; + Result.StartFileName = StartFileName; Result.StartLine = StartLine; Lines.push_back(std::make_pair(Address.Address, Result)); return Lines; @@ -1276,6 +1286,7 @@ DILineInfoTable DWARFContext::getLineInfoForAddressRange( Result.FunctionName = FunctionName; Result.Line = Row.Line; Result.Column = Row.Column; + Result.StartFileName = StartFileName; Result.StartLine = StartLine; Lines.push_back(std::make_pair(Row.Address.Address, Result)); } @@ -1318,6 +1329,7 @@ DWARFContext::getInliningInfoForAddress(object::SectionedAddress Address, Frame.FunctionName = Name; if (auto DeclLineResult = FunctionDIE.getDeclLine()) Frame.StartLine = DeclLineResult; + Frame.StartFileName = FunctionDIE.getDeclFile(Spec.FLIKind); if (Spec.FLIKind != FileLineInfoKind::None) { if (i == 0) { // For the topmost frame, initialize the line table of this diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 116f72a1d58ba..31340077a126d 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -557,6 +557,17 @@ uint64_t DWARFDie::getDeclLine() const { return toUnsigned(findRecursively(DW_AT_decl_line), 0); } +std::string +DWARFDie::getDeclFile(DILineInfoSpecifier::FileLineInfoKind Kind) const { + std::string FileName; + if (auto DeclFile = toUnsigned(findRecursively(DW_AT_decl_file))) { + if (const auto *LT = U->getContext().getLineTableForUnit(U)) { + LT->getFileNameByIndex(*DeclFile, U->getCompilationDir(), Kind, FileName); + } + } + return FileName; +} + void DWARFDie::getCallerFrame(uint32_t &CallFile, uint32_t &CallLine, uint32_t &CallColumn, uint32_t &CallDiscriminator) const { diff --git a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp index 10352237763c9..01dc31d849657 100644 --- a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp +++ b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp @@ -84,8 +84,10 @@ void DIPrinter::print(const DILineInfo &Info, bool Inlined) { return; } OS << " Filename: " << Filename << "\n"; - if (Info.StartLine) - OS << "Function start line: " << Info.StartLine << "\n"; + if (Info.StartLine) { + OS << " Function start filename: " << Info.StartFileName << "\n"; + OS << " Function start line: " << Info.StartLine << "\n"; + } OS << " Line: " << Info.Line << "\n"; OS << " Column: " << Info.Column << "\n"; if (Info.Discriminator) diff --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp index 84524195fa8af..93d05e4e27bf8 100644 --- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp +++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp @@ -12,24 +12,15 @@ #include "SymbolizableObjectFile.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" -#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" #include "llvm/Object/COFF.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Object/SymbolSize.h" #include "llvm/Support/Casting.h" #include "llvm/Support/DataExtractor.h" -#include "llvm/Support/Error.h" #include -#include -#include -#include -#include -#include -#include using namespace llvm; using namespace object; diff --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h index 0ba304ee4c61c..be3c66df056f0 100644 --- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h +++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h @@ -15,12 +15,12 @@ #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/DIContext.h" #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" -#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/Error.h" #include -#include #include #include -#include +#include +#include namespace llvm { diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp index 8b078690dea24..20295434d2e5a 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp @@ -244,7 +244,7 @@ class ELFLinkGraphBuilder_x86_64 { object::ELFFile::Elf_Shdr_Range sections; SymbolTable SymTab; - bool isRelocatable() { return Obj.getHeader()->e_type == llvm::ELF::ET_REL; } + bool isRelocatable() { return Obj.getHeader().e_type == llvm::ELF::ET_REL; } support::endianness getEndianness(const object::ELFFile &Obj) { @@ -253,7 +253,7 @@ class ELFLinkGraphBuilder_x86_64 { // This could also just become part of a template unsigned getPointerSize(const object::ELFFile &Obj) { - return Obj.getHeader()->getFileClass() == ELF::ELFCLASS64 ? 8 : 4; + return Obj.getHeader().getFileClass() == ELF::ELFCLASS64 ? 8 : 4; } // We don't technically need this right now @@ -277,7 +277,7 @@ class ELFLinkGraphBuilder_x86_64 { auto StrTabSec = Obj.getSection(SecRef.sh_link); if (!StrTabSec) return StrTabSec.takeError(); - auto StringTable = Obj.getStringTable(*StrTabSec); + auto StringTable = Obj.getStringTable(**StrTabSec); if (!StringTable) return StringTable.takeError(); @@ -310,7 +310,7 @@ class ELFLinkGraphBuilder_x86_64 { Error createNormalizedSections() { LLVM_DEBUG(dbgs() << "Creating normalized sections...\n"); for (auto &SecRef : sections) { - auto Name = Obj.getSectionName(&SecRef); + auto Name = Obj.getSectionName(SecRef); if (!Name) return Name.takeError(); sys::Memory::ProtectionFlags Prot; @@ -343,7 +343,7 @@ class ELFLinkGraphBuilder_x86_64 { if (SecRef.sh_type != ELF::SHT_NOBITS) { // .sections() already checks that the data is not beyond the end of // file - auto contents = Obj.getSectionContentsAsArray(&SecRef); + auto contents = Obj.getSectionContentsAsArray(SecRef); if (!contents) return contents.takeError(); @@ -375,7 +375,7 @@ class ELFLinkGraphBuilder_x86_64 { return make_error("Shouldn't have REL in x64", llvm::inconvertibleErrorCode()); - auto RelSectName = Obj.getSectionName(&SecRef); + auto RelSectName = Obj.getSectionName(SecRef); if (!RelSectName) return RelSectName.takeError(); // Deal with .eh_frame later @@ -386,7 +386,7 @@ class ELFLinkGraphBuilder_x86_64 { if (!UpdateSection) return UpdateSection.takeError(); - auto UpdateSectionName = Obj.getSectionName(*UpdateSection); + auto UpdateSectionName = Obj.getSectionName(**UpdateSection); if (!UpdateSectionName) return UpdateSectionName.takeError(); @@ -397,7 +397,7 @@ class ELFLinkGraphBuilder_x86_64 { *UpdateSectionName, llvm::inconvertibleErrorCode()); - auto Relocations = Obj.relas(&SecRef); + auto Relocations = Obj.relas(SecRef); if (!Relocations) return Relocations.takeError(); @@ -409,7 +409,7 @@ class ELFLinkGraphBuilder_x86_64 { << "Name: " << Obj.getRelocationTypeName(Type) << "\n"; }); auto SymbolIndex = Rela.getSymbol(false); - auto Symbol = Obj.getRelocationSymbol(&Rela, &SymTab); + auto Symbol = Obj.getRelocationSymbol(Rela, &SymTab); if (!Symbol) return Symbol.takeError(); @@ -472,10 +472,10 @@ class ELFLinkGraphBuilder_x86_64 { auto StrTabSec = Obj.getSection(SecRef.sh_link); if (!StrTabSec) return StrTabSec.takeError(); - auto StringTable = Obj.getStringTable(*StrTabSec); + auto StringTable = Obj.getStringTable(**StrTabSec); if (!StringTable) return StringTable.takeError(); - auto Name = Obj.getSectionName(&SecRef); + auto Name = Obj.getSectionName(SecRef); if (!Name) return Name.takeError(); auto Section = G->findSectionByName(*Name); @@ -520,7 +520,7 @@ class ELFLinkGraphBuilder_x86_64 { auto DefinedSection = Obj.getSection(SymRef.st_shndx); if (!DefinedSection) return DefinedSection.takeError(); - auto sectName = Obj.getSectionName(*DefinedSection); + auto sectName = Obj.getSectionName(**DefinedSection); if (!sectName) return Name.takeError(); diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp index 5105ec4951484..71ec88639a5b7 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp @@ -93,6 +93,7 @@ const char *getScopeName(Scope S) { raw_ostream &operator<<(raw_ostream &OS, const Block &B) { return OS << formatv("{0:x16}", B.getAddress()) << " -- " << formatv("{0:x16}", B.getAddress() + B.getSize()) << ": " + << "size = " << formatv("{0:x}", B.getSize()) << ", " << (B.isZeroFill() ? "zero-fill" : "content") << ", align = " << B.getAlignment() << ", align-ofs = " << B.getAlignmentOffset() @@ -126,10 +127,10 @@ raw_ostream &operator<<(raw_ostream &OS, const Symbol &Sym) { break; } OS << (Sym.isLive() ? '+' : '-') - << ", size = " << formatv("{0:x8}", Sym.getSize()) + << ", size = " << formatv("{0:x}", Sym.getSize()) << ", addr = " << formatv("{0:x16}", Sym.getAddress()) << " (" << formatv("{0:x16}", Sym.getAddressable().getAddress()) << " + " - << formatv("{0:x8}", Sym.getOffset()); + << formatv("{0:x}", Sym.getOffset()); if (Sym.isDefined()) OS << " " << Sym.getBlock().getSection().getName(); OS << ")>"; @@ -139,8 +140,33 @@ raw_ostream &operator<<(raw_ostream &OS, const Symbol &Sym) { void printEdge(raw_ostream &OS, const Block &B, const Edge &E, StringRef EdgeKindName) { OS << "edge@" << formatv("{0:x16}", B.getAddress() + E.getOffset()) << ": " - << formatv("{0:x16}", B.getAddress()) << " + " << E.getOffset() << " -- " - << EdgeKindName << " -> " << E.getTarget() << " + " << E.getAddend(); + << formatv("{0:x16}", B.getAddress()) << " + " + << formatv("{0:x}", E.getOffset()) << " -- " << EdgeKindName << " -> "; + + auto &TargetSym = E.getTarget(); + if (TargetSym.hasName()) + OS << TargetSym.getName(); + else { + auto &TargetBlock = TargetSym.getBlock(); + auto &TargetSec = TargetBlock.getSection(); + JITTargetAddress SecAddress = ~JITTargetAddress(0); + for (auto *B : TargetSec.blocks()) + if (B->getAddress() < SecAddress) + SecAddress = B->getAddress(); + + JITTargetAddress SecDelta = TargetSym.getAddress() - SecAddress; + OS << formatv("{0:x16}", TargetSym.getAddress()) << " (section " + << TargetSec.getName(); + if (SecDelta) + OS << " + " << formatv("{0:x}", SecDelta); + OS << " / block " << formatv("{0:x16}", TargetBlock.getAddress()); + if (TargetSym.getOffset()) + OS << " + " << formatv("{0:x}", TargetSym.getOffset()); + OS << ")"; + } + + if (E.getAddend() != 0) + OS << " + " << E.getAddend(); } Section::~Section() { diff --git a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp index 9e38dc36faae7..dfb0d06bdba3d 100644 --- a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp @@ -88,7 +88,7 @@ class PartitioningIRMaterializationUnit : public IRMaterializationUnit { Parent(Parent) {} private: - void materialize(MaterializationResponsibility R) override { + void materialize(std::unique_ptr R) override { Parent.emitPartition(std::move(R), std::move(TSM), std::move(SymbolToDefinition)); } @@ -128,15 +128,15 @@ void CompileOnDemandLayer::setPartitionFunction(PartitionFunction Partition) { void CompileOnDemandLayer::setImplMap(ImplSymbolMap *Imp) { this->AliaseeImpls = Imp; } -void CompileOnDemandLayer::emit(MaterializationResponsibility R, - ThreadSafeModule TSM) { +void CompileOnDemandLayer::emit( + std::unique_ptr R, ThreadSafeModule TSM) { assert(TSM && "Null module"); auto &ES = getExecutionSession(); // Sort the callables and non-callables, build re-exports and lodge the // actual module with the implementation dylib. - auto &PDR = getPerDylibResources(R.getTargetJITDylib()); + auto &PDR = getPerDylibResources(R->getTargetJITDylib()); SymbolAliasMap NonCallables; SymbolAliasMap Callables; @@ -145,7 +145,7 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R, cleanUpModule(M); }); - for (auto &KV : R.getSymbols()) { + for (auto &KV : R->getSymbols()) { auto &Name = KV.first; auto &Flags = KV.second; if (Flags.isCallable()) @@ -158,19 +158,19 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R, // implementation dylib. if (auto Err = PDR.getImplDylib().define( std::make_unique( - ES, *getManglingOptions(), std::move(TSM), R.getVModuleKey(), + ES, *getManglingOptions(), std::move(TSM), R->getVModuleKey(), *this))) { ES.reportError(std::move(Err)); - R.failMaterialization(); + R->failMaterialization(); return; } if (!NonCallables.empty()) - R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables), - JITDylibLookupFlags::MatchAllSymbols)); + R->replace(reexports(PDR.getImplDylib(), std::move(NonCallables), + JITDylibLookupFlags::MatchAllSymbols)); if (!Callables.empty()) - R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(), - std::move(Callables), AliaseeImpls)); + R->replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(), + std::move(Callables), AliaseeImpls)); } CompileOnDemandLayer::PerDylibResources & @@ -247,7 +247,7 @@ void CompileOnDemandLayer::expandPartition(GlobalValueSet &Partition) { } void CompileOnDemandLayer::emitPartition( - MaterializationResponsibility R, ThreadSafeModule TSM, + std::unique_ptr R, ThreadSafeModule TSM, IRMaterializationUnit::SymbolNameToDefinitionMap Defs) { // FIXME: Need a 'notify lazy-extracting/emitting' callback to tie the @@ -257,8 +257,8 @@ void CompileOnDemandLayer::emitPartition( auto &ES = getExecutionSession(); GlobalValueSet RequestedGVs; - for (auto &Name : R.getRequestedSymbols()) { - if (Name == R.getInitializerSymbol()) + for (auto &Name : R->getRequestedSymbols()) { + if (Name == R->getInitializerSymbol()) TSM.withModuleDo([&](Module &M) { for (auto &GV : getStaticInitGVs(M)) RequestedGVs.insert(&GV); @@ -285,9 +285,9 @@ void CompileOnDemandLayer::emitPartition( // If the partition is empty, return the whole module to the symbol table. if (GVsToExtract->empty()) { - R.replace(std::make_unique( - std::move(TSM), R.getVModuleKey(), R.getSymbols(), - R.getInitializerSymbol(), std::move(Defs), *this)); + R->replace(std::make_unique( + std::move(TSM), R->getVModuleKey(), R->getSymbols(), + R->getInitializerSymbol(), std::move(Defs), *this)); return; } @@ -308,7 +308,7 @@ void CompileOnDemandLayer::emitPartition( IRSymbolMapper::add(ES, *getManglingOptions(), PromotedGlobals, SymbolFlags); - if (auto Err = R.defineMaterializing(SymbolFlags)) + if (auto Err = R->defineMaterializing(SymbolFlags)) return std::move(Err); } @@ -348,12 +348,12 @@ void CompileOnDemandLayer::emitPartition( if (!ExtractedTSM) { ES.reportError(ExtractedTSM.takeError()); - R.failMaterialization(); + R->failMaterialization(); return; } - R.replace(std::make_unique( - ES, *getManglingOptions(), std::move(TSM), R.getVModuleKey(), *this)); + R->replace(std::make_unique( + ES, *getManglingOptions(), std::move(TSM), R->getVModuleKey(), *this)); BaseLayer.emit(std::move(R), std::move(*ExtractedTSM)); } diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp index 18eced68f07bc..243bac79c012f 100644 --- a/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -279,7 +279,7 @@ void MaterializationResponsibility::replace( JD->replace(std::move(MU)); } -MaterializationResponsibility +std::unique_ptr MaterializationResponsibility::delegate(const SymbolNameSet &Symbols, VModuleKey NewKey) { @@ -302,9 +302,10 @@ MaterializationResponsibility::delegate(const SymbolNameSet &Symbols, SymbolFlags.erase(I); } - return MaterializationResponsibility(JD, std::move(DelegatedFlags), - std::move(DelegatedInitSymbol), - std::move(NewKey)); + return std::unique_ptr( + new MaterializationResponsibility(JD, std::move(DelegatedFlags), + std::move(DelegatedInitSymbol), + std::move(NewKey))); } void MaterializationResponsibility::addDependencies( @@ -338,10 +339,10 @@ StringRef AbsoluteSymbolsMaterializationUnit::getName() const { } void AbsoluteSymbolsMaterializationUnit::materialize( - MaterializationResponsibility R) { + std::unique_ptr R) { // No dependencies, so these calls can't fail. - cantFail(R.notifyResolved(Symbols)); - cantFail(R.notifyEmitted()); + cantFail(R->notifyResolved(Symbols)); + cantFail(R->notifyEmitted()); } void AbsoluteSymbolsMaterializationUnit::discard(const JITDylib &JD, @@ -370,16 +371,16 @@ StringRef ReExportsMaterializationUnit::getName() const { } void ReExportsMaterializationUnit::materialize( - MaterializationResponsibility R) { + std::unique_ptr R) { - auto &ES = R.getTargetJITDylib().getExecutionSession(); - JITDylib &TgtJD = R.getTargetJITDylib(); + auto &ES = R->getTargetJITDylib().getExecutionSession(); + JITDylib &TgtJD = R->getTargetJITDylib(); JITDylib &SrcJD = SourceJD ? *SourceJD : TgtJD; // Find the set of requested aliases and aliasees. Return any unrequested // aliases back to the JITDylib so as to not prematurely materialize any // aliasees. - auto RequestedSymbols = R.getRequestedSymbols(); + auto RequestedSymbols = R->getRequestedSymbols(); SymbolAliasMap RequestedAliases; for (auto &Name : RequestedSymbols) { @@ -399,18 +400,19 @@ void ReExportsMaterializationUnit::materialize( if (!Aliases.empty()) { if (SourceJD) - R.replace(reexports(*SourceJD, std::move(Aliases), SourceJDLookupFlags)); + R->replace(reexports(*SourceJD, std::move(Aliases), SourceJDLookupFlags)); else - R.replace(symbolAliases(std::move(Aliases))); + R->replace(symbolAliases(std::move(Aliases))); } // The OnResolveInfo struct will hold the aliases and responsibilty for each // query in the list. struct OnResolveInfo { - OnResolveInfo(MaterializationResponsibility R, SymbolAliasMap Aliases) + OnResolveInfo(std::unique_ptr R, + SymbolAliasMap Aliases) : R(std::move(R)), Aliases(std::move(Aliases)) {} - MaterializationResponsibility R; + std::unique_ptr R; SymbolAliasMap Aliases; }; @@ -451,7 +453,7 @@ void ReExportsMaterializationUnit::materialize( assert(!QuerySymbols.empty() && "Alias cycle detected!"); auto QueryInfo = std::make_shared( - R.delegate(ResponsibilitySymbols), std::move(QueryAliases)); + R->delegate(ResponsibilitySymbols), std::move(QueryAliases)); QueryInfos.push_back( make_pair(std::move(QuerySymbols), std::move(QueryInfo))); } @@ -480,12 +482,12 @@ void ReExportsMaterializationUnit::materialize( for (auto &KV : QueryInfo->Aliases) if (SrcJDDeps.count(KV.second.Aliasee)) { PerAliasDeps = {KV.second.Aliasee}; - QueryInfo->R.addDependencies(KV.first, PerAliasDepsMap); + QueryInfo->R->addDependencies(KV.first, PerAliasDepsMap); } }; auto OnComplete = [QueryInfo](Expected Result) { - auto &ES = QueryInfo->R.getTargetJITDylib().getExecutionSession(); + auto &ES = QueryInfo->R->getTargetJITDylib().getExecutionSession(); if (Result) { SymbolMap ResolutionMap; for (auto &KV : QueryInfo->Aliases) { @@ -499,19 +501,19 @@ void ReExportsMaterializationUnit::materialize( ResolutionMap[KV.first] = JITEvaluatedSymbol( (*Result)[KV.second.Aliasee].getAddress(), KV.second.AliasFlags); } - if (auto Err = QueryInfo->R.notifyResolved(ResolutionMap)) { + if (auto Err = QueryInfo->R->notifyResolved(ResolutionMap)) { ES.reportError(std::move(Err)); - QueryInfo->R.failMaterialization(); + QueryInfo->R->failMaterialization(); return; } - if (auto Err = QueryInfo->R.notifyEmitted()) { + if (auto Err = QueryInfo->R->notifyEmitted()) { ES.reportError(std::move(Err)); - QueryInfo->R.failMaterialization(); + QueryInfo->R->failMaterialization(); return; } } else { ES.reportError(Result.takeError()); - QueryInfo->R.failMaterialization(); + QueryInfo->R->failMaterialization(); } }; @@ -2131,7 +2133,7 @@ void ExecutionSession::dump(raw_ostream &OS) { void ExecutionSession::runOutstandingMUs() { while (1) { Optional, - MaterializationResponsibility>> + std::unique_ptr>> JMU; { diff --git a/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp b/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp index 023940dc82982..c6f6870279728 100644 --- a/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp @@ -25,7 +25,7 @@ void IRCompileLayer::setNotifyCompiled(NotifyCompiledFunction NotifyCompiled) { this->NotifyCompiled = std::move(NotifyCompiled); } -void IRCompileLayer::emit(MaterializationResponsibility R, +void IRCompileLayer::emit(std::unique_ptr R, ThreadSafeModule TSM) { assert(TSM && "Module must not be null"); @@ -33,13 +33,13 @@ void IRCompileLayer::emit(MaterializationResponsibility R, { std::lock_guard Lock(IRLayerMutex); if (NotifyCompiled) - NotifyCompiled(R.getVModuleKey(), std::move(TSM)); + NotifyCompiled(R->getVModuleKey(), std::move(TSM)); else TSM = ThreadSafeModule(); } BaseLayer.emit(std::move(R), std::move(*Obj)); } else { - R.failMaterialization(); + R->failMaterialization(); getExecutionSession().reportError(Obj.takeError()); } } diff --git a/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp b/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp index 511248f83b259..d5b11349277c1 100644 --- a/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp @@ -17,14 +17,14 @@ IRTransformLayer::IRTransformLayer(ExecutionSession &ES, IRLayer &BaseLayer, : IRLayer(ES, BaseLayer.getManglingOptions()), BaseLayer(BaseLayer), Transform(std::move(Transform)) {} -void IRTransformLayer::emit(MaterializationResponsibility R, +void IRTransformLayer::emit(std::unique_ptr R, ThreadSafeModule TSM) { assert(TSM && "Module must not be null"); - if (auto TransformedTSM = Transform(std::move(TSM), R)) + if (auto TransformedTSM = Transform(std::move(TSM), *R)) BaseLayer.emit(std::move(R), std::move(*TransformedTSM)); else { - R.failMaterialization(); + R->failMaterialization(); getExecutionSession().reportError(TransformedTSM.takeError()); } } diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp index 4f7f6089e68db..7d57ed5a3a04c 100644 --- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp @@ -33,12 +33,12 @@ class CompileCallbackMaterializationUnit : public orc::MaterializationUnit { StringRef getName() const override { return ""; } private: - void materialize(MaterializationResponsibility R) override { + void materialize(std::unique_ptr R) override { SymbolMap Result; Result[Name] = JITEvaluatedSymbol(Compile(), JITSymbolFlags::Exported); // No dependencies, so these calls cannot fail. - cantFail(R.notifyResolved(Result)); - cantFail(R.notifyEmitted()); + cantFail(R->notifyResolved(Result)); + cantFail(R->notifyEmitted()); } void discard(const JITDylib &JD, const SymbolStringPtr &Name) override { diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index 373d86d92f8d7..81f500d66bc29 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -1085,15 +1085,17 @@ LLJIT::LLJIT(LLJITBuilderState &S, Error &Err) std::make_unique(hardware_concurrency(S.NumCompileThreads)); ES->setDispatchMaterialization( [this](std::unique_ptr MU, - MaterializationResponsibility MR) { - // FIXME: Switch to move capture once ThreadPool uses unique_function. - auto SharedMU = std::shared_ptr(std::move(MU)); - auto SharedMR = - std::make_shared(std::move(MR)); - auto Work = [SharedMU, SharedMR]() mutable { - SharedMU->materialize(std::move(*SharedMR)); - }; - CompileThreads->async(std::move(Work)); + std::unique_ptr MR) { + // FIXME: We should be able to use move-capture here, but ThreadPool's + // AsyncTaskTys are std::functions rather than unique_functions + // (because MSVC's std::packaged_tasks don't support move-only types). + // Fix this when all the above gets sorted out. + CompileThreads->async( + [UnownedMU = MU.release(), UnownedMR = MR.release()]() mutable { + std::unique_ptr MU(UnownedMU); + std::unique_ptr MR(UnownedMR); + MU->materialize(std::move(MR)); + }); }); } diff --git a/llvm/lib/ExecutionEngine/Orc/Layer.cpp b/llvm/lib/ExecutionEngine/Orc/Layer.cpp index 0a5d5577e99e8..8052e7b08a5a6 100644 --- a/llvm/lib/ExecutionEngine/Orc/Layer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Layer.cpp @@ -133,7 +133,7 @@ BasicIRLayerMaterializationUnit::BasicIRLayerMaterializationUnit( L(L), K(std::move(K)) {} void BasicIRLayerMaterializationUnit::materialize( - MaterializationResponsibility R) { + std::unique_ptr R) { // Throw away the SymbolToDefinition map: it's not usable after we hand // off the module. @@ -144,8 +144,8 @@ void BasicIRLayerMaterializationUnit::materialize( TSM = cloneToNewContext(TSM); #ifndef NDEBUG - auto &ES = R.getTargetJITDylib().getExecutionSession(); - auto &N = R.getTargetJITDylib().getName(); + auto &ES = R->getTargetJITDylib().getExecutionSession(); + auto &N = R->getTargetJITDylib().getName(); #endif // NDEBUG LLVM_DEBUG(ES.runSessionLocked( @@ -200,7 +200,7 @@ StringRef BasicObjectLayerMaterializationUnit::getName() const { } void BasicObjectLayerMaterializationUnit::materialize( - MaterializationResponsibility R) { + std::unique_ptr R) { L.emit(std::move(R), std::move(O)); } diff --git a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp index 5e604130d6eab..695f6cc9c1cb4 100644 --- a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp @@ -154,8 +154,8 @@ StringRef LazyReexportsMaterializationUnit::getName() const { } void LazyReexportsMaterializationUnit::materialize( - MaterializationResponsibility R) { - auto RequestedSymbols = R.getRequestedSymbols(); + std::unique_ptr R) { + auto RequestedSymbols = R->getRequestedSymbols(); SymbolAliasMap RequestedAliases; for (auto &RequestedSymbol : RequestedSymbols) { @@ -166,8 +166,8 @@ void LazyReexportsMaterializationUnit::materialize( } if (!CallableAliases.empty()) - R.replace(lazyReexports(LCTManager, ISManager, SourceJD, - std::move(CallableAliases), AliaseeTable)); + R->replace(lazyReexports(LCTManager, ISManager, SourceJD, + std::move(CallableAliases), AliaseeTable)); IndirectStubsManager::StubInitsMap StubInits; for (auto &Alias : RequestedAliases) { @@ -182,7 +182,7 @@ void LazyReexportsMaterializationUnit::materialize( if (!CallThroughTrampoline) { SourceJD.getExecutionSession().reportError( CallThroughTrampoline.takeError()); - R.failMaterialization(); + R->failMaterialization(); return; } @@ -195,7 +195,7 @@ void LazyReexportsMaterializationUnit::materialize( if (auto Err = ISManager.createStubs(StubInits)) { SourceJD.getExecutionSession().reportError(std::move(Err)); - R.failMaterialization(); + R->failMaterialization(); return; } @@ -204,8 +204,8 @@ void LazyReexportsMaterializationUnit::materialize( Stubs[Alias.first] = ISManager.findStub(*Alias.first, false); // No registered dependencies, so these calls cannot fail. - cantFail(R.notifyResolved(Stubs)); - cantFail(R.notifyEmitted()); + cantFail(R->notifyResolved(Stubs)); + cantFail(R->notifyEmitted()); } void LazyReexportsMaterializationUnit::discard(const JITDylib &JD, diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp index d8283fa7e3461..9e3245d9cc991 100644 --- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp @@ -24,9 +24,10 @@ namespace orc { class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { public: - ObjectLinkingLayerJITLinkContext(ObjectLinkingLayer &Layer, - MaterializationResponsibility MR, - std::unique_ptr ObjBuffer) + ObjectLinkingLayerJITLinkContext( + ObjectLinkingLayer &Layer, + std::unique_ptr MR, + std::unique_ptr ObjBuffer) : Layer(Layer), MR(std::move(MR)), ObjBuffer(std::move(ObjBuffer)) {} ~ObjectLinkingLayerJITLinkContext() { @@ -44,14 +45,14 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { void notifyFailed(Error Err) override { Layer.getExecutionSession().reportError(std::move(Err)); - MR.failMaterialization(); + MR->failMaterialization(); } void lookup(const LookupMap &Symbols, std::unique_ptr LC) override { JITDylibSearchOrder LinkOrder; - MR.getTargetJITDylib().withLinkOrderDo( + MR->getTargetJITDylib().withLinkOrderDo( [&](const JITDylibSearchOrder &LO) { LinkOrder = LO; }); auto &ES = Layer.getExecutionSession(); @@ -85,8 +86,8 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { for (auto &KV : InternalNamedSymbolDeps) { SymbolDependenceMap InternalDeps; - InternalDeps[&MR.getTargetJITDylib()] = std::move(KV.second); - MR.addDependencies(KV.first, InternalDeps); + InternalDeps[&MR->getTargetJITDylib()] = std::move(KV.second); + MR->addDependencies(KV.first, InternalDeps); } ES.lookup(LookupKind::Static, LinkOrder, std::move(LookupSet), @@ -115,7 +116,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { InternedResult[InternedName] = JITEvaluatedSymbol(Sym->getAddress(), Flags); - if (AutoClaim && !MR.getSymbols().count(InternedName)) { + if (AutoClaim && !MR->getSymbols().count(InternedName)) { assert(!ExtraSymbolsToClaim.count(InternedName) && "Duplicate symbol to claim?"); ExtraSymbolsToClaim[InternedName] = Flags; @@ -133,7 +134,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { Flags |= JITSymbolFlags::Weak; InternedResult[InternedName] = JITEvaluatedSymbol(Sym->getAddress(), Flags); - if (AutoClaim && !MR.getSymbols().count(InternedName)) { + if (AutoClaim && !MR->getSymbols().count(InternedName)) { assert(!ExtraSymbolsToClaim.count(InternedName) && "Duplicate symbol to claim?"); ExtraSymbolsToClaim[InternedName] = Flags; @@ -141,19 +142,19 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { } if (!ExtraSymbolsToClaim.empty()) - if (auto Err = MR.defineMaterializing(ExtraSymbolsToClaim)) + if (auto Err = MR->defineMaterializing(ExtraSymbolsToClaim)) return Err; { - // Check that InternedResult matches up with MR.getSymbols(). + // Check that InternedResult matches up with MR->getSymbols(). // This guards against faulty transformations / compilers / object caches. // First check that there aren't any missing symbols. size_t NumMaterializationSideEffectsOnlySymbols = 0; SymbolNameVector ExtraSymbols; SymbolNameVector MissingSymbols; - for (auto &KV : MR.getSymbols()) { + for (auto &KV : MR->getSymbols()) { // If this is a materialization-side-effects only symbol then bump // the counter and make sure it's *not* defined, otherwise make @@ -175,9 +176,9 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { // If there are more definitions than expected, add them to the // ExtraSymbols vector. if (InternedResult.size() > - MR.getSymbols().size() - NumMaterializationSideEffectsOnlySymbols) { + MR->getSymbols().size() - NumMaterializationSideEffectsOnlySymbols) { for (auto &KV : InternedResult) - if (!MR.getSymbols().count(KV.first)) + if (!MR->getSymbols().count(KV.first)) ExtraSymbols.push_back(KV.first); } @@ -187,23 +188,23 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { std::move(ExtraSymbols)); } - if (auto Err = MR.notifyResolved(InternedResult)) + if (auto Err = MR->notifyResolved(InternedResult)) return Err; - Layer.notifyLoaded(MR); + Layer.notifyLoaded(*MR); return Error::success(); } void notifyFinalized( std::unique_ptr A) override { - if (auto Err = Layer.notifyEmitted(MR, std::move(A))) { + if (auto Err = Layer.notifyEmitted(*MR, std::move(A))) { Layer.getExecutionSession().reportError(std::move(Err)); - MR.failMaterialization(); + MR->failMaterialization(); return; } - if (auto Err = MR.notifyEmitted()) { + if (auto Err = MR->notifyEmitted()) { Layer.getExecutionSession().reportError(std::move(Err)); - MR.failMaterialization(); + MR->failMaterialization(); } } @@ -217,7 +218,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { Config.PrePrunePasses.push_back( [this](LinkGraph &G) { return externalizeWeakAndCommonSymbols(G); }); - Layer.modifyPassConfig(MR, TT, Config); + Layer.modifyPassConfig(*MR, TT, Config); Config.PostPrunePasses.push_back( [this](LinkGraph &G) { return computeNamedSymbolDependencies(G); }); @@ -237,13 +238,13 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { auto &ES = Layer.getExecutionSession(); for (auto *Sym : G.defined_symbols()) if (Sym->hasName() && Sym->getLinkage() == Linkage::Weak) { - if (!MR.getSymbols().count(ES.intern(Sym->getName()))) + if (!MR->getSymbols().count(ES.intern(Sym->getName()))) G.makeExternal(*Sym); } for (auto *Sym : G.absolute_symbols()) if (Sym->hasName() && Sym->getLinkage() == Linkage::Weak) { - if (!MR.getSymbols().count(ES.intern(Sym->getName()))) + if (!MR->getSymbols().count(ES.intern(Sym->getName()))) G.makeExternal(*Sym); } @@ -253,13 +254,13 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { Error markResponsibilitySymbolsLive(LinkGraph &G) const { auto &ES = Layer.getExecutionSession(); for (auto *Sym : G.defined_symbols()) - if (Sym->hasName() && MR.getSymbols().count(ES.intern(Sym->getName()))) + if (Sym->hasName() && MR->getSymbols().count(ES.intern(Sym->getName()))) Sym->setLive(true); return Error::success(); } Error computeNamedSymbolDependencies(LinkGraph &G) { - auto &ES = MR.getTargetJITDylib().getExecutionSession(); + auto &ES = MR->getTargetJITDylib().getExecutionSession(); auto LocalDeps = computeLocalDeps(G); // Compute dependencies for symbols defined in the JITLink graph. @@ -306,7 +307,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { } for (auto &P : Layer.Plugins) { - auto SyntheticLocalDeps = P->getSyntheticSymbolLocalDependencies(MR); + auto SyntheticLocalDeps = P->getSyntheticSymbolLocalDependencies(*MR); if (SyntheticLocalDeps.empty()) continue; @@ -426,12 +427,12 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { SymbolDeps.erase(&SourceJD); } - MR.addDependencies(Name, SymbolDeps); + MR->addDependencies(Name, SymbolDeps); } } ObjectLinkingLayer &Layer; - MaterializationResponsibility MR; + std::unique_ptr MR; std::unique_ptr ObjBuffer; DenseMap ExternalNamedSymbolDeps; DenseMap InternalNamedSymbolDeps; @@ -452,7 +453,7 @@ ObjectLinkingLayer::~ObjectLinkingLayer() { getExecutionSession().reportError(std::move(Err)); } -void ObjectLinkingLayer::emit(MaterializationResponsibility R, +void ObjectLinkingLayer::emit(std::unique_ptr R, std::unique_ptr O) { assert(O && "Object must not be null"); jitLink(std::make_unique( diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp index d18eb38a41423..a57662e10a794 100644 --- a/llvm/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp @@ -17,8 +17,9 @@ ObjectTransformLayer::ObjectTransformLayer(ExecutionSession &ES, TransformFunction Transform) : ObjectLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {} -void ObjectTransformLayer::emit(MaterializationResponsibility R, - std::unique_ptr O) { +void ObjectTransformLayer::emit( + std::unique_ptr R, + std::unique_ptr O) { assert(O && "Module must not be null"); // If there is a transform set then apply it. @@ -26,7 +27,7 @@ void ObjectTransformLayer::emit(MaterializationResponsibility R, if (auto TransformedObj = Transform(std::move(O))) O = std::move(*TransformedObj); else { - R.failMaterialization(); + R->failMaterialization(); getExecutionSession().reportError(TransformedObj.takeError()); return; } diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp index 5933c2e666d1c..f6dd235b6edea 100644 --- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp +++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp @@ -68,6 +68,29 @@ void LLVMOrcReleaseSymbolStringPoolEntry(LLVMOrcSymbolStringPoolEntryRef S) { OrcV2CAPIHelper::releasePoolEntry(unwrap(S)); } +LLVMOrcJITDylibRef +LLVMOrcExecutionSessionCreateBareJITDylib(LLVMOrcExecutionSessionRef ES, + const char *Name) { + return wrap(&unwrap(ES)->createBareJITDylib(Name)); +} + +LLVMErrorRef +LLVMOrcExecutionSessionCreateJITDylib(LLVMOrcExecutionSessionRef ES, + LLVMOrcJITDylibRef *Result, + const char *Name) { + auto JD = unwrap(ES)->createJITDylib(Name); + if (!JD) + return wrap(JD.takeError()); + *Result = wrap(&*JD); + return LLVMErrorSuccess; +} + +LLVMOrcJITDylibRef +LLVMOrcExecutionSessionGetJITDylibByName(LLVMOrcExecutionSessionRef ES, + const char *Name) { + return wrap(unwrap(ES)->getJITDylibByName(Name)); +} + void LLVMOrcDisposeJITDylibDefinitionGenerator( LLVMOrcJITDylibDefinitionGeneratorRef DG) { delete unwrap(DG); diff --git a/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp index 7888c2fcbdbd9..1981039eb9f12 100644 --- a/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp @@ -89,23 +89,18 @@ RTDyldObjectLinkingLayer::~RTDyldObjectLinkingLayer() { } } -void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R, - std::unique_ptr O) { +void RTDyldObjectLinkingLayer::emit( + std::unique_ptr R, + std::unique_ptr O) { assert(O && "Object must not be null"); - // This method launches an asynchronous link step that will fulfill our - // materialization responsibility. We need to switch R to be heap - // allocated before that happens so it can live as long as the asynchronous - // link needs it to (i.e. it must be able to outlive this method). - auto SharedR = std::make_shared(std::move(R)); - auto &ES = getExecutionSession(); auto Obj = object::ObjectFile::createObjectFile(*O); if (!Obj) { getExecutionSession().reportError(Obj.takeError()); - SharedR->failMaterialization(); + R->failMaterialization(); return; } @@ -121,7 +116,7 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R, continue; } else { ES.reportError(SymType.takeError()); - R.failMaterialization(); + R->failMaterialization(); return; } @@ -129,7 +124,7 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R, if (!SymFlagsOrErr) { // TODO: Test this error. ES.reportError(SymFlagsOrErr.takeError()); - R.failMaterialization(); + R->failMaterialization(); return; } @@ -139,14 +134,14 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R, InternalSymbols->insert(*SymName); else { ES.reportError(SymName.takeError()); - R.failMaterialization(); + R->failMaterialization(); return; } } } } - auto K = R.getVModuleKey(); + auto K = R->getVModuleKey(); RuntimeDyld::MemoryManager *MemMgr = nullptr; // Create a record a memory manager for this object. @@ -157,6 +152,10 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R, MemMgr = MemMgrs.back().get(); } + // Switch to shared ownership of MR so that it can be captured by both + // lambdas below. + std::shared_ptr SharedR(std::move(R)); + JITDylibSearchOrderResolver Resolver(*SharedR); jitLinkForORC( diff --git a/llvm/lib/ExecutionEngine/Orc/Speculation.cpp b/llvm/lib/ExecutionEngine/Orc/Speculation.cpp index 3dd536d8253e3..0b4755fe23cfc 100644 --- a/llvm/lib/ExecutionEngine/Orc/Speculation.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Speculation.cpp @@ -55,7 +55,7 @@ Error Speculator::addSpeculationRuntime(JITDylib &JD, // If two modules, share the same LLVMContext, different threads must // not access them concurrently without locking the associated LLVMContext // this implementation follows this contract. -void IRSpeculationLayer::emit(MaterializationResponsibility R, +void IRSpeculationLayer::emit(std::unique_ptr R, ThreadSafeModule TSM) { assert(TSM && "Speculation Layer received Null Module ?"); @@ -127,7 +127,7 @@ void IRSpeculationLayer::emit(MaterializationResponsibility R, assert(Mutator.GetInsertBlock()->getParent() == &Fn && "IR builder association mismatch?"); S.registerSymbols(internToJITSymbols(IRNames.getValue()), - &R.getTargetJITDylib()); + &R->getTargetJITDylib()); } } } diff --git a/llvm/lib/Extensions/Extensions.cpp b/llvm/lib/Extensions/Extensions.cpp index e69de29bb2d1d..0d25cbda38e00 100644 --- a/llvm/lib/Extensions/Extensions.cpp +++ b/llvm/lib/Extensions/Extensions.cpp @@ -0,0 +1,15 @@ +#include "llvm/Passes/PassPlugin.h" +#define HANDLE_EXTENSION(Ext) \ + llvm::PassPluginLibraryInfo get##Ext##PluginInfo(); +#include "llvm/Support/Extension.def" + + +namespace llvm { + namespace details { + void extensions_anchor() { +#define HANDLE_EXTENSION(Ext) \ + get##Ext##PluginInfo(); +#include "llvm/Support/Extension.def" + } + } +} diff --git a/llvm/lib/Extensions/LLVMBuild.txt b/llvm/lib/Extensions/LLVMBuild.txt index 2005830a4dd7a..7a98c8f680513 100644 --- a/llvm/lib/Extensions/LLVMBuild.txt +++ b/llvm/lib/Extensions/LLVMBuild.txt @@ -18,4 +18,4 @@ type = Library name = Extensions parent = Libraries -required_libraries = +required_libraries = Support diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 12286264c81df..d27c1b4591496 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1380,19 +1380,6 @@ static Value *upgradeAbs(IRBuilder<> &Builder, CallInst &CI) { return Res; } -static Value *upgradeIntMinMax(IRBuilder<> &Builder, CallInst &CI, - ICmpInst::Predicate Pred) { - Value *Op0 = CI.getArgOperand(0); - Value *Op1 = CI.getArgOperand(1); - Value *Cmp = Builder.CreateICmp(Pred, Op0, Op1); - Value *Res = Builder.CreateSelect(Cmp, Op0, Op1); - - if (CI.getNumArgOperands() == 4) - Res = EmitX86Select(Builder, CI.getArgOperand(3), Res, CI.getArgOperand(2)); - - return Res; -} - static Value *upgradePMULDQ(IRBuilder<> &Builder, CallInst &CI, bool IsSigned) { Type *Ty = CI.getType(); @@ -2136,25 +2123,25 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name == "sse41.pmaxsd" || Name.startswith("avx2.pmaxs") || Name.startswith("avx512.mask.pmaxs"))) { - Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_SGT); + Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smax); } else if (IsX86 && (Name == "sse2.pmaxu.b" || Name == "sse41.pmaxuw" || Name == "sse41.pmaxud" || Name.startswith("avx2.pmaxu") || Name.startswith("avx512.mask.pmaxu"))) { - Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_UGT); + Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umax); } else if (IsX86 && (Name == "sse41.pminsb" || Name == "sse2.pmins.w" || Name == "sse41.pminsd" || Name.startswith("avx2.pmins") || Name.startswith("avx512.mask.pmins"))) { - Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_SLT); + Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smin); } else if (IsX86 && (Name == "sse2.pminu.b" || Name == "sse41.pminuw" || Name == "sse41.pminud" || Name.startswith("avx2.pminu") || Name.startswith("avx512.mask.pminu"))) { - Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_ULT); + Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umin); } else if (IsX86 && (Name == "sse2.pmulu.dq" || Name == "avx2.pmulu.dq" || Name == "avx512.pmulu.dq.512" || diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index 468dce95a29ad..3f00dd0575369 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -1408,12 +1408,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1, return ConstantFP::get(C1->getContext(), C3V); } } - } else if (IsScalableVector) { - // Do not iterate on scalable vector. The number of elements is unknown at - // compile-time. - // FIXME: this branch can potentially be removed - return nullptr; - } else if (auto *VTy = dyn_cast(C1->getType())) { + } else if (auto *VTy = dyn_cast(C1->getType())) { // Fast path for splatted constants. if (Constant *C2Splat = C2->getSplatValue()) { if (Instruction::isIntDivRem(Opcode) && C2Splat->isNullValue()) @@ -1425,22 +1420,24 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1, } } - // Fold each element and create a vector constant from those constants. - SmallVector Result; - Type *Ty = IntegerType::get(VTy->getContext(), 32); - for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) { - Constant *ExtractIdx = ConstantInt::get(Ty, i); - Constant *LHS = ConstantExpr::getExtractElement(C1, ExtractIdx); - Constant *RHS = ConstantExpr::getExtractElement(C2, ExtractIdx); + if (auto *FVTy = dyn_cast(VTy)) { + // Fold each element and create a vector constant from those constants. + SmallVector Result; + Type *Ty = IntegerType::get(FVTy->getContext(), 32); + for (unsigned i = 0, e = FVTy->getNumElements(); i != e; ++i) { + Constant *ExtractIdx = ConstantInt::get(Ty, i); + Constant *LHS = ConstantExpr::getExtractElement(C1, ExtractIdx); + Constant *RHS = ConstantExpr::getExtractElement(C2, ExtractIdx); - // If any element of a divisor vector is zero, the whole op is undef. - if (Instruction::isIntDivRem(Opcode) && RHS->isNullValue()) - return UndefValue::get(VTy); + // If any element of a divisor vector is zero, the whole op is undef. + if (Instruction::isIntDivRem(Opcode) && RHS->isNullValue()) + return UndefValue::get(VTy); - Result.push_back(ConstantExpr::get(Opcode, LHS, RHS)); - } + Result.push_back(ConstantExpr::get(Opcode, LHS, RHS)); + } - return ConstantVector::get(Result); + return ConstantVector::get(Result); + } } if (ConstantExpr *CE1 = dyn_cast(C1)) { @@ -1619,7 +1616,7 @@ static FCmpInst::Predicate evaluateFCmpRelation(Constant *V1, Constant *V2) { static ICmpInst::Predicate areGlobalsPotentiallyEqual(const GlobalValue *GV1, const GlobalValue *GV2) { auto isGlobalUnsafeForEquality = [](const GlobalValue *GV) { - if (GV->hasExternalWeakLinkage() || GV->hasWeakAnyLinkage()) + if (GV->isInterposable() || GV->hasGlobalUnnamedAddr()) return true; if (const auto *GVar = dyn_cast(GV)) { Type *Ty = GVar->getValueType(); diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index e701feae22562..d03ffbb8d008f 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1400,8 +1400,7 @@ static bool matchIntrinsicType( auto *ReferenceType = dyn_cast(ArgTys[RefArgNumber]); auto *ThisArgVecTy = dyn_cast(Ty); if (!ThisArgVecTy || !ReferenceType || - (cast(ReferenceType)->getNumElements() != - cast(ThisArgVecTy)->getNumElements())) + (ReferenceType->getElementCount() != ThisArgVecTy->getElementCount())) return true; PointerType *ThisArgEltTy = dyn_cast(ThisArgVecTy->getElementType()); diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index d6eeffd44b368..febfe189df6ea 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -72,8 +72,9 @@ Value *IRBuilderBase::getCastedInt8PtrValue(Value *Ptr) { static CallInst *createCallHelper(Function *Callee, ArrayRef Ops, IRBuilderBase *Builder, const Twine &Name = "", - Instruction *FMFSource = nullptr) { - CallInst *CI = Builder->CreateCall(Callee, Ops, Name); + Instruction *FMFSource = nullptr, + ArrayRef OpBundles = {}) { + CallInst *CI = Builder->CreateCall(Callee, Ops, OpBundles, Name); if (FMFSource) CI->copyFastMathFlags(FMFSource); return CI; @@ -450,14 +451,16 @@ CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) { return createCallHelper(TheFn, Ops, this); } -CallInst *IRBuilderBase::CreateAssumption(Value *Cond) { +CallInst * +IRBuilderBase::CreateAssumption(Value *Cond, + ArrayRef OpBundles) { assert(Cond->getType() == getInt1Ty() && "an assumption condition must be of type i1"); Value *Ops[] = { Cond }; Module *M = BB->getParent()->getParent(); Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume); - return createCallHelper(FnAssume, Ops, this); + return createCallHelper(FnAssume, Ops, this, "", nullptr, OpBundles); } /// Create a call to a Masked Load intrinsic. @@ -1113,63 +1116,37 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex( return Fn; } -CallInst *IRBuilderBase::CreateAlignmentAssumptionHelper( - const DataLayout &DL, Value *PtrValue, Value *Mask, Type *IntPtrTy, - Value *OffsetValue, Value **TheCheck) { - Value *PtrIntValue = CreatePtrToInt(PtrValue, IntPtrTy, "ptrint"); - - if (OffsetValue) { - bool IsOffsetZero = false; - if (const auto *CI = dyn_cast(OffsetValue)) - IsOffsetZero = CI->isZero(); - - if (!IsOffsetZero) { - if (OffsetValue->getType() != IntPtrTy) - OffsetValue = CreateIntCast(OffsetValue, IntPtrTy, /*isSigned*/ true, - "offsetcast"); - PtrIntValue = CreateSub(PtrIntValue, OffsetValue, "offsetptr"); - } - } - - Value *Zero = ConstantInt::get(IntPtrTy, 0); - Value *MaskedPtr = CreateAnd(PtrIntValue, Mask, "maskedptr"); - Value *InvCond = CreateICmpEQ(MaskedPtr, Zero, "maskcond"); - if (TheCheck) - *TheCheck = InvCond; - - return CreateAssumption(InvCond); +CallInst *IRBuilderBase::CreateAlignmentAssumptionHelper(const DataLayout &DL, + Value *PtrValue, + Value *AlignValue, + Value *OffsetValue) { + SmallVector Vals({PtrValue, AlignValue}); + if (OffsetValue) + Vals.push_back(OffsetValue); + OperandBundleDefT AlignOpB("align", Vals); + return CreateAssumption(ConstantInt::getTrue(getContext()), {AlignOpB}); } -CallInst *IRBuilderBase::CreateAlignmentAssumption( - const DataLayout &DL, Value *PtrValue, unsigned Alignment, - Value *OffsetValue, Value **TheCheck) { +CallInst *IRBuilderBase::CreateAlignmentAssumption(const DataLayout &DL, + Value *PtrValue, + unsigned Alignment, + Value *OffsetValue) { assert(isa(PtrValue->getType()) && "trying to create an alignment assumption on a non-pointer?"); assert(Alignment != 0 && "Invalid Alignment"); auto *PtrTy = cast(PtrValue->getType()); Type *IntPtrTy = getIntPtrTy(DL, PtrTy->getAddressSpace()); - - Value *Mask = ConstantInt::get(IntPtrTy, Alignment - 1); - return CreateAlignmentAssumptionHelper(DL, PtrValue, Mask, IntPtrTy, - OffsetValue, TheCheck); + Value *AlignValue = ConstantInt::get(IntPtrTy, Alignment); + return CreateAlignmentAssumptionHelper(DL, PtrValue, AlignValue, OffsetValue); } -CallInst *IRBuilderBase::CreateAlignmentAssumption( - const DataLayout &DL, Value *PtrValue, Value *Alignment, - Value *OffsetValue, Value **TheCheck) { +CallInst *IRBuilderBase::CreateAlignmentAssumption(const DataLayout &DL, + Value *PtrValue, + Value *Alignment, + Value *OffsetValue) { assert(isa(PtrValue->getType()) && "trying to create an alignment assumption on a non-pointer?"); - auto *PtrTy = cast(PtrValue->getType()); - Type *IntPtrTy = getIntPtrTy(DL, PtrTy->getAddressSpace()); - - if (Alignment->getType() != IntPtrTy) - Alignment = CreateIntCast(Alignment, IntPtrTy, /*isSigned*/ false, - "alignmentcast"); - - Value *Mask = CreateSub(Alignment, ConstantInt::get(IntPtrTy, 1), "mask"); - - return CreateAlignmentAssumptionHelper(DL, PtrValue, Mask, IntPtrTy, - OffsetValue, TheCheck); + return CreateAlignmentAssumptionHelper(DL, PtrValue, Alignment, OffsetValue); } IRBuilderDefaultInserter::~IRBuilderDefaultInserter() {} diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 6cae21e3cfe1a..3fed0bf64b6e7 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -282,6 +282,9 @@ class Verifier : public InstVisitor, VerifierSupport { /// Whether the current function has a DISubprogram attached to it. bool HasDebugInfo = false; + /// The current source language. + dwarf::SourceLanguage CurrentSourceLang = dwarf::DW_LANG_lo_user; + /// Whether source was present on the first DIFile encountered in each CU. DenseMap HasSourceDebugInfo; @@ -895,7 +898,9 @@ void Verifier::visitDIScope(const DIScope &N) { void Verifier::visitDISubrange(const DISubrange &N) { AssertDI(N.getTag() == dwarf::DW_TAG_subrange_type, "invalid tag", &N); - AssertDI(N.getRawCountNode() || N.getRawUpperBound(), + bool HasAssumedSizedArraySupport = dwarf::isFortran(CurrentSourceLang); + AssertDI(HasAssumedSizedArraySupport || N.getRawCountNode() || + N.getRawUpperBound(), "Subrange must contain count or upperBound", &N); AssertDI(!N.getRawCountNode() || !N.getRawUpperBound(), "Subrange can have any one of count or upperBound", &N); @@ -1100,6 +1105,8 @@ void Verifier::visitDICompileUnit(const DICompileUnit &N) { AssertDI(!N.getFile()->getFilename().empty(), "invalid filename", &N, N.getFile()); + CurrentSourceLang = (dwarf::SourceLanguage)N.getSourceLanguage(); + verifySourceDebugInfo(N, *N.getFile()); AssertDI((N.getEmissionKind() <= DICompileUnit::LastEmissionKind), @@ -4483,21 +4490,32 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { Assert(Elem.Tag->getKey() == "ignore" || Attribute::isExistingAttribute(Elem.Tag->getKey()), "tags must be valid attribute names"); - Assert(Elem.End - Elem.Begin <= 2, "to many arguments"); Attribute::AttrKind Kind = Attribute::getAttrKindFromName(Elem.Tag->getKey()); + unsigned ArgCount = Elem.End - Elem.Begin; + if (Kind == Attribute::Alignment) { + Assert(ArgCount <= 3 && ArgCount >= 2, + "alignment assumptions should have 2 or 3 arguments"); + Assert(Call.getOperand(Elem.Begin)->getType()->isPointerTy(), + "first argument should be a pointer"); + Assert(Call.getOperand(Elem.Begin + 1)->getType()->isIntegerTy(), + "second argument should be an integer"); + if (ArgCount == 3) + Assert(Call.getOperand(Elem.Begin + 2)->getType()->isIntegerTy(), + "third argument should be an integer if present"); + return; + } + Assert(ArgCount <= 2, "to many arguments"); if (Kind == Attribute::None) break; if (Attribute::doesAttrKindHaveArgument(Kind)) { - Assert(Elem.End - Elem.Begin == 2, - "this attribute should have 2 arguments"); + Assert(ArgCount == 2, "this attribute should have 2 arguments"); Assert(isa(Call.getOperand(Elem.Begin + 1)), "the second argument should be a constant integral value"); } else if (isFuncOnlyAttr(Kind)) { - Assert((Elem.End - Elem.Begin) == 0, "this attribute has no argument"); + Assert((ArgCount) == 0, "this attribute has no argument"); } else if (!isFuncOrArgAttr(Kind)) { - Assert((Elem.End - Elem.Begin) == 1, - "this attribute should have one argument"); + Assert((ArgCount) == 1, "this attribute should have one argument"); } } break; @@ -4999,6 +5017,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { Assert(Size % 16 == 0, "bswap must be an even number of bytes", &Call); break; } + case Intrinsic::invariant_start: { + ConstantInt *InvariantSize = dyn_cast(Call.getArgOperand(0)); + Assert(InvariantSize && + (!InvariantSize->isNegative() || InvariantSize->isMinusOne()), + "invariant_start parameter must be -1, 0 or a positive number", + &Call); + break; + } case Intrinsic::matrix_multiply: case Intrinsic::matrix_transpose: case Intrinsic::matrix_column_major_load: diff --git a/llvm/lib/InterfaceStub/ELFObjHandler.cpp b/llvm/lib/InterfaceStub/ELFObjHandler.cpp index 82e7a3c8b1baa..cc9a8743cd084 100644 --- a/llvm/lib/InterfaceStub/ELFObjHandler.cpp +++ b/llvm/lib/InterfaceStub/ELFObjHandler.cpp @@ -320,7 +320,7 @@ buildStub(const ELFObjectFile &ElfObj) { DynEnt.StrSize); // Populate Arch from ELF header. - DestStub->Arch = ElfFile->getHeader()->e_machine; + DestStub->Arch = ElfFile->getHeader().e_machine; // Populate SoName from .dynamic entries and dynamic string table. if (DynEnt.SONameOffset.hasValue()) { diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index ca29548a4d7ca..4c5778e81184e 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -50,6 +50,25 @@ using namespace llvm; using namespace lto; +#define DEBUG_TYPE "lto-backend" + +enum class LTOBitcodeEmbedding { + DoNotEmbed = 0, + EmbedOptimized = 1, + EmbedPostMergePreOptimized = 2 +}; + +static cl::opt EmbedBitcode( + "lto-embed-bitcode", cl::init(LTOBitcodeEmbedding::DoNotEmbed), + cl::values(clEnumValN(LTOBitcodeEmbedding::DoNotEmbed, "none", + "Do not embed"), + clEnumValN(LTOBitcodeEmbedding::EmbedOptimized, "optimized", + "Embed after all optimization passes"), + clEnumValN(LTOBitcodeEmbedding::EmbedPostMergePreOptimized, + "post-merge-pre-opt", + "Embed post merge, but before optimizations")), + cl::desc("Embed LLVM bitcode in object files produced by LTO")); + LLVM_ATTRIBUTE_NORETURN static void reportOpenError(StringRef Path, Twine Msg) { errs() << "failed to open " << Path << ": " << Msg << '\n'; errs().flush(); @@ -333,7 +352,25 @@ static void runOldPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM, bool opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod, bool IsThinLTO, ModuleSummaryIndex *ExportSummary, - const ModuleSummaryIndex *ImportSummary) { + const ModuleSummaryIndex *ImportSummary, + const std::vector *CmdArgs = nullptr) { + if (EmbedBitcode == LTOBitcodeEmbedding::EmbedPostMergePreOptimized) { + // FIXME: the motivation for capturing post-merge bitcode and command line + // is replicating the compilation environment from bitcode, without needing + // to understand the dependencies (the functions to be imported). This + // assumes a clang - based invocation, case in which we have the command + // line. + // It's not very clear how the above motivation would map in the + // linker-based case, so we currently don't plumb the command line args in + // that case. + if (CmdArgs == nullptr) + LLVM_DEBUG( + dbgs() << "Post-(Thin)LTO merge bitcode embedding was requested, but " + "command line arguments are not available"); + llvm::EmbedBitcodeInModule(Mod, llvm::MemoryBufferRef(), + /*EmbedBitcode*/ true, + /*EmbedMarker*/ false, CmdArgs); + } // FIXME: Plumb the combined index into the new pass manager. if (!Conf.OptPipeline.empty()) runNewPMCustomPasses(Conf, Mod, TM, Conf.OptPipeline, Conf.AAPipeline, @@ -346,30 +383,16 @@ bool opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod, return !Conf.PostOptModuleHook || Conf.PostOptModuleHook(Task, Mod); } -static cl::opt EmbedBitcode( - "lto-embed-bitcode", cl::init(false), - cl::desc("Embed LLVM bitcode in object files produced by LTO")); - -static void EmitBitcodeSection(Module &M, const Config &Conf) { - if (!EmbedBitcode) - return; - SmallVector Buffer; - raw_svector_ostream OS(Buffer); - WriteBitcodeToFile(M, OS); - - std::unique_ptr Buf( - new SmallVectorMemoryBuffer(std::move(Buffer))); - llvm::EmbedBitcodeInModule(M, Buf->getMemBufferRef(), /*EmbedBitcode*/ true, - /*EmbedMarker*/ false, /*CmdArgs*/ nullptr); -} - void codegen(const Config &Conf, TargetMachine *TM, AddStreamFn AddStream, unsigned Task, Module &Mod, const ModuleSummaryIndex &CombinedIndex) { if (Conf.PreCodeGenModuleHook && !Conf.PreCodeGenModuleHook(Task, Mod)) return; - EmitBitcodeSection(Mod, Conf); + if (EmbedBitcode == LTOBitcodeEmbedding::EmbedOptimized) + llvm::EmbedBitcodeInModule(Mod, llvm::MemoryBufferRef(), + /*EmbedBitcode*/ true, + /*EmbedMarker*/ false, /*CmdArgs*/ nullptr); std::unique_ptr DwoOut; SmallString<1024> DwoFile(Conf.SplitDwarfOutput); @@ -532,7 +555,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, Module &Mod, const ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, - MapVector &ModuleMap) { + MapVector &ModuleMap, + const std::vector *CmdArgs) { Expected TOrErr = initAndLookupTarget(Conf, Mod); if (!TOrErr) return TOrErr.takeError(); @@ -600,7 +624,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); if (!opt(Conf, TM.get(), Task, Mod, /*IsThinLTO=*/true, - /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex)) + /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex, + CmdArgs)) return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex); diff --git a/llvm/lib/MC/MCAsmBackend.cpp b/llvm/lib/MC/MCAsmBackend.cpp index cf110345df3de..0d32e71c2d8f3 100644 --- a/llvm/lib/MC/MCAsmBackend.cpp +++ b/llvm/lib/MC/MCAsmBackend.cpp @@ -54,10 +54,17 @@ std::unique_ptr MCAsmBackend::createDwoObjectWriter(raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS) const { auto TW = createObjectTargetWriter(); - if (TW->getFormat() != Triple::ELF) - report_fatal_error("dwo only supported with ELF"); - return createELFDwoObjectWriter(cast(std::move(TW)), - OS, DwoOS, Endian == support::little); + switch (TW->getFormat()) { + case Triple::ELF: + return createELFDwoObjectWriter( + cast(std::move(TW)), OS, DwoOS, + Endian == support::little); + case Triple::Wasm: + return createWasmDwoObjectWriter( + cast(std::move(TW)), OS, DwoOS); + default: + report_fatal_error("dwo only supported with ELF and Wasm"); + } } Optional MCAsmBackend::getFixupKind(StringRef Name) const { diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 9515b7e2642bc..1b2eb2412a161 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -754,6 +754,8 @@ void MCAssembler::writeSectionData(raw_ostream &OS, const MCSection *Sec, assert((cast(F).getValue() == 0) && "Invalid fill in virtual section!"); break; + case MCFragment::FT_Org: + break; } } diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index 07680e95e8e1e..7f282a1ba4977 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -588,12 +588,7 @@ static void AttemptToFoldSymbolOffsetDifference( if (!Asm->getWriter().isSymbolRefDifferenceFullyResolved(*Asm, A, B, InSet)) return; - MCFragment *FA = SA.getFragment(); - MCFragment *FB = SB.getFragment(); - if (FA == FB && !SA.isVariable() && !SA.isUnset() && !SB.isVariable() && - !SB.isUnset()) { - Addend += (SA.getOffset() - SB.getOffset()); - + auto FinalizeFolding = [&]() { // Pointers to Thumb symbols need to have their low-bit set to allow // for interworking. if (Asm->isThumbFunc(&SA)) @@ -607,11 +602,17 @@ static void AttemptToFoldSymbolOffsetDifference( // Clear the symbol expr pointers to indicate we have folded these // operands. A = B = nullptr; - return; - } + }; - if (!Layout) - return; + const MCFragment *FA = SA.getFragment(); + const MCFragment *FB = SB.getFragment(); + // If both symbols are in the same fragment, return the difference of their + // offsets + if (FA == FB && !SA.isVariable() && !SA.isUnset() && !SB.isVariable() && + !SB.isUnset()) { + Addend += SA.getOffset() - SB.getOffset(); + return FinalizeFolding(); + } const MCSection &SecA = *FA->getParent(); const MCSection &SecB = *FB->getParent(); @@ -619,30 +620,46 @@ static void AttemptToFoldSymbolOffsetDifference( if ((&SecA != &SecB) && !Addrs) return; - // One of the symbol involved is part of a fragment being laid out. Quit now - // to avoid a self loop. - if (!Layout->canGetFragmentOffset(FA) || !Layout->canGetFragmentOffset(FB)) - return; + if (Layout) { + // One of the symbol involved is part of a fragment being laid out. Quit now + // to avoid a self loop. + if (!Layout->canGetFragmentOffset(FA) || !Layout->canGetFragmentOffset(FB)) + return; + + // Eagerly evaluate when layout is finalized. + Addend += Layout->getSymbolOffset(A->getSymbol()) - + Layout->getSymbolOffset(B->getSymbol()); + if (Addrs && (&SecA != &SecB)) + Addend += (Addrs->lookup(&SecA) - Addrs->lookup(&SecB)); + + FinalizeFolding(); + } else { + // When layout is not finalized, our ability to resolve differences between + // symbols is limited to specific cases where the fragments between two + // symbols (including the fragments the symbols are defined in) are + // fixed-size fragments so the difference can be calculated. For example, + // this is important when the Subtarget is changed and a new MCDataFragment + // is created in the case of foo: instr; .arch_extension ext; instr .if . - + // foo. + if (SA.isVariable() || SA.isUnset() || SB.isVariable() || SB.isUnset() || + FA->getKind() != MCFragment::FT_Data || + FB->getKind() != MCFragment::FT_Data || + FA->getSubsectionNumber() != FB->getSubsectionNumber()) + return; + // Try to find a constant displacement from FA to FB, add the displacement + // between the offset in FA of SA and the offset in FB of SB. + int64_t Displacement = SA.getOffset() - SB.getOffset(); + for (auto FI = FB->getIterator(), FE = SecA.end(); FI != FE; ++FI) { + if (&*FI == FA) { + Addend += Displacement; + return FinalizeFolding(); + } - // Eagerly evaluate. - Addend += Layout->getSymbolOffset(A->getSymbol()) - - Layout->getSymbolOffset(B->getSymbol()); - if (Addrs && (&SecA != &SecB)) - Addend += (Addrs->lookup(&SecA) - Addrs->lookup(&SecB)); - - // Pointers to Thumb symbols need to have their low-bit set to allow - // for interworking. - if (Asm->isThumbFunc(&SA)) - Addend |= 1; - - // If symbol is labeled as micromips, we set low-bit to ensure - // correct offset in .gcc_except_table - if (Asm->getBackend().isMicroMips(&SA)) - Addend |= 1; - - // Clear the symbol expr pointers to indicate we have folded these - // operands. - A = B = nullptr; + if (FI->getKind() != MCFragment::FT_Data) + return; + Displacement += cast(FI)->getContents().size(); + } + } } static bool canFold(const MCAssembler *Asm, const MCSymbolRefExpr *A, diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp index 927294fcd7e15..ae7345c4e05b9 100644 --- a/llvm/lib/MC/MCObjectFileInfo.cpp +++ b/llvm/lib/MC/MCObjectFileInfo.cpp @@ -796,6 +796,10 @@ void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) { DwarfFrameSection = Ctx->getWasmSection(".debug_frame", SectionKind::getMetadata()); DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", SectionKind::getMetadata()); DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", SectionKind::getMetadata()); + DwarfGnuPubNamesSection = + Ctx->getWasmSection(".debug_gnu_pubnames", SectionKind::getMetadata()); + DwarfGnuPubTypesSection = + Ctx->getWasmSection(".debug_gnu_pubtypes", SectionKind::getMetadata()); DwarfDebugNamesSection = Ctx->getWasmSection(".debug_names", SectionKind::getMetadata()); @@ -808,6 +812,37 @@ void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) { DwarfLoclistsSection = Ctx->getWasmSection(".debug_loclists", SectionKind::getMetadata()); + // Fission Sections + DwarfInfoDWOSection = + Ctx->getWasmSection(".debug_info.dwo", SectionKind::getMetadata()); + DwarfTypesDWOSection = + Ctx->getWasmSection(".debug_types.dwo", SectionKind::getMetadata()); + DwarfAbbrevDWOSection = + Ctx->getWasmSection(".debug_abbrev.dwo", SectionKind::getMetadata()); + DwarfStrDWOSection = + Ctx->getWasmSection(".debug_str.dwo", SectionKind::getMetadata()); + DwarfLineDWOSection = + Ctx->getWasmSection(".debug_line.dwo", SectionKind::getMetadata()); + DwarfLocDWOSection = + Ctx->getWasmSection(".debug_loc.dwo", SectionKind::getMetadata()); + DwarfStrOffDWOSection = + Ctx->getWasmSection(".debug_str_offsets.dwo", SectionKind::getMetadata()); + DwarfRnglistsDWOSection = + Ctx->getWasmSection(".debug_rnglists.dwo", SectionKind::getMetadata()); + DwarfMacinfoDWOSection = + Ctx->getWasmSection(".debug_macinfo.dwo", SectionKind::getMetadata()); + DwarfMacroDWOSection = + Ctx->getWasmSection(".debug_macro.dwo", SectionKind::getMetadata()); + + DwarfLoclistsDWOSection = + Ctx->getWasmSection(".debug_loclists.dwo", SectionKind::getMetadata()); + + // DWP Sections + DwarfCUIndexSection = + Ctx->getWasmSection(".debug_cu_index", SectionKind::getMetadata(), 0); + DwarfTUIndexSection = + Ctx->getWasmSection(".debug_tu_index", SectionKind::getMetadata(), 0); + // Wasm use data section for LSDA. // TODO Consider putting each function's exception table in a separate // section, as in -function-sections, to facilitate lld's --gc-section. @@ -953,3 +988,21 @@ MCObjectFileInfo::getStackSizesSection(const MCSection &TextSec) const { GroupName, MCSection::NonUniqueID, cast(TextSec.getBeginSymbol())); } + +MCSection * +MCObjectFileInfo::getBBAddrMapSection(const MCSection &TextSec) const { + if (Env != IsELF) + return nullptr; + + const MCSectionELF &ElfSec = static_cast(TextSec); + unsigned Flags = ELF::SHF_LINK_ORDER; + StringRef GroupName; + if (const MCSymbol *Group = ElfSec.getGroup()) { + GroupName = Group->getName(); + Flags |= ELF::SHF_GROUP; + } + + return Ctx->getELFSection(".bb_addr_map", ELF::SHT_PROGBITS, Flags, 0, + GroupName, MCSection::NonUniqueID, + cast(TextSec.getBeginSymbol())); +} diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 497f73e411057..f5a06f0a91fe0 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -244,7 +244,8 @@ class AsmParser : public MCAsmParser { bool parseExpression(const MCExpr *&Res); bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc) override; - bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override; + bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, + AsmTypeInfo *TypeInfo) override; bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) override; bool parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res, SMLoc &EndLoc) override; @@ -1068,7 +1069,8 @@ bool AsmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) { /// primaryexpr ::= number /// primaryexpr ::= '.' /// primaryexpr ::= ~,+,- primaryexpr -bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { +bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, + AsmTypeInfo *TypeInfo) { SMLoc FirstTokenLoc = getLexer().getLoc(); AsmToken::TokenKind FirstTokenKind = Lexer.getKind(); switch (FirstTokenKind) { @@ -1079,7 +1081,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { return true; case AsmToken::Exclaim: Lex(); // Eat the operator. - if (parsePrimaryExpr(Res, EndLoc)) + if (parsePrimaryExpr(Res, EndLoc, TypeInfo)) return true; Res = MCUnaryExpr::createLNot(Res, getContext(), FirstTokenLoc); return false; @@ -1238,19 +1240,19 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { return parseBracketExpr(Res, EndLoc); case AsmToken::Minus: Lex(); // Eat the operator. - if (parsePrimaryExpr(Res, EndLoc)) + if (parsePrimaryExpr(Res, EndLoc, TypeInfo)) return true; Res = MCUnaryExpr::createMinus(Res, getContext(), FirstTokenLoc); return false; case AsmToken::Plus: Lex(); // Eat the operator. - if (parsePrimaryExpr(Res, EndLoc)) + if (parsePrimaryExpr(Res, EndLoc, TypeInfo)) return true; Res = MCUnaryExpr::createPlus(Res, getContext(), FirstTokenLoc); return false; case AsmToken::Tilde: Lex(); // Eat the operator. - if (parsePrimaryExpr(Res, EndLoc)) + if (parsePrimaryExpr(Res, EndLoc, TypeInfo)) return true; Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc); return false; diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp index b7c48e92961b3..532ded038043f 100644 --- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp +++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp @@ -53,6 +53,9 @@ class COFFMasmParser : public MCAsmParserExtension { bool ParseDirectiveSegmentEnd(StringRef, SMLoc); bool ParseDirectiveIncludelib(StringRef, SMLoc); + bool ParseSEHDirectiveAllocStack(StringRef, SMLoc); + bool ParseSEHDirectiveEndProlog(StringRef, SMLoc); + bool IgnoreDirective(StringRef, SMLoc) { while (!getLexer().is(AsmToken::EndOfStatement)) { Lex(); @@ -65,13 +68,10 @@ class COFFMasmParser : public MCAsmParserExtension { MCAsmParserExtension::Initialize(Parser); // x64 directives - // .allocstack - // .endprolog - // .pushframe - // .pushreg - // .savereg - // .savexmm128 - // .setframe + addDirectiveHandler<&COFFMasmParser::ParseSEHDirectiveAllocStack>( + ".allocstack"); + addDirectiveHandler<&COFFMasmParser::ParseSEHDirectiveEndProlog>( + ".endprolog"); // Code label directives // label @@ -92,16 +92,12 @@ class COFFMasmParser : public MCAsmParserExtension { // Data allocation directives // align - // byte/sbyte - // dword/sdword // even - // fword - // qword - // real4 - // real8 + // mmword // real10 // tbyte - // word/sword + // xmmword + // ymmword // Listing control directives addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".cref"); @@ -133,14 +129,11 @@ class COFFMasmParser : public MCAsmParserExtension { // .fpo addDirectiveHandler<&COFFMasmParser::ParseDirectiveIncludelib>( "includelib"); - // mmword // option // popcontext // pushcontext // .radix // .safeseh - // xmmword - // ymmword // Procedure directives addDirectiveHandler<&COFFMasmParser::ParseDirectiveEndProc>("endp"); @@ -148,7 +141,7 @@ class COFFMasmParser : public MCAsmParserExtension { addDirectiveHandler<&COFFMasmParser::ParseDirectiveProc>("proc"); // proto - // Processor directives + // Processor directives; all ignored addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".386"); addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".386P"); addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".387"); @@ -202,11 +195,8 @@ class COFFMasmParser : public MCAsmParserExtension { // substr (equivalent to TEXTEQU @SubStr()) // Structure and record directives - // ends // record - // struct // typedef - // union } bool ParseSectionDirectiveCode(StringRef, SMLoc) { @@ -234,6 +224,7 @@ class COFFMasmParser : public MCAsmParserExtension { } StringRef CurrentProcedure; + bool CurrentProcedureFramed; public: COFFMasmParser() = default; @@ -361,8 +352,17 @@ bool COFFMasmParser::ParseDirectiveProc(StringRef Directive, SMLoc Loc) { getStreamer().EmitCOFFSymbolType(0x20); getStreamer().EndCOFFSymbolDef(); + bool Framed = false; + if (getLexer().is(AsmToken::Identifier) && + getTok().getString().equals_lower("frame")) { + Lex(); + Framed = true; + getStreamer().EmitWinCFIStartProc(Sym, Loc); + } getStreamer().emitLabel(Sym, Loc); + CurrentProcedure = Label; + CurrentProcedureFramed = Framed; return false; } bool COFFMasmParser::ParseDirectiveEndProc(StringRef Directive, SMLoc Loc) { @@ -376,6 +376,30 @@ bool COFFMasmParser::ParseDirectiveEndProc(StringRef Directive, SMLoc Loc) { else if (CurrentProcedure != Label) return Error(LabelLoc, "endp does not match current procedure '" + CurrentProcedure + "'"); + + if (CurrentProcedureFramed) { + getStreamer().EmitWinCFIEndProc(Loc); + } + CurrentProcedure = ""; + CurrentProcedureFramed = false; + return false; +} + +bool COFFMasmParser::ParseSEHDirectiveAllocStack(StringRef Directive, + SMLoc Loc) { + int64_t Size; + SMLoc SizeLoc = getTok().getLoc(); + if (getParser().parseAbsoluteExpression(Size)) + return Error(SizeLoc, "expected integer size"); + if (Size % 8 != 0) + return Error(SizeLoc, "stack size must be a multiple of 8"); + getStreamer().EmitWinCFIAllocStack(static_cast(Size), Loc); + return false; +} + +bool COFFMasmParser::ParseSEHDirectiveEndProlog(StringRef Directive, + SMLoc Loc) { + getStreamer().EmitWinCFIEndProlog(Loc); return false; } diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index 4d62174f7e5e4..ca9b2df7cf231 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/DebugInfo/CodeView/SymbolRecord.h" @@ -122,12 +123,14 @@ struct FieldInfo; struct StructInfo { StringRef Name; bool IsUnion = false; - size_t Alignment = 0; - size_t Size = 0; + unsigned Alignment = 0; + unsigned Size = 0; + unsigned AlignmentSize = 0; std::vector Fields; StringMap FieldsByName; - FieldInfo &addField(StringRef FieldName, FieldType FT, size_t FieldSize); + FieldInfo &addField(StringRef FieldName, FieldType FT, + unsigned FieldAlignmentSize); StructInfo() = default; @@ -317,13 +320,13 @@ struct FieldInfo { size_t Offset = 0; // Total size of the field (= LengthOf * Type). - size_t SizeOf = 0; + unsigned SizeOf = 0; // Number of elements in the field (1 if scalar, >1 if an array). - size_t LengthOf = 0; + unsigned LengthOf = 0; // Size of a single entry in this field, in bytes ("type" in MASM standards). - size_t Type = 0; + unsigned Type = 0; FieldInitializer Contents; @@ -331,17 +334,18 @@ struct FieldInfo { }; FieldInfo &StructInfo::addField(StringRef FieldName, FieldType FT, - size_t FieldSize) { + unsigned FieldAlignmentSize) { if (!FieldName.empty()) - FieldsByName[FieldName] = Fields.size(); + FieldsByName[FieldName.lower()] = Fields.size(); Fields.emplace_back(FT); FieldInfo &Field = Fields.back(); if (IsUnion) { Field.Offset = 0; } else { - Size = llvm::alignTo(Size, std::min(Alignment, FieldSize)); + Size = llvm::alignTo(Size, std::min(Alignment, FieldAlignmentSize)); Field.Offset = Size; } + AlignmentSize = std::max(AlignmentSize, FieldAlignmentSize); return Field; } @@ -387,8 +391,8 @@ class MasmParser : public MCAsmParser { /// Maps struct tags to struct definitions. StringMap Structs; - /// Maps data location names to user-defined types. - StringMap KnownType; + /// Maps data location names to types. + StringMap KnownType; /// Stack of active macro instantiations. std::vector ActiveMacros; @@ -491,10 +495,11 @@ class MasmParser : public MCAsmParser { bool isParsingMasm() const override { return true; } - bool lookUpField(StringRef Name, StringRef &Type, - unsigned &Offset) const override; - bool lookUpField(StringRef Base, StringRef Member, StringRef &Type, - unsigned &Offset) const override; + bool lookUpField(StringRef Name, AsmFieldInfo &Info) const override; + bool lookUpField(StringRef Base, StringRef Member, + AsmFieldInfo &Info) const override; + + bool lookUpType(StringRef Name, AsmTypeInfo &Info) const override; bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString, unsigned &NumOutputs, unsigned &NumInputs, @@ -506,7 +511,8 @@ class MasmParser : public MCAsmParser { bool parseExpression(const MCExpr *&Res); bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc) override; - bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override; + bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, + AsmTypeInfo *TypeInfo) override; bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) override; bool parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res, SMLoc &EndLoc) override; @@ -565,7 +571,7 @@ class MasmParser : public MCAsmParser { static void DiagHandler(const SMDiagnostic &Diag, void *Context); bool lookUpField(const StructInfo &Structure, StringRef Member, - StringRef &Type, unsigned &Offset) const; + AsmFieldInfo &Info) const; /// Should we emit DWARF describing this assembler source? (Returns false if /// the source has .file directives, which means we don't want to generate @@ -623,6 +629,7 @@ class MasmParser : public MCAsmParser { DK_SQWORD, DK_DB, DK_DD, + DK_DF, DK_DQ, DK_DW, DK_REAL4, @@ -719,7 +726,12 @@ class MasmParser : public MCAsmParser { DK_STRUCT, DK_UNION, DK_ENDS, - DK_END + DK_END, + DK_PUSHFRAME, + DK_PUSHREG, + DK_SAVEREG, + DK_SAVEXMM128, + DK_SETFRAME, }; /// Maps directive name --> DirectiveKind enum, for directives parsed by this @@ -752,23 +764,24 @@ class MasmParser : public MCAsmParser { bool parseScalarInstList( unsigned Size, SmallVectorImpl &Values, const AsmToken::TokenKind EndToken = AsmToken::EndOfStatement); - bool emitIntegralValues(unsigned Size); + bool emitIntegralValues(unsigned Size, unsigned *Count = nullptr); bool addIntegralField(StringRef Name, unsigned Size); bool parseDirectiveValue(StringRef IDVal, unsigned Size); - bool parseDirectiveNamedValue(StringRef IDVal, unsigned Size, StringRef Name, - SMLoc NameLoc); + bool parseDirectiveNamedValue(StringRef TypeName, unsigned Size, + StringRef Name, SMLoc NameLoc); // "real4", "real8" - bool emitRealValues(const fltSemantics &Semantics); + bool emitRealValues(const fltSemantics &Semantics, unsigned *Count = nullptr); bool addRealField(StringRef Name, const fltSemantics &Semantics, size_t Size); bool parseDirectiveRealValue(StringRef IDVal, const fltSemantics &Semantics, size_t Size); bool parseRealInstList( const fltSemantics &Semantics, SmallVectorImpl &Values, const AsmToken::TokenKind EndToken = AsmToken::EndOfStatement); - bool parseDirectiveNamedRealValue(StringRef IDVal, - const fltSemantics &Semantics, size_t Size, - StringRef Name, SMLoc NameLoc); + bool parseDirectiveNamedRealValue(StringRef TypeName, + const fltSemantics &Semantics, + unsigned Size, StringRef Name, + SMLoc NameLoc); bool parseOptionalAngleBracketOpen(); bool parseAngleBracketClose(const Twine &Msg = "expected '>'"); @@ -812,7 +825,7 @@ class MasmParser : public MCAsmParser { const StructInitializer &Initializer); // User-defined types (structs, unions): - bool emitStructValues(const StructInfo &Structure); + bool emitStructValues(const StructInfo &Structure, unsigned *Count = nullptr); bool addStructField(StringRef Name, const StructInfo &Structure); bool parseDirectiveStructValue(const StructInfo &Structure, StringRef Directive, SMLoc DirLoc); @@ -1317,7 +1330,8 @@ bool MasmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) { /// primaryexpr ::= number /// primaryexpr ::= '.' /// primaryexpr ::= ~,+,-,'not' primaryexpr -bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { +bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, + AsmTypeInfo *TypeInfo) { SMLoc FirstTokenLoc = getLexer().getLoc(); AsmToken::TokenKind FirstTokenKind = Lexer.getKind(); switch (FirstTokenKind) { @@ -1328,7 +1342,7 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { return true; case AsmToken::Exclaim: Lex(); // Eat the operator. - if (parsePrimaryExpr(Res, EndLoc)) + if (parsePrimaryExpr(Res, EndLoc, nullptr)) return true; Res = MCUnaryExpr::createLNot(Res, getContext(), FirstTokenLoc); return false; @@ -1356,7 +1370,7 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { } // Parse named bitwise negation. if (Identifier.equals_lower("not")) { - if (parsePrimaryExpr(Res, EndLoc)) + if (parsePrimaryExpr(Res, EndLoc, nullptr)) return true; Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc); return false; @@ -1411,24 +1425,19 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { } // Find the field offset if used. - StringRef Type; - unsigned Offset = 0; + AsmFieldInfo Info; Split = SymbolName.split('.'); - if (!Split.second.empty()) { + if (Split.second.empty()) { + } else { SymbolName = Split.first; - if (Structs.count(SymbolName.lower()) && - !lookUpField(SymbolName, Split.second, Type, Offset)) { - // This is actually a reference to a field offset. - Res = MCConstantExpr::create(Offset, getContext()); - return false; - } - - auto TypeIt = KnownType.find(SymbolName); - if (TypeIt == KnownType.end() || - lookUpField(*TypeIt->second, Split.second, Type, Offset)) { + if (lookUpField(SymbolName, Split.second, Info)) { std::pair BaseMember = Split.second.split('.'); StringRef Base = BaseMember.first, Member = BaseMember.second; - lookUpField(Base, Member, Type, Offset); + lookUpField(Base, Member, Info); + } else if (Structs.count(SymbolName.lower())) { + // This is actually a reference to a field offset. + Res = MCConstantExpr::create(Info.Offset, getContext()); + return false; } } @@ -1454,13 +1463,23 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { // Otherwise create a symbol ref. const MCExpr *SymRef = MCSymbolRefExpr::create(Sym, Variant, getContext(), FirstTokenLoc); - if (Offset) { - Res = MCBinaryExpr::create(MCBinaryExpr::Add, SymRef, - MCConstantExpr::create(Offset, getContext()), - getContext()); + if (Info.Offset) { + Res = MCBinaryExpr::create( + MCBinaryExpr::Add, SymRef, + MCConstantExpr::create(Info.Offset, getContext()), getContext()); } else { Res = SymRef; } + if (TypeInfo) { + if (Info.Type.Name.empty()) { + auto TypeIt = KnownType.find(Identifier.lower()); + if (TypeIt != KnownType.end()) { + Info.Type = TypeIt->second; + } + } + + *TypeInfo = Info.Type; + } return false; } case AsmToken::BigNum: @@ -1524,19 +1543,19 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { return parseBracketExpr(Res, EndLoc); case AsmToken::Minus: Lex(); // Eat the operator. - if (parsePrimaryExpr(Res, EndLoc)) + if (parsePrimaryExpr(Res, EndLoc, nullptr)) return true; Res = MCUnaryExpr::createMinus(Res, getContext(), FirstTokenLoc); return false; case AsmToken::Plus: Lex(); // Eat the operator. - if (parsePrimaryExpr(Res, EndLoc)) + if (parsePrimaryExpr(Res, EndLoc, nullptr)) return true; Res = MCUnaryExpr::createPlus(Res, getContext(), FirstTokenLoc); return false; case AsmToken::Tilde: Lex(); // Eat the operator. - if (parsePrimaryExpr(Res, EndLoc)) + if (parsePrimaryExpr(Res, EndLoc, nullptr)) return true; Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc); return false; @@ -2114,6 +2133,7 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info, case DK_DD: return parseDirectiveValue(IDVal, 4); case DK_FWORD: + case DK_DF: return parseDirectiveValue(IDVal, 6); case DK_QWORD: case DK_SQWORD: @@ -2325,21 +2345,26 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info, Lex(); return parseDirectiveEquate(nextVal, IDVal, DirKind); case DK_BYTE: + case DK_SBYTE: case DK_DB: Lex(); return parseDirectiveNamedValue(nextVal, 1, IDVal, IDLoc); case DK_WORD: + case DK_SWORD: case DK_DW: Lex(); return parseDirectiveNamedValue(nextVal, 2, IDVal, IDLoc); case DK_DWORD: + case DK_SDWORD: case DK_DD: Lex(); return parseDirectiveNamedValue(nextVal, 4, IDVal, IDLoc); case DK_FWORD: + case DK_DF: Lex(); return parseDirectiveNamedValue(nextVal, 6, IDVal, IDLoc); case DK_QWORD: + case DK_SQWORD: case DK_DQ: Lex(); return parseDirectiveNamedValue(nextVal, 8, IDVal, IDLoc); @@ -3299,7 +3324,7 @@ bool MasmParser::parseScalarInstList(unsigned Size, return false; } -bool MasmParser::emitIntegralValues(unsigned Size) { +bool MasmParser::emitIntegralValues(unsigned Size, unsigned *Count) { SmallVector Values; if (checkForValidSection() || parseScalarInstList(Size, Values)) return true; @@ -3307,6 +3332,8 @@ bool MasmParser::emitIntegralValues(unsigned Size) { for (auto Value : Values) { emitIntValue(Value, Size); } + if (Count) + *Count = Values.size(); return false; } @@ -3346,16 +3373,24 @@ bool MasmParser::parseDirectiveValue(StringRef IDVal, unsigned Size) { /// parseDirectiveNamedValue /// ::= name (byte | word | ... ) [ expression (, expression)* ] -bool MasmParser::parseDirectiveNamedValue(StringRef IDVal, unsigned Size, +bool MasmParser::parseDirectiveNamedValue(StringRef TypeName, unsigned Size, StringRef Name, SMLoc NameLoc) { if (StructInProgress.empty()) { // Initialize named data value. MCSymbol *Sym = getContext().getOrCreateSymbol(Name); getStreamer().emitLabel(Sym); - if (emitIntegralValues(Size)) - return addErrorSuffix(" in '" + Twine(IDVal) + "' directive"); + unsigned Count; + if (emitIntegralValues(Size, &Count)) + return addErrorSuffix(" in '" + Twine(TypeName) + "' directive"); + + AsmTypeInfo Type; + Type.Name = TypeName; + Type.Size = Size * Count; + Type.ElementSize = Size; + Type.Length = Count; + KnownType[Name.lower()] = Type; } else if (addIntegralField(Name, Size)) { - return addErrorSuffix(" in '" + Twine(IDVal) + "' directive"); + return addErrorSuffix(" in '" + Twine(TypeName) + "' directive"); } return false; @@ -3472,7 +3507,8 @@ bool MasmParser::parseRealInstList(const fltSemantics &Semantics, } // Initialize real data values. -bool MasmParser::emitRealValues(const fltSemantics &Semantics) { +bool MasmParser::emitRealValues(const fltSemantics &Semantics, + unsigned *Count) { if (checkForValidSection()) return true; @@ -3484,6 +3520,8 @@ bool MasmParser::emitRealValues(const fltSemantics &Semantics) { getStreamer().emitIntValue(AsInt.getLimitedValue(), AsInt.getBitWidth() / 8); } + if (Count) + *Count = ValuesAsInt.size(); return false; } @@ -3526,18 +3564,26 @@ bool MasmParser::parseDirectiveRealValue(StringRef IDVal, /// parseDirectiveNamedRealValue /// ::= name (real4 | real8) [ expression (, expression)* ] -bool MasmParser::parseDirectiveNamedRealValue(StringRef IDVal, +bool MasmParser::parseDirectiveNamedRealValue(StringRef TypeName, const fltSemantics &Semantics, - size_t Size, StringRef Name, + unsigned Size, StringRef Name, SMLoc NameLoc) { if (StructInProgress.empty()) { // Initialize named data value. MCSymbol *Sym = getContext().getOrCreateSymbol(Name); getStreamer().emitLabel(Sym); - if (emitRealValues(Semantics)) - return addErrorSuffix(" in '" + Twine(IDVal) + "' directive"); + unsigned Count; + if (emitRealValues(Semantics, &Count)) + return addErrorSuffix(" in '" + TypeName + "' directive"); + + AsmTypeInfo Type; + Type.Name = TypeName; + Type.Size = Size * Count; + Type.ElementSize = Size; + Type.Length = Count; + KnownType[Name.lower()] = Type; } else if (addRealField(Name, Semantics, Size)) { - return addErrorSuffix(" in '" + Twine(IDVal) + "' directive"); + return addErrorSuffix(" in '" + TypeName + "' directive"); } return false; } @@ -3950,7 +3996,8 @@ bool MasmParser::emitStructInitializer(const StructInfo &Structure, } // Set data values from initializers. -bool MasmParser::emitStructValues(const StructInfo &Structure) { +bool MasmParser::emitStructValues(const StructInfo &Structure, + unsigned *Count) { std::vector Initializers; if (parseStructInstList(Structure, Initializers)) return true; @@ -3960,13 +4007,16 @@ bool MasmParser::emitStructValues(const StructInfo &Structure) { return true; } + if (Count) + *Count = Initializers.size(); return false; } // Declare a field in the current struct. bool MasmParser::addStructField(StringRef Name, const StructInfo &Structure) { StructInfo &OwningStruct = StructInProgress.back(); - FieldInfo &Field = OwningStruct.addField(Name, FT_STRUCT, Structure.Size); + FieldInfo &Field = + OwningStruct.addField(Name, FT_STRUCT, Structure.AlignmentSize); StructFieldInfo &StructInfo = Field.Contents.StructInfo; StructInfo.Structure = Structure; @@ -4009,9 +4059,15 @@ bool MasmParser::parseDirectiveNamedStructValue(const StructInfo &Structure, // Initialize named data value. MCSymbol *Sym = getContext().getOrCreateSymbol(Name); getStreamer().emitLabel(Sym); - KnownType[Name] = &Structure; - if (emitStructValues(Structure)) + unsigned Count; + if (emitStructValues(Structure, &Count)) return true; + AsmTypeInfo Type; + Type.Name = Structure.Name; + Type.Size = Structure.Size * Count; + Type.ElementSize = Structure.Size; + Type.Length = Count; + KnownType[Name.lower()] = Type; } else if (addStructField(Name, Structure)) { return addErrorSuffix(" in '" + Twine(Directive) + "' directive"); } @@ -4094,8 +4150,10 @@ bool MasmParser::parseDirectiveEnds(StringRef Name, SMLoc NameLoc) { return Error(NameLoc, "mismatched name in ENDS directive; expected '" + StructInProgress.back().Name + "'"); StructInfo Structure = StructInProgress.pop_back_val(); - // Pad to make the structure's size divisible by its alignment. - Structure.Size = llvm::alignTo(Structure.Size, Structure.Alignment); + // Pad to make the structure's size divisible by the smaller of its alignment + // and the size of its largest field. + Structure.Size = llvm::alignTo( + Structure.Size, std::min(Structure.Alignment, Structure.AlignmentSize)); Structs[Name.lower()] = Structure; if (parseToken(AsmToken::EndOfStatement)) @@ -4140,8 +4198,8 @@ bool MasmParser::parseDirectiveNestedEnds() { else ParentStruct.Size += Structure.Size; } else { - FieldInfo &Field = - ParentStruct.addField(Structure.Name, FT_STRUCT, Structure.Size); + FieldInfo &Field = ParentStruct.addField(Structure.Name, FT_STRUCT, + Structure.AlignmentSize); StructFieldInfo &StructInfo = Field.Contents.StructInfo; Field.Type = Structure.Size; Field.LengthOf = 1; @@ -6280,10 +6338,16 @@ void MasmParser::initializeDirectiveKindMap() { DirectiveKindMap[".erridni"] = DK_ERRIDNI; DirectiveKindMap[".erre"] = DK_ERRE; DirectiveKindMap[".errnz"] = DK_ERRNZ; + DirectiveKindMap[".pushframe"] = DK_PUSHFRAME; + DirectiveKindMap[".pushreg"] = DK_PUSHREG; + DirectiveKindMap[".savereg"] = DK_SAVEREG; + DirectiveKindMap[".savexmm128"] = DK_SAVEXMM128; + DirectiveKindMap[".setframe"] = DK_SETFRAME; // DirectiveKindMap[".altmacro"] = DK_ALTMACRO; // DirectiveKindMap[".noaltmacro"] = DK_NOALTMACRO; DirectiveKindMap["db"] = DK_DB; DirectiveKindMap["dd"] = DK_DD; + DirectiveKindMap["df"] = DK_DF; DirectiveKindMap["dq"] = DK_DQ; DirectiveKindMap["dw"] = DK_DW; DirectiveKindMap["echo"] = DK_ECHO; @@ -6550,37 +6614,39 @@ static int rewritesSort(const AsmRewrite *AsmRewriteA, llvm_unreachable("Unstable rewrite sort."); } -bool MasmParser::lookUpField(StringRef Name, StringRef &Type, - unsigned &Offset) const { +bool MasmParser::lookUpField(StringRef Name, AsmFieldInfo &Info) const { const std::pair BaseMember = Name.split('.'); const StringRef Base = BaseMember.first, Member = BaseMember.second; - return lookUpField(Base, Member, Type, Offset); + return lookUpField(Base, Member, Info); } -bool MasmParser::lookUpField(StringRef Base, StringRef Member, StringRef &Type, - unsigned &Offset) const { +bool MasmParser::lookUpField(StringRef Base, StringRef Member, + AsmFieldInfo &Info) const { if (Base.empty()) return true; - unsigned BaseOffset = 0; - if (Base.contains('.') && !lookUpField(Base, Type, BaseOffset)) - Base = Type; - - auto TypeIt = KnownType.find(Base); - if (TypeIt != KnownType.end()) - return lookUpField(*TypeIt->second, Member, Type, Offset); + AsmFieldInfo BaseInfo; + if (Base.contains('.') && !lookUpField(Base, BaseInfo)) + Base = BaseInfo.Type.Name; auto StructIt = Structs.find(Base.lower()); + auto TypeIt = KnownType.find(Base.lower()); + if (TypeIt != KnownType.end()) { + StructIt = Structs.find(TypeIt->second.Name.lower()); + } if (StructIt != Structs.end()) - return lookUpField(StructIt->second, Member, Type, Offset); + return lookUpField(StructIt->second, Member, Info); return true; } bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member, - StringRef &Type, unsigned &Offset) const { + AsmFieldInfo &Info) const { if (Member.empty()) { - Type = Structure.Name; + Info.Type.Name = Structure.Name; + Info.Type.Size = Structure.Size; + Info.Type.ElementSize = Structure.Size; + Info.Type.Length = 1; return false; } @@ -6589,7 +6655,7 @@ bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member, auto StructIt = Structs.find(FieldName.lower()); if (StructIt != Structs.end()) - return lookUpField(StructIt->second, FieldMember, Type, Offset); + return lookUpField(StructIt->second, FieldMember, Info); auto FieldIt = Structure.FieldsByName.find(FieldName.lower()); if (FieldIt == Structure.FieldsByName.end()) @@ -6597,9 +6663,12 @@ bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member, const FieldInfo &Field = Structure.Fields[FieldIt->second]; if (FieldMember.empty()) { - Offset += Field.Offset; + Info.Offset += Field.Offset; + Info.Type.Size = Field.SizeOf; + Info.Type.ElementSize = Field.Type; + Info.Type.Length = Field.LengthOf; if (Field.Contents.FT == FT_STRUCT) - Type = Field.Contents.StructInfo.Structure.Name; + Info.Type.Name = Field.Contents.StructInfo.Structure.Name; return false; } @@ -6607,14 +6676,44 @@ bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member, return true; const StructFieldInfo &StructInfo = Field.Contents.StructInfo; - bool Result = lookUpField(StructInfo.Structure, FieldMember, Type, Offset); - if (Result) + if (lookUpField(StructInfo.Structure, FieldMember, Info)) return true; - Offset += Field.Offset; + Info.Offset += Field.Offset; return false; } +bool MasmParser::lookUpType(StringRef Name, AsmTypeInfo &Info) const { + unsigned Size = StringSwitch(Name) + .CasesLower("byte", "db", "sbyte", 1) + .CasesLower("word", "dw", "sword", 2) + .CasesLower("dword", "dd", "sdword", 4) + .CasesLower("fword", "df", 6) + .CasesLower("qword", "dq", "sqword", 8) + .CaseLower("real4", 4) + .CaseLower("real8", 8) + .Default(0); + if (Size) { + Info.Name = Name; + Info.ElementSize = Size; + Info.Length = 1; + Info.Size = Size; + return false; + } + + auto StructIt = Structs.find(Name.lower()); + if (StructIt != Structs.end()) { + const StructInfo &Structure = StructIt->second; + Info.Name = Name; + Info.ElementSize = Structure.Size; + Info.Length = 1; + Info.Size = Structure.Size; + return false; + } + + return true; +} + bool MasmParser::parseMSInlineAsm( void *AsmLoc, std::string &AsmString, unsigned &NumOutputs, unsigned &NumInputs, SmallVectorImpl> &OpDecls, diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp index ba256102080a7..7c5834895e523 100644 --- a/llvm/lib/MC/MCSection.cpp +++ b/llvm/lib/MC/MCSection.cpp @@ -82,6 +82,7 @@ MCSection::getSubsectionInsertionPoint(unsigned Subsection) { SubsectionFragmentMap.insert(MI, std::make_pair(Subsection, F)); getFragmentList().insert(IP, F); F->setParent(this); + F->setSubsectionNumber(Subsection); } return IP; diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp index fb0de40fc6d5f..8e8dba760853e 100644 --- a/llvm/lib/MC/MCWin64EH.cpp +++ b/llvm/lib/MC/MCWin64EH.cpp @@ -264,8 +264,7 @@ static int64_t GetAbsDifference(MCStreamer &Streamer, const MCSymbol *LHS, return value; } -static uint32_t -ARM64CountOfUnwindCodes(const std::vector &Insns) { +static uint32_t ARM64CountOfUnwindCodes(ArrayRef Insns) { uint32_t Count = 0; for (const auto &I : Insns) { switch (static_cast(I.Operation)) { @@ -544,6 +543,111 @@ FindMatchingEpilog(const std::vector& EpilogInstrs, return nullptr; } +static void simplifyOpcodes(std::vector &Instructions, + bool Reverse) { + unsigned PrevOffset = -1; + unsigned PrevRegister = -1; + + auto VisitInstruction = [&](WinEH::Instruction &Inst) { + // Convert 2-byte opcodes into equivalent 1-byte ones. + if (Inst.Operation == Win64EH::UOP_SaveRegP && Inst.Register == 29) { + Inst.Operation = Win64EH::UOP_SaveFPLR; + Inst.Register = -1; + } else if (Inst.Operation == Win64EH::UOP_SaveRegPX && + Inst.Register == 29) { + Inst.Operation = Win64EH::UOP_SaveFPLRX; + Inst.Register = -1; + } else if (Inst.Operation == Win64EH::UOP_SaveRegPX && + Inst.Register == 19 && Inst.Offset <= 248) { + Inst.Operation = Win64EH::UOP_SaveR19R20X; + Inst.Register = -1; + } else if (Inst.Operation == Win64EH::UOP_AddFP && Inst.Offset == 0) { + Inst.Operation = Win64EH::UOP_SetFP; + } else if (Inst.Operation == Win64EH::UOP_SaveRegP && + Inst.Register == PrevRegister + 2 && + Inst.Offset == PrevOffset + 16) { + Inst.Operation = Win64EH::UOP_SaveNext; + Inst.Register = -1; + Inst.Offset = 0; + // Intentionally not creating UOP_SaveNext for float register pairs, + // as current versions of Windows (up to at least 20.04) is buggy + // regarding SaveNext for float pairs. + } + // Update info about the previous instruction, for detecting if + // the next one can be made a UOP_SaveNext + if (Inst.Operation == Win64EH::UOP_SaveR19R20X) { + PrevOffset = 0; + PrevRegister = 19; + } else if (Inst.Operation == Win64EH::UOP_SaveRegPX) { + PrevOffset = 0; + PrevRegister = Inst.Register; + } else if (Inst.Operation == Win64EH::UOP_SaveRegP) { + PrevOffset = Inst.Offset; + PrevRegister = Inst.Register; + } else if (Inst.Operation == Win64EH::UOP_SaveNext) { + PrevRegister += 2; + PrevOffset += 16; + } else { + PrevRegister = -1; + PrevOffset = -1; + } + }; + + // Iterate over instructions in a forward order (for prologues), + // backwards for epilogues (i.e. always reverse compared to how the + // opcodes are stored). + if (Reverse) { + for (auto It = Instructions.rbegin(); It != Instructions.rend(); It++) + VisitInstruction(*It); + } else { + for (WinEH::Instruction &Inst : Instructions) + VisitInstruction(Inst); + } +} + +static int checkPackedEpilog(MCStreamer &streamer, WinEH::FrameInfo *info, + int PrologCodeBytes) { + // Can only pack if there's one single epilog + if (info->EpilogMap.size() != 1) + return -1; + + const std::vector &Epilog = + info->EpilogMap.begin()->second; + + // Can pack if the epilog is a subset of the prolog but not vice versa + if (Epilog.size() > info->Instructions.size()) + return -1; + + // Check that the epilog actually is a perfect match for the end (backwrds) + // of the prolog. + for (int I = Epilog.size() - 1; I >= 0; I--) { + if (info->Instructions[I] != Epilog[Epilog.size() - 1 - I]) + return -1; + } + + // Check that the epilog actually is at the very end of the function, + // otherwise it can't be packed. + uint32_t DistanceFromEnd = (uint32_t)GetAbsDifference( + streamer, info->FuncletOrFuncEnd, info->EpilogMap.begin()->first); + if (DistanceFromEnd / 4 != Epilog.size()) + return -1; + + int Offset = Epilog.size() == info->Instructions.size() + ? 0 + : ARM64CountOfUnwindCodes(ArrayRef( + &info->Instructions[Epilog.size()], + info->Instructions.size() - Epilog.size())); + + // Check that the offset and prolog size fits in the first word; it's + // unclear whether the epilog count in the extension word can be taken + // as packed epilog offset. + if (Offset > 31 || PrologCodeBytes > 124) + return -1; + + info->EpilogMap.clear(); + return Offset; +} + // Populate the .xdata section. The format of .xdata on ARM64 is documented at // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { @@ -572,6 +676,10 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { return; } + simplifyOpcodes(info->Instructions, false); + for (auto &I : info->EpilogMap) + simplifyOpcodes(I.second, true); + MCContext &context = streamer.getContext(); MCSymbol *Label = context.createTempSymbol(); @@ -618,6 +726,8 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { uint32_t PrologCodeBytes = ARM64CountOfUnwindCodes(info->Instructions); uint32_t TotalCodeBytes = PrologCodeBytes; + int PackedEpilogOffset = checkPackedEpilog(streamer, info, PrologCodeBytes); + // Process epilogs. MapVector EpilogInfo; // Epilogs processed so far. @@ -650,15 +760,17 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { uint32_t CodeWordsMod = TotalCodeBytes % 4; if (CodeWordsMod) CodeWords++; - uint32_t EpilogCount = info->EpilogMap.size(); + uint32_t EpilogCount = + PackedEpilogOffset >= 0 ? PackedEpilogOffset : info->EpilogMap.size(); bool ExtensionWord = EpilogCount > 31 || TotalCodeBytes > 124; if (!ExtensionWord) { row1 |= (EpilogCount & 0x1F) << 22; row1 |= (CodeWords & 0x1F) << 27; } - // E is always 0 right now, TODO: packed epilog setup if (info->HandlesExceptions) // X row1 |= 1 << 20; + if (PackedEpilogOffset >= 0) // E + row1 |= 1 << 21; row1 |= FuncLength & 0x3FFFF; streamer.emitInt32(row1); diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp index af4620361c34d..32541e5e4ff8e 100644 --- a/llvm/lib/MC/WasmObjectWriter.cpp +++ b/llvm/lib/MC/WasmObjectWriter.cpp @@ -216,8 +216,12 @@ static void patchI64(raw_pwrite_stream &Stream, uint64_t X, uint64_t Offset) { Stream.pwrite((char *)Buffer, sizeof(Buffer), Offset); } +bool isDwoSection(const MCSection &Sec) { + return Sec.getName().endswith(".dwo"); +} + class WasmObjectWriter : public MCObjectWriter { - support::endian::Writer W; + support::endian::Writer *W; /// The target specific Wasm writer instance. std::unique_ptr TargetObjectWriter; @@ -260,7 +264,16 @@ class WasmObjectWriter : public MCObjectWriter { unsigned NumEventImports = 0; uint32_t SectionCount = 0; - // TargetObjectWriter wrappers. + enum class DwoMode { + AllSections, + NonDwoOnly, + DwoOnly, + }; + bool IsSplitDwarf = false; + raw_pwrite_stream *OS = nullptr; + raw_pwrite_stream *DwoOS = nullptr; + + // TargetObjectWriter wranppers. bool is64Bit() const { return TargetObjectWriter->is64Bit(); } bool isEmscripten() const { return TargetObjectWriter->isEmscripten(); } @@ -270,8 +283,13 @@ class WasmObjectWriter : public MCObjectWriter { public: WasmObjectWriter(std::unique_ptr MOTW, - raw_pwrite_stream &OS) - : W(OS, support::little), TargetObjectWriter(std::move(MOTW)) {} + raw_pwrite_stream &OS_) + : TargetObjectWriter(std::move(MOTW)), OS(&OS_) {} + + WasmObjectWriter(std::unique_ptr MOTW, + raw_pwrite_stream &OS_, raw_pwrite_stream &DwoOS_) + : TargetObjectWriter(std::move(MOTW)), IsSplitDwarf(true), OS(&OS_), + DwoOS(&DwoOS_) {} private: void reset() override { @@ -303,27 +321,31 @@ class WasmObjectWriter : public MCObjectWriter { void executePostLayoutBinding(MCAssembler &Asm, const MCAsmLayout &Layout) override; - + void prepareImports(SmallVectorImpl &Imports, + MCAssembler &Asm, const MCAsmLayout &Layout); uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override; + uint64_t writeOneObject(MCAssembler &Asm, const MCAsmLayout &Layout, + DwoMode Mode); + void writeString(const StringRef Str) { - encodeULEB128(Str.size(), W.OS); - W.OS << Str; + encodeULEB128(Str.size(), W->OS); + W->OS << Str; } void writeI32(int32_t val) { char Buffer[4]; support::endian::write32le(Buffer, val); - W.OS.write(Buffer, sizeof(Buffer)); + W->OS.write(Buffer, sizeof(Buffer)); } void writeI64(int64_t val) { char Buffer[8]; support::endian::write64le(Buffer, val); - W.OS.write(Buffer, sizeof(Buffer)); + W->OS.write(Buffer, sizeof(Buffer)); } - void writeValueType(wasm::ValType Ty) { W.OS << static_cast(Ty); } + void writeValueType(wasm::ValType Ty) { W->OS << static_cast(Ty); } void writeTypeSection(ArrayRef Signatures); void writeImportSection(ArrayRef Imports, uint64_t DataSize, @@ -368,17 +390,17 @@ class WasmObjectWriter : public MCObjectWriter { void WasmObjectWriter::startSection(SectionBookkeeping &Section, unsigned SectionId) { LLVM_DEBUG(dbgs() << "startSection " << SectionId << "\n"); - W.OS << char(SectionId); + W->OS << char(SectionId); - Section.SizeOffset = W.OS.tell(); + Section.SizeOffset = W->OS.tell(); // The section size. We don't know the size yet, so reserve enough space // for any 32-bit value; we'll patch it later. - encodeULEB128(0, W.OS, 5); + encodeULEB128(0, W->OS, 5); // The position where the section starts, for measuring its size. - Section.ContentsOffset = W.OS.tell(); - Section.PayloadOffset = W.OS.tell(); + Section.ContentsOffset = W->OS.tell(); + Section.PayloadOffset = W->OS.tell(); Section.Index = SectionCount++; } @@ -388,19 +410,19 @@ void WasmObjectWriter::startCustomSection(SectionBookkeeping &Section, startSection(Section, wasm::WASM_SEC_CUSTOM); // The position where the section header ends, for measuring its size. - Section.PayloadOffset = W.OS.tell(); + Section.PayloadOffset = W->OS.tell(); // Custom sections in wasm also have a string identifier. writeString(Name); // The position where the custom section starts. - Section.ContentsOffset = W.OS.tell(); + Section.ContentsOffset = W->OS.tell(); } // Now that the section is complete and we know how big it is, patch up the // section size field at the start of the section. void WasmObjectWriter::endSection(SectionBookkeeping &Section) { - uint64_t Size = W.OS.tell(); + uint64_t Size = W->OS.tell(); // /dev/null doesn't support seek/tell and can report offset of 0. // Simply skip this patching in that case. if (!Size) @@ -414,14 +436,14 @@ void WasmObjectWriter::endSection(SectionBookkeeping &Section) { // Write the final section size to the payload_len field, which follows // the section id byte. - writePatchableLEB<5>(static_cast(W.OS), Size, + writePatchableLEB<5>(static_cast(W->OS), Size, Section.SizeOffset); } // Emit the Wasm header. void WasmObjectWriter::writeHeader(const MCAssembler &Asm) { - W.OS.write(wasm::WasmMagic, sizeof(wasm::WasmMagic)); - W.write(wasm::WasmVersion); + W->OS.write(wasm::WasmMagic, sizeof(wasm::WasmMagic)); + W->write(wasm::WasmVersion); } void WasmObjectWriter::executePostLayoutBinding(MCAssembler &Asm, @@ -663,7 +685,7 @@ WasmObjectWriter::getRelocationIndexValue(const WasmRelocationEntry &RelEntry) { void WasmObjectWriter::applyRelocations( ArrayRef Relocations, uint64_t ContentsOffset, const MCAsmLayout &Layout) { - auto &Stream = static_cast(W.OS); + auto &Stream = static_cast(W->OS); for (const WasmRelocationEntry &RelEntry : Relocations) { uint64_t Offset = ContentsOffset + RelEntry.FixupSection->getSectionOffset() + @@ -718,14 +740,14 @@ void WasmObjectWriter::writeTypeSection(ArrayRef Signatures) { SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_TYPE); - encodeULEB128(Signatures.size(), W.OS); + encodeULEB128(Signatures.size(), W->OS); for (const WasmSignature &Sig : Signatures) { - W.OS << char(wasm::WASM_TYPE_FUNC); - encodeULEB128(Sig.Params.size(), W.OS); + W->OS << char(wasm::WASM_TYPE_FUNC); + encodeULEB128(Sig.Params.size(), W->OS); for (wasm::ValType Ty : Sig.Params) writeValueType(Ty); - encodeULEB128(Sig.Returns.size(), W.OS); + encodeULEB128(Sig.Returns.size(), W->OS); for (wasm::ValType Ty : Sig.Returns) writeValueType(Ty); } @@ -744,32 +766,32 @@ void WasmObjectWriter::writeImportSection(ArrayRef Imports, SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_IMPORT); - encodeULEB128(Imports.size(), W.OS); + encodeULEB128(Imports.size(), W->OS); for (const wasm::WasmImport &Import : Imports) { writeString(Import.Module); writeString(Import.Field); - W.OS << char(Import.Kind); + W->OS << char(Import.Kind); switch (Import.Kind) { case wasm::WASM_EXTERNAL_FUNCTION: - encodeULEB128(Import.SigIndex, W.OS); + encodeULEB128(Import.SigIndex, W->OS); break; case wasm::WASM_EXTERNAL_GLOBAL: - W.OS << char(Import.Global.Type); - W.OS << char(Import.Global.Mutable ? 1 : 0); + W->OS << char(Import.Global.Type); + W->OS << char(Import.Global.Mutable ? 1 : 0); break; case wasm::WASM_EXTERNAL_MEMORY: - encodeULEB128(Import.Memory.Flags, W.OS); - encodeULEB128(NumPages, W.OS); // initial + encodeULEB128(Import.Memory.Flags, W->OS); + encodeULEB128(NumPages, W->OS); // initial break; case wasm::WASM_EXTERNAL_TABLE: - W.OS << char(Import.Table.ElemType); - encodeULEB128(0, W.OS); // flags - encodeULEB128(NumElements, W.OS); // initial + W->OS << char(Import.Table.ElemType); + encodeULEB128(0, W->OS); // flags + encodeULEB128(NumElements, W->OS); // initial break; case wasm::WASM_EXTERNAL_EVENT: - encodeULEB128(Import.Event.Attribute, W.OS); - encodeULEB128(Import.Event.SigIndex, W.OS); + encodeULEB128(Import.Event.Attribute, W->OS); + encodeULEB128(Import.Event.SigIndex, W->OS); break; default: llvm_unreachable("unsupported import kind"); @@ -786,9 +808,9 @@ void WasmObjectWriter::writeFunctionSection(ArrayRef Functions) { SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_FUNCTION); - encodeULEB128(Functions.size(), W.OS); + encodeULEB128(Functions.size(), W->OS); for (const WasmFunction &Func : Functions) - encodeULEB128(Func.SigIndex, W.OS); + encodeULEB128(Func.SigIndex, W->OS); endSection(Section); } @@ -800,10 +822,10 @@ void WasmObjectWriter::writeEventSection(ArrayRef Events) { SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_EVENT); - encodeULEB128(Events.size(), W.OS); + encodeULEB128(Events.size(), W->OS); for (const wasm::WasmEventType &Event : Events) { - encodeULEB128(Event.Attribute, W.OS); - encodeULEB128(Event.SigIndex, W.OS); + encodeULEB128(Event.Attribute, W->OS); + encodeULEB128(Event.SigIndex, W->OS); } endSection(Section); @@ -816,17 +838,17 @@ void WasmObjectWriter::writeGlobalSection(ArrayRef Globals) { SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_GLOBAL); - encodeULEB128(Globals.size(), W.OS); + encodeULEB128(Globals.size(), W->OS); for (const wasm::WasmGlobal &Global : Globals) { - encodeULEB128(Global.Type.Type, W.OS); - W.OS << char(Global.Type.Mutable); - W.OS << char(Global.InitExpr.Opcode); + encodeULEB128(Global.Type.Type, W->OS); + W->OS << char(Global.Type.Mutable); + W->OS << char(Global.InitExpr.Opcode); switch (Global.Type.Type) { case wasm::WASM_TYPE_I32: - encodeSLEB128(0, W.OS); + encodeSLEB128(0, W->OS); break; case wasm::WASM_TYPE_I64: - encodeSLEB128(0, W.OS); + encodeSLEB128(0, W->OS); break; case wasm::WASM_TYPE_F32: writeI32(0); @@ -840,7 +862,7 @@ void WasmObjectWriter::writeGlobalSection(ArrayRef Globals) { default: llvm_unreachable("unexpected type"); } - W.OS << char(wasm::WASM_OPCODE_END); + W->OS << char(wasm::WASM_OPCODE_END); } endSection(Section); @@ -853,11 +875,11 @@ void WasmObjectWriter::writeExportSection(ArrayRef Exports) { SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_EXPORT); - encodeULEB128(Exports.size(), W.OS); + encodeULEB128(Exports.size(), W->OS); for (const wasm::WasmExport &Export : Exports) { writeString(Export.Name); - W.OS << char(Export.Kind); - encodeULEB128(Export.Index, W.OS); + W->OS << char(Export.Kind); + encodeULEB128(Export.Index, W->OS); } endSection(Section); @@ -870,17 +892,17 @@ void WasmObjectWriter::writeElemSection(ArrayRef TableElems) { SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_ELEM); - encodeULEB128(1, W.OS); // number of "segments" - encodeULEB128(0, W.OS); // the table index + encodeULEB128(1, W->OS); // number of "segments" + encodeULEB128(0, W->OS); // the table index // init expr for starting offset - W.OS << char(wasm::WASM_OPCODE_I32_CONST); - encodeSLEB128(InitialTableOffset, W.OS); - W.OS << char(wasm::WASM_OPCODE_END); + W->OS << char(wasm::WASM_OPCODE_I32_CONST); + encodeSLEB128(InitialTableOffset, W->OS); + W->OS << char(wasm::WASM_OPCODE_END); - encodeULEB128(TableElems.size(), W.OS); + encodeULEB128(TableElems.size(), W->OS); for (uint32_t Elem : TableElems) - encodeULEB128(Elem, W.OS); + encodeULEB128(Elem, W->OS); endSection(Section); } @@ -891,7 +913,7 @@ void WasmObjectWriter::writeDataCountSection() { SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_DATACOUNT); - encodeULEB128(DataSegments.size(), W.OS); + encodeULEB128(DataSegments.size(), W->OS); endSection(Section); } @@ -904,7 +926,7 @@ uint32_t WasmObjectWriter::writeCodeSection(const MCAssembler &Asm, SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_CODE); - encodeULEB128(Functions.size(), W.OS); + encodeULEB128(Functions.size(), W->OS); for (const WasmFunction &Func : Functions) { auto &FuncSection = static_cast(Func.Sym->getSection()); @@ -913,9 +935,9 @@ uint32_t WasmObjectWriter::writeCodeSection(const MCAssembler &Asm, if (!Func.Sym->getSize()->evaluateAsAbsolute(Size, Layout)) report_fatal_error(".size expression must be evaluatable"); - encodeULEB128(Size, W.OS); - FuncSection.setSectionOffset(W.OS.tell() - Section.ContentsOffset); - Asm.writeSectionData(W.OS, &FuncSection, Layout); + encodeULEB128(Size, W->OS); + FuncSection.setSectionOffset(W->OS.tell() - Section.ContentsOffset); + Asm.writeSectionData(W->OS, &FuncSection, Layout); } // Apply fixups. @@ -932,22 +954,21 @@ uint32_t WasmObjectWriter::writeDataSection(const MCAsmLayout &Layout) { SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_DATA); - encodeULEB128(DataSegments.size(), W.OS); // count + encodeULEB128(DataSegments.size(), W->OS); // count for (const WasmDataSegment &Segment : DataSegments) { - encodeULEB128(Segment.InitFlags, W.OS); // flags + encodeULEB128(Segment.InitFlags, W->OS); // flags if (Segment.InitFlags & wasm::WASM_SEGMENT_HAS_MEMINDEX) - encodeULEB128(0, W.OS); // memory index + encodeULEB128(0, W->OS); // memory index if ((Segment.InitFlags & wasm::WASM_SEGMENT_IS_PASSIVE) == 0) { - W.OS << char(Segment.Offset > std::numeric_limits().max() - ? wasm::WASM_OPCODE_I64_CONST - : wasm::WASM_OPCODE_I32_CONST); - encodeSLEB128(Segment.Offset, W.OS); // offset - W.OS << char(wasm::WASM_OPCODE_END); + W->OS << char(Segment.Offset > INT32_MAX ? wasm::WASM_OPCODE_I64_CONST + : wasm::WASM_OPCODE_I32_CONST); + encodeSLEB128(Segment.Offset, W->OS); // offset + W->OS << char(wasm::WASM_OPCODE_END); } - encodeULEB128(Segment.Data.size(), W.OS); // size - Segment.Section->setSectionOffset(W.OS.tell() - Section.ContentsOffset); - W.OS << Segment.Data; // data + encodeULEB128(Segment.Data.size(), W->OS); // size + Segment.Section->setSectionOffset(W->OS.tell() - Section.ContentsOffset); + W->OS << Segment.Data; // data } // Apply fixups. @@ -980,18 +1001,18 @@ void WasmObjectWriter::writeRelocSection( SectionBookkeeping Section; startCustomSection(Section, std::string("reloc.") + Name.str()); - encodeULEB128(SectionIndex, W.OS); - encodeULEB128(Relocs.size(), W.OS); + encodeULEB128(SectionIndex, W->OS); + encodeULEB128(Relocs.size(), W->OS); for (const WasmRelocationEntry &RelEntry : Relocs) { uint64_t Offset = RelEntry.Offset + RelEntry.FixupSection->getSectionOffset(); uint32_t Index = getRelocationIndexValue(RelEntry); - W.OS << char(RelEntry.Type); - encodeULEB128(Offset, W.OS); - encodeULEB128(Index, W.OS); + W->OS << char(RelEntry.Type); + encodeULEB128(Offset, W->OS); + encodeULEB128(Index, W->OS); if (RelEntry.hasAddend()) - encodeSLEB128(RelEntry.Addend, W.OS); + encodeSLEB128(RelEntry.Addend, W->OS); } endSection(Section); @@ -1010,20 +1031,20 @@ void WasmObjectWriter::writeLinkingMetaDataSection( const std::map> &Comdats) { SectionBookkeeping Section; startCustomSection(Section, "linking"); - encodeULEB128(wasm::WasmMetadataVersion, W.OS); + encodeULEB128(wasm::WasmMetadataVersion, W->OS); SectionBookkeeping SubSection; if (SymbolInfos.size() != 0) { startSection(SubSection, wasm::WASM_SYMBOL_TABLE); - encodeULEB128(SymbolInfos.size(), W.OS); + encodeULEB128(SymbolInfos.size(), W->OS); for (const wasm::WasmSymbolInfo &Sym : SymbolInfos) { - encodeULEB128(Sym.Kind, W.OS); - encodeULEB128(Sym.Flags, W.OS); + encodeULEB128(Sym.Kind, W->OS); + encodeULEB128(Sym.Flags, W->OS); switch (Sym.Kind) { case wasm::WASM_SYMBOL_TYPE_FUNCTION: case wasm::WASM_SYMBOL_TYPE_GLOBAL: case wasm::WASM_SYMBOL_TYPE_EVENT: - encodeULEB128(Sym.ElementIndex, W.OS); + encodeULEB128(Sym.ElementIndex, W->OS); if ((Sym.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0 || (Sym.Flags & wasm::WASM_SYMBOL_EXPLICIT_NAME) != 0) writeString(Sym.Name); @@ -1031,15 +1052,15 @@ void WasmObjectWriter::writeLinkingMetaDataSection( case wasm::WASM_SYMBOL_TYPE_DATA: writeString(Sym.Name); if ((Sym.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0) { - encodeULEB128(Sym.DataRef.Segment, W.OS); - encodeULEB128(Sym.DataRef.Offset, W.OS); - encodeULEB128(Sym.DataRef.Size, W.OS); + encodeULEB128(Sym.DataRef.Segment, W->OS); + encodeULEB128(Sym.DataRef.Offset, W->OS); + encodeULEB128(Sym.DataRef.Size, W->OS); } break; case wasm::WASM_SYMBOL_TYPE_SECTION: { const uint32_t SectionIndex = CustomSections[Sym.ElementIndex].OutputIndex; - encodeULEB128(SectionIndex, W.OS); + encodeULEB128(SectionIndex, W->OS); break; } default: @@ -1051,35 +1072,35 @@ void WasmObjectWriter::writeLinkingMetaDataSection( if (DataSegments.size()) { startSection(SubSection, wasm::WASM_SEGMENT_INFO); - encodeULEB128(DataSegments.size(), W.OS); + encodeULEB128(DataSegments.size(), W->OS); for (const WasmDataSegment &Segment : DataSegments) { writeString(Segment.Name); - encodeULEB128(Segment.Alignment, W.OS); - encodeULEB128(Segment.LinkerFlags, W.OS); + encodeULEB128(Segment.Alignment, W->OS); + encodeULEB128(Segment.LinkerFlags, W->OS); } endSection(SubSection); } if (!InitFuncs.empty()) { startSection(SubSection, wasm::WASM_INIT_FUNCS); - encodeULEB128(InitFuncs.size(), W.OS); + encodeULEB128(InitFuncs.size(), W->OS); for (auto &StartFunc : InitFuncs) { - encodeULEB128(StartFunc.first, W.OS); // priority - encodeULEB128(StartFunc.second, W.OS); // function index + encodeULEB128(StartFunc.first, W->OS); // priority + encodeULEB128(StartFunc.second, W->OS); // function index } endSection(SubSection); } if (Comdats.size()) { startSection(SubSection, wasm::WASM_COMDAT_INFO); - encodeULEB128(Comdats.size(), W.OS); + encodeULEB128(Comdats.size(), W->OS); for (const auto &C : Comdats) { writeString(C.first); - encodeULEB128(0, W.OS); // flags for future use - encodeULEB128(C.second.size(), W.OS); + encodeULEB128(0, W->OS); // flags for future use + encodeULEB128(C.second.size(), W->OS); for (const WasmComdatEntry &Entry : C.second) { - encodeULEB128(Entry.Kind, W.OS); - encodeULEB128(Entry.Index, W.OS); + encodeULEB128(Entry.Kind, W->OS); + encodeULEB128(Entry.Index, W->OS); } } endSection(SubSection); @@ -1095,8 +1116,8 @@ void WasmObjectWriter::writeCustomSection(WasmCustomSection &CustomSection, auto *Sec = CustomSection.Section; startCustomSection(Section, CustomSection.Name); - Sec->setSectionOffset(W.OS.tell() - Section.ContentsOffset); - Asm.writeSectionData(W.OS, Sec, Layout); + Sec->setSectionOffset(W->OS.tell() - Section.ContentsOffset); + Asm.writeSectionData(W->OS, Sec, Layout); CustomSection.OutputContentsOffset = Section.ContentsOffset; CustomSection.OutputIndex = Section.Index; @@ -1176,25 +1197,9 @@ static bool isInSymtab(const MCSymbolWasm &Sym) { return true; } - -uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, - const MCAsmLayout &Layout) { - uint64_t StartOffset = W.OS.tell(); - - LLVM_DEBUG(dbgs() << "WasmObjectWriter::writeObject\n"); - - // Collect information from the available symbols. - SmallVector Functions; - SmallVector TableElems; - SmallVector Imports; - SmallVector Exports; - SmallVector Events; - SmallVector Globals; - SmallVector SymbolInfos; - SmallVector, 2> InitFuncs; - std::map> Comdats; - uint64_t DataSize = 0; - +void WasmObjectWriter::prepareImports( + SmallVectorImpl &Imports, MCAssembler &Asm, + const MCAsmLayout &Layout) { // For now, always emit the memory import, since loads and stores are not // valid without it. In the future, we could perhaps be more clever and omit // it if there are no loads or stores. @@ -1292,13 +1297,57 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, GOTIndices[&WS] = NumGlobalImports++; } } +} + +uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, + const MCAsmLayout &Layout) { + support::endian::Writer MainWriter(*OS, support::little); + W = &MainWriter; + if (IsSplitDwarf) { + uint64_t TotalSize = writeOneObject(Asm, Layout, DwoMode::NonDwoOnly); + assert(DwoOS); + support::endian::Writer DwoWriter(*DwoOS, support::little); + W = &DwoWriter; + return TotalSize + writeOneObject(Asm, Layout, DwoMode::DwoOnly); + } else { + return writeOneObject(Asm, Layout, DwoMode::AllSections); + } +} + +uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm, + const MCAsmLayout &Layout, + DwoMode Mode) { + uint64_t StartOffset = W->OS.tell(); + SectionCount = 0; + CustomSections.clear(); + LLVM_DEBUG(dbgs() << "WasmObjectWriter::writeObject\n"); + + // Collect information from the available symbols. + SmallVector Functions; + SmallVector TableElems; + SmallVector Imports; + SmallVector Exports; + SmallVector Events; + SmallVector Globals; + SmallVector SymbolInfos; + SmallVector, 2> InitFuncs; + std::map> Comdats; + uint64_t DataSize = 0; + if (Mode != DwoMode::DwoOnly) { + prepareImports(Imports, Asm, Layout); + } // Populate DataSegments and CustomSections, which must be done before // populating DataLocations. for (MCSection &Sec : Asm) { auto &Section = static_cast(Sec); StringRef SectionName = Section.getName(); + if (Mode == DwoMode::NonDwoOnly && isDwoSection(Sec)) + continue; + if (Mode == DwoMode::DwoOnly && !isDwoSection(Sec)) + continue; + // .init_array sections are handled specially elsewhere. if (SectionName.startswith(".init_array")) continue; @@ -1695,23 +1744,33 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, // Write out the Wasm header. writeHeader(Asm); - writeTypeSection(Signatures); - writeImportSection(Imports, DataSize, TableElems.size()); - writeFunctionSection(Functions); - // Skip the "table" section; we import the table instead. - // Skip the "memory" section; we import the memory instead. - writeEventSection(Events); - writeGlobalSection(Globals); - writeExportSection(Exports); - writeElemSection(TableElems); - writeDataCountSection(); - uint32_t CodeSectionIndex = writeCodeSection(Asm, Layout, Functions); - uint32_t DataSectionIndex = writeDataSection(Layout); - for (auto &CustomSection : CustomSections) + uint32_t CodeSectionIndex, DataSectionIndex; + if (Mode != DwoMode::DwoOnly) { + writeTypeSection(Signatures); + writeImportSection(Imports, DataSize, TableElems.size()); + writeFunctionSection(Functions); + // Skip the "table" section; we import the table instead. + // Skip the "memory" section; we import the memory instead. + writeEventSection(Events); + writeGlobalSection(Globals); + writeExportSection(Exports); + writeElemSection(TableElems); + writeDataCountSection(); + + CodeSectionIndex = writeCodeSection(Asm, Layout, Functions); + DataSectionIndex = writeDataSection(Layout); + } + + for (auto &CustomSection : CustomSections) { writeCustomSection(CustomSection, Asm, Layout); - writeLinkingMetaDataSection(SymbolInfos, InitFuncs, Comdats); - writeRelocSection(CodeSectionIndex, "CODE", CodeRelocations); - writeRelocSection(DataSectionIndex, "DATA", DataRelocations); + } + + if (Mode != DwoMode::DwoOnly) { + writeLinkingMetaDataSection(SymbolInfos, InitFuncs, Comdats); + + writeRelocSection(CodeSectionIndex, "CODE", CodeRelocations); + writeRelocSection(DataSectionIndex, "DATA", DataRelocations); + } writeCustomRelocSections(); if (ProducersSection) writeCustomSection(*ProducersSection, Asm, Layout); @@ -1719,7 +1778,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, writeCustomSection(*TargetFeaturesSection, Asm, Layout); // TODO: Translate the .comment section to the output. - return W.OS.tell() - StartOffset; + return W->OS.tell() - StartOffset; } std::unique_ptr @@ -1727,3 +1786,10 @@ llvm::createWasmObjectWriter(std::unique_ptr MOTW, raw_pwrite_stream &OS) { return std::make_unique(std::move(MOTW), OS); } + +std::unique_ptr +llvm::createWasmDwoObjectWriter(std::unique_ptr MOTW, + raw_pwrite_stream &OS, + raw_pwrite_stream &DwoOS) { + return std::make_unique(std::move(MOTW), OS, DwoOS); +} diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp index 5047b5041aa75..d6cee3bb59bb8 100644 --- a/llvm/lib/MC/XCOFFObjectWriter.cpp +++ b/llvm/lib/MC/XCOFFObjectWriter.cpp @@ -49,7 +49,6 @@ namespace { constexpr unsigned DefaultSectionAlign = 4; constexpr int16_t MaxSectionIndex = INT16_MAX; -constexpr uint16_t MaxTOCSizeInARegion = UINT16_MAX; // Packs the csect's alignment and type into a byte. uint8_t getEncodedType(const MCSectionXCOFF *); @@ -431,12 +430,15 @@ void XCOFFObjectWriter::recordRelocation(MCAssembler &Asm, FixedValue = getVirtualAddress(SymA, SymASec) + Target.getConstant(); else if (Type == XCOFF::RelocationType::R_TOC || Type == XCOFF::RelocationType::R_TOCL) { - // The FixedValue should be the TC entry offset from TOC-base. - FixedValue = SectionMap[SymASec]->Address - TOCCsects.front().Address; - if (FixedValue >= MaxTOCSizeInARegion) - report_fatal_error( - "handling of TOC entries could not fit in the initial TOC " - "entry region is not yet supported"); + // The FixedValue should be the TOC entry offset from the TOC-base plus any + // constant offset value. + const int64_t TOCEntryOffset = SectionMap[SymASec]->Address - + TOCCsects.front().Address + + Target.getConstant(); + if (Type == XCOFF::RelocationType::R_TOC && !isInt<16>(TOCEntryOffset)) + report_fatal_error("TOCEntryOffset overflows in small code model mode"); + + FixedValue = TOCEntryOffset; } assert( diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp index c6e9ee175adc8..5290f8ce05607 100644 --- a/llvm/lib/Object/ELF.cpp +++ b/llvm/lib/Object/ELF.cpp @@ -366,7 +366,7 @@ ELFFile::decode_relrs(Elf_Relr_Range relrs) const { template Expected> -ELFFile::android_relas(const Elf_Shdr *Sec) const { +ELFFile::android_relas(const Elf_Shdr &Sec) const { // This function reads relocations in Android's packed relocation format, // which is based on SLEB128 and delta encoding. Expected> ContentsOrErr = getSectionContents(Sec); @@ -511,7 +511,7 @@ std::string ELFFile::getDynamicTagAsString(unsigned Arch, template std::string ELFFile::getDynamicTagAsString(uint64_t Type) const { - return getDynamicTagAsString(getHeader()->e_machine, Type); + return getDynamicTagAsString(getHeader().e_machine, Type); } template @@ -541,7 +541,7 @@ Expected ELFFile::dynamicEntries() const { for (const Elf_Shdr &Sec : *SectionsOrError) { if (Sec.sh_type == ELF::SHT_DYNAMIC) { Expected> DynOrError = - getSectionContentsAsArray(&Sec); + getSectionContentsAsArray(Sec); if (!DynOrError) return DynOrError.takeError(); Dyn = *DynOrError; diff --git a/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/llvm/lib/ObjectYAML/DWARFEmitter.cpp index bf29f40579ceb..b634f7c123e8d 100644 --- a/llvm/lib/ObjectYAML/DWARFEmitter.cpp +++ b/llvm/lib/ObjectYAML/DWARFEmitter.cpp @@ -190,7 +190,7 @@ Error DWARFYAML::emitDebugAranges(raw_ostream &OS, const DWARFYAML::Data &DI) { Error DWARFYAML::emitDebugRanges(raw_ostream &OS, const DWARFYAML::Data &DI) { const size_t RangesOffset = OS.tell(); uint64_t EntryIndex = 0; - for (auto DebugRanges : DI.DebugRanges) { + for (auto DebugRanges : *DI.DebugRanges) { const size_t CurrOffset = OS.tell() - RangesOffset; if (DebugRanges.Offset && (uint64_t)*DebugRanges.Offset < CurrOffset) return createStringError(errc::invalid_argument, diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp index 353e5058a0e5d..975b9b40b6b18 100644 --- a/llvm/lib/ObjectYAML/DWARFYAML.cpp +++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp @@ -28,7 +28,7 @@ SetVector DWARFYAML::Data::getNonEmptySectionNames() const { SecNames.insert("debug_str"); if (DebugAranges) SecNames.insert("debug_aranges"); - if (!DebugRanges.empty()) + if (DebugRanges) SecNames.insert("debug_ranges"); if (!DebugLines.empty()) SecNames.insert("debug_line"); @@ -95,8 +95,7 @@ void MappingTraits::mapping(IO &IO, DWARFYAML::Data &DWARF) { IO.mapOptional("debug_str", DWARF.DebugStrings); IO.mapOptional("debug_abbrev", DWARF.DebugAbbrev); IO.mapOptional("debug_aranges", DWARF.DebugAranges); - if (!DWARF.DebugRanges.empty() || !IO.outputting()) - IO.mapOptional("debug_ranges", DWARF.DebugRanges); + IO.mapOptional("debug_ranges", DWARF.DebugRanges); IO.mapOptional("debug_pubnames", DWARF.PubNames); IO.mapOptional("debug_pubtypes", DWARF.PubTypes); DWARFCtx.IsGNUPubSec = true; diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp index 740e02a9d2f0e..304c09fff9d28 100644 --- a/llvm/lib/Option/OptTable.cpp +++ b/llvm/lib/Option/OptTable.cpp @@ -228,7 +228,7 @@ OptTable::suggestValueCompletions(StringRef Option, StringRef Arg) const { } std::vector -OptTable::findByPrefix(StringRef Cur, unsigned short DisableFlags) const { +OptTable::findByPrefix(StringRef Cur, unsigned int DisableFlags) const { std::vector Ret; for (size_t I = FirstSearchableIndex, E = OptionInfos.size(); I < E; I++) { const Info &In = OptionInfos[I]; diff --git a/llvm/lib/Passes/LLVMBuild.txt b/llvm/lib/Passes/LLVMBuild.txt index 3e7a391154137..f49f7828d2b93 100644 --- a/llvm/lib/Passes/LLVMBuild.txt +++ b/llvm/lib/Passes/LLVMBuild.txt @@ -18,4 +18,4 @@ type = Library name = Passes parent = Libraries -required_libraries = AggressiveInstCombine Analysis Core Coroutines IPO InstCombine ObjCARC Scalar Support Target TransformUtils Vectorize Instrumentation +required_libraries = AggressiveInstCombine Analysis Core Coroutines HelloNew IPO InstCombine ObjCARC Scalar Support Target TransformUtils Vectorize Instrumentation diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 9df6a985789ea..83b2674e3cda4 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasAnalysisEvaluator.h" +#include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" @@ -75,6 +76,7 @@ #include "llvm/Transforms/Coroutines/CoroEarly.h" #include "llvm/Transforms/Coroutines/CoroElide.h" #include "llvm/Transforms/Coroutines/CoroSplit.h" +#include "llvm/Transforms/HelloNew/HelloWorld.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/ArgumentPromotion.h" #include "llvm/Transforms/IPO/Attributor.h" @@ -100,6 +102,7 @@ #include "llvm/Transforms/IPO/SCCP.h" #include "llvm/Transforms/IPO/SampleProfile.h" #include "llvm/Transforms/IPO/StripDeadPrototypes.h" +#include "llvm/Transforms/IPO/StripSymbols.h" #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h" #include "llvm/Transforms/IPO/WholeProgramDevirt.h" #include "llvm/Transforms/InstCombine/InstCombine.h" @@ -111,9 +114,9 @@ #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" -#include "llvm/Transforms/Instrumentation/HeapProfiler.h" #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" #include "llvm/Transforms/Instrumentation/InstrProfiling.h" +#include "llvm/Transforms/Instrumentation/MemProfiler.h" #include "llvm/Transforms/Instrumentation/MemorySanitizer.h" #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" #include "llvm/Transforms/Instrumentation/PoisonChecking.h" @@ -192,6 +195,7 @@ #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Utils/LowerInvoke.h" +#include "llvm/Transforms/Utils/LowerSwitch.h" #include "llvm/Transforms/Utils/Mem2Reg.h" #include "llvm/Transforms/Utils/NameAnonGlobals.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" @@ -261,9 +265,9 @@ static cl::opt cl::Hidden, cl::desc("Enable inline deferral during PGO")); -static cl::opt EnableHeapProfiler("enable-heap-prof", cl::init(false), - cl::Hidden, cl::ZeroOrMore, - cl::desc("Enable heap profiler")); +static cl::opt EnableMemProfiler("enable-mem-prof", cl::init(false), + cl::Hidden, cl::ZeroOrMore, + cl::desc("Enable memory profiler")); PipelineTuningOptions::PipelineTuningOptions() { LoopInterleaving = true; @@ -519,13 +523,15 @@ FunctionPassManager PassBuilder::buildO1FunctionSimplificationPipeline( FPM.addPass( RequireAnalysisPass()); FPM.addPass(createFunctionToLoopPassAdaptor( - std::move(LPM1), EnableMSSALoopDependency, DebugLogging)); + std::move(LPM1), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, + DebugLogging)); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. // *All* loop passes must preserve it, in order to be able to use it. FPM.addPass(createFunctionToLoopPassAdaptor( - std::move(LPM2), /*UseMemorySSA=*/false, DebugLogging)); + std::move(LPM2), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false, + DebugLogging)); // Delete small array after loop unroll. FPM.addPass(SROA()); @@ -676,14 +682,16 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass( RequireAnalysisPass()); FPM.addPass(createFunctionToLoopPassAdaptor( - std::move(LPM1), EnableMSSALoopDependency, DebugLogging)); + std::move(LPM1), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, + DebugLogging)); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); // The loop passes in LPM2 (IndVarSimplifyPass, LoopIdiomRecognizePass, // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA. // *All* loop passes must preserve it, in order to be able to use it. FPM.addPass(createFunctionToLoopPassAdaptor( - std::move(LPM2), /*UseMemorySSA=*/false, DebugLogging)); + std::move(LPM2), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false, + DebugLogging)); // Delete small array after loop unroll. FPM.addPass(SROA()); @@ -720,7 +728,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(DSEPass()); FPM.addPass(createFunctionToLoopPassAdaptor( LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), - EnableMSSALoopDependency, DebugLogging)); + EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, DebugLogging)); if (PTO.Coroutines) FPM.addPass(CoroElidePass()); @@ -798,7 +806,8 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging, FunctionPassManager FPM; FPM.addPass(createFunctionToLoopPassAdaptor( - LoopRotatePass(), EnableMSSALoopDependency, DebugLogging)); + LoopRotatePass(), EnableMSSALoopDependency, + /*UseBlockFrequencyInfo=*/false, DebugLogging)); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); // Add the profile lowering pass. @@ -1042,9 +1051,9 @@ ModulePassManager PassBuilder::buildModuleSimplificationPipeline( MPM.addPass(buildInlinerPipeline(Level, Phase, DebugLogging)); - if (EnableHeapProfiler && Phase != ThinLTOPhase::PreLink) { - MPM.addPass(createModuleToFunctionPassAdaptor(HeapProfilerPass())); - MPM.addPass(ModuleHeapProfilerPass()); + if (EnableMemProfiler && Phase != ThinLTOPhase::PreLink) { + MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass())); + MPM.addPass(ModuleMemProfilerPass()); } return MPM; @@ -1128,7 +1137,8 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline( // First rotate loops that may have been un-rotated by prior passes. OptimizePM.addPass(createFunctionToLoopPassAdaptor( - LoopRotatePass(), EnableMSSALoopDependency, DebugLogging)); + LoopRotatePass(), EnableMSSALoopDependency, + /*UseBlockFrequencyInfo=*/false, DebugLogging)); // Distribute loops to allow partial vectorization. I.e. isolate dependences // into separate loop that would otherwise inhibit vectorization. This is @@ -1160,11 +1170,14 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline( // convert to more optimized IR using more aggressive simplify CFG options. // The extra sinking transform can create larger basic blocks, so do this // before SLP vectorization. - OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions(). - forwardSwitchCondToPhi(true). - convertSwitchToLookupTable(true). - needCanonicalLoops(false). - sinkCommonInsts(true))); + // FIXME: study whether hoisting and/or sinking of common instructions should + // be delayed until after SLP vectorizer. + OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true))); // Optimize parallel scalar instruction chains into SIMD instructions. if (PTO.SLPVectorization) @@ -1192,7 +1205,7 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline( OptimizePM.addPass(RequireAnalysisPass()); OptimizePM.addPass(createFunctionToLoopPassAdaptor( LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), - EnableMSSALoopDependency, DebugLogging)); + EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, DebugLogging)); // Now that we've vectorized and unrolled loops, we may have more refined // alignment information, try to re-derive it here. @@ -1505,7 +1518,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging, FPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(FPM, Level); - FPM.addPass(JumpThreadingPass()); + FPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true)); // Do a post inline PGO instrumentation and use pass. This is a context // sensitive PGO pass. @@ -1572,7 +1585,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging, MainFPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(MainFPM, Level); - MainFPM.addPass(JumpThreadingPass()); + MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true)); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM))); // Create a function that performs CFI checks for cross-DSO calls with @@ -2257,8 +2270,9 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, } #define LOOP_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ - MPM.addPass(createModuleToFunctionPassAdaptor( \ - createFunctionToLoopPassAdaptor(CREATE_PASS, false, DebugLogging))); \ + MPM.addPass( \ + createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor( \ + CREATE_PASS, false, false, DebugLogging))); \ return Error::success(); \ } #define LOOP_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER) \ @@ -2268,7 +2282,7 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, return Params.takeError(); \ MPM.addPass( \ createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor( \ - CREATE_PASS(Params.get()), false, DebugLogging))); \ + CREATE_PASS(Params.get()), false, false, DebugLogging))); \ return Error::success(); \ } #include "PassRegistry.def" @@ -2369,8 +2383,9 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, } #define LOOP_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ - CGPM.addPass(createCGSCCToFunctionPassAdaptor( \ - createFunctionToLoopPassAdaptor(CREATE_PASS, false, DebugLogging))); \ + CGPM.addPass( \ + createCGSCCToFunctionPassAdaptor(createFunctionToLoopPassAdaptor( \ + CREATE_PASS, false, false, DebugLogging))); \ return Error::success(); \ } #define LOOP_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER) \ @@ -2380,7 +2395,7 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, return Params.takeError(); \ CGPM.addPass( \ createCGSCCToFunctionPassAdaptor(createFunctionToLoopPassAdaptor( \ - CREATE_PASS(Params.get()), false, DebugLogging))); \ + CREATE_PASS(Params.get()), false, false, DebugLogging))); \ return Error::success(); \ } #include "PassRegistry.def" @@ -2417,8 +2432,11 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM, return Err; // Add the nested pass manager with the appropriate adaptor. bool UseMemorySSA = (Name == "loop-mssa"); + bool UseBFI = + std::any_of(InnerPipeline.begin(), InnerPipeline.end(), + [](auto Pipeline) { return Pipeline.Name == "licm"; }); FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), UseMemorySSA, - DebugLogging)); + UseBFI, DebugLogging)); return Error::success(); } if (auto Count = parseRepeatPassName(Name)) { @@ -2472,8 +2490,8 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM, // The risk is that it may become obsolete if we're not careful. #define LOOP_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ - FPM.addPass( \ - createFunctionToLoopPassAdaptor(CREATE_PASS, false, DebugLogging)); \ + FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false, false, \ + DebugLogging)); \ return Error::success(); \ } #define LOOP_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER) \ @@ -2482,7 +2500,7 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM, if (!Params) \ return Params.takeError(); \ FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()), \ - false, DebugLogging)); \ + false, false, DebugLogging)); \ return Error::success(); \ } #include "PassRegistry.def" @@ -2783,6 +2801,9 @@ Error PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) { } bool PassBuilder::isAAPassName(StringRef PassName) { +#define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS) \ + if (PassName == NAME) \ + return true; #define FUNCTION_ALIAS_ANALYSIS(NAME, CREATE_PASS) \ if (PassName == NAME) \ return true; @@ -2803,6 +2824,12 @@ bool PassBuilder::isAnalysisPassName(StringRef PassName) { #define CGSSC_ANALYSIS(NAME, CREATE_PASS) \ if (PassName == NAME) \ return true; +#define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS) \ + if (PassName == NAME) \ + return true; +#define FUNCTION_ALIAS_ANALYSIS(NAME, CREATE_PASS) \ + if (PassName == NAME) \ + return true; #include "PassRegistry.def" return false; } diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index b0d1d2a63a830..2dfe9fc60f1af 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -88,7 +88,11 @@ MODULE_PASS("scc-oz-module-inliner", buildInlinerPipeline(OptimizationLevel::Oz, ThinLTOPhase::None, DebugLogging)) MODULE_PASS("oz-module-optimizer", buildModuleOptimizationPipeline(OptimizationLevel::Oz, DebugLogging, /*LTOPreLink*/false)) +MODULE_PASS("strip", StripSymbolsPass()) +MODULE_PASS("strip-dead-debug-info", StripDeadDebugInfoPass()) MODULE_PASS("strip-dead-prototypes", StripDeadPrototypesPass()) +MODULE_PASS("strip-debug-declare", StripDebugDeclarePass()) +MODULE_PASS("strip-nondebug", StripNonDebugSymbolsPass()) MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation()) MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass(nullptr, nullptr)) MODULE_PASS("verify", VerifierPass()) @@ -98,7 +102,7 @@ MODULE_PASS("msan-module", MemorySanitizerPass({})) MODULE_PASS("tsan-module", ThreadSanitizerPass()) MODULE_PASS("kasan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/true, false, true, false)) MODULE_PASS("sancov-module", ModuleSanitizerCoveragePass()) -MODULE_PASS("heapprof-module", ModuleHeapProfilerPass()) +MODULE_PASS("memprof-module", ModuleMemProfilerPass()) MODULE_PASS("poison-checking", PoisonCheckingPass()) #undef MODULE_PASS @@ -197,6 +201,7 @@ FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/false) FUNCTION_PASS("make-guards-explicit", MakeGuardsExplicitPass()) FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/true)) FUNCTION_PASS("gvn-hoist", GVNHoistPass()) +FUNCTION_PASS("helloworld", HelloWorldPass()) FUNCTION_PASS("instcombine", InstCombinePass()) FUNCTION_PASS("instcount", InstCountPass()) FUNCTION_PASS("instsimplify", InstSimplifyPass()) @@ -219,6 +224,7 @@ FUNCTION_PASS("loop-simplify", LoopSimplifyPass()) FUNCTION_PASS("loop-sink", LoopSinkPass()) FUNCTION_PASS("loop-unroll-and-jam", LoopUnrollAndJamPass()) FUNCTION_PASS("lowerinvoke", LowerInvokePass()) +FUNCTION_PASS("lowerswitch", LowerSwitchPass()) FUNCTION_PASS("mem2reg", PromotePass()) FUNCTION_PASS("memcpyopt", MemCpyOptPass()) FUNCTION_PASS("mergeicmps", MergeICmpsPass()) @@ -252,6 +258,8 @@ FUNCTION_PASS("print", PhiValuesPrinterPass(dbgs())) FUNCTION_PASS("print", RegionInfoPrinterPass(dbgs())) FUNCTION_PASS("print", ScalarEvolutionPrinterPass(dbgs())) FUNCTION_PASS("print", StackSafetyPrinterPass(dbgs())) +// TODO: rename to print after NPM switch +FUNCTION_PASS("print-alias-sets", AliasSetsPrinterPass(dbgs())) FUNCTION_PASS("print-predicateinfo", PredicateInfoPrinterPass(dbgs())) FUNCTION_PASS("reassociate", ReassociatePass()) FUNCTION_PASS("scalarizer", ScalarizerPass()) @@ -279,7 +287,7 @@ FUNCTION_PASS("kasan", AddressSanitizerPass(true, false, false)) FUNCTION_PASS("msan", MemorySanitizerPass({})) FUNCTION_PASS("kmsan", MemorySanitizerPass({0, false, /*Kernel=*/true})) FUNCTION_PASS("tsan", ThreadSanitizerPass()) -FUNCTION_PASS("heapprof", HeapProfilerPass()) +FUNCTION_PASS("memprof", MemProfilerPass()) #undef FUNCTION_PASS #ifndef FUNCTION_PASS_WITH_PARAMS diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index da58fa57bdae7..2ee373b912be0 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -36,6 +36,14 @@ static cl::opt cl::desc("Enable skipping optional passes optnone functions " "under new pass manager")); +cl::opt PreservedCFGCheckerInstrumentation::VerifyPreservedCFG( + "verify-cfg-preserved", cl::Hidden, +#ifdef NDEBUG + cl::init(false)); +#else + cl::init(true)); +#endif + // FIXME: Change `-debug-pass-manager` from boolean to enum type. Similar to // `-debug-pass` in legacy PM. static cl::opt @@ -338,10 +346,166 @@ void PrintPassInstrumentation::registerCallbacks( }); } +PreservedCFGCheckerInstrumentation::CFG::CFG(const Function *F, + bool TrackBBLifetime) { + if (TrackBBLifetime) + BBGuards = DenseMap(F->size()); + for (const auto &BB : *F) { + if (BBGuards) + BBGuards->try_emplace(intptr_t(&BB), &BB); + for (auto *Succ : successors(&BB)) { + Graph[&BB][Succ]++; + if (BBGuards) + BBGuards->try_emplace(intptr_t(Succ), Succ); + } + } +} + +static void printBBName(raw_ostream &out, const BasicBlock *BB) { + if (BB->hasName()) { + out << BB->getName() << "<" << BB << ">"; + return; + } + + if (!BB->getParent()) { + out << "unnamed_removed<" << BB << ">"; + return; + } + + if (BB == &BB->getParent()->getEntryBlock()) { + out << "entry" + << "<" << BB << ">"; + return; + } + + unsigned FuncOrderBlockNum = 0; + for (auto &FuncBB : *BB->getParent()) { + if (&FuncBB == BB) + break; + FuncOrderBlockNum++; + } + out << "unnamed_" << FuncOrderBlockNum << "<" << BB << ">"; +} + +void PreservedCFGCheckerInstrumentation::CFG::printDiff(raw_ostream &out, + const CFG &Before, + const CFG &After) { + assert(!After.isPoisoned()); + + // Print function name. + const CFG *FuncGraph = nullptr; + if (!After.Graph.empty()) + FuncGraph = &After; + else if (!Before.isPoisoned() && !Before.Graph.empty()) + FuncGraph = &Before; + + if (FuncGraph) + out << "In function @" + << FuncGraph->Graph.begin()->first->getParent()->getName() << "\n"; + + if (Before.isPoisoned()) { + out << "Some blocks were deleted\n"; + return; + } + + // Find and print graph differences. + if (Before.Graph.size() != After.Graph.size()) + out << "Different number of non-leaf basic blocks: before=" + << Before.Graph.size() << ", after=" << After.Graph.size() << "\n"; + + for (auto &BB : Before.Graph) { + auto BA = After.Graph.find(BB.first); + if (BA == After.Graph.end()) { + out << "Non-leaf block "; + printBBName(out, BB.first); + out << " is removed (" << BB.second.size() << " successors)\n"; + } + } + + for (auto &BA : After.Graph) { + auto BB = Before.Graph.find(BA.first); + if (BB == Before.Graph.end()) { + out << "Non-leaf block "; + printBBName(out, BA.first); + out << " is added (" << BA.second.size() << " successors)\n"; + continue; + } + + if (BB->second == BA.second) + continue; + + out << "Different successors of block "; + printBBName(out, BA.first); + out << " (unordered):\n"; + out << "- before (" << BB->second.size() << "): "; + for (auto &SuccB : BB->second) { + printBBName(out, SuccB.first); + if (SuccB.second != 1) + out << "(" << SuccB.second << "), "; + else + out << ", "; + } + out << "\n"; + out << "- after (" << BA.second.size() << "): "; + for (auto &SuccA : BA.second) { + printBBName(out, SuccA.first); + if (SuccA.second != 1) + out << "(" << SuccA.second << "), "; + else + out << ", "; + } + out << "\n"; + } +} + +void PreservedCFGCheckerInstrumentation::registerCallbacks( + PassInstrumentationCallbacks &PIC) { + if (!VerifyPreservedCFG) + return; + + PIC.registerBeforeNonSkippedPassCallback([this](StringRef P, Any IR) { + if (any_isa(IR)) + GraphStackBefore.emplace_back(P, CFG(any_cast(IR))); + else + GraphStackBefore.emplace_back(P, None); + }); + + PIC.registerAfterPassInvalidatedCallback( + [this](StringRef P, const PreservedAnalyses &PassPA) { + auto Before = GraphStackBefore.pop_back_val(); + assert(Before.first == P && + "Before and After callbacks must correspond"); + (void)Before; + }); + + PIC.registerAfterPassCallback([this](StringRef P, Any IR, + const PreservedAnalyses &PassPA) { + auto Before = GraphStackBefore.pop_back_val(); + assert(Before.first == P && "Before and After callbacks must correspond"); + auto &GraphBefore = Before.second; + + if (!PassPA.allAnalysesInSetPreserved()) + return; + + if (any_isa(IR)) { + assert(GraphBefore && "Must be built in BeforePassCallback"); + CFG GraphAfter(any_cast(IR), false /* NeedsGuard */); + if (GraphAfter == *GraphBefore) + return; + + dbgs() << "Error: " << P + << " reported it preserved CFG, but changes detected:\n"; + CFG::printDiff(dbgs(), *GraphBefore, GraphAfter); + report_fatal_error(Twine("Preserved CFG changed by ", P)); + } + }); +} + void StandardInstrumentations::registerCallbacks( PassInstrumentationCallbacks &PIC) { PrintIR.registerCallbacks(PIC); PrintPass.registerCallbacks(PIC); TimePasses.registerCallbacks(PIC); OptNone.registerCallbacks(PIC); + PreservedCFGChecker.registerCallbacks(PIC); } diff --git a/llvm/lib/ProfileData/GCOV.cpp b/llvm/lib/ProfileData/GCOV.cpp index 7b97723da60cc..1d8aec08c0eed 100644 --- a/llvm/lib/ProfileData/GCOV.cpp +++ b/llvm/lib/ProfileData/GCOV.cpp @@ -14,14 +14,16 @@ #include "llvm/ProfileData/GCOV.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Config/llvm-config.h" +#include "llvm/Demangle/Demangle.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" -#include "llvm/Support/Path.h" #include "llvm/Support/MD5.h" +#include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" #include #include +#include using namespace llvm; @@ -39,6 +41,59 @@ enum : uint32_t { GCOV_TAG_PROGRAM_SUMMARY = 0xa3000000, }; +namespace { +struct Summary { + Summary(StringRef Name) : Name(Name) {} + + StringRef Name; + uint64_t lines = 0; + uint64_t linesExec = 0; + uint64_t branches = 0; + uint64_t branchesExec = 0; + uint64_t branchesTaken = 0; +}; + +struct LineInfo { + SmallVector blocks; + uint64_t count = 0; + bool exists = false; +}; + +struct SourceInfo { + StringRef filename; + SmallString<0> displayName; + std::vector> startLineToFunctions; + std::vector lines; + bool ignored = false; + SourceInfo(StringRef filename) : filename(filename) {} +}; + +class Context { +public: + Context(const GCOV::Options &Options) : options(Options) {} + void print(StringRef filename, StringRef gcno, StringRef gcda, + GCOVFile &file); + +private: + std::string getCoveragePath(StringRef filename, StringRef mainFilename) const; + void printFunctionDetails(const GCOVFunction &f, raw_ostream &os) const; + void printBranchInfo(const GCOVBlock &Block, uint32_t &edgeIdx, + raw_ostream &OS) const; + void printSummary(const Summary &summary, raw_ostream &os) const; + + void collectFunction(GCOVFunction &f, Summary &summary); + void collectSourceLine(SourceInfo &si, Summary *summary, LineInfo &line, + size_t lineNum) const; + void collectSource(SourceInfo &si, Summary &summary) const; + void annotateSource(SourceInfo &si, const GCOVFile &file, StringRef gcno, + StringRef gcda, raw_ostream &os) const; + void printSourceToIntermediate(const SourceInfo &si, raw_ostream &os) const; + + const GCOV::Options &options; + std::vector sources; +}; +} // namespace + //===----------------------------------------------------------------------===// // GCOVFile implementation. @@ -61,8 +116,8 @@ bool GCOVFile::readGCNO(GCOVBuffer &buf) { if (!buf.readInt(length)) return false; if (tag == GCOV_TAG_FUNCTION) { - Functions.push_back(std::make_unique(*this)); - fn = Functions.back().get(); + functions.push_back(std::make_unique(*this)); + fn = functions.back().get(); fn->ident = buf.getWord(); fn->linenoChecksum = buf.getWord(); if (Version >= GCOV::V407) @@ -90,41 +145,40 @@ bool GCOVFile::readGCNO(GCOVBuffer &buf) { if (Version < GCOV::V800) { for (uint32_t i = 0; i != length; ++i) { buf.getWord(); // Ignored block flags - fn->Blocks.push_back(std::make_unique(*fn, i)); + fn->blocks.push_back(std::make_unique(i)); } } else { uint32_t num = buf.getWord(); for (uint32_t i = 0; i != num; ++i) - fn->Blocks.push_back(std::make_unique(*fn, i)); + fn->blocks.push_back(std::make_unique(i)); } } else if (tag == GCOV_TAG_ARCS && fn) { uint32_t srcNo = buf.getWord(); - if (srcNo >= fn->Blocks.size()) { + if (srcNo >= fn->blocks.size()) { errs() << "unexpected block number: " << srcNo << " (in " - << fn->Blocks.size() << ")\n"; + << fn->blocks.size() << ")\n"; return false; } - GCOVBlock *src = fn->Blocks[srcNo].get(); + GCOVBlock *src = fn->blocks[srcNo].get(); for (uint32_t i = 0, e = (length - 1) / 2; i != e; ++i) { uint32_t dstNo = buf.getWord(), flags = buf.getWord(); - GCOVBlock *dst = fn->Blocks[dstNo].get(); - auto arc = - std::make_unique(*src, *dst, flags & GCOV_ARC_FALLTHROUGH); + GCOVBlock *dst = fn->blocks[dstNo].get(); + auto arc = std::make_unique(*src, *dst, flags); src->addDstEdge(arc.get()); dst->addSrcEdge(arc.get()); - if (flags & GCOV_ARC_ON_TREE) + if (arc->onTree()) fn->treeArcs.push_back(std::move(arc)); else fn->arcs.push_back(std::move(arc)); } } else if (tag == GCOV_TAG_LINES && fn) { uint32_t srcNo = buf.getWord(); - if (srcNo >= fn->Blocks.size()) { + if (srcNo >= fn->blocks.size()) { errs() << "unexpected block number: " << srcNo << " (in " - << fn->Blocks.size() << ")\n"; + << fn->blocks.size() << ")\n"; return false; } - GCOVBlock &Block = *fn->Blocks[srcNo]; + GCOVBlock &Block = *fn->blocks[srcNo]; for (;;) { uint32_t line = buf.getWord(); if (line) @@ -219,12 +273,24 @@ bool GCOVFile::readGCDA(GCOVBuffer &buf) { return false; } for (std::unique_ptr &arc : fn->arcs) { - if (!buf.readInt64(arc->Count)) + if (!buf.readInt64(arc->count)) return false; - // FIXME Fix counters - arc->src.Counter += arc->Count; - if (arc->dst.succ.empty()) - arc->dst.Counter += arc->Count; + arc->src.count += arc->count; + } + + if (fn->blocks.size() >= 2) { + GCOVBlock &src = *fn->blocks[0]; + GCOVBlock &sink = + Version < GCOV::V408 ? *fn->blocks.back() : *fn->blocks[1]; + auto arc = std::make_unique(sink, src, GCOV_ARC_ON_TREE); + sink.addDstEdge(arc.get()); + src.addSrcEdge(arc.get()); + fn->treeArcs.push_back(std::move(arc)); + + for (GCOVBlock &block : fn->blocksRange()) + fn->propagateCounts(block, nullptr); + for (size_t i = fn->treeArcs.size() - 1; i; --i) + fn->treeArcs[i - 1]->src.count += fn->treeArcs[i - 1]->count; } } pos += 4 * length; @@ -246,41 +312,71 @@ void GCOVFile::print(raw_ostream &OS) const { LLVM_DUMP_METHOD void GCOVFile::dump() const { print(dbgs()); } #endif -/// collectLineCounts - Collect line counts. This must be used after -/// reading .gcno and .gcda files. -void GCOVFile::collectLineCounts(FileInfo &fi) { - assert(fi.sources.empty()); - for (StringRef filename : filenames) - fi.sources.emplace_back(filename); - for (GCOVFunction &f : *this) { - f.collectLineCounts(fi); - fi.sources[f.srcIdx].functions.push_back(&f); - } - fi.setRunCount(RunCount); - fi.setProgramCount(ProgramCount); -} +bool GCOVArc::onTree() const { return flags & GCOV_ARC_ON_TREE; } //===----------------------------------------------------------------------===// // GCOVFunction implementation. +StringRef GCOVFunction::getName(bool demangle) const { + if (!demangle) + return Name; + if (demangled.empty()) { + do { + if (Name.startswith("_Z")) { + int status = 0; + // Name is guaranteed to be NUL-terminated. + char *res = itaniumDemangle(Name.data(), nullptr, nullptr, &status); + if (status == 0) { + demangled = res; + free(res); + break; + } + } + demangled = Name; + } while (0); + } + return demangled; +} StringRef GCOVFunction::getFilename() const { return file.filenames[srcIdx]; } /// getEntryCount - Get the number of times the function was called by /// retrieving the entry block's count. uint64_t GCOVFunction::getEntryCount() const { - return Blocks.front()->getCount(); + return blocks.front()->getCount(); } -/// getExitCount - Get the number of times the function returned by retrieving -/// the exit block's count. -uint64_t GCOVFunction::getExitCount() const { - return Blocks.back()->getCount(); +GCOVBlock &GCOVFunction::getExitBlock() const { + return file.getVersion() < GCOV::V408 ? *blocks.back() : *blocks[1]; +} + +// For each basic block, the sum of incoming edge counts equals the sum of +// outgoing edge counts by Kirchoff's circuit law. If the unmeasured arcs form a +// spanning tree, the count for each unmeasured arc (GCOV_ARC_ON_TREE) can be +// uniquely identified. +uint64_t GCOVFunction::propagateCounts(const GCOVBlock &v, GCOVArc *pred) { + // If GCOV_ARC_ON_TREE edges do form a tree, visited is not needed; otherwise + // this prevents infinite recursion. + if (!visited.insert(&v).second) + return 0; + + uint64_t excess = 0; + for (GCOVArc *e : v.srcs()) + if (e != pred) + excess += e->onTree() ? propagateCounts(e->src, e) : e->count; + for (GCOVArc *e : v.dsts()) + if (e != pred) + excess -= e->onTree() ? propagateCounts(e->dst, e) : e->count; + if (int64_t(excess) < 0) + excess = -excess; + if (pred) + pred->count = excess; + return excess; } void GCOVFunction::print(raw_ostream &OS) const { OS << "===== " << Name << " (" << ident << ") @ " << getFilename() << ":" << startLine << "\n"; - for (const auto &Block : Blocks) + for (const auto &Block : blocks) Block->print(OS); } @@ -291,44 +387,30 @@ LLVM_DUMP_METHOD void GCOVFunction::dump() const { print(dbgs()); } /// collectLineCounts - Collect line counts. This must be used after /// reading .gcno and .gcda files. -void GCOVFunction::collectLineCounts(FileInfo &FI) { - // If the line number is zero, this is a function that doesn't actually appear - // in the source file, so there isn't anything we can do with it. - if (startLine == 0) - return; - - for (const auto &Block : Blocks) - Block->collectLineCounts(FI); - FI.addFunctionLine(getFilename(), startLine, this); -} //===----------------------------------------------------------------------===// // GCOVBlock implementation. -/// collectLineCounts - Collect line counts. This must be used after -/// reading .gcno and .gcda files. -void GCOVBlock::collectLineCounts(FileInfo &FI) { - for (uint32_t N : Lines) - FI.addBlockLine(Parent.getFilename(), N, this); -} - void GCOVBlock::print(raw_ostream &OS) const { - OS << "Block : " << Number << " Counter : " << Counter << "\n"; + OS << "Block : " << number << " Counter : " << count << "\n"; if (!pred.empty()) { OS << "\tSource Edges : "; for (const GCOVArc *Edge : pred) - OS << Edge->src.Number << " (" << Edge->Count << "), "; + OS << Edge->src.number << " (" << Edge->count << "), "; OS << "\n"; } if (!succ.empty()) { OS << "\tDestination Edges : "; - for (const GCOVArc *Edge : succ) - OS << Edge->dst.Number << " (" << Edge->Count << "), "; + for (const GCOVArc *Edge : succ) { + if (Edge->flags & GCOV_ARC_ON_TREE) + OS << '*'; + OS << Edge->dst.number << " (" << Edge->count << "), "; + } OS << "\n"; } - if (!Lines.empty()) { + if (!lines.empty()) { OS << "\tLines : "; - for (uint32_t N : Lines) + for (uint32_t N : lines) OS << (N) << ","; OS << "\n"; } @@ -350,10 +432,10 @@ LLVM_DUMP_METHOD void GCOVBlock::dump() const { print(dbgs()); } uint64_t GCOVBlock::getCycleCount(const Edges &Path) { uint64_t CycleCount = std::numeric_limits::max(); for (auto E : Path) { - CycleCount = std::min(E->CyclesCount, CycleCount); + CycleCount = std::min(E->cycleCount, CycleCount); } for (auto E : Path) { - E->CyclesCount -= CycleCount; + E->cycleCount -= CycleCount; } return CycleCount; } @@ -436,42 +518,16 @@ void GCOVBlock::getCyclesCount(const BlockVector &Blocks, uint64_t &Count) { } } -/// Get the count for the list of blocks which lie on the same line. -uint64_t GCOVBlock::getLineCount(const BlockVector &Blocks) { - uint64_t Count = 0; - - for (auto Block : Blocks) { - if (Block->getNumSrcEdges() == 0) { - // The block has no predecessors and a non-null counter - // (can be the case with entry block in functions). - Count += Block->getCount(); - } else { - // Add counts from predecessors that are not on the same line. - for (auto E : Block->srcs()) { - const GCOVBlock *W = &E->src; - if (find(Blocks, W) == Blocks.end()) { - Count += E->Count; - } - } - } - for (auto E : Block->dsts()) { - E->CyclesCount = E->Count; - } - } - - GCOVBlock::getCyclesCount(Blocks, Count); - - return Count; -} - //===----------------------------------------------------------------------===// // FileInfo implementation. -// Safe integer division, returns 0 if numerator is 0. -static uint32_t safeDiv(uint64_t Numerator, uint64_t Divisor) { - if (!Numerator) +// Format dividend/divisor as a percentage. Return 1 if the result is greater +// than 0% and less than 1%. +static uint32_t formatPercentage(uint64_t dividend, uint64_t divisor) { + if (!dividend || !divisor) return 0; - return Numerator / Divisor; + dividend *= 100; + return dividend < divisor ? 1 : dividend / divisor; } // This custom division function mimics gcov's branch ouputs: @@ -582,23 +638,23 @@ static std::string mangleCoveragePath(StringRef Filename, bool PreservePaths) { return std::string(Result.str()); } -std::string FileInfo::getCoveragePath(StringRef Filename, - StringRef MainFilename) { - if (Options.NoOutput) +std::string Context::getCoveragePath(StringRef filename, + StringRef mainFilename) const { + if (options.NoOutput) // This is probably a bug in gcov, but when -n is specified, paths aren't // mangled at all, and the -l and -p options are ignored. Here, we do the // same. - return std::string(Filename); + return std::string(filename); std::string CoveragePath; - if (Options.LongFileNames && !Filename.equals(MainFilename)) + if (options.LongFileNames && !filename.equals(mainFilename)) CoveragePath = - mangleCoveragePath(MainFilename, Options.PreservePaths) + "##"; - CoveragePath += mangleCoveragePath(Filename, Options.PreservePaths); - if (Options.HashFilenames) { + mangleCoveragePath(mainFilename, options.PreservePaths) + "##"; + CoveragePath += mangleCoveragePath(filename, options.PreservePaths); + if (options.HashFilenames) { MD5 Hasher; MD5::MD5Result Result; - Hasher.update(Filename.str()); + Hasher.update(filename.str()); Hasher.final(Result); CoveragePath += "##" + std::string(Result.digest()); } @@ -606,292 +662,302 @@ std::string FileInfo::getCoveragePath(StringRef Filename, return CoveragePath; } -std::unique_ptr -FileInfo::openCoveragePath(StringRef CoveragePath) { - std::error_code EC; - auto OS = - std::make_unique(CoveragePath, EC, sys::fs::OF_Text); - if (EC) { - errs() << EC.message() << "\n"; - return std::make_unique(); +void Context::collectFunction(GCOVFunction &f, Summary &summary) { + SourceInfo &si = sources[f.srcIdx]; + if (f.startLine >= si.startLineToFunctions.size()) + si.startLineToFunctions.resize(f.startLine + 1); + si.startLineToFunctions[f.startLine].push_back(&f); + for (const GCOVBlock &b : f.blocksRange()) { + if (b.lines.empty()) + continue; + uint32_t maxLineNum = *std::max_element(b.lines.begin(), b.lines.end()); + if (maxLineNum >= si.lines.size()) + si.lines.resize(maxLineNum + 1); + for (uint32_t lineNum : b.lines) { + LineInfo &line = si.lines[lineNum]; + if (!line.exists) + ++summary.lines; + if (line.count == 0 && b.count) + ++summary.linesExec; + line.exists = true; + line.count += b.count; + line.blocks.push_back(&b); + } } - return std::move(OS); } -/// print - Print source files with collected line count information. -void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename, - StringRef GCNOFile, StringRef GCDAFile, GCOVFile &file) { - SmallVector Filenames; - for (const auto &LI : LineInfo) - Filenames.push_back(LI.first()); - llvm::sort(Filenames); - - for (StringRef Filename : Filenames) { - auto AllLines = - Options.Intermediate ? LineConsumer() : LineConsumer(Filename); - std::string CoveragePath = getCoveragePath(Filename, MainFilename); - std::unique_ptr CovStream; - if (Options.NoOutput || Options.Intermediate) - CovStream = std::make_unique(); - else if (!Options.UseStdout) - CovStream = openCoveragePath(CoveragePath); - raw_ostream &CovOS = - !Options.NoOutput && Options.UseStdout ? llvm::outs() : *CovStream; - - CovOS << " -: 0:Source:" << Filename << "\n"; - CovOS << " -: 0:Graph:" << GCNOFile << "\n"; - CovOS << " -: 0:Data:" << GCDAFile << "\n"; - CovOS << " -: 0:Runs:" << RunCount << "\n"; - if (file.getVersion() < GCOV::V900) - CovOS << " -: 0:Programs:" << ProgramCount << "\n"; - - const LineData &Line = LineInfo[Filename]; - GCOVCoverage FileCoverage(Filename); - for (uint32_t LineIndex = 0; LineIndex < Line.LastLine || !AllLines.empty(); - ++LineIndex) { - if (Options.BranchInfo) { - FunctionLines::const_iterator FuncsIt = Line.Functions.find(LineIndex); - if (FuncsIt != Line.Functions.end()) - printFunctionSummary(CovOS, FuncsIt->second); - } +void Context::collectSourceLine(SourceInfo &si, Summary *summary, + LineInfo &line, size_t lineNum) const { + uint64_t count = 0; + for (const GCOVBlock *b : line.blocks) { + if (b->number == 0) { + // For nonstandard control flows, arcs into the exit block may be + // duplicately counted (fork) or not be counted (abnormal exit), and thus + // the (exit,entry) counter may be inaccurate. Count the entry block with + // the outgoing arcs. + for (const GCOVArc *arc : b->succ) + count += arc->count; + } else { + // Add counts from predecessors that are not on the same line. + for (const GCOVArc *arc : b->pred) + if (!llvm::is_contained(line.blocks, &arc->src)) + count += arc->count; + } + for (GCOVArc *arc : b->succ) + arc->cycleCount = arc->count; + } - BlockLines::const_iterator BlocksIt = Line.Blocks.find(LineIndex); - if (BlocksIt == Line.Blocks.end()) { - // No basic blocks are on this line. Not an executable line of code. - CovOS << " -:"; - AllLines.printNext(CovOS, LineIndex + 1); - } else { - const BlockVector &Blocks = BlocksIt->second; - - // Add up the block counts to form line counts. - DenseMap LineExecs; - for (const GCOVBlock *Block : Blocks) { - if (Options.FuncCoverage) { - // This is a slightly convoluted way to most accurately gather line - // statistics for functions. Basically what is happening is that we - // don't want to count a single line with multiple blocks more than - // once. However, we also don't simply want to give the total line - // count to every function that starts on the line. Thus, what is - // happening here are two things: - // 1) Ensure that the number of logical lines is only incremented - // once per function. - // 2) If there are multiple blocks on the same line, ensure that the - // number of lines executed is incremented as long as at least - // one of the blocks are executed. - const GCOVFunction *Function = &Block->getParent(); - if (FuncCoverages.find(Function) == FuncCoverages.end()) { - std::pair KeyValue( - Function, GCOVCoverage(Function->getName())); - FuncCoverages.insert(KeyValue); - } - GCOVCoverage &FuncCoverage = FuncCoverages.find(Function)->second; - - if (LineExecs.find(Function) == LineExecs.end()) { - if (Block->getCount()) { - ++FuncCoverage.LinesExec; - LineExecs[Function] = true; - } else { - LineExecs[Function] = false; - } - ++FuncCoverage.LogicalLines; - } else if (!LineExecs[Function] && Block->getCount()) { - ++FuncCoverage.LinesExec; - LineExecs[Function] = true; - } - } - } + GCOVBlock::getCyclesCount(line.blocks, count); + line.count = count; + if (line.exists) { + ++summary->lines; + if (line.count != 0) + ++summary->linesExec; + } - const uint64_t LineCount = GCOVBlock::getLineCount(Blocks); - if (LineCount == 0) - CovOS << " #####:"; - else { - CovOS << format("%9" PRIu64 ":", LineCount); - ++FileCoverage.LinesExec; - } - ++FileCoverage.LogicalLines; - - AllLines.printNext(CovOS, LineIndex + 1); - - uint32_t BlockNo = 0; - uint32_t EdgeNo = 0; - for (const GCOVBlock *Block : Blocks) { - // Only print block and branch information at the end of the block. - if (Block->getLastLine() != LineIndex + 1) - continue; - if (Options.AllBlocks) - printBlockInfo(CovOS, *Block, LineIndex, BlockNo); - if (Options.BranchInfo) { - size_t NumEdges = Block->getNumDstEdges(); - if (NumEdges > 1) - printBranchInfo(CovOS, *Block, FileCoverage, EdgeNo); - else if (Options.UncondBranch && NumEdges == 1) - printUncondBranchInfo(CovOS, EdgeNo, Block->succ[0]->Count); - } - } + if (options.BranchInfo) + for (const GCOVBlock *b : line.blocks) { + if (b->getLastLine() != lineNum) + continue; + int branches = 0, execBranches = 0, takenBranches = 0; + for (const GCOVArc *arc : b->succ) { + ++branches; + if (count != 0) + ++execBranches; + if (arc->count != 0) + ++takenBranches; + } + if (branches > 1) { + summary->branches += branches; + summary->branchesExec += execBranches; + summary->branchesTaken += takenBranches; } } - SourceInfo &source = sources[file.filenameToIdx.find(Filename)->second]; - source.name = CoveragePath; - source.coverage = FileCoverage; +} + +void Context::collectSource(SourceInfo &si, Summary &summary) const { + size_t lineNum = 0; + for (LineInfo &line : si.lines) { + collectSourceLine(si, &summary, line, lineNum); + ++lineNum; } +} - if (Options.Intermediate && !Options.NoOutput) { - // gcov 7.* unexpectedly create multiple .gcov files, which was fixed in 8.0 - // (PR GCC/82702). We create just one file. - std::string outputPath(sys::path::filename(MainFilename)); - std::error_code ec; - raw_fd_ostream os(outputPath + ".gcov", ec, sys::fs::OF_Text); - if (ec) { - errs() << ec.message() << "\n"; - return; +void Context::annotateSource(SourceInfo &si, const GCOVFile &file, + StringRef gcno, StringRef gcda, + raw_ostream &os) const { + auto source = + options.Intermediate ? LineConsumer() : LineConsumer(si.filename); + + os << " -: 0:Source:" << si.displayName << '\n'; + os << " -: 0:Graph:" << gcno << '\n'; + os << " -: 0:Data:" << gcda << '\n'; + os << " -: 0:Runs:" << file.RunCount << '\n'; + if (file.Version < GCOV::V900) + os << " -: 0:Programs:" << file.ProgramCount << '\n'; + + for (size_t lineNum = 1; !source.empty(); ++lineNum) { + if (lineNum >= si.lines.size()) { + os << " -:"; + source.printNext(os, lineNum); + continue; } - for (const SourceInfo &source : sources) { - os << "file:" << source.filename << '\n'; - for (const GCOVFunction *f : source.functions) - os << "function:" << f->startLine << ',' << f->getEntryCount() << ',' - << f->Name << '\n'; - const LineData &line = LineInfo[source.filename]; - for (uint32_t lineNum = 0; lineNum != line.LastLine; ++lineNum) { - BlockLines::const_iterator BlocksIt = line.Blocks.find(lineNum); - if (BlocksIt == line.Blocks.end()) - continue; - const BlockVector &blocks = BlocksIt->second; - // GCC 8 (r254259) added third third field for Ada: - // lcount:,, - // We don't need the third field. - os << "lcount:" << (lineNum + 1) << ',' - << GCOVBlock::getLineCount(blocks) << '\n'; - - if (!Options.BranchInfo) - continue; - for (const GCOVBlock *block : blocks) { - if (block->getLastLine() != lineNum + 1 || - block->getNumDstEdges() < 2) - continue; - for (const GCOVArc *arc : block->dsts()) { - const char *type = block->getCount() - ? arc->Count ? "taken" : "nottaken" - : "notexec"; - os << "branch:" << (lineNum + 1) << ',' << type << '\n'; - } + const LineInfo &line = si.lines[lineNum]; + if (options.BranchInfo && lineNum < si.startLineToFunctions.size()) + for (const auto *f : si.startLineToFunctions[lineNum]) + printFunctionDetails(*f, os); + if (!line.exists) + os << " -:"; + else if (line.count == 0) + os << " #####:"; + else + os << format("%9" PRIu64 ":", line.count); + source.printNext(os, lineNum); + + uint32_t blockIdx = 0, edgeIdx = 0; + for (const GCOVBlock *b : line.blocks) { + if (b->getLastLine() != lineNum) + continue; + if (options.AllBlocks) { + if (b->getCount() == 0) + os << " $$$$$:"; + else + os << format("%9" PRIu64 ":", b->count); + os << format("%5u-block %2u\n", lineNum, blockIdx++); + } + if (options.BranchInfo) { + size_t NumEdges = b->succ.size(); + if (NumEdges > 1) + printBranchInfo(*b, edgeIdx, os); + else if (options.UncondBranch && NumEdges == 1) { + uint64_t count = b->succ[0]->count; + os << format("unconditional %2u ", edgeIdx++) + << formatBranchInfo(options, count, count) << '\n'; } } } } +} - if (!Options.UseStdout) { - // FIXME: There is no way to detect calls given current instrumentation. - if (Options.FuncCoverage) - printFuncCoverage(InfoOS); - printFileCoverage(InfoOS); +void Context::printSourceToIntermediate(const SourceInfo &si, + raw_ostream &os) const { + os << "file:" << si.filename << '\n'; + for (const auto &fs : si.startLineToFunctions) + for (const GCOVFunction *f : fs) + os << "function:" << f->startLine << ',' << f->getEntryCount() << ',' + << f->getName(options.Demangle) << '\n'; + for (size_t lineNum = 1, size = si.lines.size(); lineNum < size; ++lineNum) { + const LineInfo &line = si.lines[lineNum]; + if (line.blocks.empty()) + continue; + // GCC 8 (r254259) added third third field for Ada: + // lcount:,, + // We don't need the third field. + os << "lcount:" << lineNum << ',' << line.count << '\n'; + + if (!options.BranchInfo) + continue; + for (const GCOVBlock *b : line.blocks) { + if (b->succ.size() < 2 || b->getLastLine() != lineNum) + continue; + for (const GCOVArc *arc : b->succ) { + const char *type = + b->getCount() ? arc->count ? "taken" : "nottaken" : "notexec"; + os << "branch:" << lineNum << ',' << type << '\n'; + } + } } } -/// printFunctionSummary - Print function and block summary. -void FileInfo::printFunctionSummary(raw_ostream &OS, - const FunctionVector &Funcs) const { - for (const GCOVFunction *Func : Funcs) { - uint64_t EntryCount = Func->getEntryCount(); - uint32_t BlocksExec = 0; - for (const GCOVBlock &Block : Func->blocks()) - if (Block.getNumDstEdges() && Block.getCount()) - ++BlocksExec; - - OS << "function " << Func->getName() << " called " << EntryCount - << " returned " << safeDiv(Func->getExitCount() * 100, EntryCount) - << "% blocks executed " - << safeDiv(BlocksExec * 100, Func->getNumBlocks() - 1) << "%\n"; +void Context::print(StringRef filename, StringRef gcno, StringRef gcda, + GCOVFile &file) { + for (StringRef filename : file.filenames) { + sources.emplace_back(filename); + SourceInfo &si = sources.back(); + si.displayName = si.filename; + if (!options.SourcePrefix.empty() && + sys::path::replace_path_prefix(si.displayName, options.SourcePrefix, + "") && + !si.displayName.empty()) { + // TODO replace_path_prefix may strip the prefix even if the remaining + // part does not start with a separator. + if (sys::path::is_separator(si.displayName[0])) + si.displayName.erase(si.displayName.begin()); + else + si.displayName = si.filename; + } + if (options.RelativeOnly && sys::path::is_absolute(si.displayName)) + si.ignored = true; + } + + raw_ostream &os = llvm::outs(); + for (GCOVFunction &f : make_pointee_range(file.functions)) { + Summary summary(f.getName(options.Demangle)); + collectFunction(f, summary); + if (options.FuncCoverage && !options.UseStdout) { + os << "Function '" << summary.Name << "'\n"; + printSummary(summary, os); + os << '\n'; + } } -} -/// printBlockInfo - Output counts for each block. -void FileInfo::printBlockInfo(raw_ostream &OS, const GCOVBlock &Block, - uint32_t LineIndex, uint32_t &BlockNo) const { - if (Block.getCount() == 0) - OS << " $$$$$:"; - else - OS << format("%9" PRIu64 ":", Block.getCount()); - OS << format("%5u-block %2u\n", LineIndex + 1, BlockNo++); -} + for (SourceInfo &si : sources) { + if (si.ignored) + continue; + Summary summary(si.displayName); + collectSource(si, summary); + + // Print file summary unless -t is specified. + std::string gcovName = getCoveragePath(si.filename, filename); + if (!options.UseStdout) { + os << "File '" << summary.Name << "'\n"; + printSummary(summary, os); + if (!options.NoOutput && !options.Intermediate) + os << "Creating '" << gcovName << "'\n"; + os << '\n'; + } -/// printBranchInfo - Print conditional branch probabilities. -void FileInfo::printBranchInfo(raw_ostream &OS, const GCOVBlock &Block, - GCOVCoverage &Coverage, uint32_t &EdgeNo) { - SmallVector BranchCounts; - uint64_t TotalCounts = 0; - for (const GCOVArc *Edge : Block.dsts()) { - BranchCounts.push_back(Edge->Count); - TotalCounts += Edge->Count; - if (Block.getCount()) - ++Coverage.BranchesExec; - if (Edge->Count) - ++Coverage.BranchesTaken; - ++Coverage.Branches; - - if (Options.FuncCoverage) { - const GCOVFunction *Function = &Block.getParent(); - GCOVCoverage &FuncCoverage = FuncCoverages.find(Function)->second; - if (Block.getCount()) - ++FuncCoverage.BranchesExec; - if (Edge->Count) - ++FuncCoverage.BranchesTaken; - ++FuncCoverage.Branches; + if (options.NoOutput || options.Intermediate) + continue; + Optional os; + if (!options.UseStdout) { + std::error_code ec; + os.emplace(gcovName, ec, sys::fs::OF_Text); + if (ec) { + errs() << ec.message() << '\n'; + continue; + } } + annotateSource(si, file, gcno, gcda, + options.UseStdout ? llvm::outs() : *os); } - for (uint64_t N : BranchCounts) - OS << format("branch %2u ", EdgeNo++) - << formatBranchInfo(Options, N, TotalCounts) << "\n"; + if (options.Intermediate && !options.NoOutput) { + // gcov 7.* unexpectedly create multiple .gcov files, which was fixed in 8.0 + // (PR GCC/82702). We create just one file. + std::string outputPath(sys::path::filename(filename)); + std::error_code ec; + raw_fd_ostream os(outputPath + ".gcov", ec, sys::fs::OF_Text); + if (ec) { + errs() << ec.message() << '\n'; + return; + } + + for (const SourceInfo &si : sources) + printSourceToIntermediate(si, os); + } } -/// printUncondBranchInfo - Print unconditional branch probabilities. -void FileInfo::printUncondBranchInfo(raw_ostream &OS, uint32_t &EdgeNo, - uint64_t Count) const { - OS << format("unconditional %2u ", EdgeNo++) - << formatBranchInfo(Options, Count, Count) << "\n"; +void Context::printFunctionDetails(const GCOVFunction &f, + raw_ostream &os) const { + const uint64_t entryCount = f.getEntryCount(); + uint32_t blocksExec = 0; + const GCOVBlock &exitBlock = f.getExitBlock(); + uint64_t exitCount = 0; + for (const GCOVArc *arc : exitBlock.pred) + exitCount += arc->count; + for (const GCOVBlock &b : f.blocksRange()) + if (b.number != 0 && &b != &exitBlock && b.getCount()) + ++blocksExec; + + os << "function " << f.getName(options.Demangle) << " called " << entryCount + << " returned " << formatPercentage(exitCount, entryCount) + << "% blocks executed " + << formatPercentage(blocksExec, f.blocks.size() - 2) << "%\n"; } -// printCoverage - Print generic coverage info used by both printFuncCoverage -// and printFileCoverage. -void FileInfo::printCoverage(raw_ostream &OS, - const GCOVCoverage &Coverage) const { - OS << format("Lines executed:%.2f%% of %u\n", - double(Coverage.LinesExec) * 100 / Coverage.LogicalLines, - Coverage.LogicalLines); - if (Options.BranchInfo) { - if (Coverage.Branches) { - OS << format("Branches executed:%.2f%% of %u\n", - double(Coverage.BranchesExec) * 100 / Coverage.Branches, - Coverage.Branches); - OS << format("Taken at least once:%.2f%% of %u\n", - double(Coverage.BranchesTaken) * 100 / Coverage.Branches, - Coverage.Branches); - } else { - OS << "No branches\n"; - } - OS << "No calls\n"; // to be consistent with gcov - } +/// printBranchInfo - Print conditional branch probabilities. +void Context::printBranchInfo(const GCOVBlock &Block, uint32_t &edgeIdx, + raw_ostream &os) const { + uint64_t total = 0; + for (const GCOVArc *arc : Block.dsts()) + total += arc->count; + for (const GCOVArc *arc : Block.dsts()) + os << format("branch %2u ", edgeIdx++) + << formatBranchInfo(options, arc->count, total) << '\n'; } -// printFuncCoverage - Print per-function coverage info. -void FileInfo::printFuncCoverage(raw_ostream &OS) const { - for (const auto &FC : FuncCoverages) { - const GCOVCoverage &Coverage = FC.second; - OS << "Function '" << Coverage.Name << "'\n"; - printCoverage(OS, Coverage); - OS << "\n"; +void Context::printSummary(const Summary &summary, raw_ostream &os) const { + os << format("Lines executed:%.2f%% of %u\n", + double(summary.linesExec) * 100 / summary.lines, summary.lines); + if (options.BranchInfo) { + if (summary.branches == 0) { + os << "No branches\n"; + } else { + os << format("Branches executed:%.2f%% of %u\n", + double(summary.branchesExec) * 100 / summary.branches, + summary.branches); + os << format("Taken at least once:%.2f%% of %u\n", + double(summary.branchesTaken) * 100 / summary.branches, + summary.branches); + } + os << "No calls\n"; } } -// printFileCoverage - Print per-file coverage info. -void FileInfo::printFileCoverage(raw_ostream &OS) const { - for (const SourceInfo &source : sources) { - const GCOVCoverage &Coverage = source.coverage; - OS << "File '" << Coverage.Name << "'\n"; - printCoverage(OS, Coverage); - if (!Options.NoOutput && !Options.Intermediate) - OS << "Creating '" << source.name << "'\n"; - OS << "\n"; - } +void llvm::gcovOneInput(const GCOV::Options &options, StringRef filename, + StringRef gcno, StringRef gcda, GCOVFile &file) { + Context fi(options); + fi.print(filename, gcno, gcda, file); } diff --git a/llvm/lib/ProfileData/LLVMBuild.txt b/llvm/lib/ProfileData/LLVMBuild.txt index 335c2260a0029..2fffab24579b1 100644 --- a/llvm/lib/ProfileData/LLVMBuild.txt +++ b/llvm/lib/ProfileData/LLVMBuild.txt @@ -21,4 +21,4 @@ subdirectories = Coverage type = Library name = ProfileData parent = Libraries -required_libraries = Core Support +required_libraries = Core Support Demangle diff --git a/llvm/lib/Support/AArch64TargetParser.cpp b/llvm/lib/Support/AArch64TargetParser.cpp index a6de44605675a..82f770766d9be 100644 --- a/llvm/lib/Support/AArch64TargetParser.cpp +++ b/llvm/lib/Support/AArch64TargetParser.cpp @@ -35,11 +35,11 @@ unsigned AArch64::getDefaultFPU(StringRef CPU, AArch64::ArchKind AK) { .Default(ARM::FK_INVALID); } -unsigned AArch64::getDefaultExtensions(StringRef CPU, AArch64::ArchKind AK) { +uint64_t AArch64::getDefaultExtensions(StringRef CPU, AArch64::ArchKind AK) { if (CPU == "generic") return AArch64ARCHNames[static_cast(AK)].ArchBaseExtensions; - return StringSwitch(CPU) + return StringSwitch(CPU) #define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ .Case(NAME, AArch64ARCHNames[static_cast(ArchKind::ID)] \ .ArchBaseExtensions | \ @@ -59,7 +59,7 @@ AArch64::ArchKind AArch64::getCPUArchKind(StringRef CPU) { .Default(ArchKind::INVALID); } -bool AArch64::getExtensionFeatures(unsigned Extensions, +bool AArch64::getExtensionFeatures(uint64_t Extensions, std::vector &Features) { if (Extensions == AArch64::AEK_INVALID) return false; diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 569cac790af99..7a4c8bd3639d5 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -755,6 +755,7 @@ void IEEEFloat::copySignificand(const IEEEFloat &rhs) { void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) { category = fcNaN; sign = Negative; + exponent = exponentNaN(); integerPart *significand = significandParts(); unsigned numParts = partCount(); @@ -925,8 +926,7 @@ IEEEFloat::IEEEFloat(const fltSemantics &ourSemantics, integerPart value) { IEEEFloat::IEEEFloat(const fltSemantics &ourSemantics) { initialize(&ourSemantics); - category = fcZero; - sign = false; + makeZero(false); } // Delegate to the previous constructor, because later copy constructor may @@ -3379,15 +3379,13 @@ void IEEEFloat::initFromF80LongDoubleAPInt(const APInt &api) { sign = static_cast(i2>>15); if (myexponent == 0 && mysignificand == 0) { - // exponent, significand meaningless - category = fcZero; + makeZero(sign); } else if (myexponent==0x7fff && mysignificand==0x8000000000000000ULL) { - // exponent, significand meaningless - category = fcInfinity; + makeInf(sign); } else if ((myexponent == 0x7fff && mysignificand != 0x8000000000000000ULL) || (myexponent != 0x7fff && myexponent != 0 && myintegerbit == 0)) { - // exponent meaningless category = fcNaN; + exponent = exponentNaN(); significandParts()[0] = mysignificand; significandParts()[1] = 0; } else { @@ -3438,16 +3436,14 @@ void IEEEFloat::initFromQuadrupleAPInt(const APInt &api) { sign = static_cast(i2>>63); if (myexponent==0 && (mysignificand==0 && mysignificand2==0)) { - // exponent, significand meaningless - category = fcZero; + makeZero(sign); } else if (myexponent==0x7fff && (mysignificand==0 && mysignificand2==0)) { - // exponent, significand meaningless - category = fcInfinity; + makeInf(sign); } else if (myexponent==0x7fff && (mysignificand!=0 || mysignificand2 !=0)) { - // exponent meaningless category = fcNaN; + exponent = exponentNaN(); significandParts()[0] = mysignificand; significandParts()[1] = mysignificand2; } else { @@ -3473,14 +3469,12 @@ void IEEEFloat::initFromDoubleAPInt(const APInt &api) { sign = static_cast(i>>63); if (myexponent==0 && mysignificand==0) { - // exponent, significand meaningless - category = fcZero; + makeZero(sign); } else if (myexponent==0x7ff && mysignificand==0) { - // exponent, significand meaningless - category = fcInfinity; + makeInf(sign); } else if (myexponent==0x7ff && mysignificand!=0) { - // exponent meaningless category = fcNaN; + exponent = exponentNaN(); *significandParts() = mysignificand; } else { category = fcNormal; @@ -3504,14 +3498,12 @@ void IEEEFloat::initFromFloatAPInt(const APInt &api) { sign = i >> 31; if (myexponent==0 && mysignificand==0) { - // exponent, significand meaningless - category = fcZero; + makeZero(sign); } else if (myexponent==0xff && mysignificand==0) { - // exponent, significand meaningless - category = fcInfinity; + makeInf(sign); } else if (myexponent==0xff && mysignificand!=0) { - // sign, exponent, significand meaningless category = fcNaN; + exponent = exponentNaN(); *significandParts() = mysignificand; } else { category = fcNormal; @@ -3535,14 +3527,12 @@ void IEEEFloat::initFromBFloatAPInt(const APInt &api) { sign = i >> 15; if (myexponent == 0 && mysignificand == 0) { - // exponent, significand meaningless - category = fcZero; + makeZero(sign); } else if (myexponent == 0xff && mysignificand == 0) { - // exponent, significand meaningless - category = fcInfinity; + makeInf(sign); } else if (myexponent == 0xff && mysignificand != 0) { - // sign, exponent, significand meaningless category = fcNaN; + exponent = exponentNaN(); *significandParts() = mysignificand; } else { category = fcNormal; @@ -3566,14 +3556,12 @@ void IEEEFloat::initFromHalfAPInt(const APInt &api) { sign = i >> 15; if (myexponent==0 && mysignificand==0) { - // exponent, significand meaningless - category = fcZero; + makeZero(sign); } else if (myexponent==0x1f && mysignificand==0) { - // exponent, significand meaningless - category = fcInfinity; + makeInf(sign); } else if (myexponent==0x1f && mysignificand!=0) { - // sign, exponent, significand meaningless category = fcNaN; + exponent = exponentNaN(); *significandParts() = mysignificand; } else { category = fcNormal; @@ -4131,17 +4119,29 @@ IEEEFloat::opStatus IEEEFloat::next(bool nextDown) { return result; } +APFloatBase::ExponentType IEEEFloat::exponentNaN() const { + return semantics->maxExponent + 1; +} + +APFloatBase::ExponentType IEEEFloat::exponentInf() const { + return semantics->maxExponent + 1; +} + +APFloatBase::ExponentType IEEEFloat::exponentZero() const { + return semantics->minExponent - 1; +} + void IEEEFloat::makeInf(bool Negative) { category = fcInfinity; sign = Negative; - exponent = semantics->maxExponent + 1; + exponent = exponentInf(); APInt::tcSet(significandParts(), 0, partCount()); } void IEEEFloat::makeZero(bool Negative) { category = fcZero; sign = Negative; - exponent = semantics->minExponent-1; + exponent = exponentZero(); APInt::tcSet(significandParts(), 0, partCount()); } diff --git a/llvm/lib/Support/ARMTargetParser.cpp b/llvm/lib/Support/ARMTargetParser.cpp index 751f84475f42c..73baac832ee30 100644 --- a/llvm/lib/Support/ARMTargetParser.cpp +++ b/llvm/lib/Support/ARMTargetParser.cpp @@ -255,7 +255,7 @@ ARM::ISAKind ARM::parseArchISA(StringRef Arch) { unsigned ARM::parseFPU(StringRef FPU) { StringRef Syn = getFPUSynonym(FPU); - for (const auto F : FPUNames) { + for (const auto &F : FPUNames) { if (Syn == F.getName()) return F.ID; } @@ -409,7 +409,7 @@ bool ARM::getExtensionFeatures(uint64_t Extensions, if (Extensions == AEK_INVALID) return false; - for (const auto AE : ARCHExtNames) { + for (const auto &AE : ARCHExtNames) { if ((Extensions & AE.ID) == AE.ID && AE.Feature) Features.push_back(AE.Feature); else if (AE.NegFeature) @@ -436,7 +436,7 @@ unsigned ARM::getArchAttr(ARM::ArchKind AK) { } StringRef ARM::getArchExtName(uint64_t ArchExtKind) { - for (const auto AE : ARCHExtNames) { + for (const auto &AE : ARCHExtNames) { if (ArchExtKind == AE.ID) return AE.getName(); } @@ -453,7 +453,7 @@ static bool stripNegationPrefix(StringRef &Name) { StringRef ARM::getArchExtFeature(StringRef ArchExt) { bool Negated = stripNegationPrefix(ArchExt); - for (const auto AE : ARCHExtNames) { + for (const auto &AE : ARCHExtNames) { if (AE.Feature && ArchExt == AE.getName()) return StringRef(Negated ? AE.NegFeature : AE.Feature); } @@ -502,7 +502,7 @@ bool ARM::appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, if (ID == AEK_INVALID) return false; - for (const auto AE : ARCHExtNames) { + for (const auto &AE : ARCHExtNames) { if (Negated) { if ((AE.ID & ID) == ID && AE.NegFeature) Features.push_back(AE.NegFeature); @@ -535,7 +535,7 @@ bool ARM::appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, } StringRef ARM::getHWDivName(uint64_t HWDivKind) { - for (const auto D : HWDivNames) { + for (const auto &D : HWDivNames) { if (HWDivKind == D.ID) return D.getName(); } @@ -548,7 +548,7 @@ StringRef ARM::getDefaultCPU(StringRef Arch) { return StringRef(); // Look for multiple AKs to find the default for pair AK+Name. - for (const auto CPU : CPUNames) { + for (const auto &CPU : CPUNames) { if (CPU.ArchID == AK && CPU.Default) return CPU.getName(); } @@ -559,7 +559,7 @@ StringRef ARM::getDefaultCPU(StringRef Arch) { uint64_t ARM::parseHWDiv(StringRef HWDiv) { StringRef Syn = getHWDivSynonym(HWDiv); - for (const auto D : HWDivNames) { + for (const auto &D : HWDivNames) { if (Syn == D.getName()) return D.ID; } @@ -567,7 +567,7 @@ uint64_t ARM::parseHWDiv(StringRef HWDiv) { } uint64_t ARM::parseArchExt(StringRef ArchExt) { - for (const auto A : ARCHExtNames) { + for (const auto &A : ARCHExtNames) { if (ArchExt == A.getName()) return A.ID; } @@ -575,7 +575,7 @@ uint64_t ARM::parseArchExt(StringRef ArchExt) { } ARM::ArchKind ARM::parseCPUArch(StringRef CPU) { - for (const auto C : CPUNames) { + for (const auto &C : CPUNames) { if (CPU == C.getName()) return C.ArchID; } diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index bcf972d4c49d6..a87906c16697b 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -1,24 +1,9 @@ +include(GetLibraryName) + if(LLVM_ENABLE_ZLIB) set(imported_libs ZLIB::ZLIB) endif() -function(get_system_libname libpath libname) - get_filename_component(libpath ${libpath} NAME) - set(prefixes ${CMAKE_FIND_LIBRARY_PREFIXES}) - set(suffixes ${CMAKE_FIND_LIBRARY_SUFFIXES}) - list(FILTER prefixes EXCLUDE REGEX "^\\s*$") - list(FILTER suffixes EXCLUDE REGEX "^\\s*$") - if( prefixes ) - string(REPLACE ";" "|" prefixes "${prefixes}") - string(REGEX REPLACE "^(${prefixes})" "" libpath ${libpath}) - endif() - if( suffixes ) - string(REPLACE ";" "|" suffixes "${suffixes}") - string(REGEX REPLACE "(${suffixes})$" "" libpath ${libpath}) - endif() - set(${libname} "${libpath}" PARENT_SCOPE) -endfunction() - if( MSVC OR MINGW ) # libuuid required for FOLDERID_Profile usage in lib/Support/Windows/Path.inc. # advapi32 required for CryptAcquireContextW in lib/Support/Windows/Path.inc. @@ -80,7 +65,6 @@ if(LLVM_INTEGRATED_CRT_ALLOC) add_definitions(-DENABLE_OVERRIDE -DENABLE_PRELOAD) set(ALLOCATOR_FILES "${LLVM_INTEGRATED_CRT_ALLOC}/rpmalloc/rpmalloc.c") elseif(LLVM_INTEGRATED_CRT_ALLOC MATCHES "snmalloc$") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /std:c++17" PARENT_SCOPE) set(ALLOCATOR_FILES "${LLVM_INTEGRATED_CRT_ALLOC}/src/override/malloc.cc" "${LLVM_INTEGRATED_CRT_ALLOC}/src/override/new.cc") set(system_libs ${system_libs} "mincore.lib" "-INCLUDE:malloc") elseif(LLVM_INTEGRATED_CRT_ALLOC MATCHES "mimalloc$") @@ -245,6 +229,8 @@ add_llvm_component_library(LLVMSupport set(llvm_system_libs ${system_libs}) +# This block is only needed for llvm-config. When we deprecate llvm-config and +# move to using CMake export, this block can be removed. if(LLVM_ENABLE_ZLIB) # CMAKE_BUILD_TYPE is only meaningful to single-configuration generators. if(CMAKE_BUILD_TYPE) @@ -254,17 +240,29 @@ if(LLVM_ENABLE_ZLIB) if(NOT zlib_library) get_property(zlib_library TARGET ZLIB::ZLIB PROPERTY LOCATION) endif() - get_system_libname(${zlib_library} zlib_library) + get_library_name(${zlib_library} zlib_library) set(llvm_system_libs ${llvm_system_libs} "${zlib_library}") endif() if(LLVM_ENABLE_TERMINFO) - get_system_libname(${TERMINFO_LIB} terminfo_library) + get_library_name(${TERMINFO_LIB} terminfo_library) set(llvm_system_libs ${llvm_system_libs} "${terminfo_library}") endif() set_property(TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS "${llvm_system_libs}") + +if(LLVM_INTEGRATED_CRT_ALLOC) + if(LLVM_INTEGRATED_CRT_ALLOC MATCHES "snmalloc$") + set_property(TARGET LLVMSupport PROPERTY CXX_STANDARD 17) + add_definitions(-D_SILENCE_CXX17_ITERATOR_BASE_CLASS_DEPRECATION_WARNING) + if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND + "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "x86_64") + set_property(TARGET LLVMSupport PROPERTY COMPILE_FLAGS "-mcx16") + endif() + endif() +endif() + if(LLVM_WITH_Z3) target_include_directories(LLVMSupport SYSTEM PRIVATE diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp index aad50e1240341..ed32a80a061db 100644 --- a/llvm/lib/Support/KnownBits.cpp +++ b/llvm/lib/Support/KnownBits.cpp @@ -115,13 +115,13 @@ KnownBits KnownBits::umax(const KnownBits &LHS, const KnownBits &RHS) { KnownBits KnownBits::umin(const KnownBits &LHS, const KnownBits &RHS) { // Flip the range of values: [0, 0xFFFFFFFF] <-> [0xFFFFFFFF, 0] - auto Flip = [](KnownBits Val) { return KnownBits(Val.One, Val.Zero); }; + auto Flip = [](const KnownBits &Val) { return KnownBits(Val.One, Val.Zero); }; return Flip(umax(Flip(LHS), Flip(RHS))); } KnownBits KnownBits::smax(const KnownBits &LHS, const KnownBits &RHS) { // Flip the range of values: [-0x80000000, 0x7FFFFFFF] <-> [0, 0xFFFFFFFF] - auto Flip = [](KnownBits Val) { + auto Flip = [](const KnownBits &Val) { unsigned SignBitPosition = Val.getBitWidth() - 1; APInt Zero = Val.Zero; APInt One = Val.One; @@ -134,7 +134,7 @@ KnownBits KnownBits::smax(const KnownBits &LHS, const KnownBits &RHS) { KnownBits KnownBits::smin(const KnownBits &LHS, const KnownBits &RHS) { // Flip the range of values: [-0x80000000, 0x7FFFFFFF] <-> [0xFFFFFFFF, 0] - auto Flip = [](KnownBits Val) { + auto Flip = [](const KnownBits &Val) { unsigned SignBitPosition = Val.getBitWidth() - 1; APInt Zero = Val.One; APInt One = Val.Zero; @@ -145,6 +145,24 @@ KnownBits KnownBits::smin(const KnownBits &LHS, const KnownBits &RHS) { return Flip(umax(Flip(LHS), Flip(RHS))); } +KnownBits KnownBits::abs() const { + // If the source's MSB is zero then we know the rest of the bits already. + if (isNonNegative()) + return *this; + + // Assume we know nothing. + KnownBits KnownAbs(getBitWidth()); + + // We only know that the absolute values's MSB will be zero iff there is + // a set bit that isn't the sign bit (otherwise it could be INT_MIN). + APInt Val = One; + Val.clearSignBit(); + if (!Val.isNullValue()) + KnownAbs.Zero.setSignBit(); + + return KnownAbs; +} + KnownBits &KnownBits::operator&=(const KnownBits &RHS) { // Result bit is 0 if either operand bit is 0. Zero |= RHS.Zero; diff --git a/llvm/lib/Support/TrigramIndex.cpp b/llvm/lib/Support/TrigramIndex.cpp index 88375e6e78639..1f1f3022b0b30 100644 --- a/llvm/lib/Support/TrigramIndex.cpp +++ b/llvm/lib/Support/TrigramIndex.cpp @@ -15,12 +15,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/TrigramIndex.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" - #include -#include -#include using namespace llvm; diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp index a5af98582452b..b7d9bd4f865c9 100644 --- a/llvm/lib/Support/X86TargetParser.cpp +++ b/llvm/lib/Support/X86TargetParser.cpp @@ -529,7 +529,7 @@ static constexpr FeatureBitset ImpliedFeaturesAVX5124FMAPS = {}; static constexpr FeatureBitset ImpliedFeaturesAVX5124VNNIW = {}; // SSE4_A->FMA4->XOP chain. -static constexpr FeatureBitset ImpliedFeaturesSSE4_A = FeatureSSSE3; +static constexpr FeatureBitset ImpliedFeaturesSSE4_A = FeatureSSE3; static constexpr FeatureBitset ImpliedFeaturesFMA4 = FeatureAVX | FeatureSSE4_A; static constexpr FeatureBitset ImpliedFeaturesXOP = FeatureFMA4; diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp index 83050c8574d9d..48b42fec0acdf 100644 --- a/llvm/lib/Support/raw_ostream.cpp +++ b/llvm/lib/Support/raw_ostream.cpp @@ -12,7 +12,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Config/config.h" #include "llvm/Support/Compiler.h" @@ -30,7 +29,6 @@ #include #include #include -#include // may provide O_BINARY. #if defined(HAVE_FCNTL_H) @@ -620,8 +618,9 @@ raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC, /// FD is the file descriptor that this writes to. If ShouldClose is true, this /// closes the file when the stream is destroyed. -raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered) - : raw_pwrite_stream(unbuffered), FD(fd), ShouldClose(shouldClose) { +raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered, + OStreamKind K) + : raw_pwrite_stream(unbuffered, K), FD(fd), ShouldClose(shouldClose) { if (FD < 0 ) { ShouldClose = false; return; @@ -904,6 +903,37 @@ raw_ostream &llvm::nulls() { return S; } +//===----------------------------------------------------------------------===// +// File Streams +//===----------------------------------------------------------------------===// + +raw_fd_stream::raw_fd_stream(StringRef Filename, std::error_code &EC) + : raw_fd_ostream(getFD(Filename, EC, sys::fs::CD_CreateAlways, + sys::fs::FA_Write | sys::fs::FA_Read, + sys::fs::OF_None), + true, false, OStreamKind::OK_FDStream) { + if (EC) + return; + + // Do not support non-seekable files. + if (!supportsSeeking()) + EC = std::make_error_code(std::errc::invalid_argument); +} + +ssize_t raw_fd_stream::read(char *Ptr, size_t Size) { + assert(get_fd() >= 0 && "File already closed."); + ssize_t Ret = ::read(get_fd(), (void *)Ptr, Size); + if (Ret >= 0) + inc_pos(Ret); + else + error_detected(std::error_code(errno, std::generic_category())); + return Ret; +} + +bool raw_fd_stream::classof(const raw_ostream *OS) { + return OS->get_kind() == OStreamKind::OK_FDStream; +} + //===----------------------------------------------------------------------===// // raw_string_ostream //===----------------------------------------------------------------------===// diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index d3db004196b8b..3c40d45c1e051 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -128,12 +128,12 @@ bool StringRecTy::typeIsConvertibleTo(const RecTy *RHS) const { } std::string ListRecTy::getAsString() const { - return "list<" + Ty->getAsString() + ">"; + return "list<" + ElementTy->getAsString() + ">"; } bool ListRecTy::typeIsConvertibleTo(const RecTy *RHS) const { if (const auto *ListTy = dyn_cast(RHS)) - return Ty->typeIsConvertibleTo(ListTy->getElementType()); + return ElementTy->typeIsConvertibleTo(ListTy->getElementType()); return false; } diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 3a94820dac8d3..da8447f91f366 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -32,6 +32,7 @@ #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/FaultMaps.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" @@ -69,12 +70,13 @@ namespace { class AArch64AsmPrinter : public AsmPrinter { AArch64MCInstLower MCInstLowering; StackMaps SM; + FaultMaps FM; const AArch64Subtarget *STI; public: AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this), - SM(*this) {} + SM(*this), FM(*this) {} StringRef getPassName() const override { return "AArch64 Assembly Printer"; } @@ -95,6 +97,9 @@ class AArch64AsmPrinter : public AsmPrinter { const MachineInstr &MI); void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI); + void LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM, + const MachineInstr &MI); + void LowerFAULTING_OP(const MachineInstr &MI); void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI); void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI); @@ -221,26 +226,9 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) { return; // Emit a .note.gnu.property section with the flags. - MCSection *Cur = OutStreamer->getCurrentSectionOnly(); - MCSection *Nt = MMI->getContext().getELFSection( - ".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC); - OutStreamer->SwitchSection(Nt); - - // Emit the note header. - emitAlignment(Align(8)); - OutStreamer->emitInt32(4); // data size for "GNU\0" - OutStreamer->emitInt32(4 * 4); // Elf_Prop size - OutStreamer->emitInt32(ELF::NT_GNU_PROPERTY_TYPE_0); - OutStreamer->emitBytes(StringRef("GNU", 4)); // note name - - // Emit the PAC/BTI properties. - OutStreamer->emitInt32(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND); - OutStreamer->emitInt32(4); // data size - OutStreamer->emitInt32(Flags); // data - OutStreamer->emitInt32(0); // pad - - OutStreamer->endSection(Nt); - OutStreamer->SwitchSection(Cur); + if (auto *TS = static_cast( + OutStreamer->getTargetStreamer())) + TS->emitNoteSection(Flags); } void AArch64AsmPrinter::emitFunctionHeaderComment() { @@ -539,7 +527,11 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) { // generates code that does this, it is always safe to set. OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols); } + + // Emit stack and fault map information. emitStackMaps(SM); + FM.serializeToFaultMapSection(); + } void AArch64AsmPrinter::EmitLOHs() { @@ -944,6 +936,83 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); } +void AArch64AsmPrinter::LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM, + const MachineInstr &MI) { + StatepointOpers SOpers(&MI); + if (unsigned PatchBytes = SOpers.getNumPatchBytes()) { + assert(PatchBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); + for (unsigned i = 0; i < PatchBytes; i += 4) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); + } else { + // Lower call target and choose correct opcode + const MachineOperand &CallTarget = SOpers.getCallTarget(); + MCOperand CallTargetMCOp; + unsigned CallOpcode; + switch (CallTarget.getType()) { + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + MCInstLowering.lowerOperand(CallTarget, CallTargetMCOp); + CallOpcode = AArch64::BL; + break; + case MachineOperand::MO_Immediate: + CallTargetMCOp = MCOperand::createImm(CallTarget.getImm()); + CallOpcode = AArch64::BL; + break; + case MachineOperand::MO_Register: + CallTargetMCOp = MCOperand::createReg(CallTarget.getReg()); + CallOpcode = AArch64::BLR; + break; + default: + llvm_unreachable("Unsupported operand type in statepoint call target"); + break; + } + + EmitToStreamer(OutStreamer, + MCInstBuilder(CallOpcode).addOperand(CallTargetMCOp)); + } + + auto &Ctx = OutStreamer.getContext(); + MCSymbol *MILabel = Ctx.createTempSymbol(); + OutStreamer.emitLabel(MILabel); + SM.recordStatepoint(*MILabel, MI); +} + +void AArch64AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI) { + // FAULTING_LOAD_OP , , , + // , + + Register DefRegister = FaultingMI.getOperand(0).getReg(); + FaultMaps::FaultKind FK = + static_cast(FaultingMI.getOperand(1).getImm()); + MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol(); + unsigned Opcode = FaultingMI.getOperand(3).getImm(); + unsigned OperandsBeginIdx = 4; + + auto &Ctx = OutStreamer->getContext(); + MCSymbol *FaultingLabel = Ctx.createTempSymbol(); + OutStreamer->emitLabel(FaultingLabel); + + assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!"); + FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel); + + MCInst MI; + MI.setOpcode(Opcode); + + if (DefRegister != (Register)0) + MI.addOperand(MCOperand::createReg(DefRegister)); + + for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx, + E = FaultingMI.operands_end(); + I != E; ++I) { + MCOperand Dest; + lowerOperand(*I, Dest); + MI.addOperand(Dest); + } + + OutStreamer->AddComment("on-fault: " + HandlerLabel->getName()); + OutStreamer->emitInstruction(MI, getSubtargetInfo()); +} + void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { Register DestReg = MI.getOperand(0).getReg(); if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) { @@ -1225,6 +1294,12 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { case TargetOpcode::PATCHPOINT: return LowerPATCHPOINT(*OutStreamer, SM, *MI); + case TargetOpcode::STATEPOINT: + return LowerSTATEPOINT(*OutStreamer, SM, *MI); + + case TargetOpcode::FAULTING_OP: + return LowerFAULTING_OP(*MI); + case TargetOpcode::PATCHABLE_FUNCTION_ENTER: LowerPATCHABLE_FUNCTION_ENTER(*MI); return; diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 5fa44606488be..2187b6121421a 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -19,7 +19,6 @@ def fconstant_to_constant : GICombineRule< def AArch64PreLegalizerCombinerHelper: GICombinerHelper< "AArch64GenPreLegalizerCombinerHelper", [all_combines, - elide_br_by_inverting_cond, fconstant_to_constant]> { let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule"; let StateClass = "AArch64PreLegalizerCombinerHelperState"; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 063644716a654..56533d5eadf78 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -145,6 +145,9 @@ static bool isMergePassthruOpcode(unsigned Opc) { case AArch64ISD::FROUND_MERGE_PASSTHRU: case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU: case AArch64ISD::FTRUNC_MERGE_PASSTHRU: + case AArch64ISD::FCVTZU_MERGE_PASSTHRU: + case AArch64ISD::FCVTZS_MERGE_PASSTHRU: + case AArch64ISD::FSQRT_MERGE_PASSTHRU: return true; } } @@ -742,6 +745,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::GlobalAddress); @@ -944,6 +948,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, for (MVT VT : MVT::integer_scalable_vector_valuetypes()) { if (isTypeLegal(VT)) { setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); + setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); @@ -964,8 +970,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } - for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) + for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + } setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); @@ -988,6 +996,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FROUND, VT, Custom); setOperationAction(ISD::FROUNDEVEN, VT, Custom); setOperationAction(ISD::FTRUNC, VT, Custom); + setOperationAction(ISD::FSQRT, VT, Custom); } } @@ -1165,6 +1174,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::UDIV, VT, Custom); setOperationAction(ISD::UMAX, VT, Custom); setOperationAction(ISD::UMIN, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::XOR, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); } @@ -1500,6 +1510,9 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) MAKE_CASE(AArch64ISD::ADC) MAKE_CASE(AArch64ISD::SBC) @@ -1802,6 +1815,7 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: + case TargetOpcode::STATEPOINT: return emitPatchPoint(MI, BB); case AArch64::CATCHRET: @@ -2864,6 +2878,14 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, // in the cost tables. EVT InVT = Op.getOperand(0).getValueType(); EVT VT = Op.getValueType(); + + if (VT.isScalableVector()) { + unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT + ? AArch64ISD::FCVTZU_MERGE_PASSTHRU + : AArch64ISD::FCVTZS_MERGE_PASSTHRU; + return LowerToPredicatedOp(Op, DAG, Opcode); + } + unsigned NumElts = InVT.getVectorNumElements(); // f16 conversions are promoted to f32 when full fp16 is not supported. @@ -3382,6 +3404,17 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_frintz: return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_fcvtzu: + return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl, + Op.getValueType(), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_fcvtzs: + return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl, + Op.getValueType(), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_fsqrt: + return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_convert_to_svbool: { EVT OutVT = Op.getValueType(); EVT InVT = Op.getOperand(1).getValueType(); @@ -3693,6 +3726,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU); case ISD::FTRUNC: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU); + case ISD::FSQRT: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU); case ISD::FP_ROUND: case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); @@ -3821,6 +3856,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED); case ISD::FMINNUM: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED); + case ISD::VSELECT: + return LowerFixedLengthVectorSelectToSVE(Op, DAG); } } @@ -5271,7 +5308,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SDValue FuncTLVGet = DAG.getLoad( PtrMemVT, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), - /* Alignment = */ PtrMemVT.getSizeInBits() / 8, + Align(PtrMemVT.getSizeInBits() / 8), MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); Chain = FuncTLVGet.getValue(1); @@ -6300,8 +6337,8 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, // void *__stack at offset 0 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); - MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, - MachinePointerInfo(SV), /* Alignment = */ 8)); + MemOps.push_back( + DAG.getStore(Chain, DL, Stack, VAList, MachinePointerInfo(SV), Align(8))); // void *__gr_top at offset 8 int GPRSize = FuncInfo->getVarArgsGPRSize(); @@ -6316,8 +6353,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, DAG.getConstant(GPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, - MachinePointerInfo(SV, 8), - /* Alignment = */ 8)); + MachinePointerInfo(SV, 8), Align(8))); } // void *__vr_top at offset 16 @@ -6332,23 +6368,22 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, DAG.getConstant(FPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, - MachinePointerInfo(SV, 16), - /* Alignment = */ 8)); + MachinePointerInfo(SV, 16), Align(8))); } // int __gr_offs at offset 24 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); - MemOps.push_back(DAG.getStore( - Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, - MachinePointerInfo(SV, 24), /* Alignment = */ 4)); + MemOps.push_back( + DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), + GROffsAddr, MachinePointerInfo(SV, 24), Align(4))); // int __vr_offs at offset 28 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); - MemOps.push_back(DAG.getStore( - Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, - MachinePointerInfo(SV, 28), /* Alignment = */ 4)); + MemOps.push_back( + DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), + VROffsAddr, MachinePointerInfo(SV, 28), Align(4))); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } @@ -7299,7 +7334,11 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, continue; } - assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); + if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) { + LLVM_DEBUG( + dbgs() << "Reshuffle failed: result vector too small to extract\n"); + return SDValue(); + } if (Src.MaxElt - Src.MinElt >= NumSrcElts) { LLVM_DEBUG( @@ -7328,6 +7367,13 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, DAG.getConstant(NumSrcElts, dl, MVT::i64)); unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); + if (!SrcVT.is64BitVector()) { + LLVM_DEBUG( + dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT " + "for SVE vectors."); + return SDValue(); + } + Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, VEXTSrc2, DAG.getConstant(Imm, dl, MVT::i32)); @@ -9085,7 +9131,8 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. - if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64) + if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 && + InVT.getSizeInBits() == 128) return Op; return SDValue(); @@ -9099,9 +9146,34 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, EVT InVT = Op.getOperand(1).getValueType(); unsigned Idx = cast(Op.getOperand(2))->getZExtValue(); - // We don't have any patterns for scalable vector yet. - if (InVT.isScalableVector()) + if (InVT.isScalableVector()) { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + if (!isTypeLegal(VT) || !VT.isInteger()) + return SDValue(); + + SDValue Vec0 = Op.getOperand(0); + SDValue Vec1 = Op.getOperand(1); + + // Ensure the subvector is half the size of the main vector. + if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) + return SDValue(); + + // Extend elements of smaller vector... + EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext())); + SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); + + if (Idx == 0) { + SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0); + return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0); + } else if (Idx == InVT.getVectorMinNumElements()) { + SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0); + return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec); + } + return SDValue(); + } // This will be matched by custom code during ISelDAGToDAG. if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef()) @@ -9502,14 +9574,12 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, case ISD::VECREDUCE_UMIN: return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); case ISD::VECREDUCE_FMAX: { - assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag"); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), Op.getOperand(0)); } case ISD::VECREDUCE_FMIN: { - assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag"); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), @@ -11548,6 +11618,60 @@ performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return ResultHADD; } +static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) { + switch (Opcode) { + case ISD::FADD: + return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64; + case ISD::ADD: + return VT == MVT::i64; + default: + return false; + } +} + +static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + ConstantSDNode *ConstantN1 = dyn_cast(N1); + + EVT VT = N->getValueType(0); + const bool FullFP16 = + static_cast(DAG.getSubtarget()).hasFullFP16(); + + // Rewrite for pairwise fadd pattern + // (f32 (extract_vector_elt + // (fadd (vXf32 Other) + // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0)) + // -> + // (f32 (fadd (extract_vector_elt (vXf32 Other) 0) + // (extract_vector_elt (vXf32 Other) 1)) + if (ConstantN1 && ConstantN1->getZExtValue() == 0 && + hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) { + SDLoc DL(N0); + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + + ShuffleVectorSDNode *Shuffle = dyn_cast(N01); + SDValue Other = N00; + + // And handle the commutative case. + if (!Shuffle) { + Shuffle = dyn_cast(N00); + Other = N01; + } + + if (Shuffle && Shuffle->getMaskElt(0) == 1 && + Other == Shuffle->getOperand(0)) { + return DAG.getNode(N0->getOpcode(), DL, VT, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, + DAG.getConstant(0, DL, MVT::i64)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, + DAG.getConstant(1, DL, MVT::i64))); + } + } + + return SDValue(); +} + static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -13001,6 +13125,31 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, S->getMemOperand()->getFlags()); } +static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + EVT ResVT = N->getValueType(0); + + // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z) + if (Op0.getOpcode() == AArch64ISD::UUNPKLO) { + if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) { + SDValue X = Op0.getOperand(0).getOperand(0); + return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1); + } + } + + // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z) + if (Op1.getOpcode() == AArch64ISD::UUNPKHI) { + if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) { + SDValue Z = Op1.getOperand(0).getOperand(1); + return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z); + } + } + + return SDValue(); +} + /// Target-specific DAG combine function for post-increment LD1 (lane) and /// post-increment LD1R. static SDValue performPostLD1Combine(SDNode *N, @@ -14342,8 +14491,12 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performPostLD1Combine(N, DCI, false); case AArch64ISD::NVCAST: return performNVCASTCombine(N); + case AArch64ISD::UZP1: + return performUzpCombine(N, DAG); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); + case ISD::EXTRACT_VECTOR_ELT: + return performExtractVectorEltCombine(N, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { @@ -15738,6 +15891,31 @@ SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op, return convertFromScalableVector(DAG, VT, ScalableRes); } +SDValue +AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + + EVT InVT = Op.getOperand(1).getValueType(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); + SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1)); + SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2)); + + // Convert the mask to a predicated (NOTE: We don't need to worry about + // inactive lanes since VSELECT is safe when given undefined elements). + EVT MaskVT = Op.getOperand(0).getValueType(); + EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT); + auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0)); + Mask = DAG.getNode(ISD::TRUNCATE, DL, + MaskContainerVT.changeVectorElementType(MVT::i1), Mask); + + auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT, + Mask, Op1, Op2); + + return convertFromScalableVector(DAG, VT, ScalableRes); +} + SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE( SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index d6e511891752a..3d6f47ebcdccf 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -102,7 +102,10 @@ enum NodeType : unsigned { FRINT_MERGE_PASSTHRU, FROUND_MERGE_PASSTHRU, FROUNDEVEN_MERGE_PASSTHRU, + FSQRT_MERGE_PASSTHRU, FTRUNC_MERGE_PASSTHRU, + FCVTZU_MERGE_PASSTHRU, + FCVTZS_MERGE_PASSTHRU, SIGN_EXTEND_INREG_MERGE_PASSTHRU, ZERO_EXTEND_INREG_MERGE_PASSTHRU, @@ -415,12 +418,14 @@ namespace { // Any instruction that defines a 32-bit result zeros out the high half of the // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may // be copying from a truncate. But any other 32-bit operation will zero-extend -// up to 64 bits. +// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper +// 32 bits, they're probably just qualifying a CopyFromReg. // FIXME: X86 also checks for CMOV here. Do we need something similar? static inline bool isDef32(const SDNode &N) { unsigned Opc = N.getOpcode(); return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG && - Opc != ISD::CopyFromReg; + Opc != ISD::CopyFromReg && Opc != ISD::AssertSext && + Opc != ISD::AssertZext; } } // end anonymous namespace @@ -917,6 +922,7 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerFixedLengthVectorIntExtendToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op, diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 25d478ebfc055..61155087cbe28 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -3939,7 +3939,7 @@ class LoadPreIdx sz, bit V, bits<2> opc, RegisterOperand regtype, (outs GPR64sp:$wback, regtype:$Rt), (ins GPR64sp:$Rn, simm9:$offset), asm, "$Rn = $wback,@earlyclobber $wback", []>, - Sched<[WriteLD, WriteAdr]>; + Sched<[WriteAdr, WriteLD]>; let mayStore = 1, mayLoad = 0 in class StorePreIdx sz, bit V, bits<2> opc, RegisterOperand regtype, @@ -3985,7 +3985,7 @@ class LoadPostIdx sz, bit V, bits<2> opc, RegisterOperand regtype, (outs GPR64sp:$wback, regtype:$Rt), (ins GPR64sp:$Rn, simm9:$offset), asm, "$Rn = $wback,@earlyclobber $wback", []>, - Sched<[WriteLD, WriteAdr]>; + Sched<[WriteAdr, WriteLD]>; let mayStore = 1, mayLoad = 0 in class StorePostIdx sz, bit V, bits<2> opc, RegisterOperand regtype, @@ -4082,7 +4082,7 @@ class LoadPairPreIdx opc, bit V, RegisterOperand regtype, : BaseLoadStorePairPreIdx, - Sched<[WriteLD, WriteLDHi, WriteAdr]>; + Sched<[WriteAdr, WriteLD, WriteLDHi]>; let mayStore = 1, mayLoad = 0 in class StorePairPreIdx opc, bit V, RegisterOperand regtype, @@ -4123,7 +4123,7 @@ class LoadPairPostIdx opc, bit V, RegisterOperand regtype, : BaseLoadStorePairPostIdx, - Sched<[WriteLD, WriteLDHi, WriteAdr]>; + Sched<[WriteAdr, WriteLD, WriteLDHi]>; let mayStore = 1, mayLoad = 0 in class StorePairPostIdx opc, bit V, RegisterOperand regtype, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 9e37d0292e7a7..3d1cf767cfca6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -107,6 +107,13 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); break; + case TargetOpcode::STATEPOINT: + NumBytes = StatepointOpers(&MI).getNumPatchBytes(); + assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); + // No patch bytes means a normal call inst is emitted + if (NumBytes == 0) + NumBytes = 4; + break; case AArch64::TLSDESC_CALLSEQ: // This gets lowered to an instruction sequence which takes 16 bytes NumBytes = 16; @@ -321,6 +328,56 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, return true; } +bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, + MachineBranchPredicate &MBP, + bool AllowModify) const { + // For the moment, handle only a block which ends with a cb(n)zx followed by + // a fallthrough. Why this? Because it is a common form. + // TODO: Should we handle b.cc? + + MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); + if (I == MBB.end()) + return true; + + // Skip over SpeculationBarrierEndBB terminators + if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || + I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { + --I; + } + + if (!isUnpredicatedTerminator(*I)) + return true; + + // Get the last instruction in the block. + MachineInstr *LastInst = &*I; + unsigned LastOpc = LastInst->getOpcode(); + if (!isCondBranchOpcode(LastOpc)) + return true; + + switch (LastOpc) { + default: + return true; + case AArch64::CBZW: + case AArch64::CBZX: + case AArch64::CBNZW: + case AArch64::CBNZX: + break; + }; + + MBP.TrueDest = LastInst->getOperand(1).getMBB(); + assert(MBP.TrueDest && "expected!"); + MBP.FalseDest = MBB.getNextNode(); + + MBP.ConditionDef = nullptr; + MBP.SingleUseCondition = false; + + MBP.LHS = LastInst->getOperand(0); + MBP.RHS = MachineOperand::CreateImm(0); + MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE + : MachineBranchPredicate::PRED_EQ; + return false; +} + bool AArch64InstrInfo::reverseBranchCondition( SmallVectorImpl &Cond) const { if (Cond[0].getImm() != -1) { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 298c04d81708d..1a21d8474e071 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -188,6 +188,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { MachineBasicBlock *&FBB, SmallVectorImpl &Cond, bool AllowModify = false) const override; + bool analyzeBranchPredicate(MachineBasicBlock &MBB, + MachineBranchPredicate &MBP, + bool AllowModify) const override; unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved = nullptr) const override; unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 85cb230517433..06e88b7b2045f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3802,7 +3802,7 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, // Floating point immediate move. //===----------------------------------------------------------------------===// -let isReMaterializable = 1 in { +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm FMOV : FPMoveImmediate<"fmov">; } @@ -7482,6 +7482,9 @@ def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)), def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)), (vector_extract (v4f32 FPR128:$Rn), (i64 1))), (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; +def : Pat<(fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)), + (vector_extract (v8f16 FPR128:$Rn), (i64 1))), + (f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; // Scalar 64-bit shifts in FPR64 registers. def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 9562269336d8d..12e938c0f66ce 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -20,7 +20,6 @@ #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCLinkerOptimizationHint.h" #include diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 2f1317d8f1ea8..e0685d766655a 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -611,9 +611,10 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED; Register FrameReg; - // Special handling of dbg_value, stackmap and patchpoint instructions. + // Special handling of dbg_value, stackmap patchpoint statepoint instructions. if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP || - MI.getOpcode() == TargetOpcode::PATCHPOINT) { + MI.getOpcode() == TargetOpcode::PATCHPOINT || + MI.getOpcode() == TargetOpcode::STATEPOINT) { StackOffset Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, /*PreferFP=*/true, @@ -734,3 +735,19 @@ unsigned AArch64RegisterInfo::getLocalAddressRegister( return getBaseRegister(); return getFrameRegister(MF); } + +/// SrcRC and DstRC will be morphed into NewRC if this returns true +bool AArch64RegisterInfo::shouldCoalesce( + MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, + const TargetRegisterClass *DstRC, unsigned DstSubReg, + const TargetRegisterClass *NewRC, LiveIntervals &LIS) const { + if (MI->isCopy() && + ((DstRC->getID() == AArch64::GPR64RegClassID) || + (DstRC->getID() == AArch64::GPR64commonRegClassID)) && + MI->getOperand(0).getSubReg() && MI->getOperand(1).getSubReg()) + // Do not coalesce in the case of a 32-bit subregister copy + // which implements a 32 to 64 bit zero extension + // which relies on the upper 32 bits being zeroed. + return false; + return true; +} diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index e3c8a77f433f8..d7580d7b68330 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -129,6 +129,12 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo { unsigned getLocalAddressRegister(const MachineFunction &MF) const; bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const; + + /// SrcRC and DstRC will be morphed into NewRC if this returns true + bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, + unsigned SubReg, const TargetRegisterClass *DstRC, + unsigned DstSubReg, const TargetRegisterClass *NewRC, + LiveIntervals &LIS) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index e01a34242a8d7..fbe4b01a259af 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -209,6 +209,15 @@ def AArch64frintx_mt : SDNode<"AArch64ISD::FRINT_MERGE_PASSTHRU", SDT_AArch64Ari def AArch64frinta_mt : SDNode<"AArch64ISD::FROUND_MERGE_PASSTHRU", SDT_AArch64Arith>; def AArch64frintn_mt : SDNode<"AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU", SDT_AArch64Arith>; def AArch64frintz_mt : SDNode<"AArch64ISD::FTRUNC_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64fsqrt_mt : SDNode<"AArch64ISD::FSQRT_MERGE_PASSTHRU", SDT_AArch64Arith>; + +def SDT_AArch64FCVT : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, + SDTCVecEltisVT<1,i1> +]>; + +def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>; +def AArch64fcvtzs_mt : SDNode<"AArch64ISD::FCVTZS_MERGE_PASSTHRU", SDT_AArch64FCVT>; def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>; def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>; @@ -1387,40 +1396,40 @@ multiclass sve_prefetch; defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>; - defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv4i1, nxv4f32, ElementSizeS>; - defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv4i1, nxv8f16, ElementSizeS>; - defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; - defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; - defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; - defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; - defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; - defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; - defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; - defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; - defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv2i1, nxv8f16, ElementSizeD>; - defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv2i1, nxv4f32, ElementSizeD>; - defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; - defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; - defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>; - defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>; - defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>; - defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>; - defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>; - defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>; - defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; - defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; - defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>; - defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>; - defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>; - defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>; - defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>; - defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>; - defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, null_frag, nxv8f16, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, null_frag, nxv4f32, nxv4i1, nxv8f16, ElementSizeS>; + defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, null_frag, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; + defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, null_frag, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; + defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, null_frag, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; + defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, null_frag, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; + defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, null_frag, AArch64fcvtzs_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; + defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, null_frag, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, null_frag, AArch64fcvtzu_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; + defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, null_frag, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, null_frag, nxv8f16, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, null_frag, nxv2f64, nxv2i1, nxv8f16, ElementSizeD>; + defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, null_frag, nxv4f32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, null_frag, nxv2f64, nxv2i1, nxv4f32, ElementSizeD>; + defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, null_frag, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>; + defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, null_frag, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>; + defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, null_frag, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>; + defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, null_frag, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, null_frag, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, null_frag, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>; + defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, null_frag, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, null_frag, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>; + defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>; + defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>; + defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>; + defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>; + defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", null_frag, AArch64frintn_mt>; defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", null_frag, AArch64frintp_mt>; @@ -1430,7 +1439,7 @@ multiclass sve_prefetch; defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", null_frag, AArch64frinti_mt>; defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>; - defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", int_aarch64_sve_fsqrt>; + defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", null_frag, AArch64fsqrt_mt>; let Predicates = [HasBF16, HasSVE] in { defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>; diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 8b15898c1c140..6df717f030a72 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -455,6 +455,7 @@ void AArch64PassConfig::addIRPasses() { .forwardSwitchCondToPhi(true) .convertSwitchToLookupTable(true) .needCanonicalLoops(false) + .hoistCommonInsts(true) .sinkCommonInsts(true))); // Run LoopDataPrefetch @@ -543,7 +544,7 @@ bool AArch64PassConfig::addInstSelector() { } bool AArch64PassConfig::addIRTranslator() { - addPass(new IRTranslator()); + addPass(new IRTranslator(getOptLevel())); return false; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 5f5da63b21b64..fb23bc641573e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -192,6 +192,10 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) return TTI::TCC_Free; break; + case Intrinsic::experimental_gc_statepoint: + if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) + return TTI::TCC_Free; + break; } return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 05b7f70f2335c..3c3a246b90a12 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -223,11 +223,6 @@ class AArch64TTIImpl : public BasicTTIImplBase { // We don't have legalization support for ordered FP reductions. return !II->getFastMathFlags().allowReassoc(); - case Intrinsic::experimental_vector_reduce_fmax: - case Intrinsic::experimental_vector_reduce_fmin: - // Lowering asserts that there are no NaNs. - return !II->getFastMathFlags().noNaNs(); - default: // Don't expand anything else, let legalization deal with it. return false; diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 08a29bbb3e87a..502966c633676 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -5725,7 +5725,7 @@ bool AArch64AsmParser::parseDirectiveSEHSaveRegX(SMLoc L) { bool AArch64AsmParser::parseDirectiveSEHSaveRegP(SMLoc L) { unsigned Reg; int64_t Offset; - if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) || + if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) || parseComma() || parseImmExpr(Offset)) return true; getTargetStreamer().EmitARM64WinCFISaveRegP(Reg, Offset); @@ -5737,7 +5737,7 @@ bool AArch64AsmParser::parseDirectiveSEHSaveRegP(SMLoc L) { bool AArch64AsmParser::parseDirectiveSEHSaveRegPX(SMLoc L) { unsigned Reg; int64_t Offset; - if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::X28) || + if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) || parseComma() || parseImmExpr(Offset)) return true; getTargetStreamer().EmitARM64WinCFISaveRegPX(Reg, Offset); @@ -5789,7 +5789,7 @@ bool AArch64AsmParser::parseDirectiveSEHSaveFRegX(SMLoc L) { bool AArch64AsmParser::parseDirectiveSEHSaveFRegP(SMLoc L) { unsigned Reg; int64_t Offset; - if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) || + if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) || parseComma() || parseImmExpr(Offset)) return true; getTargetStreamer().EmitARM64WinCFISaveFRegP(Reg, Offset); @@ -5801,7 +5801,7 @@ bool AArch64AsmParser::parseDirectiveSEHSaveFRegP(SMLoc L) { bool AArch64AsmParser::parseDirectiveSEHSaveFRegPX(SMLoc L) { unsigned Reg; int64_t Offset; - if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) || + if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) || parseComma() || parseImmExpr(Offset)) return true; getTargetStreamer().EmitARM64WinCFISaveFRegPX(Reg, Offset); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index a8d68180bb76a..7307d5b7e1d0c 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Type.h" #include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -170,8 +171,57 @@ class AArch64InstructionSelector : public InstructionSelector { emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const; - MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS, + MachineInstr *emitInstr(unsigned Opcode, + std::initializer_list DstOps, + std::initializer_list SrcOps, + MachineIRBuilder &MIRBuilder, + const ComplexRendererFns &RenderFns = None) const; + /// Helper function to emit a binary operation such as an ADD, ADDS, etc. + /// + /// This is intended for instructions with the following opcode variants: + /// + /// - Xri, Wri (arithmetic immediate form) + /// - Xrs, Wrs (shifted register form) + /// - Xrr, Wrr (register form) + /// + /// For example, for ADD, we have ADDXri, ADDWri, ADDXrs, etc. + /// + /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above + /// in a specific order. + /// + /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode. + /// + /// \code + /// const std::array, 3> Table { + /// {{AArch64::ADDXri, AArch64::ADDWri}, + /// {AArch64::ADDXrs, AArch64::ADDWrs}, + /// {AArch64::ADDXrr, AArch64::ADDWrr}}}; + /// \endcode + /// + /// Each row in the table corresponds to a different addressing mode. Each + /// column corresponds to a different register size. + /// + /// \attention Rows must be structured as follows: + /// - Row 0: The ri opcode variants + /// - Row 1: The rs opcode variants + /// - Row 2: The rr opcode variants + /// + /// \attention Columns must be structured as follows: + /// - Column 0: The 64-bit opcode variants + /// - Column 1: The 32-bit opcode variants + /// + /// \p Dst is the destination register of the binop to emit. + /// \p LHS is the left-hand operand of the binop to emit. + /// \p RHS is the right-hand operand of the binop to emit. + MachineInstr *emitBinOp( + const std::array, 3> &AddrModeAndSizeToOpcode, + Register Dst, MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, + MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const; MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitTST(const Register &LHS, const Register &RHS, @@ -1755,6 +1805,17 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { MachineRegisterInfo &MRI = MF.getRegInfo(); switch (I.getOpcode()) { + case TargetOpcode::G_BR: { + // If the branch jumps to the fallthrough block, don't bother emitting it. + // Only do this for -O0 for a good code size improvement, because when + // optimizations are enabled we want to leave this choice to + // MachineBlockPlacement. + bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None; + if (EnableOpt || !MBB.isLayoutSuccessor(I.getOperand(0).getMBB())) + return false; + I.eraseFromParent(); + return true; + } case TargetOpcode::G_SHL: return earlySelectSHL(I, MRI); case TargetOpcode::G_CONSTANT: { @@ -2260,21 +2321,22 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } auto &MemOp = **I.memoperands_begin(); + uint64_t MemSizeInBytes = MemOp.getSize(); if (MemOp.isAtomic()) { // For now we just support s8 acquire loads to be able to compile stack // protector code. if (MemOp.getOrdering() == AtomicOrdering::Acquire && - MemOp.getSize() == 1) { + MemSizeInBytes == 1) { I.setDesc(TII.get(AArch64::LDARB)); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n"); return false; } - unsigned MemSizeInBits = MemOp.getSize() * 8; + unsigned MemSizeInBits = MemSizeInBytes * 8; - const Register PtrReg = I.getOperand(1).getReg(); #ifndef NDEBUG + const Register PtrReg = I.getOperand(1).getReg(); const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); // Sanity-check the pointer register. assert(PtrRB.getID() == AArch64::GPRRegBankID && @@ -2286,78 +2348,78 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { const Register ValReg = I.getOperand(0).getReg(); const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); - const unsigned NewOpc = - selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); - if (NewOpc == I.getOpcode()) - return false; - - I.setDesc(TII.get(NewOpc)); - - uint64_t Offset = 0; - auto *PtrMI = MRI.getVRegDef(PtrReg); - - // Try to fold a GEP into our unsigned immediate addressing mode. - if (PtrMI->getOpcode() == TargetOpcode::G_PTR_ADD) { - if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) { - int64_t Imm = *COff; - const unsigned Size = MemSizeInBits / 8; - const unsigned Scale = Log2_32(Size); - if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) { - Register Ptr2Reg = PtrMI->getOperand(1).getReg(); - I.getOperand(1).setReg(Ptr2Reg); - PtrMI = MRI.getVRegDef(Ptr2Reg); - Offset = Imm / Size; - } + // Helper lambda for partially selecting I. Either returns the original + // instruction with an updated opcode, or a new instruction. + auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * { + bool IsStore = I.getOpcode() == TargetOpcode::G_STORE; + const unsigned NewOpc = + selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); + if (NewOpc == I.getOpcode()) + return nullptr; + // Check if we can fold anything into the addressing mode. + auto AddrModeFns = + selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes); + if (!AddrModeFns) { + // Can't fold anything. Use the original instruction. + I.setDesc(TII.get(NewOpc)); + I.addOperand(MachineOperand::CreateImm(0)); + return &I; } - } - // If we haven't folded anything into our addressing mode yet, try to fold - // a frame index into the base+offset. - if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX) - I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex()); + // Folded something. Create a new instruction and return it. + auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags()); + IsStore ? NewInst.addUse(ValReg) : NewInst.addDef(ValReg); + NewInst.cloneMemRefs(I); + for (auto &Fn : *AddrModeFns) + Fn(NewInst); + I.eraseFromParent(); + return &*NewInst; + }; - I.addOperand(MachineOperand::CreateImm(Offset)); + MachineInstr *LoadStore = SelectLoadStoreAddressingMode(); + if (!LoadStore) + return false; // If we're storing a 0, use WZR/XZR. if (Opcode == TargetOpcode::G_STORE) { auto CVal = getConstantVRegValWithLookThrough( - ValReg, MRI, /*LookThroughInstrs = */ true, + LoadStore->getOperand(0).getReg(), MRI, /*LookThroughInstrs = */ true, /*HandleFConstants = */ false); if (CVal && CVal->Value == 0) { - unsigned Opc = I.getOpcode(); - switch (Opc) { + switch (LoadStore->getOpcode()) { case AArch64::STRWui: case AArch64::STRHHui: case AArch64::STRBBui: - I.getOperand(0).setReg(AArch64::WZR); + LoadStore->getOperand(0).setReg(AArch64::WZR); break; case AArch64::STRXui: - I.getOperand(0).setReg(AArch64::XZR); + LoadStore->getOperand(0).setReg(AArch64::XZR); break; } } } if (IsZExtLoad) { - // The zextload from a smaller type to i32 should be handled by the importer. - if (MRI.getType(ValReg).getSizeInBits() != 64) + // The zextload from a smaller type to i32 should be handled by the + // importer. + if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64) return false; // If we have a ZEXTLOAD then change the load's type to be a narrower reg - //and zero_extend with SUBREG_TO_REG. + // and zero_extend with SUBREG_TO_REG. Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); - Register DstReg = I.getOperand(0).getReg(); - I.getOperand(0).setReg(LdReg); + Register DstReg = LoadStore->getOperand(0).getReg(); + LoadStore->getOperand(0).setReg(LdReg); - MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); + MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator())); MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) .addImm(0) .addUse(LdReg) .addImm(AArch64::sub_32); - constrainSelectedInstRegOperands(I, TII, TRI, RBI); + constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, MRI); } - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); } case TargetOpcode::G_SMULH: @@ -2449,11 +2511,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } // Add and set the set condition flag. - unsigned AddsOpc = OpSize == 32 ? AArch64::ADDSWrr : AArch64::ADDSXrr; MachineIRBuilder MIRBuilder(I); - auto AddsMI = MIRBuilder.buildInstr(AddsOpc, {I.getOperand(0)}, - {I.getOperand(2), I.getOperand(3)}); - constrainSelectedInstRegOperands(*AddsMI, TII, TRI, RBI); + emitADDS(I.getOperand(0).getReg(), I.getOperand(2), I.getOperand(3), + MIRBuilder); // Now, put the overflow result in the register given by the first operand // to the G_UADDO. CSINC increments the result when the predicate is false, @@ -3736,55 +3796,70 @@ getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { return std::make_pair(Opc, SubregIdx); } +MachineInstr *AArch64InstructionSelector::emitInstr( + unsigned Opcode, std::initializer_list DstOps, + std::initializer_list SrcOps, MachineIRBuilder &MIRBuilder, + const ComplexRendererFns &RenderFns) const { + assert(Opcode && "Expected an opcode?"); + assert(!isPreISelGenericOpcode(Opcode) && + "Function should only be used to produce selected instructions!"); + auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps); + if (RenderFns) + for (auto &Fn : *RenderFns) + Fn(MI); + constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); + return &*MI; +} + +MachineInstr *AArch64InstructionSelector::emitBinOp( + const std::array, 3> &AddrModeAndSizeToOpcode, + Register Dst, MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); + auto Ty = MRI.getType(LHS.getReg()); + assert(Ty.isScalar() && "Expected a scalar?"); + unsigned Size = Ty.getSizeInBits(); + assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only"); + bool Is32Bit = Size == 32; + if (auto Fns = selectArithImmed(RHS)) + return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS}, + MIRBuilder, Fns); + if (auto Fns = selectShiftedRegister(RHS)) + return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS}, + MIRBuilder, Fns); + return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS}, + MIRBuilder); +} + MachineInstr * AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { - assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); - MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); - static const unsigned OpcTable[2][2]{{AArch64::ADDXrr, AArch64::ADDXri}, - {AArch64::ADDWrr, AArch64::ADDWri}}; - bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32; - auto ImmFns = selectArithImmed(RHS); - unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; - auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS}); - - // If we matched a valid constant immediate, add those operands. - if (ImmFns) { - for (auto &RenderFn : *ImmFns) - RenderFn(AddMI); - } else { - AddMI.addUse(RHS.getReg()); - } + const std::array, 3> OpcTable{ + {{AArch64::ADDXri, AArch64::ADDWri}, + {AArch64::ADDXrs, AArch64::ADDWrs}, + {AArch64::ADDXrr, AArch64::ADDWrr}}}; + return emitBinOp(OpcTable, DefReg, LHS, RHS, MIRBuilder); +} - constrainSelectedInstRegOperands(*AddMI, TII, TRI, RBI); - return &*AddMI; +MachineInstr * +AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS, + MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + const std::array, 3> OpcTable{ + {{AArch64::ADDSXri, AArch64::ADDSWri}, + {AArch64::ADDSXrs, AArch64::ADDSWrs}, + {AArch64::ADDSXrr, AArch64::ADDSWrr}}}; + return emitBinOp(OpcTable, Dst, LHS, RHS, MIRBuilder); } MachineInstr * AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { - assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); - static const unsigned OpcTable[2][2]{{AArch64::ADDSXrr, AArch64::ADDSXri}, - {AArch64::ADDSWrr, AArch64::ADDSWri}}; bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); - auto ImmFns = selectArithImmed(RHS); - unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; - Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; - - auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS}); - - // If we matched a valid constant immediate, add those operands. - if (ImmFns) { - for (auto &RenderFn : *ImmFns) - RenderFn(CmpMI); - } else { - CmpMI.addUse(RHS.getReg()); - } - - constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); - return &*CmpMI; + return emitADDS(Is32Bit ? AArch64::WZR : AArch64::XZR, LHS, RHS, MIRBuilder); } MachineInstr * diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 77e5f374c1af0..1dfae8d0ba7cf 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -14,6 +14,7 @@ #include "AArch64LegalizerInfo.h" #include "AArch64Subtarget.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineInstr.h" @@ -62,21 +63,21 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) } getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) - .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64, v16s8, v8s16}) - .clampScalar(0, s1, s64) - .widenScalarToNextPow2(0, 8) - .fewerElementsIf( - [=](const LegalityQuery &Query) { - return Query.Types[0].isVector() && - (Query.Types[0].getElementType() != s64 || - Query.Types[0].getNumElements() != 2); - }, - [=](const LegalityQuery &Query) { - LLT EltTy = Query.Types[0].getElementType(); - if (EltTy == s64) - return std::make_pair(0, LLT::vector(2, 64)); - return std::make_pair(0, EltTy); - }); + .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64, v16s8, v8s16}) + .clampScalar(0, s1, s64) + .widenScalarToNextPow2(0, 8) + .fewerElementsIf( + [=](const LegalityQuery &Query) { + return Query.Types[0].isVector() && + (Query.Types[0].getElementType() != s64 || + Query.Types[0].getNumElements() != 2); + }, + [=](const LegalityQuery &Query) { + LLT EltTy = Query.Types[0].getElementType(); + if (EltTy == s64) + return std::make_pair(0, LLT::vector(2, 64)); + return std::make_pair(0, EltTy); + }); getActionDefinitionsBuilder(G_PHI) .legalFor({p0, s16, s32, s64, v2s32, v4s32, v2s64}) @@ -97,15 +98,20 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .moreElementsToNextPow2(0); getActionDefinitionsBuilder(G_SHL) - .legalFor({{s32, s32}, {s64, s64}, - {v2s32, v2s32}, {v4s32, v4s32}, {v2s64, v2s64}}) - .clampScalar(1, s32, s64) - .clampScalar(0, s32, s64) - .widenScalarToNextPow2(0) - .clampNumElements(0, v2s32, v4s32) - .clampNumElements(0, v2s64, v2s64) - .moreElementsToNextPow2(0) - .minScalarSameAs(1, 0); + .legalFor({{s32, s32}, + {s64, s64}, + {v2s32, v2s32}, + {v4s32, v4s32}, + {v2s64, v2s64}, + {v16s8, v16s8}, + {v8s16, v8s16}}) + .clampScalar(1, s32, s64) + .clampScalar(0, s32, s64) + .widenScalarToNextPow2(0) + .clampNumElements(0, v2s32, v4s32) + .clampNumElements(0, v2s64, v2s64) + .moreElementsToNextPow2(0) + .minScalarSameAs(1, 0); getActionDefinitionsBuilder(G_PTR_ADD) .legalFor({{p0, s64}, {v2p0, v2s64}}) @@ -132,7 +138,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {s64, s64}, {v2s32, v2s32}, {v4s32, v4s32}, - {v2s64, v2s64}}) + {v2s64, v2s64}, + {v16s8, v16s8}, + {v8s16, v8s16}}) .clampScalar(1, s32, s64) .clampScalar(0, s32, s64) .minScalarSameAs(1, 0); @@ -140,8 +148,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_SREM, G_UREM}) .lowerFor({s1, s8, s16, s32, s64}); - getActionDefinitionsBuilder({G_SMULO, G_UMULO}) - .lowerFor({{s64, s1}}); + getActionDefinitionsBuilder({G_SMULO, G_UMULO}).lowerFor({{s64, s1}}); getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64}); @@ -150,7 +157,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .minScalar(0, s32); getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) - .legalFor({s32, s64, v2s64, v4s32, v2s32}); + .legalFor({s32, s64, v2s64, v4s32, v2s32}); getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64}); @@ -262,8 +269,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {v4s32, p0, 128, 8}, {v2s64, p0, 128, 8}}) // These extends are also legal - .legalForTypesWithMemDesc({{s32, p0, 8, 8}, - {s32, p0, 16, 8}}) + .legalForTypesWithMemDesc({{s32, p0, 8, 8}, {s32, p0, 16, 8}}) .clampScalar(0, s8, s64) .lowerIfMemSizeNotPow2() // Lower any any-extending loads left into G_ANYEXT and G_LOAD @@ -285,6 +291,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {p0, p0, 64, 8}, {s128, p0, 128, 8}, {v16s8, p0, 128, 8}, + {v8s8, p0, 64, 8}, {v4s16, p0, 64, 8}, {v8s16, p0, 128, 8}, {v2s32, p0, 64, 8}, @@ -302,7 +309,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // Constants getActionDefinitionsBuilder(G_CONSTANT) - .legalFor({p0, s8, s16, s32, s64}) + .legalFor({p0, s8, s16, s32, s64}) .clampScalar(0, s8, s64) .widenScalarToNextPow2(0); getActionDefinitionsBuilder(G_FCONSTANT) @@ -378,13 +385,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_TRUNC).alwaysLegal(); - getActionDefinitionsBuilder(G_SEXT_INREG) - .legalFor({s32, s64}) - .lower(); + getActionDefinitionsBuilder(G_SEXT_INREG).legalFor({s32, s64}).lower(); // FP conversions - getActionDefinitionsBuilder(G_FPTRUNC).legalFor( - {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}); + getActionDefinitionsBuilder(G_FPTRUNC) + .legalFor( + {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}) + .clampMaxNumElements(0, s32, 2); getActionDefinitionsBuilder(G_FPEXT).legalFor( {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}); @@ -544,8 +551,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0; }) // Any vectors left are the wrong size. Scalarize them. - .scalarize(0) - .scalarize(1); + .scalarize(0) + .scalarize(1); } getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) @@ -557,8 +564,32 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .legalIf([=](const LegalityQuery &Query) { const LLT &VecTy = Query.Types[1]; return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || - VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32; - }); + VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 || + VecTy == v16s8 || VecTy == v2s32; + }) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { + // We want to promote to to if that wouldn't + // cause the total vec size to be > 128b. + return Query.Types[1].getNumElements() <= 2; + }, + 0, s64) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { + return Query.Types[1].getNumElements() <= 4; + }, + 0, s32) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { + return Query.Types[1].getNumElements() <= 8; + }, + 0, s16) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { + return Query.Types[1].getNumElements() <= 16; + }, + 0, s8) + .minScalarOrElt(0, s8); // Worst case, we need at least s8. getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) .legalIf([=](const LegalityQuery &Query) { @@ -568,7 +599,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) }); getActionDefinitionsBuilder(G_BUILD_VECTOR) - .legalFor({{v4s16, s16}, + .legalFor({{v16s8, s8}, + {v4s16, s16}, {v8s16, s16}, {v2s32, s32}, {v4s32, s32}, @@ -584,8 +616,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) }) .minScalarSameAs(1, 0); - getActionDefinitionsBuilder(G_CTLZ).legalForCartesianProduct( - {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) + getActionDefinitionsBuilder(G_CTLZ) + .legalForCartesianProduct( + {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) .scalarize(1); getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) @@ -613,8 +646,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_CONCAT_VECTORS) .legalFor({{v4s32, v2s32}, {v8s16, v4s16}}); - getActionDefinitionsBuilder(G_JUMP_TABLE) - .legalFor({{p0}, {s64}}); + getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}}); getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) { return Query.Types[0] == p0 && Query.Types[1] == s64; @@ -653,10 +685,9 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, llvm_unreachable("expected switch to return"); } -bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, - GISelChangeObserver &Observer) const { +bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + // G_ADD_LOW instructions. @@ -706,8 +737,8 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI, return true; } -bool AArch64LegalizerInfo::legalizeIntrinsic( - LegalizerHelper &Helper, MachineInstr &MI) const { +bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, + MachineInstr &MI) const { return true; } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index 48ed68f492635..f32a8f15b8a54 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -11,12 +11,23 @@ //===----------------------------------------------------------------------===// #include "AArch64TargetStreamer.h" +#include "AArch64MCAsmInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/ConstantPools.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/CommandLine.h" using namespace llvm; +static cl::opt MarkBTIProperty( + "aarch64-mark-bti-property", cl::Hidden, + cl::desc("Add .note.gnu.property with BTI to assembly files"), + cl::init(false)); + // // AArch64TargetStreamer Implemenation // @@ -37,8 +48,50 @@ void AArch64TargetStreamer::emitCurrentConstantPool() { ConstantPools->emitForCurrentSection(Streamer); } -// finish() - write out any non-empty assembler constant pools. -void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); } +// finish() - write out any non-empty assembler constant pools and +// write out note.gnu.properties if need. +void AArch64TargetStreamer::finish() { + ConstantPools->emitAll(Streamer); + + if (MarkBTIProperty) + emitNoteSection(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI); +} + +void AArch64TargetStreamer::emitNoteSection(unsigned Flags) { + if (Flags == 0) + return; + + MCStreamer &OutStreamer = getStreamer(); + MCContext &Context = OutStreamer.getContext(); + // Emit a .note.gnu.property section with the flags. + MCSectionELF *Nt = Context.getELFSection(".note.gnu.property", ELF::SHT_NOTE, + ELF::SHF_ALLOC); + if (Nt->isRegistered()) { + SMLoc Loc; + Context.reportWarning( + Loc, + "The .note.gnu.property is not emitted because it is already present."); + return; + } + MCSection *Cur = OutStreamer.getCurrentSectionOnly(); + OutStreamer.SwitchSection(Nt); + + // Emit the note header. + OutStreamer.emitValueToAlignment(Align(8).value()); + OutStreamer.emitIntValue(4, 4); // data size for "GNU\0" + OutStreamer.emitIntValue(4 * 4, 4); // Elf_Prop size + OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4); + OutStreamer.emitBytes(StringRef("GNU", 4)); // note name + + // Emit the PAC/BTI properties. + OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4); + OutStreamer.emitIntValue(4, 4); // data size + OutStreamer.emitIntValue(Flags, 4); // data + OutStreamer.emitIntValue(0, 4); // pad + + OutStreamer.endSection(Nt); + OutStreamer.SwitchSection(Cur); +} void AArch64TargetStreamer::emitInst(uint32_t Inst) { char Buffer[4]; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h index c0dee085caced..09953315bbd0d 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h @@ -33,6 +33,9 @@ class AArch64TargetStreamer : public MCTargetStreamer { /// Emit contents of constant pool for the current section. void emitCurrentConstantPool(); + /// Callback used to implement the .note.gnu.property section. + void emitNoteSection(unsigned Flags); + /// Callback used to implement the .inst directive. virtual void emitInst(uint32_t Inst); diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 0f135c3e80593..66d8759e4d081 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -2279,11 +2279,20 @@ class sve_fp_2op_p_zd opc, string asm, RegisterOperand i_zprtype, multiclass sve_fp_2op_p_zd opc, string asm, RegisterOperand i_zprtype, RegisterOperand o_zprtype, - SDPatternOperator op, ValueType vt1, + SDPatternOperator int_op, + SDPatternOperator ir_op, ValueType vt1, ValueType vt2, ValueType vt3, ElementSizeEnum Sz> { def NAME : sve_fp_2op_p_zd; - def : SVE_3_Op_Pat(NAME)>; + // convert vt3 to a packed type for the intrinsic patterns + defvar packedvt3 = !cond(!eq(!cast(vt3), "nxv2f16"): nxv8f16, + !eq(!cast(vt3), "nxv4f16"): nxv8f16, + !eq(!cast(vt3), "nxv2f32"): nxv4f32, + 1 : vt3); + + def : SVE_3_Op_Pat(NAME)>; + + def : SVE_1_Op_Passthru_Pat(NAME)>; } multiclass sve_fp_2op_p_zd_HSD opc, string asm, SDPatternOperator op_merge, diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 37e4b56e9ccf7..3e8cd60b7d77a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -163,7 +163,7 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", "LDSMisalignedBug", "true", - "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode" + "Some GFX10 bug with multi-dword LDS and flat access that is not naturally aligned in WGP mode" >; def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug", @@ -929,6 +929,7 @@ def FeatureISAVersion10_1_1 : FeatureSet< FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, + FeatureLdsMisalignedBug, FeatureDoesNotSupportXNACK, FeatureCodeObjectV3])>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index d243074aa2fd1..d34345e79fa63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -42,8 +42,7 @@ def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< - "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, - elide_br_by_inverting_cond]> { + "AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> { let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index be8742c8dd47e..5fb072ff18aeb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4164,9 +4164,9 @@ SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); - return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4, + return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4), MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachineMemOperand::MOInvariant); } SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, @@ -4178,7 +4178,7 @@ SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); - SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, + SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4), MachineMemOperand::MODereferenceable); return Store; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index c9be4e11cfc11..209f932536541 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUTargetTransformInfo.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" using namespace llvm; @@ -929,11 +930,6 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, if (!NewNumElts) return UndefValue::get(II.getType()); - // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are - // fully supported. - if (II.getType()->getScalarSizeInBits() == 16 && NewNumElts == 3) - return nullptr; - if (NewNumElts >= VWidth && DemandedElts.isMask()) { if (DMaskIdx >= 0) II.setArgOperand(DMaskIdx, Args[DMaskIdx]); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3f39f6f21c1cc..d84d6309bb266 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -72,6 +73,7 @@ const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, CodeGenCoverage &CoverageInfo) { MRI = &MF.getRegInfo(); + Subtarget = &MF.getSubtarget(); InstructionSelector::setupMF(MF, KB, CoverageInfo); } @@ -1742,6 +1744,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( return selectDSAppendConsume(I, false); case Intrinsic::amdgcn_s_barrier: return selectSBarrier(I); + case Intrinsic::amdgcn_global_atomic_fadd: + return selectGlobalAtomicFaddIntrinsic(I); default: { return selectImpl(I, *CoverageInfo); } @@ -2898,6 +2902,123 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( return true; } +bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( + MachineInstr &MI) const { + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { + Function &F = MBB->getParent()->getFunction(); + DiagnosticInfoUnsupported + NoFpRet(F, "return versions of fp atomics not supported", + MI.getDebugLoc(), DS_Error); + F.getContext().diagnose(NoFpRet); + return false; + } + + // FIXME: This is only needed because tablegen requires number of dst operands + // in match and replace pattern to be the same. Otherwise patterns can be + // exported from SDag path. + MachineOperand &VDataIn = MI.getOperand(1); + MachineOperand &VIndex = MI.getOperand(3); + MachineOperand &VOffset = MI.getOperand(4); + MachineOperand &SOffset = MI.getOperand(5); + int16_t Offset = MI.getOperand(6).getImm(); + + bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI); + bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI); + + unsigned Opcode; + if (HasVOffset) { + Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN + : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN; + } else { + Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN + : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET; + } + + if (MRI->getType(VDataIn.getReg()).isVector()) { + switch (Opcode) { + case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN: + Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN; + break; + case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN: + Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN; + break; + case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN: + Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN; + break; + case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET: + Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET; + break; + } + } + + auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode)); + I.add(VDataIn); + + if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN || + Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) { + Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); + BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) + .addReg(VIndex.getReg()) + .addImm(AMDGPU::sub0) + .addReg(VOffset.getReg()) + .addImm(AMDGPU::sub1); + + I.addReg(IdxReg); + } else if (HasVIndex) { + I.add(VIndex); + } else if (HasVOffset) { + I.add(VOffset); + } + + I.add(MI.getOperand(2)); // rsrc + I.add(SOffset); + I.addImm(Offset); + renderExtractSLC(I, MI, 7); + I.cloneMemRefs(MI); + + MI.eraseFromParent(); + + return true; +} + +bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic( + MachineInstr &MI) const{ + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { + Function &F = MBB->getParent()->getFunction(); + DiagnosticInfoUnsupported + NoFpRet(F, "return versions of fp atomics not supported", + MI.getDebugLoc(), DS_Error); + F.getContext().diagnose(NoFpRet); + return false; + } + + // FIXME: This is only needed because tablegen requires number of dst operands + // in match and replace pattern to be the same. Otherwise patterns can be + // exported from SDag path. + auto Addr = selectFlatOffsetImpl(MI.getOperand(2)); + + Register Data = MI.getOperand(3).getReg(); + const unsigned Opc = MRI->getType(Data).isVector() ? + AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32; + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) + .addReg(Addr.first) + .addReg(Data) + .addImm(Addr.second) + .addImm(0) // SLC + .cloneMemRefs(MI); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); @@ -3017,6 +3138,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { assert(Intr && "not an image intrinsic with image pseudo"); return selectImageIntrinsic(I, Intr); } + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: + return selectAMDGPU_BUFFER_ATOMIC_FADD(I); default: return selectImpl(I, *CoverageInfo); } @@ -3166,7 +3289,7 @@ AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { Register Src; unsigned Mods; std::tie(Src, Mods) = selectVOP3ModsImpl(Root); - if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) + if (!isKnownNeverNaN(Src, *MRI)) return None; return {{ @@ -3259,14 +3382,11 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { } template -InstructionSelector::ComplexRendererFns +std::pair AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); - InstructionSelector::ComplexRendererFns Default = {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset - }}; + auto Default = std::make_pair(Root.getReg(), 0); if (!STI.hasFlatInstOffsets()) return Default; @@ -3286,20 +3406,27 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { Register BasePtr = OpDef->getOperand(1).getReg(); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, - }}; + return std::make_pair(BasePtr, Offset.getValue()); } InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { - return selectFlatOffsetImpl(Root); + auto PtrWithOffset = selectFlatOffsetImpl(Root); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, + }}; } InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { - return selectFlatOffsetImpl(Root); + auto PtrWithOffset = selectFlatOffsetImpl(Root); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, + }}; } /// Match a zero extend from a 32-bit value to 64-bits. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 2176e2b549511..578958f120aa0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -50,6 +50,7 @@ class SIRegisterInfo; class AMDGPUInstructionSelector final : public InstructionSelector { private: MachineRegisterInfo *MRI; + const GCNSubtarget *Subtarget; public: AMDGPUInstructionSelector(const GCNSubtarget &STI, @@ -140,6 +141,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const; bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const; + bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const; + bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const; std::pair selectVOP3ModsImpl(MachineOperand &Root) const; @@ -179,11 +182,11 @@ class AMDGPUInstructionSelector final : public InstructionSelector { selectSmrdSgpr(MachineOperand &Root) const; template - InstructionSelector::ComplexRendererFns + std::pair selectFlatOffsetImpl(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectFlatOffset(MachineOperand &Root) const; - InstructionSelector::ComplexRendererFns selectFlatOffsetSigned(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index fad606c792a92..01c7934e9eb05 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -483,6 +483,8 @@ defm atomic_load_umax : ret_noret_binary_atomic_op; defm atomic_load_umin : ret_noret_binary_atomic_op; defm atomic_load_xor : ret_noret_binary_atomic_op; defm atomic_load_fadd : ret_noret_binary_atomic_op; +let MemoryVT = v2f16 in +defm atomic_load_fadd_v2f16 : ret_noret_binary_atomic_op; defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op; def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index 524a34be876ff..31c6c0bb0c2f3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -379,9 +379,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10)); ZeroIdxList.push_back(zeroInt); - GetElementPtrInst *BufferIdx = - dyn_cast(GetElementPtrInst::Create( - nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch)); + GetElementPtrInst *BufferIdx = GetElementPtrInst::Create( + nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch); Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS); Value *id_gep_cast = @@ -395,8 +394,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id // the following GEP is the buffer pointer - BufferIdx = cast(GetElementPtrInst::Create( - nullptr, pcall, FourthIdxList, "PrintBuffGep", Brnch)); + BufferIdx = GetElementPtrInst::Create(nullptr, pcall, FourthIdxList, + "PrintBuffGep", Brnch); Type *Int32Ty = Type::getInt32Ty(Ctx); Type *Int64Ty = Type::getInt64Ty(Ctx); @@ -409,17 +408,15 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( if (ArgType->isFPOrFPVectorTy() && !isa(ArgType)) { Type *IType = (ArgType->isFloatTy()) ? Int32Ty : Int64Ty; if (OpConvSpecifiers[ArgCount - 1] == 'f') { - ConstantFP *fpCons = dyn_cast(Arg); - if (fpCons) { - APFloat Val(fpCons->getValueAPF()); + if (auto *FpCons = dyn_cast(Arg)) { + APFloat Val(FpCons->getValueAPF()); bool Lost = false; Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Lost); Arg = ConstantFP::get(Ctx, Val); IType = Int32Ty; - } else { - FPExtInst *FpExt = dyn_cast(Arg); - if (FpExt && FpExt->getType()->isDoubleTy() && + } else if (auto *FpExt = dyn_cast(Arg)) { + if (FpExt->getType()->isDoubleTy() && FpExt->getOperand(0)->getType()->isFloatTy()) { Arg = FpExt->getOperand(0); IType = Int32Ty; @@ -431,9 +428,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( } else if (ArgType->getTypeID() == Type::PointerTyID) { if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) { const char *S = NonLiteralStr; - if (ConstantExpr *ConstExpr = dyn_cast(Arg)) { - GlobalVariable *GV = - dyn_cast(ConstExpr->getOperand(0)); + if (auto *ConstExpr = dyn_cast(Arg)) { + auto *GV = dyn_cast(ConstExpr->getOperand(0)); if (GV && GV->hasInitializer()) { Constant *Init = GV->getInitializer(); ConstantDataArray *CA = dyn_cast(Init); @@ -491,27 +487,27 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( switch (EleSize) { default: EleCount = TotalSize / 64; - IType = dyn_cast(Type::getInt64Ty(ArgType->getContext())); + IType = Type::getInt64Ty(ArgType->getContext()); break; case 8: if (EleCount >= 8) { EleCount = TotalSize / 64; - IType = dyn_cast(Type::getInt64Ty(ArgType->getContext())); + IType = Type::getInt64Ty(ArgType->getContext()); } else if (EleCount >= 3) { EleCount = 1; - IType = dyn_cast(Type::getInt32Ty(ArgType->getContext())); + IType = Type::getInt32Ty(ArgType->getContext()); } else { EleCount = 1; - IType = dyn_cast(Type::getInt16Ty(ArgType->getContext())); + IType = Type::getInt16Ty(ArgType->getContext()); } break; case 16: if (EleCount >= 3) { EleCount = TotalSize / 64; - IType = dyn_cast(Type::getInt64Ty(ArgType->getContext())); + IType = Type::getInt64Ty(ArgType->getContext()); } else { EleCount = 1; - IType = dyn_cast(Type::getInt32Ty(ArgType->getContext())); + IType = Type::getInt32Ty(ArgType->getContext()); } break; } @@ -539,8 +535,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( (void)StBuff; if (I + 1 == E && ArgCount + 1 == CI->getNumArgOperands()) break; - BufferIdx = dyn_cast(GetElementPtrInst::Create( - nullptr, BufferIdx, BuffOffset, "PrintBuffNextPtr", Brnch)); + BufferIdx = GetElementPtrInst::Create(nullptr, BufferIdx, BuffOffset, + "PrintBuffNextPtr", Brnch); LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n" << *BufferIdx << '\n'); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c0bef6a5ada16..fc9315c016bb1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -750,6 +750,9 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( for (MachineInstr &MI : Range) { for (MachineOperand &Def : MI.defs()) { + if (MRI.use_nodbg_empty(Def.getReg())) + continue; + LLT ResTy = MRI.getType(Def.getReg()); const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); ResultRegs.push_back(Def.getReg()); @@ -2971,7 +2974,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, {1, 4}); + executeInWaterfallLoop(MI, MRI, {2, 5}); return; } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { @@ -3929,7 +3932,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { // vdata_out OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); @@ -3952,23 +3956,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // initialized. break; } - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { - // vdata_in - OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); - - // rsrc - OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); - - // vindex - OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); - - // voffset - OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); - - // soffset - OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); - break; - } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { // vdata_out OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 5946249e84b09..ccc493640b292 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -283,7 +283,6 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; @@ -294,7 +293,6 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { auto DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; } @@ -308,7 +306,6 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) { auto DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); return DAG; } @@ -604,7 +601,6 @@ class AMDGPUPassConfig : public TargetPassConfig { createMachineScheduler(MachineSchedContext *C) const override { ScheduleDAGMILive *DAG = createGenericSchedLive(C); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; } @@ -946,7 +942,7 @@ bool GCNPassConfig::addInstSelector() { } bool GCNPassConfig::addIRTranslator() { - addPass(new IRTranslator()); + addPass(new IRTranslator(getOptLevel())); return false; } diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index db74f8a54c0af..e1369e8f5c95f 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1070,7 +1070,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { std::string &CollectString); bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, - RegisterKind RegKind, unsigned Reg1); + RegisterKind RegKind, unsigned Reg1, SMLoc Loc); bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, unsigned &RegNum, unsigned &RegWidth, bool RestoreOnFailure = false); @@ -1088,7 +1088,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool ParseRegRange(unsigned& Num, unsigned& Width); unsigned getRegularReg(RegisterKind RegKind, unsigned RegNum, - unsigned RegWidth); + unsigned RegWidth, + SMLoc Loc); bool isRegister(); bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const; @@ -1443,6 +1444,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { void cvtMIMG(MCInst &Inst, const OperandVector &Operands, bool IsAtomic = false); void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); + void cvtIntersectRay(MCInst &Inst, const OperandVector &Operands); OperandMatchResultTy parseDim(OperandVector &Operands); OperandMatchResultTy parseDPP8(OperandVector &Operands); @@ -2065,7 +2067,8 @@ OperandMatchResultTy AMDGPUAsmParser::tryParseRegister(unsigned &RegNo, } bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, - RegisterKind RegKind, unsigned Reg1) { + RegisterKind RegKind, unsigned Reg1, + SMLoc Loc) { switch (RegKind) { case IS_SPECIAL: if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { @@ -2098,12 +2101,14 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, RegWidth = 2; return true; } + Error(Loc, "register does not fit in the list"); return false; case IS_VGPR: case IS_SGPR: case IS_AGPR: case IS_TTMP: if (Reg1 != Reg + RegWidth) { + Error(Loc, "registers in a list must have consecutive indices"); return false; } RegWidth++; @@ -2186,7 +2191,8 @@ AMDGPUAsmParser::isRegister() unsigned AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum, - unsigned RegWidth) { + unsigned RegWidth, + SMLoc Loc) { assert(isRegularReg(RegKind)); @@ -2197,18 +2203,24 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, AlignSize = std::min(RegWidth, 4u); } - if (RegNum % AlignSize != 0) + if (RegNum % AlignSize != 0) { + Error(Loc, "invalid register alignment"); return AMDGPU::NoRegister; + } unsigned RegIdx = RegNum / AlignSize; int RCID = getRegClass(RegKind, RegWidth); - if (RCID == -1) + if (RCID == -1) { + Error(Loc, "invalid or unsupported register size"); return AMDGPU::NoRegister; + } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); const MCRegisterClass RC = TRI->getRegClass(RCID); - if (RegIdx >= RC.getNumRegs()) + if (RegIdx >= RC.getNumRegs()) { + Error(Loc, "register index is out of range"); return AMDGPU::NoRegister; + } return RC.getRegister(RegIdx); } @@ -2216,24 +2228,40 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, bool AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) { int64_t RegLo, RegHi; - if (!trySkipToken(AsmToken::LBrac)) + if (!skipToken(AsmToken::LBrac, "missing register index")) return false; + SMLoc FirstIdxLoc = getLoc(); + SMLoc SecondIdxLoc; + if (!parseExpr(RegLo)) return false; if (trySkipToken(AsmToken::Colon)) { + SecondIdxLoc = getLoc(); if (!parseExpr(RegHi)) return false; } else { RegHi = RegLo; } - if (!trySkipToken(AsmToken::RBrac)) + if (!skipToken(AsmToken::RBrac, "expected a closing square bracket")) + return false; + + if (!isUInt<32>(RegLo)) { + Error(FirstIdxLoc, "invalid register index"); + return false; + } + + if (!isUInt<32>(RegHi)) { + Error(SecondIdxLoc, "invalid register index"); return false; + } - if (!isUInt<32>(RegLo) || !isUInt<32>(RegHi) || RegLo > RegHi) + if (RegLo > RegHi) { + Error(FirstIdxLoc, "first register index should not exceed second index"); return false; + } Num = static_cast(RegLo); Width = (RegHi - RegLo) + 1; @@ -2260,10 +2288,14 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, SmallVectorImpl &Tokens) { assert(isToken(AsmToken::Identifier)); StringRef RegName = getTokenStr(); + auto Loc = getLoc(); const RegInfo *RI = getRegularRegInfo(RegName); - if (!RI) + if (!RI) { + Error(Loc, "invalid register name"); return AMDGPU::NoRegister; + } + Tokens.push_back(getToken()); lex(); // skip register name @@ -2271,8 +2303,10 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, StringRef RegSuffix = RegName.substr(RI->Name.size()); if (!RegSuffix.empty()) { // Single 32-bit register: vXX. - if (!getRegNum(RegSuffix, RegNum)) + if (!getRegNum(RegSuffix, RegNum)) { + Error(Loc, "invalid register index"); return AMDGPU::NoRegister; + } RegWidth = 1; } else { // Range of registers: v[XX:YY]. ":YY" is optional. @@ -2280,44 +2314,59 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, return AMDGPU::NoRegister; } - return getRegularReg(RegKind, RegNum, RegWidth); + return getRegularReg(RegKind, RegNum, RegWidth, Loc); } unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum, unsigned &RegWidth, SmallVectorImpl &Tokens) { unsigned Reg = AMDGPU::NoRegister; + auto ListLoc = getLoc(); - if (!trySkipToken(AsmToken::LBrac)) + if (!skipToken(AsmToken::LBrac, + "expected a register or a list of registers")) { return AMDGPU::NoRegister; + } // List of consecutive registers, e.g.: [s0,s1,s2,s3] + auto Loc = getLoc(); if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) return AMDGPU::NoRegister; - if (RegWidth != 1) + if (RegWidth != 1) { + Error(Loc, "expected a single 32-bit register"); return AMDGPU::NoRegister; + } for (; trySkipToken(AsmToken::Comma); ) { RegisterKind NextRegKind; unsigned NextReg, NextRegNum, NextRegWidth; + Loc = getLoc(); - if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth, - Tokens)) + if (!ParseAMDGPURegister(NextRegKind, NextReg, + NextRegNum, NextRegWidth, + Tokens)) { return AMDGPU::NoRegister; - if (NextRegWidth != 1) + } + if (NextRegWidth != 1) { + Error(Loc, "expected a single 32-bit register"); return AMDGPU::NoRegister; - if (NextRegKind != RegKind) + } + if (NextRegKind != RegKind) { + Error(Loc, "registers in a list must be of the same kind"); return AMDGPU::NoRegister; - if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg)) + } + if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg, Loc)) return AMDGPU::NoRegister; } - if (!trySkipToken(AsmToken::RBrac)) + if (!skipToken(AsmToken::RBrac, + "expected a comma or a closing square bracket")) { return AMDGPU::NoRegister; + } if (isRegularReg(RegKind)) - Reg = getRegularReg(RegKind, RegNum, RegWidth); + Reg = getRegularReg(RegKind, RegNum, RegWidth, ListLoc); return Reg; } @@ -2325,6 +2374,7 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum, bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, unsigned &RegNum, unsigned &RegWidth, SmallVectorImpl &Tokens) { + auto Loc = getLoc(); Reg = AMDGPU::NoRegister; if (isToken(AsmToken::Identifier)) { @@ -2336,12 +2386,26 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - return Reg != AMDGPU::NoRegister && subtargetHasRegister(*TRI, Reg); + if (Reg == AMDGPU::NoRegister) { + assert(Parser.hasPendingError()); + return false; + } + + if (!subtargetHasRegister(*TRI, Reg)) { + if (Reg == AMDGPU::SGPR_NULL) { + Error(Loc, "'null' operand is not supported on this GPU"); + } else { + Error(Loc, "register not available on this GPU"); + } + return false; + } + + return true; } bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, unsigned &RegNum, unsigned &RegWidth, - bool RestoreOnFailure) { + bool RestoreOnFailure /*=false*/) { Reg = AMDGPU::NoRegister; SmallVector Tokens; @@ -2413,8 +2477,6 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) { unsigned Reg, RegNum, RegWidth; if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) { - //FIXME: improve error messages (bug 41303). - Error(StartLoc, "not a valid operand."); return nullptr; } if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) { @@ -2480,7 +2542,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) { // This syntax is not compatible with syntax of standard // MC expressions (due to the trailing '|'). SMLoc EndLoc; - if (getParser().parsePrimaryExpr(Expr, EndLoc)) + if (getParser().parsePrimaryExpr(Expr, EndLoc, nullptr)) return MatchOperand_ParseFail; } else { if (Parser.parseExpression(Expr)) @@ -3048,8 +3110,9 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) { int TFEIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::tfe); assert(VDataIdx != -1); - assert(DMaskIdx != -1); - assert(TFEIdx != -1); + + if (DMaskIdx == -1 || TFEIdx == -1) // intersect_ray + return true; unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx); unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0; @@ -3076,6 +3139,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) { return true; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); @@ -3084,9 +3148,11 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) { assert(VAddr0Idx != -1); assert(SrsrcIdx != -1); - assert(DimIdx != -1); assert(SrsrcIdx > VAddr0Idx); + if (DimIdx == -1) + return true; // intersect_ray + unsigned Dim = Inst.getOperand(DimIdx).getImm(); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim); bool IsNSA = SrsrcIdx - VAddr0Idx > 1; @@ -6405,6 +6471,17 @@ void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) cvtMIMG(Inst, Operands, true); } +void AMDGPUAsmParser::cvtIntersectRay(MCInst &Inst, + const OperandVector &Operands) { + for (unsigned I = 1; I < Operands.size(); ++I) { + auto &Operand = (AMDGPUOperand &)*Operands[I]; + if (Operand.isReg()) + Operand.addRegOperands(Inst, 1); + } + + Inst.addOperand(MCOperand::createImm(1)); // a16 +} + //===----------------------------------------------------------------------===// // smrd //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 45eca4b3216a5..e1c9f1609a02a 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -529,21 +529,23 @@ multiclass MUBUF_Pseudo_Loads { - def _OFFSET : MUBUF_Load_Pseudo , + defvar legal_load_vt = !if(!eq(!cast(load_vt), !cast(v3f16)), v4f16, load_vt); + + def _OFFSET : MUBUF_Load_Pseudo , MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>; - def _ADDR64 : MUBUF_Load_Pseudo , + def _ADDR64 : MUBUF_Load_Pseudo , MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>; - def _OFFEN : MUBUF_Load_Pseudo ; - def _IDXEN : MUBUF_Load_Pseudo ; - def _BOTHEN : MUBUF_Load_Pseudo ; + def _OFFEN : MUBUF_Load_Pseudo ; + def _IDXEN : MUBUF_Load_Pseudo ; + def _BOTHEN : MUBUF_Load_Pseudo ; let DisableWQM = 1 in { - def _OFFSET_exact : MUBUF_Load_Pseudo ; - def _OFFEN_exact : MUBUF_Load_Pseudo ; - def _IDXEN_exact : MUBUF_Load_Pseudo ; - def _BOTHEN_exact : MUBUF_Load_Pseudo ; + def _OFFSET_exact : MUBUF_Load_Pseudo ; + def _OFFEN_exact : MUBUF_Load_Pseudo ; + def _IDXEN_exact : MUBUF_Load_Pseudo ; + def _BOTHEN_exact : MUBUF_Load_Pseudo ; } } @@ -577,25 +579,27 @@ multiclass MUBUF_Pseudo_Stores { - def _OFFSET : MUBUF_Store_Pseudo (store_vt), !cast(v3f16)), v4f16, store_vt); + + def _OFFSET : MUBUF_Store_Pseudo , MUBUFAddr64Table<0, NAME>; - def _ADDR64 : MUBUF_Store_Pseudo , MUBUFAddr64Table<1, NAME>; - def _OFFEN : MUBUF_Store_Pseudo ; - def _IDXEN : MUBUF_Store_Pseudo ; - def _BOTHEN : MUBUF_Store_Pseudo ; + def _OFFEN : MUBUF_Store_Pseudo ; + def _IDXEN : MUBUF_Store_Pseudo ; + def _BOTHEN : MUBUF_Store_Pseudo ; let DisableWQM = 1 in { - def _OFFSET_exact : MUBUF_Store_Pseudo ; - def _OFFEN_exact : MUBUF_Store_Pseudo ; - def _IDXEN_exact : MUBUF_Store_Pseudo ; - def _BOTHEN_exact : MUBUF_Store_Pseudo ; + def _OFFSET_exact : MUBUF_Store_Pseudo ; + def _OFFEN_exact : MUBUF_Store_Pseudo ; + def _IDXEN_exact : MUBUF_Store_Pseudo ; + def _BOTHEN_exact : MUBUF_Store_Pseudo ; } } @@ -1094,14 +1098,12 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; let SubtargetPredicate = HasAtomicFaddInsts in { - defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN < - "buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret + "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_noret_32 >; defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < - "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_noret_32 >; - } // End SubtargetPredicate = HasAtomicFaddInsts //===----------------------------------------------------------------------===// @@ -1164,9 +1166,11 @@ let SubtargetPredicate = isGFX10Plus in { //===----------------------------------------------------------------------===// multiclass MUBUF_LoadIntrinsicPat { + string opcode, ValueType memoryVt = vt> { + defvar st = !if(!eq(!cast(memoryVt), !cast(vt)), name, mubuf_intrinsic_load); + def : GCNPat< - (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$auxiliary, 0)), (!cast(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), @@ -1174,7 +1178,7 @@ multiclass MUBUF_LoadIntrinsicPat; def : GCNPat< - (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$auxiliary, 0)), (!cast(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), @@ -1182,7 +1186,7 @@ multiclass MUBUF_LoadIntrinsicPat; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$auxiliary, timm)), (!cast(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), @@ -1190,7 +1194,7 @@ multiclass MUBUF_LoadIntrinsicPat; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, timm:$auxiliary, timm)), (!cast(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), @@ -1214,6 +1218,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in { defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; } // End HasUnpackedD16VMem. @@ -1223,6 +1228,8 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; } // End HasPackedD16VMem. @@ -1245,9 +1252,11 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; multiclass MUBUF_StoreIntrinsicPat { + string opcode, ValueType memoryVt = vt> { + defvar st = !if(!eq(!cast(memoryVt), !cast(vt)), name, mubuf_intrinsic_store); + def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$auxiliary, 0), (!cast(opcode # _OFFSET_exact) getVregSrcForVT.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), @@ -1255,7 +1264,7 @@ multiclass MUBUF_StoreIntrinsicPat; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$auxiliary, 0), (!cast(opcode # _OFFEN_exact) getVregSrcForVT.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), @@ -1264,7 +1273,7 @@ multiclass MUBUF_StoreIntrinsicPat; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$auxiliary, timm), (!cast(opcode # _IDXEN_exact) getVregSrcForVT.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), @@ -1273,7 +1282,7 @@ multiclass MUBUF_StoreIntrinsicPat; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, timm:$auxiliary, timm), (!cast(opcode # _BOTHEN_exact) getVregSrcForVT.ret:$vdata, @@ -1298,6 +1307,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in { defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; } // End HasUnpackedD16VMem. @@ -1307,6 +1317,8 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; } // End HasPackedD16VMem. @@ -1394,36 +1406,46 @@ defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; +class NoUseBufferAtomic : PatFrag < + (ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5, node:$src6, node:$src7), + (vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7)), + [{ return SDValue(N, 0).use_empty(); }]> { + + let GISelPredicateCode = [{ + return MRI.use_nodbg_empty(MI.getOperand(0).getReg()); + }]; +} + multiclass BufferAtomicPatterns_NO_RTN { def : GCNPat< - (name vt:$vdata_in, v4i32:$rsrc, 0, - 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0), + (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, 0, + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0), (!cast(opcode # _OFFSET) getVregSrcForVT.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_slc $cachepolicy)) + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm), + (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, i32:$vindex, + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm), (!cast(opcode # _IDXEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_slc $cachepolicy)) + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (name vt:$vdata_in, v4i32:$rsrc, 0, - i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0), + (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, 0, + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0), (!cast(opcode # _OFFEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_slc $cachepolicy)) + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm), + (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, i32:$vindex, + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm), (!cast(opcode # _BOTHEN) getVregSrcForVT.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), @@ -1686,9 +1708,11 @@ defm : MUBUFScratchStorePat { + string opcode, ValueType memoryVt = vt> { + defvar st = !if(!eq(!cast(memoryVt), !cast(vt)), name, mtbuf_intrinsic_load); + def : GCNPat< - (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0)), (!cast(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), @@ -1697,7 +1721,7 @@ multiclass MTBUF_LoadIntrinsicPat; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, timm)), (!cast(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), @@ -1706,7 +1730,7 @@ multiclass MTBUF_LoadIntrinsicPat; def : GCNPat< - (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0)), (!cast(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), @@ -1715,7 +1739,7 @@ multiclass MTBUF_LoadIntrinsicPat; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, timm)), (!cast(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), @@ -1739,6 +1763,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in { defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; } // End HasUnpackedD16VMem. @@ -1746,13 +1771,16 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; } // End HasPackedD16VMem. multiclass MTBUF_StoreIntrinsicPat { + string opcode, ValueType memoryVt = vt> { + defvar st = !if(!eq(!cast(memoryVt), !cast(vt)), name, mtbuf_intrinsic_store); + def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0), (!cast(opcode # _OFFSET_exact) getVregSrcForVT.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), @@ -1761,7 +1789,7 @@ multiclass MTBUF_StoreIntrinsicPat; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, timm), (!cast(opcode # _IDXEN_exact) getVregSrcForVT.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), @@ -1770,7 +1798,7 @@ multiclass MTBUF_StoreIntrinsicPat; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0), (!cast(opcode # _OFFEN_exact) getVregSrcForVT.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), @@ -1779,7 +1807,7 @@ multiclass MTBUF_StoreIntrinsicPat; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, + (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, timm), (!cast(opcode # _BOTHEN_exact) getVregSrcForVT.ret:$vdata, @@ -1803,6 +1831,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in { defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; } // End HasUnpackedD16VMem. @@ -1810,6 +1839,7 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; } // End HasPackedD16VMem. diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 9c2f2e7eecd14..b7dde61f608bf 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -139,6 +139,8 @@ DECODE_OPERAND_REG(VS_128) DECODE_OPERAND_REG(VReg_64) DECODE_OPERAND_REG(VReg_96) DECODE_OPERAND_REG(VReg_128) +DECODE_OPERAND_REG(VReg_256) +DECODE_OPERAND_REG(VReg_512) DECODE_OPERAND_REG(SReg_32) DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) @@ -499,8 +501,16 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { AMDGPU::OpName::d16); assert(VDataIdx != -1); - assert(DMaskIdx != -1); - assert(TFEIdx != -1); + if (DMaskIdx == -1 || TFEIdx == -1) {// intersect_ray + if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16) > -1) { + assert(MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa || + MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa || + MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa || + MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa); + addOperand(MI, MCOperand::createImm(1)); + } + return MCDisassembler::Success; + } const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); bool IsAtomic = (VDstIdx != -1); diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index f5b6829e89f79..abe29f73a9141 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -78,6 +78,7 @@ class FLAT_Real op, FLAT_Pseudo ps> : // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let AsmMatchConverter = ps.AsmMatchConverter; + let OtherPredicates = ps.OtherPredicates; let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; @@ -714,16 +715,16 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in { FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>; } // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1 -let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in { - -defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < - "global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret ->; -defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < - "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret ->; - -} // End SubtargetPredicate = HasAtomicFaddInsts +let is_flat_global = 1 in { +let OtherPredicates = [HasAtomicFaddInsts] in { + defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < + "global_atomic_add_f32", VGPR_32, f32 + >; + defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < + "global_atomic_pk_add_f16", VGPR_32, v2f16 + >; +} // End OtherPredicates = [HasAtomicFaddInsts] +} // End is_flat_global = 1 //===----------------------------------------------------------------------===// // Flat Patterns @@ -1081,8 +1082,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", atomic_swap_global_64, i64 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", AMDGPUatomic_cmp_swap_global_64, i64, v2i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", atomic_load_xor_global_64, i64>; -defm : GlobalFLATNoRtnAtomicPats ; -defm : GlobalFLATNoRtnAtomicPats ; +let OtherPredicates = [HasAtomicFaddInsts] in { +defm : GlobalFLATNoRtnAtomicPats ; +defm : GlobalFLATNoRtnAtomicPats ; +} } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index d897127812b9b..432d951018d09 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -67,7 +67,14 @@ static bool isSGetReg(unsigned Opcode) { } static bool isSSetReg(unsigned Opcode) { - return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32; + switch (Opcode) { + case AMDGPU::S_SETREG_B32: + case AMDGPU::S_SETREG_B32_mode: + case AMDGPU::S_SETREG_IMM32_B32: + case AMDGPU::S_SETREG_IMM32_B32_mode: + return true; + } + return false; } static bool isRWLane(unsigned Opcode) { @@ -368,7 +375,7 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, if (IsHazard(&*I)) return WaitStates; - if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr()) + if (I->isInlineAsm() || I->isMetaInstruction()) continue; WaitStates += SIInstrInfo::getNumWaitStates(*I); diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index ff9228e2dea4a..1df86e7ca6b20 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -114,7 +114,7 @@ GCNNSAReassign::tryAssignRegisters(SmallVectorImpl &Intervals, unsigned NumRegs = Intervals.size(); for (unsigned N = 0; N < NumRegs; ++N) - if (VRM->hasPhys(Intervals[N]->reg)) + if (VRM->hasPhys(Intervals[N]->reg())) LRM->unassign(*Intervals[N]); for (unsigned N = 0; N < NumRegs; ++N) @@ -302,14 +302,15 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI << "\tOriginal allocation:\t"; - for(auto *LI : Intervals) - dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI); + for (auto *LI + : Intervals) dbgs() + << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI); dbgs() << '\n'); bool Success = scavengeRegs(Intervals); if (!Success) { LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n"); - if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation. + if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation. continue; } else { // Check we did not make it worse for other instructions. @@ -328,7 +329,7 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { if (!Success) { for (unsigned I = 0; I < Info->VAddrDwords; ++I) - if (VRM->hasPhys(Intervals[I]->reg)) + if (VRM->hasPhys(Intervals[I]->reg())) LRM->unassign(*Intervals[I]); for (unsigned I = 0; I < Info->VAddrDwords; ++I) @@ -339,11 +340,12 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { C.second = true; ++NumNSAConverted; - LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t [" - << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI) - << " : " - << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI) - << "]\n"); + LLVM_DEBUG( + dbgs() << "\tNew allocation:\t\t [" + << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI) + << " : " + << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI) + << "]\n"); Changed = true; } diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp index 1c940428273cb..92d4a64624793 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -650,7 +650,7 @@ unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg, unsigned GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank, unsigned SubReg) const { - const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); + const TargetRegisterClass *RC = MRI->getRegClass(LI.reg()); unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs : MaxNumSGPRs; unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 687cfef4559f3..1836237c8df56 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -40,7 +40,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT, HasAggressiveSymbolFolding = true; COMMDirectiveAlignmentIsInBytes = false; HasNoDeadStrip = true; - WeakRefDirective = ".weakref\t"; //===--- Dwarf Emission Directives -----------------------------------===// SupportsDebugInformation = true; DwarfRegNumForCFI = true; diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index ba7d9ad2eda1a..c223e1a8bc265 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -708,6 +708,55 @@ multiclass MIMG_Gather op, AMDGPUSampleVariant sample, bit wqm = 0, multiclass MIMG_Gather_WQM op, AMDGPUSampleVariant sample> : MIMG_Gather; +class MIMG_IntersectRay_gfx10 + : MIMG_gfx10 { + + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc), + !if(!eq(A16,1), (ins GFX10A16:$a16), (ins))); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc"#!if(!eq(A16,1), "$a16", ""); + + let nsa = 0; +} + +class MIMG_IntersectRay_nsa_gfx10 + : MIMG_nsa_gfx10 { + let InOperandList = !con(nsah.AddrIns, + (ins SReg_128:$srsrc), + !if(!eq(A16,1), (ins GFX10A16:$a16), (ins))); + let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(!eq(A16,1), "$a16", ""); +} + +multiclass MIMG_IntersectRay { + def "" : MIMGBaseOpcode; + let SubtargetPredicate = HasGFX10_BEncoding, + AssemblerPredicate = HasGFX10_BEncoding, + AsmMatchConverter = !if(!eq(A16,1), "cvtIntersectRay", ""), + dmask = 0xf, + unorm = 1, + d16 = 0, + glc = 0, + slc = 0, + dlc = 0, + tfe = 0, + lwe = 0, + r128 = 1, + ssamp = 0, + dim = {0, 0, 0}, + a16 = A16, + d16 = 0, + BaseOpcode = !cast(NAME), + VDataDwords = 4 in { + // TODO: MIMGAddrSize will choose VReg_512 which is a 16 register tuple, + // when we only need 9, 11 or 12 depending on A16 field and ptr size. + def "_sa" : MIMG_IntersectRay_gfx10.RegClass, A16> { + let VAddrDwords = !srl(MIMGAddrSize.RegClass.Size, 5); + } + def _nsa : MIMG_IntersectRay_nsa_gfx10 { + let VAddrDwords = num_addrs; + } + } +} + //===----------------------------------------------------------------------===// // MIMG Instructions //===----------------------------------------------------------------------===// @@ -832,6 +881,11 @@ defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl let SubtargetPredicate = HasGFX10_BEncoding in defm IMAGE_MSAA_LOAD : MIMG_NoSampler <0x00000080, "image_msaa_load", 1>; +defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 11, 0>; +defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 8, 1>; +defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 12, 0>; +defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 9, 1>; + /********** ========================================= **********/ /********** Table of dimension-aware image intrinsics **********/ /********** ========================================= **********/ diff --git a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp index 90e48c63b5dca..0a0532c629595 100644 --- a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp +++ b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp @@ -80,9 +80,8 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); - // Check for instructions that don't have tfe or lwe fields - // There shouldn't be any at this point. - assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction"); + if (!TFE && !LWE) // intersect_ray + continue; unsigned TFEVal = TFE->getImm(); unsigned LWEVal = LWE->getImm(); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 9a30d4fd6bd4a..4df7fd85a5dde 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -192,8 +192,8 @@ static bool updateOperand(FoldCandidate &Fold, if (Fold.isImm()) { if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked && !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) && - AMDGPU::isInlinableLiteralV216(static_cast(Fold.ImmToFold), - ST.hasInv2PiInlineImm())) { + AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, + ST.hasInv2PiInlineImm())) { // Set op_sel/op_sel_hi on this operand or bail out if op_sel is // already set. unsigned Opcode = MI->getOpcode(); @@ -209,30 +209,30 @@ static bool updateOperand(FoldCandidate &Fold, ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); MachineOperand &Mod = MI->getOperand(ModIdx); unsigned Val = Mod.getImm(); - if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) - return false; - // Only apply the following transformation if that operand requries - // a packed immediate. - switch (TII.get(Opcode).OpInfo[OpNo].OperandType) { - case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_IMM_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - // If upper part is all zero we do not need op_sel_hi. - if (!isUInt<16>(Fold.ImmToFold)) { - if (!(Fold.ImmToFold & 0xffff)) { - Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); + if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) { + // Only apply the following transformation if that operand requries + // a packed immediate. + switch (TII.get(Opcode).OpInfo[OpNo].OperandType) { + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + // If upper part is all zero we do not need op_sel_hi. + if (!isUInt<16>(Fold.ImmToFold)) { + if (!(Fold.ImmToFold & 0xffff)) { + Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); + Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + return true; + } Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); return true; } - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); - return true; + break; + default: + break; } - break; - default: - break; } } } @@ -355,10 +355,17 @@ static bool tryAddToFoldList(SmallVectorImpl &FoldList, } // Special case for s_setreg_b32 - if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) { - MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32)); - appendFoldCandidate(FoldList, MI, OpNo, OpToFold); - return true; + if (OpToFold->isImm()) { + unsigned ImmOpc = 0; + if (Opc == AMDGPU::S_SETREG_B32) + ImmOpc = AMDGPU::S_SETREG_IMM32_B32; + else if (Opc == AMDGPU::S_SETREG_B32_mode) + ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode; + if (ImmOpc) { + MI->setDesc(TII->get(ImmOpc)); + appendFoldCandidate(FoldList, MI, OpNo, OpToFold); + return true; + } } // If we are already folding into another operand of MI, then @@ -1237,6 +1244,11 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); } else { + // Skip updating literal use if it's used in the same REQ_SQUENCE as, + // if that literal could be inlined, it's just a single use. + if (NonInlineUse && NonInlineUse->getParent() == UseMI && + UseMI->isRegSequence()) + continue; if (++NumLiteralUses == 1) { NonInlineUse = &*Use; NonInlineUseOpNo = OpNo; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ad9c4d0673476..aa90f537396e7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -102,6 +102,10 @@ static cl::opt UseDivergentRegisterIndexing( cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false)); +static cl::opt EnableLowerSGPRToVGPRCopy( + "lower-sgpr-to-vgpr-copy", cl::Hidden, + cl::desc("Enable lowering copy from SGPR to VGPR"), cl::init(true)); + static bool hasFP32Denormals(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo(); return Info->getMode().allFP32Denormals(); @@ -546,8 +550,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote); AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); - setOperationAction(ISD::ROTR, MVT::i16, Promote); - setOperationAction(ISD::ROTL, MVT::i16, Promote); + setOperationAction(ISD::ROTR, MVT::i16, Expand); + setOperationAction(ISD::ROTL, MVT::i16, Expand); setOperationAction(ISD::SDIV, MVT::i16, Promote); setOperationAction(ISD::UDIV, MVT::i16, Promote); @@ -806,6 +810,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3i16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); @@ -817,6 +823,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v3i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v3f16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom); @@ -917,15 +925,18 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, if (VT.isVector()) { EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); - if (Size == 32) - return ScalarVT.getSimpleVT(); + if (Size == 16) { + if (Subtarget->has16BitInsts()) + return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + return VT.isInteger() ? MVT::i32 : MVT::f32; + } - if (Size > 32) - return MVT::i32; + if (Size < 16) + return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32; + return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32; + } - if (Size == 16 && Subtarget->has16BitInsts()) - return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; - } else if (VT.getSizeInBits() > 32) + if (VT.getSizeInBits() > 32) return MVT::i32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); @@ -942,14 +953,15 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); - if (Size == 32) + // FIXME: Should probably promote 8-bit vectors to i16. + if (Size == 16 && Subtarget->has16BitInsts()) + return (NumElts + 1) / 2; + + if (Size <= 32) return NumElts; if (Size > 32) return NumElts * ((Size + 31) / 32); - - if (Size == 16 && Subtarget->has16BitInsts()) - return (NumElts + 1) / 2; } else if (VT.getSizeInBits() > 32) return (VT.getSizeInBits() + 31) / 32; @@ -964,6 +976,16 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( unsigned NumElts = VT.getVectorNumElements(); EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); + // FIXME: We should fix the ABI to be the same on targets without 16-bit + // support, but unless we can properly handle 3-vectors, it will be still be + // inconsistent. + if (Size == 16 && Subtarget->has16BitInsts()) { + RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + IntermediateVT = RegisterVT; + NumIntermediates = (NumElts + 1) / 2; + return NumIntermediates; + } + if (Size == 32) { RegisterVT = ScalarVT.getSimpleVT(); IntermediateVT = RegisterVT; @@ -971,20 +993,26 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( return NumIntermediates; } - if (Size > 32) { + if (Size < 16 && Subtarget->has16BitInsts()) { + // FIXME: Should probably form v2i16 pieces + RegisterVT = MVT::i16; + IntermediateVT = ScalarVT; + NumIntermediates = NumElts; + return NumIntermediates; + } + + + if (Size != 16 && Size <= 32) { RegisterVT = MVT::i32; - IntermediateVT = RegisterVT; - NumIntermediates = NumElts * ((Size + 31) / 32); + IntermediateVT = ScalarVT; + NumIntermediates = NumElts; return NumIntermediates; } - // FIXME: We should fix the ABI to be the same on targets without 16-bit - // support, but unless we can properly handle 3-vectors, it will be still be - // inconsistent. - if (Size == 16 && Subtarget->has16BitInsts()) { - RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + if (Size > 32) { + RegisterVT = MVT::i32; IntermediateVT = RegisterVT; - NumIntermediates = (NumElts + 1) / 2; + NumIntermediates = NumElts * ((Size + 31) / 32); return NumIntermediates; } } @@ -1121,7 +1149,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_buffer_atomic_fadd: { SIMachineFunctionInfo *MFI = MF.getInfo(); - Info.opc = ISD::INTRINSIC_VOID; + Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); Info.ptrVal = MFI->getBufferPSV( *MF.getSubtarget().getInstrInfo(), @@ -1135,18 +1163,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } - case Intrinsic::amdgcn_global_atomic_fadd: { - Info.opc = ISD::INTRINSIC_VOID; - Info.memVT = MVT::getVT(CI.getOperand(0)->getType() - ->getPointerElementType()); - Info.ptrVal = CI.getOperand(0); - Info.align.reset(); - - // FIXME: Should report an atomic ordering here. - Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - - return true; - } case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: { Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1171,6 +1187,28 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_global_atomic_fadd: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile; + return true; + } + case Intrinsic::amdgcn_image_bvh_intersect_ray: { + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT? + Info.ptrVal = MFI->getImagePSV( + *MF.getSubtarget().getInstrInfo(), CI.getArgOperand(5)); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MODereferenceable; + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -1417,8 +1455,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( } if (Size == 96) { // ds_read/write_b96 require 16-byte alignment on gfx8 and older. - bool Aligned = - Alignment >= Align(Subtarget->hasUnalignedDSAccess() ? 4 : 16); + bool Aligned = Alignment >= Align((Subtarget->hasUnalignedDSAccess() && + !Subtarget->hasLDSMisalignedBug()) + ? 4 + : 16); if (IsFast) *IsFast = Aligned; @@ -1428,8 +1468,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we // can do a 8 byte aligned, 16 byte access in a single operation using // ds_read2/write2_b64. - bool Aligned = - Alignment >= Align(Subtarget->hasUnalignedDSAccess() ? 4 : 8); + bool Aligned = Alignment >= Align((Subtarget->hasUnalignedDSAccess() && + !Subtarget->hasLDSMisalignedBug()) + ? 4 + : 8); if (IsFast) *IsFast = Aligned; @@ -1661,9 +1703,9 @@ SDValue SITargetLowering::lowerKernargMemParameter( // TODO: If we passed in the base kernel offset we could have a better // alignment than 4, but we don't really need it. SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset); - SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4, + SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4), MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachineMemOperand::MOInvariant); SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32); SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt); @@ -3070,8 +3112,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, MemOpChains.push_back(Cpy); } else { - SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, - Alignment ? Alignment->value() : 0); + SDValue Store = + DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment); MemOpChains.push_back(Store); } } @@ -4232,9 +4274,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return emitGWSMemViolTestLoop(MI, BB); case AMDGPU::S_SETREG_B32: { - if (!getSubtarget()->hasDenormModeInst()) - return BB; - // Try to optimize cases that only set the denormal mode or rounding mode. // // If the s_setreg_b32 fully sets all of the bits in the rounding mode or @@ -4244,9 +4283,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( // FIXME: This could be predicates on the immediate, but tablegen doesn't // allow you to have a no side effect instruction in the output of a // sideeffecting pattern. - - // TODO: Should also emit a no side effects pseudo if only FP bits are - // touched, even if not all of them or to a variable. unsigned ID, Offset, Width; AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width); if (ID != AMDGPU::Hwreg::ID_MODE) @@ -4254,50 +4290,54 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const unsigned WidthMask = maskTrailingOnes(Width); const unsigned SetMask = WidthMask << Offset; - unsigned SetDenormOp = 0; - unsigned SetRoundOp = 0; - - // The dedicated instructions can only set the whole denorm or round mode at - // once, not a subset of bits in either. - if (Width == 8 && (SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK | - AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) { - // If this fully sets both the round and denorm mode, emit the two - // dedicated instructions for these. - assert(Offset == 0); - SetRoundOp = AMDGPU::S_ROUND_MODE; - SetDenormOp = AMDGPU::S_DENORM_MODE; - } else if (Width == 4) { - if ((SetMask & AMDGPU::Hwreg::FP_ROUND_MASK) == SetMask) { + + if (getSubtarget()->hasDenormModeInst()) { + unsigned SetDenormOp = 0; + unsigned SetRoundOp = 0; + + // The dedicated instructions can only set the whole denorm or round mode + // at once, not a subset of bits in either. + if (SetMask == + (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) { + // If this fully sets both the round and denorm mode, emit the two + // dedicated instructions for these. + SetRoundOp = AMDGPU::S_ROUND_MODE; + SetDenormOp = AMDGPU::S_DENORM_MODE; + } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) { SetRoundOp = AMDGPU::S_ROUND_MODE; - assert(Offset == 0); - } else if ((SetMask & AMDGPU::Hwreg::FP_DENORM_MASK) == SetMask) { + } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) { SetDenormOp = AMDGPU::S_DENORM_MODE; - assert(Offset == 4); } - } - if (SetRoundOp || SetDenormOp) { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg()); - if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) { - unsigned ImmVal = Def->getOperand(1).getImm(); - if (SetRoundOp) { - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp)) - .addImm(ImmVal & 0xf); + if (SetRoundOp || SetDenormOp) { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg()); + if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) { + unsigned ImmVal = Def->getOperand(1).getImm(); + if (SetRoundOp) { + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp)) + .addImm(ImmVal & 0xf); + + // If we also have the denorm mode, get just the denorm mode bits. + ImmVal >>= 4; + } - // If we also have the denorm mode, get just the denorm mode bits. - ImmVal >>= 4; - } + if (SetDenormOp) { + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp)) + .addImm(ImmVal & 0xf); + } - if (SetDenormOp) { - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp)) - .addImm(ImmVal & 0xf); + MI.eraseFromParent(); + return BB; } - - MI.eraseFromParent(); } } + // If only FP bits are touched, used the no side effects pseudo. + if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK | + AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) + MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode)); + return BB; } default: @@ -4555,15 +4595,27 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +// Used for D16: Casts the result of an instruction into the right vector, +// packs values if loads return unpacked values. static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked) { if (!LoadVT.isVector()) return Result; + // Cast back to the original packed type or to a larger type that is a + // multiple of 32 bit for D16. Widening the return type is a required for + // legalization. + EVT FittingLoadVT = LoadVT; + if ((LoadVT.getVectorNumElements() % 2) == 1) { + FittingLoadVT = + EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(), + LoadVT.getVectorNumElements() + 1); + } + if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. // Truncate to v2i16/v4i16. - EVT IntLoadVT = LoadVT.changeTypeToInteger(); + EVT IntLoadVT = FittingLoadVT.changeTypeToInteger(); // Workaround legalizer not scalarizing truncate after vector op // legalization but not creating intermediate vector trunc. @@ -4572,14 +4624,18 @@ static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, for (SDValue &Elt : Elts) Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); + // Pad illegal v1i16/v3fi6 to v4i16 + if ((LoadVT.getVectorNumElements() % 2) == 1) + Elts.push_back(DAG.getUNDEF(MVT::i16)); + Result = DAG.getBuildVector(IntLoadVT, DL, Elts); // Bitcast to original type (v2f16/v4f16). - return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); + return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result); } // Cast back to the original packed type. - return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); + return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result); } SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, @@ -4593,10 +4649,16 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, EVT LoadVT = M->getValueType(0); EVT EquivLoadVT = LoadVT; - if (Unpacked && LoadVT.isVector()) { - EquivLoadVT = LoadVT.isVector() ? - EVT::getVectorVT(*DAG.getContext(), MVT::i32, - LoadVT.getVectorNumElements()) : LoadVT; + if (LoadVT.isVector()) { + if (Unpacked) { + EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + LoadVT.getVectorNumElements()); + } else if ((LoadVT.getVectorNumElements() % 2) == 1) { + // Widen v3f16 to legal type + EquivLoadVT = + EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(), + LoadVT.getVectorNumElements() + 1); + } } // Change from v4f16/v2f16 to EquivLoadVT. @@ -4607,8 +4669,6 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops, M->getMemoryVT(), M->getMemOperand()); - if (!Unpacked) // Just adjusted the opcode. - return Load; SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); @@ -4812,8 +4872,9 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { if (Res.getOpcode() == ISD::MERGE_VALUES) { // FIXME: Hacky - Results.push_back(Res.getOperand(0)); - Results.push_back(Res.getOperand(1)); + for (unsigned I = 0; I < Res.getNumOperands(); I++) { + Results.push_back(Res.getOperand(I)); + } } else { Results.push_back(Res); Results.push_back(Res.getValue(1)); @@ -5232,7 +5293,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, // be available and how do we get it? MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, - MinAlign(64, StructOffset), + commonAlignment(Align(64), StructOffset), MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); } @@ -5843,10 +5904,18 @@ static SDValue constructRetValue(SelectionDAG &DAG, if (IsD16) Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked); - if (!ReqRetVT.isVector()) + EVT LegalReqRetVT = ReqRetVT; + if (!ReqRetVT.isVector()) { Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data); - - Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data); + } else { + // We need to widen the return vector to a legal type + if ((ReqRetVT.getVectorNumElements() % 2) == 1) { + LegalReqRetVT = + EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(), + ReqRetVT.getVectorNumElements() + 1); + } + } + Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data); if (TexFail) return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL); @@ -7035,7 +7104,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_umax: case Intrinsic::amdgcn_buffer_atomic_and: case Intrinsic::amdgcn_buffer_atomic_or: - case Intrinsic::amdgcn_buffer_atomic_xor: { + case Intrinsic::amdgcn_buffer_atomic_xor: + case Intrinsic::amdgcn_buffer_atomic_fadd: { unsigned Slc = cast(Op.getOperand(6))->getZExtValue(); unsigned IdxEn = 1; if (auto Idx = dyn_cast(Op.getOperand(4))) @@ -7095,6 +7165,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_xor: Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; + case Intrinsic::amdgcn_buffer_atomic_fadd: + if (!Op.getValue(0).use_empty()) { + DiagnosticInfoUnsupported + NoFpRet(DAG.getMachineFunction().getFunction(), + "return versions of fp atomics not supported", + DL.getDebugLoc(), DS_Error); + DAG.getContext()->diagnose(NoFpRet); + return SDValue(); + } + Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD; + break; default: llvm_unreachable("unhandled atomic opcode"); } @@ -7102,6 +7183,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); case Intrinsic::amdgcn_raw_buffer_atomic_swap: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP); case Intrinsic::amdgcn_raw_buffer_atomic_add: @@ -7227,6 +7312,97 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_global_atomic_fadd: { + if (!Op.getValue(0).use_empty()) { + DiagnosticInfoUnsupported + NoFpRet(DAG.getMachineFunction().getFunction(), + "return versions of fp atomics not supported", + DL.getDebugLoc(), DS_Error); + DAG.getContext()->diagnose(NoFpRet); + return SDValue(); + } + MemSDNode *M = cast(Op); + SDValue Ops[] = { + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value + }; + + EVT VT = Op.getOperand(3).getValueType(); + return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, + DAG.getVTList(VT, MVT::Other), Ops, + M->getMemOperand()); + } + case Intrinsic::amdgcn_image_bvh_intersect_ray: { + SDLoc DL(Op); + MemSDNode *M = cast(Op); + SDValue NodePtr = M->getOperand(2); + SDValue RayExtent = M->getOperand(3); + SDValue RayOrigin = M->getOperand(4); + SDValue RayDir = M->getOperand(5); + SDValue RayInvDir = M->getOperand(6); + SDValue TDescr = M->getOperand(7); + + assert(NodePtr.getValueType() == MVT::i32 || + NodePtr.getValueType() == MVT::i64); + assert(RayDir.getValueType() == MVT::v4f16 || + RayDir.getValueType() == MVT::v4f32); + + bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; + bool Is64 = NodePtr.getValueType() == MVT::i64; + unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa + : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; + + SmallVector Ops; + + auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) { + SmallVector Lanes; + DAG.ExtractVectorElements(Op, Lanes, 0, 3); + if (Lanes[0].getValueSizeInBits() == 32) { + for (unsigned I = 0; I < 3; ++I) + Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I])); + } else { + if (IsAligned) { + Ops.push_back( + DAG.getBitcast(MVT::i32, + DAG.getBuildVector(MVT::v2f16, DL, + { Lanes[0], Lanes[1] }))); + Ops.push_back(Lanes[2]); + } else { + SDValue Elt0 = Ops.pop_back_val(); + Ops.push_back( + DAG.getBitcast(MVT::i32, + DAG.getBuildVector(MVT::v2f16, DL, + { Elt0, Lanes[0] }))); + Ops.push_back( + DAG.getBitcast(MVT::i32, + DAG.getBuildVector(MVT::v2f16, DL, + { Lanes[1], Lanes[2] }))); + } + } + }; + + if (Is64) + DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2); + else + Ops.push_back(NodePtr); + + Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); + packLanes(RayOrigin, true); + packLanes(RayDir, true); + packLanes(RayInvDir, false); + Ops.push_back(TDescr); + if (IsA16) + Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1)); + Ops.push_back(M->getChain()); + + auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops); + MachineMemOperand *MemRef = M->getMemOperand(); + DAG.setNodeMemRefs(NewNode, {MemRef}); + return SDValue(NewNode, 0); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) @@ -7277,17 +7453,28 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, return VData; SDLoc DL(VData); - assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"); + unsigned NumElements = StoreVT.getVectorNumElements(); if (Subtarget->hasUnpackedD16VMem()) { // We need to unpack the packed data to store. EVT IntStoreVT = StoreVT.changeTypeToInteger(); SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); - EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, - StoreVT.getVectorNumElements()); + EVT EquivStoreVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements); SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); return DAG.UnrollVectorOp(ZExt.getNode()); + } else if (NumElements == 3) { + EVT IntStoreVT = + EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits()); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + + EVT WidenedStoreVT = EVT::getVectorVT( + *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1); + EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(), + WidenedStoreVT.getStoreSizeInBits()); + SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData); + return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt); } assert(isTypeLegal(StoreVT)); @@ -7467,8 +7654,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, EVT VDataVT = VData.getValueType(); EVT EltType = VDataVT.getScalarType(); bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); - if (IsD16) + if (IsD16) { VData = handleD16VData(VData, DAG); + VDataVT = VData.getValueType(); + } if (!isTypeLegal(VDataVT)) { VData = @@ -7512,8 +7701,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, EVT EltType = VDataVT.getScalarType(); bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); - if (IsD16) + if (IsD16) { VData = handleD16VData(VData, DAG); + VDataVT = VData.getValueType(); + } if (!isTypeLegal(VDataVT)) { VData = @@ -7548,39 +7739,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_raw_buffer_atomic_fadd: - return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); - case Intrinsic::amdgcn_struct_buffer_atomic_fadd: - return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); - case Intrinsic::amdgcn_buffer_atomic_fadd: { - unsigned Slc = cast(Op.getOperand(6))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast(Op.getOperand(4))) - IdxEn = Idx->getZExtValue() != 0; - SDValue Ops[] = { - Chain, - Op.getOperand(2), // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - // We don't know the offset if vindex is non-zero, so clear it. - if (IdxEn) - Offset = 0; - EVT VT = Op.getOperand(2).getValueType(); - - auto *M = cast(Op); - M->getMemOperand()->setOffset(Offset); - - return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_FADD, DL, - Op->getVTList(), Ops, VT, - M->getMemOperand()); - } case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); @@ -10890,7 +11048,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, unsigned Opcode = Node->getMachineOpcode(); if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && - !TII->isGather4(Opcode)) { + !TII->isGather4(Opcode) && + AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) != -1) { return adjustWritemask(Node, DAG); } @@ -11330,6 +11489,60 @@ bool SITargetLowering::checkAsmConstraintValA(SDValue Op, return false; } +// Lower COPY from SGPR to VGPR to real one as they are real transfer instead +// of COPY. +static void lowerSGPRToVGPRCopy(MachineFunction &MF, MachineRegisterInfo &MRI, + const SIRegisterInfo &TRI, + const SIInstrInfo &TII) { + for (MachineBasicBlock &MBB : MF) { + for (auto BI = MBB.begin(), BE = MBB.end(); BI != BE; /*EMPTY*/) { + MachineInstr &MI = *BI++; + + auto IsSGPRToVGPRCopy = [&MRI, &TRI](const MachineInstr &MI) { + if (!MI.isCopy()) + return false; + + auto DstReg = MI.getOperand(0).getReg(); + auto SrcReg = MI.getOperand(1).getReg(); + const auto *DstRC = DstReg.isVirtual() ? MRI.getRegClass(DstReg) + : TRI.getPhysRegClass(DstReg); + const auto *SrcRC = SrcReg.isVirtual() ? MRI.getRegClass(SrcReg) + : TRI.getPhysRegClass(SrcReg); + return (DstRC == &AMDGPU::VGPR_32RegClass || + DstRC == &AMDGPU::VReg_64RegClass) && + (SrcRC == &AMDGPU::SGPR_32RegClass || + SrcRC == &AMDGPU::SGPR_64RegClass); + }; + + // Skip if it's not a copy from SGPR to VGPR. + if (!IsSGPRToVGPRCopy(MI)) + continue; + + const MachineOperand &Src = MI.getOperand(1); + // FIXME: Need subreg support. + if (Src.getSubReg() != AMDGPU::NoSubRegister) + continue; + // FIXME: Need undef support. + if (Src.getReg().isVirtual()) { + auto *DefMI = MRI.getVRegDef(Src.getReg()); + if (!DefMI || DefMI->isImplicitDef()) + continue; + } + + LLVM_DEBUG(dbgs() << "Lower COPY: " << MI); + unsigned Opcode = (TRI.getRegSizeInBits(Src.getReg(), MRI) == 64) + ? AMDGPU::V_MOV_B64_PSEUDO + : AMDGPU::V_MOV_B32_e32; + auto DstReg = MI.getOperand(0).getReg(); + auto MIB = BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(Opcode), DstReg) + .add(MI.getOperand(1)); + (void)MIB; + LLVM_DEBUG(dbgs() << " to: " << *MIB.getInstr()); + MI.eraseFromParent(); + } + } +} + // Figure out which registers should be reserved for stack access. Only after // the function is legalized do we know all of the non-spill stack objects or if // calls are present. @@ -11338,6 +11551,10 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { SIMachineFunctionInfo *Info = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + + if (EnableLowerSGPRToVGPRCopy) + lowerSGPRToVGPRCopy(MF, MRI, *TRI, *TII); if (Info->isEntryFunction()) { // Callable functions have fixed registers used for stack access. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 3e8220ad9db22..6bfa33cef7ced 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -90,7 +90,6 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 87ef8bcaa92e4..ae1f6e212d98e 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -855,7 +855,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( setForceEmitWaitcnt(); bool IsForceEmitWaitcnt = isForceEmitWaitcnt(); - if (MI.isDebugInstr()) + if (MI.isMetaInstruction()) return false; AMDGPU::Waitcnt Wait; @@ -1026,8 +1026,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( continue; RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I); + + const bool IsVGPR = TRI->isVGPR(*MRI, Op.getReg()); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (TRI->isVGPR(*MRI, Op.getReg())) { + if (IsVGPR) { // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the // previous write and this write are the same type of VMEM // instruction, in which case they're guaranteed to write their diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 9aa28cff10868..21ad82d546612 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3070,9 +3070,6 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, // Target-independent instructions do not have an implicit-use of EXEC, even // when they operate on VGPRs. Treating EXEC modifications as scheduling // boundaries prevents incorrect movements of such instructions. - - // TODO: Don't treat setreg with known constant that only changes MODE as - // barrier. return MI.modifiesRegister(AMDGPU::EXEC, &RI) || MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || MI.getOpcode() == AMDGPU::S_SETREG_B32 || diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 13957a6c1f628..7fdbe2afa033c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -173,18 +173,6 @@ class SDBufferAtomic : SDNode ; -class SDBufferAtomicNoRtn : SDNode , // rsrc - SDTCisVT<2, i32>, // vindex(VGPR) - SDTCisVT<3, i32>, // voffset(VGPR) - SDTCisVT<4, i32>, // soffset(SGPR) - SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // cachepolicy(imm) - SDTCisVT<7, i1>]>, // idxen(imm) - [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] ->; - def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">; def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">; def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">; @@ -198,7 +186,7 @@ def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; -def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD">; +def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, @@ -316,18 +304,6 @@ defm atomic_load_fmax_#as : binary_atomic_op; } // End let AddressSpaces = ... } // End foreach AddrSpace -def atomic_fadd_global_noret_impl : PatFrag< - (ops node:$ptr, node:$value), - (atomic_load_fadd node:$ptr, node:$value)> { - // FIXME: Move this - let MemoryVT = f32; - let IsAtomic = 1; - let AddressSpaces = StoreAddress_global.AddrSpaces; -} - -def atomic_fadd_global_noret : PatFrags<(ops node:$src0, node:$src1), - [(int_amdgcn_global_atomic_fadd node:$src0, node:$src1), - (atomic_fadd_global_noret_impl node:$src0, node:$src1)]>; //===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. @@ -562,6 +538,48 @@ def si_setcc_uniform : PatFrag < return true; }]>; +//===----------------------------------------------------------------------===// +// SDNodes PatFrags for a16 loads and stores with 3 components. +// v3f16/v3i16 is widened to v4f16/v4i16, so we need to match on the memory +// load/store size. +//===----------------------------------------------------------------------===// + +class mubuf_intrinsic_load : PatFrag < + (ops node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, + node:$auxiliary, node:$idxen), + (name node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, + node:$auxiliary, node:$idxen)> { + let IsLoad = 1; + let MemoryVT = vt; +} + +class mubuf_intrinsic_store : PatFrag < + (ops node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, + node:$auxiliary, node:$idxen), + (name node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, + node:$auxiliary, node:$idxen)> { + let IsStore = 1; + let MemoryVT = vt; +} + +class mtbuf_intrinsic_load : PatFrag < + (ops node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, + node:$format, node:$auxiliary, node:$idxen), + (name node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, + node:$format, node:$auxiliary, node:$idxen)> { + let IsLoad = 1; + let MemoryVT = vt; +} + +class mtbuf_intrinsic_store : PatFrag < + (ops node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, + node:$format, node:$auxiliary, node:$idxen), + (name node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, + node:$format, node:$auxiliary, node:$idxen)> { + let IsStore = 1; + let MemoryVT = vt; +} + //===----------------------------------------------------------------------===// // SDNodes PatFrags for d16 loads //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 2ac5f6be65802..47b27d63408dd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2040,8 +2040,6 @@ def : GCNPat < SRCMODS.NONE, $src2) >; -// COPY is workaround tablegen bug from multiple outputs -// from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i16 16)) @@ -2435,7 +2433,7 @@ def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; -def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction<1/*NoRtn*/>; +def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 3d612d56a9663..576828c9c8dfd 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -393,6 +393,15 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::DS_WRITE_B64: case AMDGPU::DS_WRITE_B64_gfx9: return DS_WRITE; + case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa: + case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa: + case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa: + case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa: + case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa: + case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa: + case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa: + case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa: + return UNKNOWN; } } diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp index 0e162ac42c111..a2e1486e4b9a6 100644 --- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -242,8 +242,10 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, Status IPChange; for (MachineInstr &MI : MBB) { Status InstrMode = getInstructionMode(MI, TII); - if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) || - (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) { + if (MI.getOpcode() == AMDGPU::S_SETREG_B32 || + MI.getOpcode() == AMDGPU::S_SETREG_B32_mode || + MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || + MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32_mode) { // We preserve any explicit mode register setreg instruction we encounter, // as we assume it has been inserted by a higher authority (this is // likely to be a very rare occurrence). @@ -267,7 +269,8 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, // If this is an immediate then we know the value being set, but if it is // not an immediate then we treat the modified bits of the mode register // as unknown. - if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) { + if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || + MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32_mode) { unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm(); unsigned Mode = (Val << Offset) & Mask; Status Setreg = Status(Mask, Mode); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 8a9899988b4c9..c3ffd5b7d6147 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -503,8 +503,10 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, #endif assert(FIOp && FIOp->isFI() && "frame index must be address operand"); assert(TII->isMUBUF(MI)); - assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == - MF->getInfo()->getStackPtrOffsetReg() && + + MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); + assert(SOffset->getReg() == + MF->getInfo()->getStackPtrOffsetReg() && "should only be seeing stack pointer offset relative FrameIndex"); MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); @@ -513,6 +515,10 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, FIOp->ChangeToRegister(BaseReg, false); OffsetOp->setImm(NewOffset); + + // The move materializing the base address will be an absolute stack address, + // so clear the base offset. + SOffset->ChangeToImmediate(0); } bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 932381c99e0b0..d6dff4b9c8899 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -104,6 +104,9 @@ def HWVALU : ProcResource<1> { def HWRC : ProcResource<1> { // Register destination cache let BufferSize = 1; } +def HWXDL : ProcResource<1> { // MFMA CU + let BufferSize = 0; +} class HWWriteRes resources, int latency> : WriteRes { @@ -138,9 +141,13 @@ multiclass SICommonWriteRes { def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; - def : HWVALUWriteRes; - def : HWVALUWriteRes; - def : HWVALUWriteRes; + + let ResourceCycles = [2] in + def : HWWriteRes; + let ResourceCycles = [8] in + def : HWWriteRes; + let ResourceCycles = [16] in + def : HWWriteRes; def : ReadAdvance; def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>; diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 8f718ce6cb466..0be245f7698e6 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -272,8 +272,8 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { // enabled int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); - unsigned TFEVal = MI.getOperand(TFEIdx).getImm(); - unsigned LWEVal = MI.getOperand(LWEIdx).getImm(); + unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); + unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); int ToUntie = -1; if (TFEVal || LWEVal) { // TFE/LWE is enabled so we need to deal with an implicit tied operand diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index df2e18fd44146..e65096b7448b4 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -813,8 +813,6 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo < "$sdst, $simm16" >; -let hasSideEffects = 1 in { - let mayLoad = 1 in { // s_getreg_b32 should use hasSideEffects = 1 for tablegen to allow // its use in the readcyclecounter selection. @@ -825,40 +823,55 @@ def S_GETREG_B32 : SOPK_Pseudo < "$sdst, $simm16", [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> { let SOPKZext = 1; + let hasSideEffects = 1; } -} +} // End mayLoad = 1 -let mayLoad = 0, mayStore =0 in { +let mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE] in { // FIXME: Need to truncate immediate to 16-bits. -def S_SETREG_B32 : SOPK_Pseudo < +class S_SETREG_B32_Pseudo pattern=[]> : SOPK_Pseudo < "s_setreg_b32", (outs), (ins SReg_32:$sdst, hwreg:$simm16), "$simm16, $sdst", - [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> { + pattern>; +def S_SETREG_B32 : S_SETREG_B32_Pseudo < + [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> { // Use custom inserter to optimize some cases to - // S_DENORM_MODE/S_ROUND_MODE. + // S_DENORM_MODE/S_ROUND_MODE/S_SETREG_B32_mode. let usesCustomInserter = 1; - let Defs = [MODE]; - let Uses = [MODE]; + let hasSideEffects = 1; +} + +// Variant of SETREG that is guaranteed to only touch FP bits in the MODE +// register, so doesn't have unmodeled side effects. +def S_SETREG_B32_mode : S_SETREG_B32_Pseudo { + let hasSideEffects = 0; } // FIXME: Not on SI? //def S_GETREG_REGRD_B32 : SOPK_32 , "s_getreg_regrd_b32">; -def S_SETREG_IMM32_B32 : SOPK_Pseudo < +class S_SETREG_IMM32_B32_Pseudo : SOPK_Pseudo < "s_setreg_imm32_b32", (outs), (ins i32imm:$imm, hwreg:$simm16), "$simm16, $imm"> { let Size = 8; // Unlike every other SOPK instruction. let has_sdst = 0; - let Defs = [MODE]; - let Uses = [MODE]; } +def S_SETREG_IMM32_B32 : S_SETREG_IMM32_B32_Pseudo { + let hasSideEffects = 1; } -} // End hasSideEffects = 1 + +// Variant of SETREG_IMM32 that is guaranteed to only touch FP bits in the MODE +// register, so doesn't have unmodeled side effects. +def S_SETREG_IMM32_B32_mode : S_SETREG_IMM32_B32_Pseudo { + let hasSideEffects = 0; +} + +} // End mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE] class SOPK_WAITCNT pat=[]> : SOPK_Pseudo< diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index dd662d9d06f24..92cbbf336f937 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1380,6 +1380,19 @@ bool isInlinableIntLiteralV216(int32_t Literal) { return Lo16 == Hi16 && isInlinableIntLiteral(Lo16); } +bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) { + assert(HasInv2Pi); + + int16_t Lo16 = static_cast(Literal); + if (isInt<16>(Literal) || isUInt<16>(Literal)) + return true; + + int16_t Hi16 = static_cast(Literal >> 16); + if (!(Literal & 0xffff)) + return true; + return Lo16 == Hi16; +} + bool isArgPassedInSGPR(const Argument *A) { const Function *F = A->getParent(); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 9c66b27733dbe..c5feadb98f13e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -693,6 +693,9 @@ bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi); LLVM_READNONE bool isInlinableIntLiteralV216(int32_t Literal); +LLVM_READNONE +bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi); + bool isArgPassedInSGPR(const Argument *Arg); LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 3048bcc610c76..c4546f989c70d 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -605,16 +605,24 @@ class ThreeOpFrag : PatFrag< let PredicateCodeUsesOperands = 1; // The divergence predicate is irrelevant in GlobalISel, as we have - // proper register bank checks. We also force all VOP instruction - // operands to VGPR, so we should not need to check the constant bus - // restriction. + // proper register bank checks. We just need to verify the constant + // bus restriction when all the sources are considered. // // FIXME: With unlucky SGPR operands, we could penalize code by // blocking folding SGPR->VGPR copies later. // FIXME: There's no register bank verifier - // FIXME: Should add a way for the emitter to recognize this is a - // trivially true predicate to eliminate the check. - let GISelPredicateCode = [{return true;}]; + let GISelPredicateCode = [{ + const int ConstantBusLimit = Subtarget->getConstantBusLimit(AMDGPU::V_ADD3_U32); + int ConstantBusUses = 0; + for (unsigned i = 0; i < 3; ++i) { + const RegisterBank *RegBank = RBI.getRegBank(Operands[i]->getReg(), MRI, TRI); + if (RegBank->getID() == AMDGPU::SGPRRegBankID) { + if (++ConstantBusUses > ConstantBusLimit) + return false; + } + } + return true; + }]; } let SubtargetPredicate = isGFX9Plus in { diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index dd7b520effa86..d81c8efa1597d 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -5678,6 +5678,7 @@ struct OutlinerCosts { const int FrameRegSave; const int CallDefault; const int FrameDefault; + const int SaveRestoreLROnStack; OutlinerCosts(const ARMSubtarget &target) : CallTailCall(target.isThumb() ? 4 : 4), @@ -5689,7 +5690,8 @@ struct OutlinerCosts { CallRegSave(target.isThumb() ? 8 : 12), FrameRegSave(target.isThumb() ? 2 : 4), CallDefault(target.isThumb() ? 8 : 12), - FrameDefault(target.isThumb() ? 2 : 4) {} + FrameDefault(target.isThumb() ? 2 : 4), + SaveRestoreLROnStack(target.isThumb() ? 8 : 8) {} }; unsigned @@ -5830,10 +5832,28 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( C.setCallInfo(MachineOutlinerDefault, Costs.CallDefault); SetCandidateCallInfo(MachineOutlinerDefault, Costs.CallDefault); CandidatesWithoutStackFixups.push_back(C); - } - else + } else return outliner::OutlinedFunction(); } + + // Does every candidate's MBB contain a call? If so, then we might have a + // call in the range. + if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { + // check if the range contains a call. These require a save + restore of + // the link register. + if (std::any_of(FirstCand.front(), FirstCand.back(), + [](const MachineInstr &MI) { return MI.isCall(); })) + NumBytesToCreateFrame += Costs.SaveRestoreLROnStack; + + // Handle the last instruction separately. If it is tail call, then the + // last instruction is a call, we don't want to save + restore in this + // case. However, it could be possible that the last instruction is a + // call without it being valid to tail call this sequence. We should + // consider this as well. + else if (FrameID != MachineOutlinerThunk && + FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) + NumBytesToCreateFrame += Costs.SaveRestoreLROnStack; + } RepeatedSequenceLocs = CandidatesWithoutStackFixups; } @@ -5973,6 +5993,23 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, return outliner::InstrType::Illegal; if (MI.isCall()) { + // Get the function associated with the call. Look at each operand and find + // the one that represents the calle and get its name. + const Function *Callee = nullptr; + for (const MachineOperand &MOP : MI.operands()) { + if (MOP.isGlobal()) { + Callee = dyn_cast(MOP.getGlobal()); + break; + } + } + + // Dont't outline calls to "mcount" like functions, in particular Linux + // kernel function tracing relies on it. + if (Callee && + (Callee->getName() == "\01__gnu_mcount_nc" || + Callee->getName() == "\01mcount" || Callee->getName() == "__mcount")) + return outliner::InstrType::Illegal; + // If we don't know anything about the callee, assume it depends on the // stack layout of the caller. In that case, it's only legal to outline // as a tail-call. Explicitly list the call instructions we know about so @@ -5982,7 +6019,29 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, Opc == ARM::tBLXr || Opc == ARM::tBLXi) UnknownCallOutlineType = outliner::InstrType::LegalTerminator; - return UnknownCallOutlineType; + if (!Callee) + return UnknownCallOutlineType; + + // We have a function we have information about. Check if it's something we + // can safely outline. + MachineFunction *MF = MI.getParent()->getParent(); + MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); + + // We don't know what's going on with the callee at all. Don't touch it. + if (!CalleeMF) + return UnknownCallOutlineType; + + // Check if we know anything about the callee saves on the function. If we + // don't, then don't touch it, since that implies that we haven't computed + // anything about its stack frame yet. + MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); + if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || + MFI.getNumObjects() > 0) + return UnknownCallOutlineType; + + // At this point, we can say that CalleeMF ought to not pass anything on the + // stack. Therefore, we can outline it. + return outliner::InstrType::Legal; } // Since calls are handled, don't touch LR or PC @@ -6045,10 +6104,6 @@ void ARMBaseInstrInfo::restoreLRFromStack( void ARMBaseInstrInfo::buildOutlinedFrame( MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const { - // Nothing is needed for tail-calls. - if (OF.FrameConstructionID == MachineOutlinerTailCall) - return; - // For thunk outlining, rewrite the last instruction from a call to a // tail-call. if (OF.FrameConstructionID == MachineOutlinerThunk) { @@ -6065,9 +6120,57 @@ void ARMBaseInstrInfo::buildOutlinedFrame( if (isThumb && !Call->getOperand(FuncOp).isReg()) MIB.add(predOps(ARMCC::AL)); Call->eraseFromParent(); - return; } + // Is there a call in the outlined range? + auto IsNonTailCall = [](MachineInstr &MI) { + return MI.isCall() && !MI.isReturn(); + }; + if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) { + MachineBasicBlock::iterator It = MBB.begin(); + MachineBasicBlock::iterator Et = MBB.end(); + + if (OF.FrameConstructionID == MachineOutlinerTailCall || + OF.FrameConstructionID == MachineOutlinerThunk) + Et = std::prev(MBB.end()); + + // We have to save and restore LR, we need to add it to the liveins if it + // is not already part of the set. This is suffient since outlined + // functions only have one block. + if (!MBB.isLiveIn(ARM::LR)) + MBB.addLiveIn(ARM::LR); + + // Insert a save before the outlined region + saveLROnStack(MBB, It); + + unsigned StackAlignment = Subtarget.getStackAlignment().value(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const MCRegisterInfo *MRI = STI.getRegisterInfo(); + unsigned DwarfReg = MRI->getDwarfRegNum(ARM::LR, true); + // Add a CFI saying the stack was moved down. + int64_t StackPosEntry = MF.addFrameInst( + MCCFIInstruction::cfiDefCfaOffset(nullptr, StackAlignment)); + BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) + .addCFIIndex(StackPosEntry) + .setMIFlags(MachineInstr::FrameSetup); + + // Add a CFI saying that the LR that we want to find is now higher than + // before. + int64_t LRPosEntry = MF.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfReg, StackAlignment)); + BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) + .addCFIIndex(LRPosEntry) + .setMIFlags(MachineInstr::FrameSetup); + + // Insert a restore before the terminator for the function. Restore LR. + restoreLRFromStack(MBB, Et); + } + + // If this is a tail call outlined function, then there's already a return. + if (OF.FrameConstructionID == MachineOutlinerTailCall || + OF.FrameConstructionID == MachineOutlinerThunk) + return; + // Here we have to insert the return ourselves. Get the correct opcode from // current feature set. BuildMI(MBB, MBB.end(), DebugLoc(), get(Subtarget.getReturnOpcode())) @@ -6134,3 +6237,12 @@ bool ARMBaseInstrInfo::shouldOutlineFromFunctionByDefault( MachineFunction &MF) const { return Subtarget.isMClass() && MF.getFunction().hasMinSize(); } + +bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, + AAResults *AA) const { + // Try hard to rematerialize any VCTPs because if we spill P0, it will block + // the tail predication conversion. This means that the element count + // register has to be live for longer, but that has to be better than + // spill/restore and VPT predication. + return isVCTP(&MI) && !isPredicated(MI); +} diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 53c627c209343..5bf6e880056de 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -452,6 +452,9 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { MachineInstr *canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI, const TargetInstrInfo *TII) const; + bool isReallyTriviallyReMaterializable(const MachineInstr &MI, + AAResults *AA) const override; + private: /// Modeling special VFP / NEON fp MLA / MLS hazards. @@ -635,8 +638,7 @@ static inline unsigned getTailPredVectorWidth(unsigned Opcode) { return 0; } -static inline -bool isVCTP(MachineInstr *MI) { +static inline bool isVCTP(const MachineInstr *MI) { switch (MI->getOpcode()) { default: break; diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index 204e57fefb9a5..86da5a24d3407 100644 --- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -775,15 +775,25 @@ initializeFunctionInfo(const std::vector &CPEMIs) { // Taking the address of a CP entry. case ARM::LEApcrel: - case ARM::LEApcrelJT: - // This takes a SoImm, which is 8 bit immediate rotated. We'll - // pretend the maximum offset is 255 * 4. Since each instruction - // 4 byte wide, this is always correct. We'll check for other - // displacements that fits in a SoImm as well. - Bits = 8; - Scale = 4; - NegOk = true; - IsSoImm = true; + case ARM::LEApcrelJT: { + // This takes a SoImm, which is 8 bit immediate rotated. We'll + // pretend the maximum offset is 255 * 4. Since each instruction + // 4 byte wide, this is always correct. We'll check for other + // displacements that fits in a SoImm as well. + Bits = 8; + NegOk = true; + IsSoImm = true; + unsigned CPI = I.getOperand(op).getIndex(); + MachineInstr *CPEMI = CPEMIs[CPI]; + const Align CPEAlign = getCPEAlign(CPEMI); + const unsigned LogCPEAlign = Log2(CPEAlign); + if (LogCPEAlign >= 2) + Scale = 4; + else + // For constants with less than 4-byte alignment, + // we'll pretend the maximum offset is 255 * 1. + Scale = 1; + } break; case ARM::t2LEApcrel: case ARM::t2LEApcrelJT: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 1239e6bbf6843..d2e755b38ca97 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2517,9 +2517,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), - /* Alignment = */ 0, MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(), + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); } else if (Subtarget->isTargetCOFF()) { assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); @@ -3328,8 +3328,7 @@ ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad( MVT::i32, DL, Chain, DescAddr, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), - /* Alignment = */ 4, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4), MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); Chain = FuncTLVGet.getValue(1); @@ -4998,16 +4997,6 @@ static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); } -// Similar to isLowerSaturate(), but checks for upper-saturating conditions. -static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, - const SDValue TrueVal, const SDValue FalseVal, - const ISD::CondCode CC, const SDValue K) { - return (isGTorGE(CC) && - ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || - (isLTorLE(CC) && - ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); -} - // Check if two chained conditionals could be converted into SSAT or USAT. // // SSAT can replace a set of two conditional selectors that bound a number to an @@ -5019,6 +5008,10 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, // x < k ? (x < -k ? -k : x) : k // etc. // +// LLVM canonicalizes these to either a min(max()) or a max(min()) +// pattern. This function tries to match one of these and will return true +// if successful. +// // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is // a power of 2. // @@ -5026,9 +5019,9 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, // Additionally, the variable is returned in parameter V, the constant in K and // usat is set to true if the conditional represents an unsigned saturation static bool isSaturatingConditional(const SDValue &Op, SDValue &V, - uint64_t &K, bool &usat) { - SDValue LHS1 = Op.getOperand(0); - SDValue RHS1 = Op.getOperand(1); + uint64_t &K, bool &Usat) { + SDValue V1 = Op.getOperand(0); + SDValue K1 = Op.getOperand(1); SDValue TrueVal1 = Op.getOperand(2); SDValue FalseVal1 = Op.getOperand(3); ISD::CondCode CC1 = cast(Op.getOperand(4))->get(); @@ -5037,82 +5030,57 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V, if (Op2.getOpcode() != ISD::SELECT_CC) return false; - SDValue LHS2 = Op2.getOperand(0); - SDValue RHS2 = Op2.getOperand(1); + SDValue V2 = Op2.getOperand(0); + SDValue K2 = Op2.getOperand(1); SDValue TrueVal2 = Op2.getOperand(2); SDValue FalseVal2 = Op2.getOperand(3); ISD::CondCode CC2 = cast(Op2.getOperand(4))->get(); - // Find out which are the constants and which are the variables - // in each conditional - SDValue *K1 = isa(LHS1) ? &LHS1 : isa(RHS1) - ? &RHS1 - : nullptr; - SDValue *K2 = isa(LHS2) ? &LHS2 : isa(RHS2) - ? &RHS2 - : nullptr; - SDValue K2Tmp = isa(TrueVal2) ? TrueVal2 : FalseVal2; - SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; - SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; - SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; - - // We must detect cases where the original operations worked with 16- or - // 8-bit values. In such case, V2Tmp != V2 because the comparison operations - // must work with sign-extended values but the select operations return - // the original non-extended value. - SDValue V2TmpReg = V2Tmp; - if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) - V2TmpReg = V2Tmp->getOperand(0); - - // Check that the registers and the constants have the correct values - // in both conditionals - if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || - V2TmpReg != V2) - return false; + SDValue V1Tmp = V1; + SDValue V2Tmp = V2; - // Figure out which conditional is saturating the lower/upper bound. - const SDValue *LowerCheckOp = - isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) - ? &Op - : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) - ? &Op2 - : nullptr; - const SDValue *UpperCheckOp = - isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) - ? &Op - : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) - ? &Op2 - : nullptr; - - if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) - return false; + if (V1.getOpcode() == ISD::SIGN_EXTEND_INREG && + V2.getOpcode() == ISD::SIGN_EXTEND_INREG) { + V1Tmp = V1.getOperand(0); + V2Tmp = V2.getOperand(0); + } + + // Check that the registers and the constants match a max(min()) or min(max()) + // pattern + if (V1Tmp == TrueVal1 && V2Tmp == TrueVal2 && K1 == FalseVal1 && + K2 == FalseVal2 && + ((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2)))) { - // Check that the constant in the lower-bound check is - // the opposite of the constant in the upper-bound check - // in 1's complement. - int64_t Val1 = cast(*K1)->getSExtValue(); - int64_t Val2 = cast(*K2)->getSExtValue(); - int64_t PosVal = std::max(Val1, Val2); - int64_t NegVal = std::min(Val1, Val2); + // Check that the constant in the lower-bound check is + // the opposite of the constant in the upper-bound check + // in 1's complement. + if (!isa(K1) || !isa(K2)) + return false; + + int64_t Val1 = cast(K1)->getSExtValue(); + int64_t Val2 = cast(K2)->getSExtValue(); + int64_t PosVal = std::max(Val1, Val2); + int64_t NegVal = std::min(Val1, Val2); - if (((Val1 > Val2 && UpperCheckOp == &Op) || - (Val1 < Val2 && UpperCheckOp == &Op2)) && - isPowerOf2_64(PosVal + 1)) { + if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) || + !isPowerOf2_64(PosVal + 1)) + return false; - // Handle the difference between USAT (unsigned) and SSAT (signed) saturation + // Handle the difference between USAT (unsigned) and SSAT (signed) + // saturation if (Val1 == ~Val2) - usat = false; + Usat = false; else if (NegVal == 0) - usat = true; + Usat = true; else return false; - V = V2; - K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive + V = V2Tmp; + // At this point, PosVal is guaranteed to be positive + K = (uint64_t) PosVal; return true; } - return false; } @@ -14765,10 +14733,25 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, }; auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef ExtTypes, SDValue &A, SDValue &B) { - if (ResVT != RetTy || N0->getOpcode() != ISD::MUL) + // For a vmla we are trying to match a larger pattern: + // ExtA = sext/zext A + // ExtB = sext/zext B + // Mul = mul ExtA, ExtB + // vecreduce.add Mul + // There might also be en extra extend between the mul and the addreduce, so + // long as the bitwidth is high enough to make them equivalent (for example + // original v8i16 might be mul at v8i32 and the reduce happens at v8i64). + if (ResVT != RetTy) + return false; + SDValue Mul = N0; + if (Mul->getOpcode() == ExtendCode && + Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= + ResVT.getScalarSizeInBits()) + Mul = Mul->getOperand(0); + if (Mul->getOpcode() != ISD::MUL) return false; - SDValue ExtA = N0->getOperand(0); - SDValue ExtB = N0->getOperand(1); + SDValue ExtA = Mul->getOperand(0); + SDValue ExtB = Mul->getOperand(1); if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode) return false; A = ExtA->getOperand(0); @@ -14780,11 +14763,21 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, }; auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef ExtTypes, SDValue &A, SDValue &B, SDValue &Mask) { + // Same as the pattern above with a select for the zero predicated lanes + // ExtA = sext/zext A + // ExtB = sext/zext B + // Mul = mul ExtA, ExtB + // N0 = select Mask, Mul, 0 + // vecreduce.add N0 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) return false; Mask = N0->getOperand(0); SDValue Mul = N0->getOperand(1); + if (Mul->getOpcode() == ExtendCode && + Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= + ResVT.getScalarSizeInBits()) + Mul = Mul->getOperand(0); if (Mul->getOpcode() != ISD::MUL) return false; SDValue ExtA = Mul->getOperand(0); @@ -14865,6 +14858,26 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask)) return DAG.getNode(ISD::TRUNCATE, dl, ResVT, DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask)); + + // Some complications. We can get a case where the two inputs of the mul are + // the same, then the output sext will have been helpfully converted to a + // zext. Turn it back. + SDValue Op = N0; + if (Op->getOpcode() == ISD::VSELECT) + Op = Op->getOperand(1); + if (Op->getOpcode() == ISD::ZERO_EXTEND && + Op->getOperand(0)->getOpcode() == ISD::MUL) { + SDValue Mul = Op->getOperand(0); + if (Mul->getOperand(0) == Mul->getOperand(1) && + Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) { + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul); + if (Op != N0) + Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0), + N0->getOperand(0), Ext, N0->getOperand(2)); + return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext); + } + } + return SDValue(); } @@ -15322,7 +15335,7 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, - Alignment.value(), MMOFlags, AAInfo); + Alignment, MMOFlags, AAInfo); Loads.push_back(NewLoad); Chains.push_back(SDValue(NewLoad.getNode(), 1)); } @@ -16433,6 +16446,19 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I, switch (II->getIntrinsicID()) { case Intrinsic::fma: return !IsFMS(I); + case Intrinsic::arm_mve_add_predicated: + case Intrinsic::arm_mve_mul_predicated: + case Intrinsic::arm_mve_qadd_predicated: + case Intrinsic::arm_mve_hadd_predicated: + case Intrinsic::arm_mve_vqdmull_predicated: + case Intrinsic::arm_mve_qdmulh_predicated: + case Intrinsic::arm_mve_qrdmulh_predicated: + case Intrinsic::arm_mve_fma_predicated: + return true; + case Intrinsic::arm_mve_sub_predicated: + case Intrinsic::arm_mve_qsub_predicated: + case Intrinsic::arm_mve_hsub_predicated: + return Operand == 1; default: return false; } diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 75543093bcbfe..6c3d3be58c72f 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -450,7 +450,7 @@ class MVE_ScalarShift { let Inst{31-20} = 0b111010100101; let Inst{8} = 0b1; - + let validForTailPredication=1; } class MVE_ScalarShiftSingleReg size, bit rounding, let Inst{12-8} = 0b01011; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } multiclass MVE_VQxDMULH_m; def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>; def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>; +let isReMaterializable = 1 in class MVE_VCTPInst size, list pattern=[]> : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix, "$Rn", vpred_n, "", pattern> { diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index aea137ac0ddb4..cf4bcc743d8fb 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -2490,7 +2490,8 @@ let DecoderMethod = "DecodeForVMRSandVMSR" in { "vmrs", "\t$Rt, fpcxts", []>; } - let Predicates = [HasV8_1MMainline, HasMVEInt] in { + let Predicates = [HasV8_1MMainline, HasMVEInt], + D=MVEDomain, validForTailPredication=1 in { // System level VPR/P0 -> GPR let Uses = [VPR] in def VMRS_VPR : MovFromVFP<0b1100 /* vpr */, (outs GPR:$Rt), (ins), @@ -2845,12 +2846,19 @@ let Defs = [FPSCR] in { } } -let Predicates = [HasV8_1MMainline, HasMVEInt] in { +let Predicates = [HasV8_1MMainline, HasMVEInt], + D=MVEDomain, validForTailPredication=1 in { let Uses = [VPR] in { defm VSTR_VPR : vfp_vstrldr_sysreg<0b0,0b1100, "vpr">; } defm VSTR_P0 : vfp_vstrldr_sysreg<0b0,0b1101, "p0", (outs), (ins VCCR:$P0)>; + + let Defs = [VPR] in { + defm VLDR_VPR : vfp_vstrldr_sysreg<0b1,0b1100, "vpr">; + } + defm VLDR_P0 : vfp_vstrldr_sysreg<0b1,0b1101, "p0", + (outs VCCR:$P0), (ins)>; } let Uses = [FPSCR] in { @@ -2862,11 +2870,3 @@ let Uses = [FPSCR] in { defm VLDR_FPCXTS : vfp_vstrldr_sysreg<0b1,0b1111, "fpcxts">; } } - -let Predicates = [HasV8_1MMainline, HasMVEInt] in { - let Defs = [VPR] in { - defm VLDR_VPR : vfp_vstrldr_sysreg<0b1,0b1100, "vpr">; - } - defm VLDR_P0 : vfp_vstrldr_sysreg<0b1,0b1101, "p0", - (outs VCCR:$P0), (ins)>; -} diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 09bb3b3c6f728..a5da506080878 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -2570,10 +2570,85 @@ static int getBaseOperandIndex(MachineInstr &MI) { case ARM::t2STRHi8: case ARM::t2STRHi12: return 1; + case ARM::MVE_VLDRBS16_post: + case ARM::MVE_VLDRBS32_post: + case ARM::MVE_VLDRBU16_post: + case ARM::MVE_VLDRBU32_post: + case ARM::MVE_VLDRHS32_post: + case ARM::MVE_VLDRHU32_post: + case ARM::MVE_VLDRBU8_post: + case ARM::MVE_VLDRHU16_post: + case ARM::MVE_VLDRWU32_post: + case ARM::MVE_VSTRB16_post: + case ARM::MVE_VSTRB32_post: + case ARM::MVE_VSTRH32_post: + case ARM::MVE_VSTRBU8_post: + case ARM::MVE_VSTRHU16_post: + case ARM::MVE_VSTRWU32_post: + case ARM::MVE_VLDRBS16_pre: + case ARM::MVE_VLDRBS32_pre: + case ARM::MVE_VLDRBU16_pre: + case ARM::MVE_VLDRBU32_pre: + case ARM::MVE_VLDRHS32_pre: + case ARM::MVE_VLDRHU32_pre: + case ARM::MVE_VLDRBU8_pre: + case ARM::MVE_VLDRHU16_pre: + case ARM::MVE_VLDRWU32_pre: + case ARM::MVE_VSTRB16_pre: + case ARM::MVE_VSTRB32_pre: + case ARM::MVE_VSTRH32_pre: + case ARM::MVE_VSTRBU8_pre: + case ARM::MVE_VSTRHU16_pre: + case ARM::MVE_VSTRWU32_pre: + return 2; } return -1; } +static bool isPostIndex(MachineInstr &MI) { + switch (MI.getOpcode()) { + case ARM::MVE_VLDRBS16_post: + case ARM::MVE_VLDRBS32_post: + case ARM::MVE_VLDRBU16_post: + case ARM::MVE_VLDRBU32_post: + case ARM::MVE_VLDRHS32_post: + case ARM::MVE_VLDRHU32_post: + case ARM::MVE_VLDRBU8_post: + case ARM::MVE_VLDRHU16_post: + case ARM::MVE_VLDRWU32_post: + case ARM::MVE_VSTRB16_post: + case ARM::MVE_VSTRB32_post: + case ARM::MVE_VSTRH32_post: + case ARM::MVE_VSTRBU8_post: + case ARM::MVE_VSTRHU16_post: + case ARM::MVE_VSTRWU32_post: + return true; + } + return false; +} + +static bool isPreIndex(MachineInstr &MI) { + switch (MI.getOpcode()) { + case ARM::MVE_VLDRBS16_pre: + case ARM::MVE_VLDRBS32_pre: + case ARM::MVE_VLDRBU16_pre: + case ARM::MVE_VLDRBU32_pre: + case ARM::MVE_VLDRHS32_pre: + case ARM::MVE_VLDRHU32_pre: + case ARM::MVE_VLDRBU8_pre: + case ARM::MVE_VLDRHU16_pre: + case ARM::MVE_VLDRWU32_pre: + case ARM::MVE_VSTRB16_pre: + case ARM::MVE_VSTRB32_pre: + case ARM::MVE_VSTRH32_pre: + case ARM::MVE_VSTRBU8_pre: + case ARM::MVE_VSTRHU16_pre: + case ARM::MVE_VSTRWU32_pre: + return true; + } + return false; +} + // Given a memory access Opcode, check that the give Imm would be a valid Offset // for this instruction (same as isLegalAddressImm), Or if the instruction // could be easily converted to one where that was valid. For example converting @@ -2703,19 +2778,26 @@ static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset, } // Given a Base Register, optimise the load/store uses to attempt to create more -// post-inc accesses. We do this by taking zero offset loads/stores with an add, -// and convert them to a postinc load/store of the same type. Any subsequent -// accesses will be adjusted to use and account for the post-inc value. +// post-inc accesses and less register moves. We do this by taking zero offset +// loads/stores with an add, and convert them to a postinc load/store of the +// same type. Any subsequent accesses will be adjusted to use and account for +// the post-inc value. // For example: // LDR #0 LDR_POSTINC #16 // LDR #4 LDR #-12 // LDR #8 LDR #-8 // LDR #12 LDR #-4 // ADD #16 +// +// At the same time if we do not find an increment but do find an existing +// pre/post inc instruction, we can still adjust the offsets of subsequent +// instructions to save the register move that would otherwise be needed for the +// in-place increment. bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { // We are looking for: // One zero offset load/store that can become postinc MachineInstr *BaseAccess = nullptr; + MachineInstr *PrePostInc = nullptr; // An increment that can be folded in MachineInstr *Increment = nullptr; // Other accesses after BaseAccess that will need to be updated to use the @@ -2734,40 +2816,62 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { if (!Use.getOperand(BaseOp).isReg() || Use.getOperand(BaseOp).getReg() != Base) return false; - if (Use.getOperand(BaseOp + 1).getImm() == 0) + if (isPreIndex(Use) || isPostIndex(Use)) + PrePostInc = &Use; + else if (Use.getOperand(BaseOp + 1).getImm() == 0) BaseAccess = &Use; else OtherAccesses.insert(&Use); } - if (!BaseAccess || !Increment || - BaseAccess->getParent() != Increment->getParent()) - return false; - Register PredReg; - if (Increment->definesRegister(ARM::CPSR) || - getInstrPredicate(*Increment, PredReg) != ARMCC::AL) - return false; + int IncrementOffset; + Register NewBaseReg; + if (BaseAccess && Increment) { + if (PrePostInc || BaseAccess->getParent() != Increment->getParent()) + return false; + Register PredReg; + if (Increment->definesRegister(ARM::CPSR) || + getInstrPredicate(*Increment, PredReg) != ARMCC::AL) + return false; + + LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg " + << Base.virtRegIndex() << "\n"); - LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg " - << Base.virtRegIndex() << "\n"); + // Make sure that Increment has no uses before BaseAccess. + for (MachineInstr &Use : + MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) { + if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) { + LLVM_DEBUG(dbgs() << " BaseAccess doesn't dominate use of increment\n"); + return false; + } + } - // Make sure that Increment has no uses before BaseAccess. - for (MachineInstr &Use : - MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) { - if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) { - LLVM_DEBUG(dbgs() << " BaseAccess doesn't dominate use of increment\n"); + // Make sure that Increment can be folded into Base + IncrementOffset = getAddSubImmediate(*Increment); + unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode( + BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub); + if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) { + LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on postinc\n"); return false; } } + else if (PrePostInc) { + // If we already have a pre/post index load/store then set BaseAccess, + // IncrementOffset and NewBaseReg to the values it already produces, + // allowing us to update and subsequent uses of BaseOp reg with the + // incremented value. + if (Increment) + return false; - // Make sure that Increment can be folded into Base - int IncrementOffset = getAddSubImmediate(*Increment); - unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode( - BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub); - if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) { - LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on postinc\n"); - return false; + LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on already " + << "indexed VirtualReg " << Base.virtRegIndex() << "\n"); + int BaseOp = getBaseOperandIndex(*PrePostInc); + IncrementOffset = PrePostInc->getOperand(BaseOp+1).getImm(); + BaseAccess = PrePostInc; + NewBaseReg = PrePostInc->getOperand(0).getReg(); } + else + return false; // And make sure that the negative value of increment can be added to all // other offsets after the BaseAccess. We rely on either @@ -2801,16 +2905,18 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { return false; } - // Replace BaseAccess with a post inc - LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump()); - LLVM_DEBUG(dbgs() << " And : "; Increment->dump()); - Register NewBaseReg = Increment->getOperand(0).getReg(); - MachineInstr *BaseAccessPost = - createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI); - BaseAccess->eraseFromParent(); - Increment->eraseFromParent(); - (void)BaseAccessPost; - LLVM_DEBUG(dbgs() << " To : "; BaseAccessPost->dump()); + if (!PrePostInc) { + // Replace BaseAccess with a post inc + LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump()); + LLVM_DEBUG(dbgs() << " And : "; Increment->dump()); + NewBaseReg = Increment->getOperand(0).getReg(); + MachineInstr *BaseAccessPost = + createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI); + BaseAccess->eraseFromParent(); + Increment->eraseFromParent(); + (void)BaseAccessPost; + LLVM_DEBUG(dbgs() << " To : "; BaseAccessPost->dump()); + } for (auto *Use : SuccessorAccesses) { LLVM_DEBUG(dbgs() << "Changing: "; Use->dump()); diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index a98590fd79c68..abfd339903c22 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -527,7 +527,12 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) { }; MBB = VCTP->getParent(); - if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(), NumElements)) { + // Remove modifications to the element count since they have no purpose in a + // tail predicated loop. Explicitly refer to the vctp operand no matter which + // register NumElements has been assigned to, since that is what the + // modifications will be using + if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(), + VCTP->getOperand(1).getReg())) { SmallPtrSet ElementChain; SmallPtrSet Ignore = { VCTP }; unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode()); @@ -718,7 +723,7 @@ bool LowOverheadLoop::ValidateLiveOuts() { continue; else if (!isPredicated && retainsOrReduces) return false; - else + else if (!isPredicated) FalseLanesUnknown.insert(&MI); } @@ -849,6 +854,24 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { if (CannotTailPredicate) return false; + const MCInstrDesc &MCID = MI->getDesc(); + uint64_t Flags = MCID.TSFlags; + if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) + return true; + + if (MI->getOpcode() == ARM::MVE_VPSEL || + MI->getOpcode() == ARM::MVE_VPNOT) { + // TODO: Allow VPSEL and VPNOT, we currently cannot because: + // 1) It will use the VPR as a predicate operand, but doesn't have to be + // instead a VPT block, which means we can assert while building up + // the VPT block because we don't find another VPT or VPST to being a new + // one. + // 2) VPSEL still requires a VPR operand even after tail predicating, + // which means we can't remove it unless there is another + // instruction, such as vcmp, that can provide the VPR def. + return false; + } + if (isVCTP(MI)) { // If we find another VCTP, check whether it uses the same value as the main VCTP. // If it does, store it in the SecondaryVCTPs set, else refuse it. @@ -869,28 +892,17 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { if (MI->getOpcode() != ARM::MVE_VPST) { assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 && "VPT does not implicitly define VPR?!"); + CurrentPredicate.clear(); CurrentPredicate.insert(MI); } VPTBlocks.emplace_back(MI, CurrentPredicate); CurrentBlock = &VPTBlocks.back(); return true; - } else if (MI->getOpcode() == ARM::MVE_VPSEL || - MI->getOpcode() == ARM::MVE_VPNOT) { - // TODO: Allow VPSEL and VPNOT, we currently cannot because: - // 1) It will use the VPR as a predicate operand, but doesn't have to be - // instead a VPT block, which means we can assert while building up - // the VPT block because we don't find another VPT or VPST to being a new - // one. - // 2) VPSEL still requires a VPR operand even after tail predicating, - // which means we can't remove it unless there is another - // instruction, such as vcmp, that can provide the VPR def. - return false; } bool IsUse = false; bool IsDef = false; - const MCInstrDesc &MCID = MI->getDesc(); for (int i = MI->getNumOperands() - 1; i >= 0; --i) { const MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || MO.getReg() != ARM::VPR) @@ -908,6 +920,16 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { } } + // If this instruction defines the VPR, update the predicate for the + // proceeding instructions. + if (IsDef) { + // Clear the existing predicate when we're not in VPT Active state. + if (!isVectorPredicated(MI)) + CurrentPredicate.clear(); + CurrentPredicate.insert(MI); + LLVM_DEBUG(dbgs() << "ARM Loops: Adding Predicate: " << *MI); + } + // If we find a vpr def that is not already predicated on the vctp, we've // got disjoint predicates that may not be equivalent when we do the // conversion. @@ -916,16 +938,12 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { return false; } - uint64_t Flags = MCID.TSFlags; - if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) - return true; - // If we find an instruction that has been marked as not valid for tail // predication, only allow the instruction if it's contained within a valid // VPT block. - if ((Flags & ARMII::ValidForTailPredication) == 0 && !IsUse) { + if ((Flags & ARMII::ValidForTailPredication) == 0) { LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI); - return false; + return IsUse; } // If the instruction is already explicitly predicated, then the conversion @@ -1293,6 +1311,12 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I) RemovePredicate(&*I); + // Check if the instruction defining vpr is a vcmp so it can be combined + // with the VPST This should be the divergent instruction + MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->MI->getOpcode()) != 0 + ? Divergent->MI + : nullptr; + unsigned Size = 0; auto E = MachineBasicBlock::reverse_iterator(Divergent->MI); auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI); @@ -1302,13 +1326,32 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { ++Size; ++I; } - // Create a VPST (with a null mask for now, we'll recompute it later). - MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt, - InsertAt->getDebugLoc(), - TII->get(ARM::MVE_VPST)); - MIB.addImm(0); - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen()); - LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); + MachineInstrBuilder MIB; + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " + << *Block.getPredicateThen()); + if (VCMP) { + // Combine the VPST and VCMP into a VPT + MIB = + BuildMI(*InsertAt->getParent(), InsertAt, InsertAt->getDebugLoc(), + TII->get(VCMPOpcodeToVPT(VCMP->getOpcode()))); + MIB.addImm(ARMVCC::Then); + // Register one + MIB.add(VCMP->getOperand(1)); + // Register two + MIB.add(VCMP->getOperand(2)); + // The comparison code, e.g. ge, eq, lt + MIB.add(VCMP->getOperand(3)); + LLVM_DEBUG(dbgs() + << "ARM Loops: Combining with VCMP to VPT: " << *MIB); + LoLoop.ToRemove.insert(VCMP); + } else { + // Create a VPST (with a null mask for now, we'll recompute it later) + // or a VPT in case there was a VCMP right before it + MIB = BuildMI(*InsertAt->getParent(), InsertAt, + InsertAt->getDebugLoc(), TII->get(ARM::MVE_VPST)); + MIB.addImm(0); + LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); + } LoLoop.ToRemove.insert(Block.getPredicateThen()); LoLoop.BlockMasksToRecompute.insert(MIB.getInstr()); } diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 55ac332e2c6a6..cf4115f77fec5 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -407,7 +407,8 @@ void ARMPassConfig::addIRPasses() { // ldrex/strex loops to simplify this, but it needs tidying up. if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) addPass(createCFGSimplificationPass( - SimplifyCFGOptions().sinkCommonInsts(true), [this](const Function &F) { + SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true), + [this](const Function &F) { const auto &ST = this->TM->getSubtarget(F); return ST.hasAnyDataBarrier() && !ST.isThumb1Only(); })); @@ -469,7 +470,7 @@ bool ARMPassConfig::addInstSelector() { } bool ARMPassConfig::addIRTranslator() { - addPass(new IRTranslator()); + addPass(new IRTranslator(getOptLevel())); return false; } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index c789b35f32af5..ce3910754e5b2 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Type.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" @@ -1861,6 +1862,20 @@ bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, return ST->hasMVEIntegerOps(); } +bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + if (!ST->hasMVEIntegerOps()) + return false; + + unsigned ScalarBits = Ty->getScalarSizeInBits(); + switch (Opcode) { + case Instruction::Add: + return ScalarBits <= 32; + default: + return false; + } +} + bool ARMTTIImpl::preferPredicatedReductionSelect( unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { if (!ST->hasMVEIntegerOps()) diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index cc2019b47a076..3ffe31ba883c4 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -186,6 +186,9 @@ class ARMTTIImpl : public BasicTTIImplBase { bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; + bool preferInLoopReduction(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const; + bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; @@ -194,18 +197,7 @@ class ARMTTIImpl : public BasicTTIImplBase { case Intrinsic::experimental_vector_reduce_v2_fadd: case Intrinsic::experimental_vector_reduce_v2_fmul: // We don't have legalization support for ordered FP reductions. - if (!II->getFastMathFlags().allowReassoc()) - return true; - // Can't legalize reductions with soft floats. - return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs(); - - case Intrinsic::experimental_vector_reduce_fmin: - case Intrinsic::experimental_vector_reduce_fmax: - // Can't legalize reductions with soft floats, and NoNan will create - // fminimum which we do not know how to lower. - return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs() || - !II->getFastMathFlags().noNaNs(); - + return !II->getFastMathFlags().allowReassoc(); default: // Don't expand anything else, let legalization deal with it. return false; diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp index d8008320696c3..f36b341157036 100644 --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -1062,6 +1062,7 @@ static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP, FixSummands(YElType, X); XElType = cast(X->getType()); } + assert(XElType && YElType && "Unknown vector types"); // Check that the summands are of compatible types if (XElType != YElType) { LLVM_DEBUG(dbgs() << "masked gathers/scatters: incompatible gep offsets\n"); diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp index ef83e36381104..a99fefefdf25d 100644 --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -119,10 +119,10 @@ class MVETailPredication : public LoopPass { /// load/stores. bool IsPredicatedVectorLoop(); - /// Perform checks on the arguments of @llvm.get.active.lane.mask - /// intrinsic: check if the first is a loop induction variable, and for the - /// the second check that no overflow can occur in the expression that use - /// this backedge-taken count. + /// Perform several checks on the arguments of @llvm.get.active.lane.mask + /// intrinsic. E.g., check that the loop induction variable and the element + /// count are of the form we expect, and also perform overflow checks for + /// the new expressions that are created. bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount, FixedVectorType *VecTy); @@ -373,10 +373,73 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, EnableTailPredication == TailPredication::ForceEnabledNoReductions || EnableTailPredication == TailPredication::ForceEnabled; - // 1) TODO: Check that the TripCount (TC) belongs to this loop (originally). + // 1) Check that the original scalar loop TripCount (TC) belongs to this loop. // The scalar tripcount corresponds the number of elements processed by the // loop, so we will refer to that from this point on. - auto *ElemCountVal = ActiveLaneMask->getOperand(1); + Value *ElemCount = ActiveLaneMask->getOperand(1); + auto *EC= SE->getSCEV(ElemCount); + auto *TC = SE->getSCEV(TripCount); + int VectorWidth = VecTy->getNumElements(); + ConstantInt *ConstElemCount = nullptr; + + if (!SE->isLoopInvariant(EC, L)) { + LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n"); + return false; + } + + if ((ConstElemCount = dyn_cast(ElemCount))) { + ConstantInt *TC = dyn_cast(TripCount); + if (!TC) { + LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in " + "set.loop.iterations\n"); + return false; + } + + // Calculate 2 tripcount values and check that they are consistent with + // each other: + // i) The number of loop iterations extracted from the set.loop.iterations + // intrinsic, multipled by the vector width: + uint64_t TC1 = TC->getZExtValue() * VectorWidth; + + // ii) TC1 has to be equal to TC + 1, with the + 1 to compensate for start + // counting from 0. + uint64_t TC2 = ConstElemCount->getZExtValue() + 1; + + if (TC1 != TC2) { + LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: " + << TC1 << " from set.loop.iterations, and " + << TC2 << " from get.active.lane.mask\n"); + return false; + } + } else if (!ForceTailPredication) { + // Smoke tests if the element count is a runtime value. I.e., this isn't + // fully generic because that would require a full SCEV visitor here. It + // would require extracting the variable from the elementcount SCEV + // expression, and match this up with the tripcount SCEV expression. If + // this matches up, we know both expressions are bound by the same + // variable, and thus we know this tripcount belongs to this loop. The + // checks below will catch most cases though. + if (isa(EC) || isa(EC)) { + // If the element count is a simple AddExpr or SCEVUnknown, which is e.g. + // the case when the element count is just a variable %N, we can just see + // if it is an operand in the tripcount scev expression. + if (isa(TC) && !SE->hasOperand(TC, EC)) { + LLVM_DEBUG(dbgs() << "ARM TP: Can't verify the element counter\n"); + return false; + } + } else if (const SCEVAddRecExpr *AddRecExpr = dyn_cast(EC)) { + // For more complicated AddRecExpr, check that the corresponding loop and + // its loop hierarhy contains the trip count loop. + if (!AddRecExpr->getLoop()->contains(L)) { + LLVM_DEBUG(dbgs() << "ARM TP: Can't verify the element counter\n"); + return false; + } + } else { + LLVM_DEBUG(dbgs() << "ARM TP: Unsupported SCEV type, can't verify the " + "element counter\n"); + return false; + } + } // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow: // @@ -393,16 +456,11 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, // // upperbound(TC) <= UINT_MAX - VectorWidth // - auto *TC = SE->getSCEV(TripCount); unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits(); - int VectorWidth = VecTy->getNumElements(); - auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); - uint64_t MaxMinusVW = Diff.getZExtValue(); - // FIXME: since ranges can be negative we work with signed ranges here, but - // we shouldn't extract the zext'ed values for them. - uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue(); + auto MaxMinusVW = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); + APInt UpperboundTC = SE->getUnsignedRangeMax(TC); - if (UpperboundTC > MaxMinusVW && !ForceTailPredication) { + if (UpperboundTC.ugt(MaxMinusVW) && !ForceTailPredication) { LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n"; dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n"; dbgs() << UpperboundTC << " <= " << MaxMinusVW << " == false\n";); @@ -432,16 +490,16 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set, // we first add 0 to TC such that we can do the <= comparison on both sets. // - auto *ElementCount = SE->getSCEV(ElemCountVal); + // Tmp = ElementCount + (VW-1) - auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount, + auto *ECPlusVWMinus1 = SE->getAddExpr(EC, SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); // Ceil = ElementCount + (VW-1) / VW auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth))); - ConstantRange RangeCeil = SE->getSignedRange(Ceil) ; - ConstantRange RangeTC = SE->getSignedRange(TC) ; + ConstantRange RangeCeil = SE->getUnsignedRange(Ceil) ; + ConstantRange RangeTC = SE->getUnsignedRange(TC) ; if (!RangeTC.isSingleElement()) { auto ZeroRange = ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0)); diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp index bf9b32e1278e3..a816c2412b08c 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -676,7 +676,7 @@ SDValue AVRTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { SDValue FI = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), getPointerTy(DL)); return DAG.getStore(Op.getOperand(0), dl, FI, Op.getOperand(1), - MachinePointerInfo(SV), 0); + MachinePointerInfo(SV)); } SDValue AVRTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -1096,8 +1096,7 @@ SDValue AVRTargetLowering::LowerFormalArguments( // from this parameter. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DL)); InVals.push_back(DAG.getLoad(LocVT, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(MF, FI), - 0)); + MachinePointerInfo::getFixedStack(MF, FI))); } } @@ -1230,8 +1229,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getStore(Chain, DL, Arg, PtrOff, - MachinePointerInfo::getStack(MF, VA.getLocMemOffset()), - 0); + MachinePointerInfo::getStack(MF, VA.getLocMemOffset())); } } diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp index 230bc7adc07ab..0abe42d221207 100644 --- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp +++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp @@ -166,13 +166,13 @@ class AVROperand : public MCParsedAsmOperand { assert(N == 1 && "Invalid number of operands!"); // The operand is actually a imm8, but we have its bitwise // negation in the assembly source, so twiddle it here. - const MCConstantExpr *CE = dyn_cast(getImm()); + const auto *CE = cast(getImm()); Inst.addOperand(MCOperand::createImm(~(uint8_t)CE->getValue())); } bool isImmCom8() const { if (!isImm()) return false; - const MCConstantExpr *CE = dyn_cast(getImm()); + const auto *CE = dyn_cast(getImm()); if (!CE) return false; int64_t Value = CE->getValue(); return isUInt<8>(Value); diff --git a/llvm/lib/Target/Hexagon/Hexagon.h b/llvm/lib/Target/Hexagon/Hexagon.h index 58dadf012da56..98e5710d4fc1d 100644 --- a/llvm/lib/Target/Hexagon/Hexagon.h +++ b/llvm/lib/Target/Hexagon/Hexagon.h @@ -14,12 +14,9 @@ #ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGON_H #define LLVM_LIB_TARGET_HEXAGON_HEXAGON_H -#include "MCTargetDesc/HexagonMCTargetDesc.h" -#include "llvm/CodeGen/TargetLowering.h" -#include "llvm/Target/TargetMachine.h" - namespace llvm { class HexagonTargetMachine; + class ImmutablePass; /// Creates a Hexagon-specific Target Transformation Info pass. ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM); diff --git a/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp index 11a455ce43470..b456cf139c55c 100644 --- a/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "Hexagon.h" +#include "MCTargetDesc/HexagonMCTargetDesc.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunction.h" diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp index 587527d8c32cb..23d0cc829e52a 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp @@ -10,6 +10,7 @@ // to move them together. If we can move them next to each other we do so and // replace them with a combine instruction. //===----------------------------------------------------------------------===// + #include "HexagonInstrInfo.h" #include "HexagonSubtarget.h" #include "llvm/ADT/DenseMap.h" @@ -26,6 +27,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h index 87d385e1ce3c4..c8871cc56c486 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h @@ -11,6 +11,7 @@ #include "Hexagon.h" #include "HexagonBlockRanges.h" +#include "MCTargetDesc/HexagonMCTargetDesc.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index b4b389a7b9568..bdd5c7dd151e2 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -231,10 +231,10 @@ SDNode *HexagonDAGToDAGISel::StoreInstrForLoadIntrinsic(MachineSDNode *LoadN, if (Size >= 4) TS = CurDAG->getStore(SDValue(LoadN, 2), dl, SDValue(LoadN, 0), Loc, PI, - Size); + Align(Size)); else TS = CurDAG->getTruncStore(SDValue(LoadN, 2), dl, SDValue(LoadN, 0), Loc, - PI, MVT::getIntegerVT(Size * 8), Size); + PI, MVT::getIntegerVT(Size * 8), Align(Size)); SDNode *StoreN; { diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 645d28de2b20d..20e5e5a91b124 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1863,6 +1863,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::VALIGN: return "HexagonISD::VALIGN"; case HexagonISD::VALIGNADDR: return "HexagonISD::VALIGNADDR"; case HexagonISD::VPACKL: return "HexagonISD::VPACKL"; + case HexagonISD::VUNPACK: return "HexagonISD::VUNPACK"; + case HexagonISD::VUNPACKU: return "HexagonISD::VUNPACKU"; case HexagonISD::OP_END: break; } return nullptr; @@ -2650,6 +2652,28 @@ HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) llvm_unreachable("Invalid type for zero"); } +SDValue +HexagonTargetLowering::appendUndef(SDValue Val, MVT ResTy, SelectionDAG &DAG) + const { + MVT ValTy = ty(Val); + assert(ValTy.getVectorElementType() == ResTy.getVectorElementType()); + + unsigned ValLen = ValTy.getVectorNumElements(); + unsigned ResLen = ResTy.getVectorNumElements(); + if (ValLen == ResLen) + return Val; + + const SDLoc &dl(Val); + assert(ValLen < ResLen); + assert(ResLen % ValLen == 0); + + SmallVector Concats = {Val}; + for (unsigned i = 1, e = ResLen / ValLen; i < e; ++i) + Concats.push_back(DAG.getUNDEF(ValTy)); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResTy, Concats); +} + SDValue HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { MVT VecTy = ty(Op); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index 8473515b3c758..cc34a4cd03963 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_HEXAGON_HEXAGONISELLOWERING_H #include "Hexagon.h" +#include "MCTargetDesc/HexagonMCTargetDesc.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -93,6 +94,8 @@ enum NodeType : unsigned { // the low halfwords and pack them into the first 32 // halfwords of the output. The rest of the output is // unspecified. + VUNPACK, // Unpacking into low elements with sign extension. + VUNPACKU, // Unpacking into low elements with zero extension. OP_END }; @@ -366,6 +369,7 @@ class HexagonTargetLowering : public TargetLowering { SDValue contractPredicate(SDValue Vec64, const SDLoc &dl, SelectionDAG &DAG) const; SDValue getVectorShiftByInt(SDValue Op, SelectionDAG &DAG) const; + SDValue appendUndef(SDValue Val, MVT ResTy, SelectionDAG &DAG) const; bool isUndef(SDValue Op) const { if (Op.isMachineOpcode()) @@ -480,13 +484,16 @@ class HexagonTargetLowering : public TargetLowering { SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const; SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const; + SDValue WidenHvxLoad(SDValue Op, SelectionDAG &DAG) const; SDValue WidenHvxStore(SDValue Op, SelectionDAG &DAG) const; + SDValue WidenHvxExtend(SDValue Op, SelectionDAG &DAG) const; SDValue WidenHvxTruncate(SDValue Op, SelectionDAG &DAG) const; std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override; + bool shouldWidenToHvx(MVT Ty, SelectionDAG &DAG) const; bool isHvxOperation(SDNode *N, SelectionDAG &DAG) const; SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const; void LowerHvxOperationWrapper(SDNode *N, SmallVectorImpl &Results, diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index e5d05cfe64c47..a61d79ab3364a 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -234,8 +234,12 @@ HexagonTargetLowering::initializeHVXLowering() { MVT VecTy = MVT::getVectorVT(ElemTy, N); auto Action = getPreferredVectorAction(VecTy); if (Action == TargetLoweringBase::TypeWidenVector) { - setOperationAction(ISD::STORE, VecTy, Custom); - setOperationAction(ISD::TRUNCATE, VecTy, Custom); + setOperationAction(ISD::LOAD, VecTy, Custom); + setOperationAction(ISD::STORE, VecTy, Custom); + setOperationAction(ISD::TRUNCATE, VecTy, Custom); + setOperationAction(ISD::ANY_EXTEND, VecTy, Custom); + setOperationAction(ISD::SIGN_EXTEND, VecTy, Custom); + setOperationAction(ISD::ZERO_EXTEND, VecTy, Custom); } } } @@ -1886,6 +1890,38 @@ HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable(Name.c_str()); } +SDValue +HexagonTargetLowering::WidenHvxLoad(SDValue Op, SelectionDAG &DAG) const { + const SDLoc &dl(Op); + auto *LoadN = cast(Op.getNode()); + assert(LoadN->isUnindexed() && "Not widening indexed loads yet"); + assert(LoadN->getMemoryVT().getVectorElementType() != MVT::i1 && + "Not widening loads of i1 yet"); + + SDValue Chain = LoadN->getChain(); + SDValue Base = LoadN->getBasePtr(); + SDValue Offset = DAG.getUNDEF(MVT::i32); + + MVT ResTy = ty(Op); + unsigned HwLen = Subtarget.getVectorLength(); + unsigned ResLen = ResTy.getStoreSize(); + assert(ResLen < HwLen && "vsetq(v1) prerequisite"); + + MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen); + SDValue Mask = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy, + {DAG.getConstant(ResLen, dl, MVT::i32)}, DAG); + + MVT LoadTy = MVT::getVectorVT(MVT::i8, HwLen); + MachineFunction &MF = DAG.getMachineFunction(); + auto *MemOp = MF.getMachineMemOperand(LoadN->getMemOperand(), 0, HwLen); + + SDValue Load = DAG.getMaskedLoad(LoadTy, dl, Chain, Base, Offset, Mask, + DAG.getUNDEF(LoadTy), LoadTy, MemOp, + ISD::UNINDEXED, ISD::NON_EXTLOAD, false); + SDValue Value = opCastElem(Load, ResTy.getVectorElementType(), DAG); + return DAG.getMergeValues({Value, Chain}, dl); +} + SDValue HexagonTargetLowering::WidenHvxStore(SDValue Op, SelectionDAG &DAG) const { const SDLoc &dl(Op); @@ -1912,23 +1948,33 @@ HexagonTargetLowering::WidenHvxStore(SDValue Op, SelectionDAG &DAG) const { assert(ValueLen < HwLen && "vsetq(v1) prerequisite"); MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen); - SDValue StoreQ = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy, - {DAG.getConstant(ValueLen, dl, MVT::i32)}, DAG); + SDValue Mask = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy, + {DAG.getConstant(ValueLen, dl, MVT::i32)}, DAG); MachineFunction &MF = DAG.getMachineFunction(); - auto *MOp = MF.getMachineMemOperand(StoreN->getMemOperand(), 0, HwLen); - return DAG.getMaskedStore(Chain, dl, Value, Base, Offset, StoreQ, ty(Value), - MOp, ISD::UNINDEXED, false, false); + auto *MemOp = MF.getMachineMemOperand(StoreN->getMemOperand(), 0, HwLen); + return DAG.getMaskedStore(Chain, dl, Value, Base, Offset, Mask, ty(Value), + MemOp, ISD::UNINDEXED, false, false); } SDValue -HexagonTargetLowering::WidenHvxTruncate(SDValue Op, SelectionDAG &DAG) const { +HexagonTargetLowering::WidenHvxExtend(SDValue Op, SelectionDAG &DAG) const { const SDLoc &dl(Op); unsigned HwWidth = 8*Subtarget.getVectorLength(); + SDValue Op0 = Op.getOperand(0); + MVT ResTy = ty(Op); + MVT OpTy = ty(Op0); + if (!Subtarget.isHVXElementType(OpTy) || !Subtarget.isHVXElementType(ResTy)) + return SDValue(); + + // .-res, op-> ScalarVec Illegal HVX + // Scalar ok - - + // Illegal widen(insert) widen - + // HVX - widen ok + auto getFactor = [HwWidth](MVT Ty) { unsigned Width = Ty.getSizeInBits(); - assert(HwWidth % Width == 0); - return HwWidth / Width; + return HwWidth > Width ? HwWidth / Width : 1; }; auto getWideTy = [getFactor](MVT Ty) { @@ -1936,19 +1982,60 @@ HexagonTargetLowering::WidenHvxTruncate(SDValue Op, SelectionDAG &DAG) const { return MVT::getVectorVT(Ty.getVectorElementType(), WideLen); }; + unsigned Opcode = Op.getOpcode() == ISD::SIGN_EXTEND ? HexagonISD::VUNPACK + : HexagonISD::VUNPACKU; + SDValue WideOp = appendUndef(Op0, getWideTy(OpTy), DAG); + SDValue WideRes = DAG.getNode(Opcode, dl, getWideTy(ResTy), WideOp); + return WideRes; +} + +SDValue +HexagonTargetLowering::WidenHvxTruncate(SDValue Op, SelectionDAG &DAG) const { + const SDLoc &dl(Op); + unsigned HwWidth = 8*Subtarget.getVectorLength(); + SDValue Op0 = Op.getOperand(0); MVT ResTy = ty(Op); MVT OpTy = ty(Op0); + if (!Subtarget.isHVXElementType(OpTy) || !Subtarget.isHVXElementType(ResTy)) + return SDValue(); + + // .-res, op-> ScalarVec Illegal HVX + // Scalar ok extract(widen) - + // Illegal - widen widen + // HVX - - ok + + auto getFactor = [HwWidth](MVT Ty) { + unsigned Width = Ty.getSizeInBits(); + assert(HwWidth % Width == 0); + return HwWidth / Width; + }; + + auto getWideTy = [getFactor](MVT Ty) { + unsigned WideLen = Ty.getVectorNumElements() * getFactor(Ty); + return MVT::getVectorVT(Ty.getVectorElementType(), WideLen); + }; + if (Subtarget.isHVXVectorType(OpTy)) return DAG.getNode(HexagonISD::VPACKL, dl, getWideTy(ResTy), Op0); - MVT WideOpTy = getWideTy(OpTy); - SmallVector Concats = {Op0}; - for (int i = 0, e = getFactor(OpTy) - 1; i != e; ++i) - Concats.push_back(DAG.getUNDEF(OpTy)); + assert(!isTypeLegal(OpTy) && "HVX-widening a truncate of scalar?"); - SDValue Cat = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideOpTy, Concats); - return DAG.getNode(HexagonISD::VPACKL, dl, getWideTy(ResTy), Cat); + SDValue WideOp = appendUndef(Op0, getWideTy(OpTy), DAG); + SDValue WideRes = DAG.getNode(HexagonISD::VPACKL, dl, getWideTy(ResTy), + WideOp); + // If the original result wasn't legal and was supposed to be widened, + // we're done. + if (shouldWidenToHvx(ResTy, DAG)) + return WideRes; + + // The original result type wasn't meant to be widened to HVX, so + // leave it as it is. Standard legalization should be able to deal + // with it (since now it's a result of a target-idendependent ISD + // node). + assert(ResTy.isVector()); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResTy, + {WideRes, getZero(dl, MVT::i32, DAG)}); } SDValue @@ -1965,6 +2052,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { break; case ISD::LOAD: case ISD::STORE: + case ISD::MLOAD: + case ISD::MSTORE: return SplitHvxMemOp(Op, DAG); case ISD::CTPOP: case ISD::CTLZ: @@ -2029,11 +2118,21 @@ HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N, SDValue Op(N, 0); switch (Opc) { + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + assert(shouldWidenToHvx(ty(Op.getOperand(0)), DAG) && "Not widening?"); + if (SDValue T = WidenHvxExtend(Op, DAG)) + Results.push_back(T); + break; + case ISD::TRUNCATE: + assert(shouldWidenToHvx(ty(Op.getOperand(0)), DAG) && "Not widening?"); + if (SDValue T = WidenHvxTruncate(Op, DAG)) + Results.push_back(T); + break; case ISD::STORE: { - assert( - getPreferredHvxVectorAction(ty(cast(N)->getValue())) == - TargetLoweringBase::TypeWidenVector && - "Not widening?"); + assert(shouldWidenToHvx(ty(cast(N)->getValue()), DAG) && + "Not widening?"); SDValue Store = WidenHvxStore(SDValue(N, 0), DAG); Results.push_back(Store); break; @@ -2061,12 +2160,26 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N, unsigned Opc = N->getOpcode(); SDValue Op(N, 0); switch (Opc) { + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + assert(shouldWidenToHvx(ty(Op), DAG) && "Not widening?"); + if (SDValue T = WidenHvxExtend(Op, DAG)) + Results.push_back(T); + break; case ISD::TRUNCATE: - if (!Subtarget.isHVXVectorType(ty(Op), false)) { - SDValue T = WidenHvxTruncate(Op, DAG); + assert(shouldWidenToHvx(ty(Op), DAG) && "Not widening?"); + if (SDValue T = WidenHvxTruncate(Op, DAG)) Results.push_back(T); - } break; + case ISD::LOAD: { + assert(shouldWidenToHvx(ty(Op), DAG) && "Not widening?"); + SDValue Load = WidenHvxLoad(Op, DAG); + assert(Load->getOpcode() == ISD::MERGE_VALUES); + Results.push_back(Load.getOperand(0)); + Results.push_back(Load.getOperand(1)); + break; + } case ISD::BITCAST: if (isHvxBoolTy(ty(N->getOperand(0)))) { SDValue Op(N, 0); @@ -2084,27 +2197,59 @@ HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.isBeforeLegalizeOps()) return SDValue(); + const SDLoc &dl(N); + SelectionDAG &DAG = DCI.DAG; SDValue Op(N, 0); unsigned Opc = Op.getOpcode(); - if (Opc == ISD::VSELECT) { - // (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0) - SDValue Cond = Op.getOperand(0); - if (Cond->getOpcode() == ISD::XOR) { - SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1); - if (C1->getOpcode() == HexagonISD::QTRUE) { - SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0, - Op.getOperand(2), Op.getOperand(1)); - return VSel; + switch (Opc) { + case ISD::VSELECT: { + // (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0) + SDValue Cond = Op.getOperand(0); + if (Cond->getOpcode() == ISD::XOR) { + SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1); + if (C1->getOpcode() == HexagonISD::QTRUE) + return DAG.getNode(ISD::VSELECT, dl, ty(Op), C0, + Op.getOperand(2), Op.getOperand(1)); } + break; + } + case HexagonISD::VINSERTW0: + if (isUndef(Op.getOperand(1))) + return Op.getOperand(0); + break; + case HexagonISD::VROR: { + SDValue Op0 = Op.getOperand(0); + if (Op0.getOpcode() == HexagonISD::VROR) { + SDValue Vec = Op0.getOperand(0); + SDValue Rot0 = Op.getOperand(1), Rot1 = Op0.getOperand(1); + SDValue Rot = DAG.getNode(ISD::ADD, dl, ty(Rot0), {Rot0, Rot1}); + return DAG.getNode(HexagonISD::VROR, dl, ty(Op), {Vec, Rot}); + } + break; } } + return SDValue(); } +bool +HexagonTargetLowering::shouldWidenToHvx(MVT Ty, SelectionDAG &DAG) const { + assert(!Subtarget.isHVXVectorType(Ty, true)); + auto Action = getPreferredHvxVectorAction(Ty); + if (Action == TargetLoweringBase::TypeWidenVector) { + EVT WideTy = getTypeToTransformTo(*DAG.getContext(), Ty); + assert(WideTy.isSimple()); + return Subtarget.isHVXVectorType(WideTy.getSimpleVT(), true); + } + return false; +} + bool HexagonTargetLowering::isHvxOperation(SDNode *N, SelectionDAG &DAG) const { + if (!Subtarget.useHVXOps()) + return false; // If the type of any result, or any operand type are HVX vector types, // this is an HVX operation. auto IsHvxTy = [this](EVT Ty) { @@ -2122,15 +2267,7 @@ HexagonTargetLowering::isHvxOperation(SDNode *N, SelectionDAG &DAG) const { if (!Op.getValueType().isSimple()) return false; MVT ValTy = ty(Op); - if (ValTy.isVector()) { - auto Action = getPreferredVectorAction(ValTy); - if (Action == TargetLoweringBase::TypeWidenVector) { - EVT WideTy = getTypeToTransformTo(*DAG.getContext(), ValTy); - assert(WideTy.isSimple()); - return Subtarget.isHVXVectorType(WideTy.getSimpleVT(), true); - } - } - return false; + return ValTy.isVector() && shouldWidenToHvx(ValTy, DAG); }; for (int i = 0, e = N->getNumValues(); i != e; ++i) { diff --git a/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp index d818e0897f750..e026bb6d601d0 100644 --- a/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp +++ b/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp @@ -11,7 +11,9 @@ // //===----------------------------------------------------------------------===// +#include "Hexagon.h" #include "llvm/CodeGen/StackProtector.h" +#include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -19,8 +21,6 @@ #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" -#include "Hexagon.h" - using namespace llvm; namespace llvm { diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index b656a845b1526..b84c6eb27fe2a 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -41,6 +41,8 @@ def HexagonQCAT: SDNode<"HexagonISD::QCAT", SDTVecBinOp>; def HexagonQTRUE: SDNode<"HexagonISD::QTRUE", SDTVecLeaf>; def HexagonQFALSE: SDNode<"HexagonISD::QFALSE", SDTVecLeaf>; def HexagonVPACKL: SDNode<"HexagonISD::VPACKL", SDTVecUnaryOp>; +def HexagonVUNPACK: SDNode<"HexagonISD::VUNPACK", SDTVecUnaryOp>; +def HexagonVUNPACKU: SDNode<"HexagonISD::VUNPACKU", SDTVecUnaryOp>; def vzero: PatFrag<(ops), (HexagonVZERO)>; def qtrue: PatFrag<(ops), (HexagonQTRUE)>; @@ -48,8 +50,10 @@ def qfalse: PatFrag<(ops), (HexagonQFALSE)>; def qcat: PatFrag<(ops node:$Qs, node:$Qt), (HexagonQCAT node:$Qs, node:$Qt)>; -def qnot: PatFrag<(ops node:$Qs), (xor node:$Qs, qtrue)>; -def vpackl: PatFrag<(ops node:$Vs), (HexagonVPACKL node:$Vs)>; +def qnot: PatFrag<(ops node:$Qs), (xor node:$Qs, qtrue)>; +def vpackl: PatFrag<(ops node:$Vs), (HexagonVPACKL node:$Vs)>; +def vunpack: PatFrag<(ops node:$Vs), (HexagonVUNPACK node:$Vs)>; +def vunpacku: PatFrag<(ops node:$Vs), (HexagonVUNPACKU node:$Vs)>; def VSxtb: OutPatFrag<(ops node:$Vs), (V6_vunpackb $Vs)>; def VSxth: OutPatFrag<(ops node:$Vs), (V6_vunpackh $Vs)>; @@ -406,9 +410,30 @@ let Predicates = [UseHVX] in { def: Pat<(srl HVI16:$Vs, HVI16:$Vt), (V6_vlsrhv HvxVR:$Vs, HvxVR:$Vt)>; def: Pat<(srl HVI32:$Vs, HVI32:$Vt), (V6_vlsrwv HvxVR:$Vs, HvxVR:$Vt)>; + // Vpackl is a pseudo-op that is used when legalizing widened truncates. + // It should never be produced with a register pair in the output, but + // it can happen to have a pair as an input. def: Pat<(VecI8 (vpackl HVI16:$Vs)), (V6_vdealb HvxVR:$Vs)>; - def: Pat<(VecI8 (vpackl HVI32:$Vs)), (V6_vdealb4w HvxVR:$Vs, (IMPLICIT_DEF))>; + def: Pat<(VecI8 (vpackl HVI32:$Vs)), (V6_vdealb4w (IMPLICIT_DEF), HvxVR:$Vs)>; def: Pat<(VecI16 (vpackl HVI32:$Vs)), (V6_vdealh HvxVR:$Vs)>; + def: Pat<(VecI8 (vpackl HWI16:$Vs)), (V6_vpackeb (HiVec $Vs), (LoVec $Vs))>; + def: Pat<(VecI8 (vpackl HWI32:$Vs)), + (V6_vpackeb (IMPLICIT_DEF), (V6_vpackeh (HiVec $Vs), (LoVec $Vs)))>; + def: Pat<(VecI16 (vpackl HWI32:$Vs)), (V6_vpackeh (HiVec $Vs), (LoVec $Vs))>; + + def: Pat<(VecI16 (vunpack HVI8:$Vs)), (LoVec (VSxtb $Vs))>; + def: Pat<(VecI32 (vunpack HVI8:$Vs)), (LoVec (VSxth (LoVec (VSxtb $Vs))))>; + def: Pat<(VecI32 (vunpack HVI16:$Vs)), (LoVec (VSxth $Vs))>; + def: Pat<(VecPI16 (vunpack HVI8:$Vs)), (VSxtb $Vs)>; + def: Pat<(VecPI32 (vunpack HVI8:$Vs)), (VSxth (LoVec (VSxtb $Vs)))>; + def: Pat<(VecPI32 (vunpack HVI32:$Vs)), (VSxth $Vs)>; + + def: Pat<(VecI16 (vunpacku HVI8:$Vs)), (LoVec (VZxtb $Vs))>; + def: Pat<(VecI32 (vunpacku HVI8:$Vs)), (LoVec (VZxth (LoVec (VZxtb $Vs))))>; + def: Pat<(VecI32 (vunpacku HVI16:$Vs)), (LoVec (VZxth $Vs))>; + def: Pat<(VecPI16 (vunpacku HVI8:$Vs)), (VZxtb $Vs)>; + def: Pat<(VecPI32 (vunpacku HVI8:$Vs)), (VZxth (LoVec (VZxtb $Vs)))>; + def: Pat<(VecPI32 (vunpacku HVI32:$Vs)), (VZxth $Vs)>; def: Pat<(VecI16 (bswap HVI16:$Vs)), (V6_vdelta HvxVR:$Vs, (V6_lvsplatw (A2_tfrsi 0x01010101)))>; diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index b1d06b0c3937a..60792929be918 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -10,10 +10,10 @@ // //===----------------------------------------------------------------------===// +#include "HexagonSubtarget.h" #include "Hexagon.h" #include "HexagonInstrInfo.h" #include "HexagonRegisterInfo.h" -#include "HexagonSubtarget.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" @@ -26,6 +26,7 @@ #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetMachine.h" #include #include #include @@ -38,7 +39,6 @@ using namespace llvm; #define GET_SUBTARGETINFO_TARGET_DESC #include "HexagonGenSubtargetInfo.inc" - static cl::opt EnableBSBSched("enable-bsb-sched", cl::Hidden, cl::ZeroOrMore, cl::init(true)); diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index c47b95c5ad2aa..5b71784bac260 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -275,6 +275,17 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { return makeArrayRef(Types); } + bool isHVXElementType(MVT Ty, bool IncludeBool = false) const { + if (!useHVXOps()) + return false; + if (Ty.isVector()) + Ty = Ty.getVectorElementType(); + if (IncludeBool && Ty == MVT::i1) + return true; + ArrayRef ElemTypes = getHVXElementTypes(); + return llvm::find(ElemTypes, Ty) != ElemTypes.end(); + } + bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const { if (!VecTy.isVector() || !useHVXOps() || VecTy.isScalableVector()) return false; @@ -298,7 +309,7 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { unsigned VecWidth = VecTy.getSizeInBits(); if (VecWidth != 8*HwLen && VecWidth != 16*HwLen) return false; - return llvm::any_of(ElemTypes, [ElemTy] (MVT T) { return ElemTy == T; }); + return llvm::find(ElemTypes, ElemTy) != ElemTypes.end(); } unsigned getTypeAlignment(MVT Ty) const { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 6728306db3d57..37cf391c99838 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -327,6 +327,7 @@ void HexagonPassConfig::addIRPasses() { .forwardSwitchCondToPhi(true) .convertSwitchToLookupTable(true) .needCanonicalLoops(false) + .hoistCommonInsts(true) .sinkCommonInsts(true))); if (EnableLoopPrefetch) addPass(createLoopDataPrefetchPass()); diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 2da35020006e2..3416a56a1de18 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -3025,8 +3025,8 @@ SDValue MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset, MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); int FI = MFI.CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false); SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(), - /* Alignment = */ 0, MachineMemOperand::MOVolatile); + return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(), MaybeAlign(), + MachineMemOperand::MOVolatile); } void MipsTargetLowering:: @@ -4404,7 +4404,7 @@ void MipsTargetLowering::passByValArg( SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg, DAG.getConstant(OffsetInBytes, DL, PtrTy)); SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr, - MachinePointerInfo(), Alignment.value()); + MachinePointerInfo(), Alignment); MemOpChains.push_back(LoadVal.getValue(1)); unsigned ArgReg = ArgRegs[FirstReg + I]; RegsToPass.push_back(std::make_pair(ArgReg, LoadVal)); @@ -4431,7 +4431,7 @@ void MipsTargetLowering::passByValArg( PtrTy)); SDValue LoadVal = DAG.getExtLoad( ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(), - MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment.value()); + MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment); MemOpChains.push_back(LoadVal.getValue(1)); // Shift the loaded value. diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp index bdf29c53cbd54..4a448a5f7c681 100644 --- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp @@ -2307,7 +2307,7 @@ static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr, Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset); return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(), - /* Alignment = */ 16); + Align(16)); } SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op, @@ -2382,7 +2382,7 @@ static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr, Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset); return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(), - /* Alignment = */ 16); + Align(16)); } SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op, diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp index 5433b29f3f089..7e2c43164d52f 100644 --- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp +++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp @@ -316,7 +316,7 @@ void MipsPassConfig::addPreEmitPass() { } bool MipsPassConfig::addIRTranslator() { - addPass(new IRTranslator()); + addPass(new IRTranslator(getOptLevel())); return false; } diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt index 5a06faa16be19..882fb0a5b7e2b 100644 --- a/llvm/lib/Target/PowerPC/CMakeLists.txt +++ b/llvm/lib/Target/PowerPC/CMakeLists.txt @@ -11,10 +11,13 @@ tablegen(LLVM PPCGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM PPCGenRegisterInfo.inc -gen-register-info) tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget) tablegen(LLVM PPCGenExegesis.inc -gen-exegesis) +tablegen(LLVM PPCGenRegisterBank.inc -gen-register-bank) +tablegen(LLVM PPCGenGlobalISel.inc -gen-global-isel) add_public_tablegen_target(PowerPCCommonTableGen) add_llvm_target(PowerPCCodeGen + GISel/PPCInstructionSelector.cpp PPCBoolRetToInt.cpp PPCAsmPrinter.cpp PPCBranchSelector.cpp @@ -49,6 +52,9 @@ add_llvm_target(PowerPCCodeGen PPCExpandISEL.cpp PPCPreEmitPeephole.cpp PPCLowerMASSVEntries.cpp + GISel/PPCCallLowering.cpp + GISel/PPCRegisterBankInfo.cpp + GISel/PPCLegalizerInfo.cpp ) add_subdirectory(AsmParser) diff --git a/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp new file mode 100644 index 0000000000000..dea28e971fedd --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp @@ -0,0 +1,51 @@ +//===-- PPCCallLowering.h - Call lowering for GlobalISel -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the lowering of LLVM calls to machine code calls for +/// GlobalISel. +/// +//===----------------------------------------------------------------------===// + +#include "PPCCallLowering.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "ppc-call-lowering" + +using namespace llvm; + +PPCCallLowering::PPCCallLowering(const PPCTargetLowering &TLI) + : CallLowering(&TLI) {} + +bool PPCCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, + const Value *Val, ArrayRef VRegs, + Register SwiftErrorVReg) const { + assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) && + "Return value without a vreg"); + if (VRegs.size() > 0) + return false; + + MIRBuilder.buildInstr(PPC::BLR8); + return true; +} + +bool PPCCallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef> VRegs) const { + + // If VRegs is empty, then there are no formal arguments to lower and thus can + // always return true. If there are formal arguments, we currently do not + // handle them and thus return false. + return VRegs.empty(); +} + +bool PPCCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const { + return false; +} diff --git a/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h new file mode 100644 index 0000000000000..ef078aa8ed838 --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h @@ -0,0 +1,39 @@ +//===-- PPCCallLowering.h - Call lowering for GlobalISel -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file describes how to lower LLVM calls to machine code calls. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_GISEL_PPCCALLLOWERING_H +#define LLVM_LIB_TARGET_POWERPC_GISEL_PPCCALLLOWERING_H + +#include "PPCISelLowering.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/IR/CallingConv.h" + +namespace llvm { + +class PPCTargetLowering; + +class PPCCallLowering : public CallLowering { +public: + PPCCallLowering(const PPCTargetLowering &TLI); + + bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, + ArrayRef VRegs, + Register SwiftErrorVReg) const override; + bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef> VRegs) const override; + bool lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const override; +}; +} // end namespace llvm + +#endif diff --git a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp new file mode 100644 index 0000000000000..7d64816ed6c7f --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp @@ -0,0 +1,92 @@ +//===- PPCInstructionSelector.cpp --------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the InstructionSelector class for +/// PowerPC. +//===----------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "PPCRegisterBankInfo.h" +#include "PPCSubtarget.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/IntrinsicsPowerPC.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "ppc-gisel" + +using namespace llvm; + +namespace { + +#define GET_GLOBALISEL_PREDICATE_BITSET +#include "PPCGenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATE_BITSET + +class PPCInstructionSelector : public InstructionSelector { +public: + PPCInstructionSelector(const PPCTargetMachine &TM, const PPCSubtarget &STI, + const PPCRegisterBankInfo &RBI); + + bool select(MachineInstr &I) override; + static const char *getName() { return DEBUG_TYPE; } + +private: + /// tblgen generated 'select' implementation that is used as the initial + /// selector for the patterns that do not require complex C++. + bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + + const PPCInstrInfo &TII; + const PPCRegisterInfo &TRI; + const PPCRegisterBankInfo &RBI; + +#define GET_GLOBALISEL_PREDICATES_DECL +#include "PPCGenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_DECL + +#define GET_GLOBALISEL_TEMPORARIES_DECL +#include "PPCGenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_DECL +}; + +} // end anonymous namespace + +#define GET_GLOBALISEL_IMPL +#include "PPCGenGlobalISel.inc" +#undef GET_GLOBALISEL_IMPL + +PPCInstructionSelector::PPCInstructionSelector(const PPCTargetMachine &TM, + const PPCSubtarget &STI, + const PPCRegisterBankInfo &RBI) + : InstructionSelector(), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI), +#define GET_GLOBALISEL_PREDICATES_INIT +#include "PPCGenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_INIT +#define GET_GLOBALISEL_TEMPORARIES_INIT +#include "PPCGenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_INIT +{ +} + +bool PPCInstructionSelector::select(MachineInstr &I) { + if (selectImpl(I, *CoverageInfo)) + return true; + return false; +} + +namespace llvm { +InstructionSelector * +createPPCInstructionSelector(const PPCTargetMachine &TM, + const PPCSubtarget &Subtarget, + const PPCRegisterBankInfo &RBI) { + return new PPCInstructionSelector(TM, Subtarget, RBI); +} +} // end namespace llvm diff --git a/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp b/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp new file mode 100644 index 0000000000000..c16bcaea592bf --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp @@ -0,0 +1,20 @@ +//===- PPCLegalizerInfo.h ----------------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the Machinelegalizer class for PowerPC +//===----------------------------------------------------------------------===// + +#include "PPCLegalizerInfo.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "ppc-legalinfo" + +using namespace llvm; +using namespace LegalizeActions; + +PPCLegalizerInfo::PPCLegalizerInfo(const PPCSubtarget &ST) { computeTables(); } diff --git a/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h b/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h new file mode 100644 index 0000000000000..c73186d3d0c11 --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h @@ -0,0 +1,28 @@ +//===- PPCLegalizerInfo.h ----------------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the Machinelegalizer class for PowerPC +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_GISEL_PPCMACHINELEGALIZER_H +#define LLVM_LIB_TARGET_POWERPC_GISEL_PPCMACHINELEGALIZER_H + +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" + +namespace llvm { + +class PPCSubtarget; + +/// This class provides the information for the PowerPC target legalizer for +/// GlobalISel. +class PPCLegalizerInfo : public LegalizerInfo { +public: + PPCLegalizerInfo(const PPCSubtarget &ST); +}; +} // namespace llvm +#endif diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp new file mode 100644 index 0000000000000..6af79324919cc --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp @@ -0,0 +1,27 @@ +//===- PPCRegisterBankInfo.cpp --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the RegisterBankInfo class for +/// PowerPC. +//===----------------------------------------------------------------------===// + +#include "PPCRegisterBankInfo.h" +#include "PPCRegisterInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "ppc-reg-bank-info" + +#define GET_TARGET_REGBANK_IMPL +#include "PPCGenRegisterBank.inc" + +using namespace llvm; + +PPCRegisterBankInfo::PPCRegisterBankInfo(const TargetRegisterInfo &TRI) + : PPCGenRegisterBankInfo() {} diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h new file mode 100644 index 0000000000000..358d5ed3cf14e --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h @@ -0,0 +1,39 @@ +//===-- PPCRegisterBankInfo.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares the targeting of the RegisterBankInfo class for PowerPC. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_PPC_GISEL_PPCREGISTERBANKINFO_H +#define LLVM_LIB_TARGET_PPC_GISEL_PPCREGISTERBANKINFO_H + +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" + +#define GET_REGBANK_DECLARATIONS +#include "PPCGenRegisterBank.inc" + +namespace llvm { +class TargetRegisterInfo; + +class PPCGenRegisterBankInfo : public RegisterBankInfo { +protected: +#define GET_TARGET_REGBANK_CLASS +#include "PPCGenRegisterBank.inc" +}; + +class PPCRegisterBankInfo final : public PPCGenRegisterBankInfo { +public: + PPCRegisterBankInfo(const TargetRegisterInfo &TRI); +}; +} // namespace llvm + +#endif diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td new file mode 100644 index 0000000000000..0e8a4b7061c5a --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td @@ -0,0 +1,15 @@ +//===-- PPCRegisterBanks.td - Describe the PPC Banks -------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Define the PPC register banks used for GlobalISel. +/// +//===----------------------------------------------------------------------===// + +/// General Purpose Registers +def GPRRegBank : RegisterBank<"GPR", [G8RC]>; diff --git a/llvm/lib/Target/PowerPC/LLVMBuild.txt b/llvm/lib/Target/PowerPC/LLVMBuild.txt index 34c295731697c..ed38d2a402141 100644 --- a/llvm/lib/Target/PowerPC/LLVMBuild.txt +++ b/llvm/lib/Target/PowerPC/LLVMBuild.txt @@ -30,5 +30,5 @@ has_jit = 1 type = Library name = PowerPCCodeGen parent = PowerPC -required_libraries = Analysis AsmPrinter CodeGen Core MC PowerPCDesc PowerPCInfo Scalar SelectionDAG Support Target TransformUtils +required_libraries = Analysis AsmPrinter CodeGen Core MC PowerPCDesc PowerPCInfo Scalar SelectionDAG Support Target TransformUtils GlobalISel add_to_library_groups = PowerPC diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index 006cd57f517e9..601e11d4ee8e5 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -419,7 +419,13 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, } break; case PPC::fixup_ppc_imm34: - report_fatal_error("Unsupported Modifier for fixup_ppc_imm34."); + switch (Modifier) { + default: + report_fatal_error("Unsupported Modifier for fixup_ppc_imm34."); + case MCSymbolRefExpr::VK_TPREL: + Type = ELF::R_PPC64_TPREL34; + break; + } break; case FK_Data_8: switch (Modifier) { diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h index e8a9032bfbeec..e242d319470bc 100644 --- a/llvm/lib/Target/PowerPC/PPC.h +++ b/llvm/lib/Target/PowerPC/PPC.h @@ -20,17 +20,20 @@ #undef PPC namespace llvm { - class PPCTargetMachine; - class PassRegistry; - class FunctionPass; - class MachineInstr; - class MachineOperand; - class AsmPrinter; - class MCInst; - class MCOperand; - class ModulePass; - - FunctionPass *createPPCCTRLoops(); +class PPCRegisterBankInfo; +class PPCSubtarget; +class PPCTargetMachine; +class PassRegistry; +class FunctionPass; +class InstructionSelector; +class MachineInstr; +class MachineOperand; +class AsmPrinter; +class MCInst; +class MCOperand; +class ModulePass; + +FunctionPass *createPPCCTRLoops(); #ifndef NDEBUG FunctionPass *createPPCCTRLoopsVerify(); #endif @@ -78,7 +81,10 @@ namespace llvm { ModulePass *createPPCLowerMASSVEntriesPass(); void initializePPCLowerMASSVEntriesPass(PassRegistry &); extern char &PPCLowerMASSVEntriesID; - + + InstructionSelector * + createPPCInstructionSelector(const PPCTargetMachine &, const PPCSubtarget &, + const PPCRegisterBankInfo &); namespace PPCII { /// Target Operand Flag enum. diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index a617715d4bd86..81e5b3859a1f5 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -174,6 +174,9 @@ def FeatureAddisLoadFusion : SubtargetFeature<"fuse-addis-load", "HasAddisLoadFusion", "true", "Power8 Addis-Load fusion", [FeatureFusion]>; +def FeatureStoreFusion : SubtargetFeature<"fuse-store", "HasStoreFusion", "true", + "Target supports store clustering", + [FeatureFusion]>; def FeatureUnalignedFloats : SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess", "true", "CPU does not trap on unaligned FP access">; @@ -325,6 +328,8 @@ def ProcessorFeatures { [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, + FeaturePPCPreRASched, + FeaturePPCPostRASched, FeatureISA3_0, FeaturePredictableSelectIsExpensive ]; @@ -334,9 +339,7 @@ def ProcessorFeatures { // dispatch for vector operations than scalar ones. For the time being, // this list also includes scheduling-related features since we do not have // enough info to create custom scheduling strategies for future CPUs. - list P9SpecificFeatures = [FeatureVectorsUseTwoUnits, - FeaturePPCPreRASched, - FeaturePPCPostRASched]; + list P9SpecificFeatures = [FeatureVectorsUseTwoUnits]; list P9InheritableFeatures = !listconcat(P8InheritableFeatures, P9AdditionalFeatures); list P9Features = @@ -345,10 +348,12 @@ def ProcessorFeatures { // Power10 // For P10 CPU we assume that all of the existing features from Power9 // still exist with the exception of those we know are Power9 specific. + list FusionFeatures = [FeatureStoreFusion]; list P10AdditionalFeatures = - [DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs, - FeaturePCRelativeMemops, FeatureP10Vector, FeatureMMA, - FeaturePairedVectorMemops]; + !listconcat(FusionFeatures, [ + DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs, + FeaturePCRelativeMemops, FeatureP10Vector, FeatureMMA, + FeaturePairedVectorMemops]); list P10SpecificFeatures = []; list P10InheritableFeatures = !listconcat(P9InheritableFeatures, P10AdditionalFeatures); @@ -433,6 +438,7 @@ def getAltVSXFMAOpcode : InstrMapping { include "PPCRegisterInfo.td" include "PPCSchedule.td" +include "GISel/PPCRegisterBanks.td" //===----------------------------------------------------------------------===// // PowerPC processors supported. @@ -558,7 +564,7 @@ def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>; def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>; def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>; // No scheduler model yet. -def : ProcessorModel<"pwr10", NoSchedModel, ProcessorFeatures.P10Features>; +def : ProcessorModel<"pwr10", P9Model, ProcessorFeatures.P10Features>; // No scheduler model for future CPU. def : ProcessorModel<"future", NoSchedModel, ProcessorFeatures.FutureFeatures>; diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 8f1477012bfdd..f950e748158f5 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -579,6 +579,38 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { } } #endif + + auto getTOCRelocAdjustedExprForXCOFF = [this](const MCExpr *Expr, + ptrdiff_t OriginalOffset) { + // Apply an offset to the TOC-based expression such that the adjusted + // notional offset from the TOC base (to be encoded into the instruction's D + // or DS field) is the signed 16-bit truncation of the original notional + // offset from the TOC base. + // This is consistent with the treatment used both by XL C/C++ and + // by AIX ld -r. + ptrdiff_t Adjustment = + OriginalOffset - llvm::SignExtend32<16>(OriginalOffset); + return MCBinaryExpr::createAdd( + Expr, MCConstantExpr::create(-Adjustment, OutContext), OutContext); + }; + + auto getTOCEntryLoadingExprForXCOFF = + [IsPPC64, getTOCRelocAdjustedExprForXCOFF, + this](const MCSymbol *MOSymbol, const MCExpr *Expr) -> const MCExpr * { + const unsigned EntryByteSize = IsPPC64 ? 8 : 4; + const auto TOCEntryIter = TOC.find(MOSymbol); + assert(TOCEntryIter != TOC.end() && + "Could not find the TOC entry for this symbol."); + const ptrdiff_t EntryDistanceFromTOCBase = + (TOCEntryIter - TOC.begin()) * EntryByteSize; + constexpr int16_t PositiveTOCRange = INT16_MAX; + + if (EntryDistanceFromTOCBase > PositiveTOCRange) + return getTOCRelocAdjustedExprForXCOFF(Expr, EntryDistanceFromTOCBase); + + return Expr; + }; + // Lower multi-instruction pseudo operations. switch (MI->getOpcode()) { default: break; @@ -725,6 +757,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { assert( TM.getCodeModel() == CodeModel::Small && "This pseudo should only be selected for 32-bit small code model."); + Exp = getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp); TmpInst.getOperand(1) = MCOperand::createExpr(Exp); EmitToStreamer(*OutStreamer, TmpInst); return; @@ -753,17 +786,20 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) && "Invalid operand!"); + // Map the operand to its corresponding MCSymbol. + const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); + // Map the machine operand to its corresponding MCSymbol, then map the // global address operand to be a reference to the TOC entry we will // synthesize later. - MCSymbol *TOCEntry = - lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO, *this)); + MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol); const MCSymbolRefExpr::VariantKind VK = IsAIX ? MCSymbolRefExpr::VK_None : MCSymbolRefExpr::VK_PPC_TOC; const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry, VK, OutContext); - TmpInst.getOperand(1) = MCOperand::createExpr(Exp); + TmpInst.getOperand(1) = MCOperand::createExpr( + IsAIX ? getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp) : Exp); EmitToStreamer(*OutStreamer, TmpInst); return; } @@ -1821,16 +1857,6 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) { PPCTargetStreamer *TS = static_cast(OutStreamer->getTargetStreamer()); - const unsigned EntryByteSize = Subtarget->isPPC64() ? 8 : 4; - const unsigned TOCEntriesByteSize = TOC.size() * EntryByteSize; - // TODO: If TOC entries' size is larger than 32768, then we run out of - // positive displacement to reach the TOC entry. We need to decide how to - // handle entries' size larger than that later. - if (TOCEntriesByteSize > 32767) { - report_fatal_error("Handling of TOC entry displacement larger than 32767 " - "is not yet implemented."); - } - for (auto &I : TOC) { // Setup the csect for the current TC entry. MCSectionXCOFF *TCEntry = cast( diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 62bb5cc1e8062..a70e7468a15b2 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -691,6 +691,8 @@ bool PPCDAGToDAGISel::tryTLSXFormLoad(LoadSDNode *LD) { SDValue Offset = LD->getOffset(); if (!Offset.isUndef()) return false; + if (Base.getOperand(1).getOpcode() == PPCISD::TLS_LOCAL_EXEC_MAT_ADDR) + return false; SDLoc dl(LD); EVT MemVT = LD->getMemoryVT(); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index b213abb57aa83..6bdebf9111d6e 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -316,8 +316,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal); - if (Subtarget.hasVSX()) - setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Legal); + if (Subtarget.hasVSX()) { + setOperationAction(ISD::STRICT_FRINT, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FRINT, MVT::f64, Legal); + } if (Subtarget.hasFSQRT()) { setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); @@ -886,6 +888,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SREM, MVT::v2i64, Legal); setOperationAction(ISD::UREM, MVT::v4i32, Legal); setOperationAction(ISD::SREM, MVT::v4i32, Legal); + setOperationAction(ISD::UDIV, MVT::v1i128, Legal); + setOperationAction(ISD::SDIV, MVT::v1i128, Legal); } setOperationAction(ISD::MUL, MVT::v8i16, Legal); @@ -1059,7 +1063,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FMAXNUM, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FMINNUM, MVT::v4f32, Legal); - setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal); @@ -1073,7 +1077,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FMAXNUM, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FMINNUM, MVT::v2f64, Legal); - setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal); @@ -1199,6 +1203,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLibcallName(RTLIB::SRA_I128, nullptr); } + if (!isPPC64) + setMaxAtomicSizeInBitsSupported(32); + setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); // We have target-specific dag combine patterns for the following nodes: @@ -1315,6 +1322,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, MaxLoadsPerMemcmpOptSize = 4; } + IsStrictFPEnabled = true; + // Let the subtarget (CPU) decide if a predictable select is more expensive // than the corresponding branch. This information is used in CGP to decide // when to convert selects into branches. @@ -1505,6 +1514,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR"; case PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR: return "PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR"; + case PPCISD::TLS_LOCAL_EXEC_MAT_ADDR: + return "PPCISD::TLS_LOCAL_EXEC_MAT_ADDR"; case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT"; case PPCISD::FNMSUB: return "PPCISD::FNMSUB"; case PPCISD::STRICT_FADDRTZ: @@ -3008,6 +3019,15 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, TLSModel::Model Model = TM.getTLSModel(GV); if (Model == TLSModel::LocalExec) { + if (Subtarget.isUsingPCRelativeCalls()) { + SDValue TLSReg = DAG.getRegister(PPC::X13, MVT::i64); + SDValue TGA = DAG.getTargetGlobalAddress( + GV, dl, PtrVT, 0, (PPCII::MO_PCREL_FLAG | PPCII::MO_TPREL_FLAG)); + SDValue MatAddr = + DAG.getNode(PPCISD::TLS_LOCAL_EXEC_MAT_ADDR, dl, PtrVT, TGA); + return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, MatAddr); + } + SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_HA); SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, @@ -8219,8 +8239,8 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT); EVT DstSetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), DstVT); - SDValue Sel = - DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT, Chain, true); + SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT, + SDNodeFlags(), Chain, true); Chain = Sel.getValue(1); SDValue FltOfs = DAG.getSelect( @@ -14074,8 +14094,7 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N, EVT Op1VT = N->getOperand(1).getValueType(); EVT ResVT = Val.getValueType(); - // Floating point types smaller than 32 bits are not legal on Power. - if (ResVT.getScalarSizeInBits() < 32) + if (!isTypeLegal(ResVT)) return SDValue(); // Only perform combine for conversion to i64/i32 or power9 i16/i8. diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 05c9a5d314133..3e900e2ce2999 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -441,6 +441,11 @@ namespace llvm { /// through an add like PADDI. TLS_DYNAMIC_MAT_PCREL_ADDR, + /// TLS_LOCAL_EXEC_MAT_ADDR = Materialize an address for TLS global address + /// when using local exec access models, and when prefixed instructions are + /// available. This is used with ADD_TLS to produce an add like PADDI. + TLS_LOCAL_EXEC_MAT_ADDR, + // Constrained conversion from floating point to int STRICT_FCTIDZ = ISD::FIRST_TARGET_STRICTFP_OPCODE, STRICT_FCTIWZ, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 0732e0f0ace36..7e5e42fdf47e8 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -2222,6 +2222,112 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return true; } +bool PPCInstrInfo::getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, SmallVectorImpl &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { + const MachineOperand *BaseOp; + OffsetIsScalable = false; + if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI)) + return false; + BaseOps.push_back(BaseOp); + return true; +} + +static bool isLdStSafeToCluster(const MachineInstr &LdSt, + const TargetRegisterInfo *TRI) { + // If this is a volatile load/store, don't mess with it. + if (LdSt.hasOrderedMemoryRef() || LdSt.getNumExplicitOperands() != 3) + return false; + + if (LdSt.getOperand(2).isFI()) + return true; + + assert(LdSt.getOperand(2).isReg() && "Expected a reg operand."); + // Can't cluster if the instruction modifies the base register + // or it is update form. e.g. ld r2,3(r2) + if (LdSt.modifiesRegister(LdSt.getOperand(2).getReg(), TRI)) + return false; + + return true; +} + +// Only cluster instruction pair that have the same opcode, and they are +// clusterable according to PowerPC specification. +static bool isClusterableLdStOpcPair(unsigned FirstOpc, unsigned SecondOpc, + const PPCSubtarget &Subtarget) { + switch (FirstOpc) { + default: + return false; + case PPC::STD: + case PPC::STFD: + case PPC::STXSD: + case PPC::DFSTOREf64: + return FirstOpc == SecondOpc; + // PowerPC backend has opcode STW/STW8 for instruction "stw" to deal with + // 32bit and 64bit instruction selection. They are clusterable pair though + // they are different opcode. + case PPC::STW: + case PPC::STW8: + return SecondOpc == PPC::STW || SecondOpc == PPC::STW8; + } +} + +bool PPCInstrInfo::shouldClusterMemOps( + ArrayRef BaseOps1, + ArrayRef BaseOps2, unsigned NumLoads, + unsigned NumBytes) const { + + assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); + const MachineOperand &BaseOp1 = *BaseOps1.front(); + const MachineOperand &BaseOp2 = *BaseOps2.front(); + assert((BaseOp1.isReg() || BaseOp1.isFI()) && + "Only base registers and frame indices are supported."); + + // The NumLoads means the number of loads that has been clustered. + // Don't cluster memory op if there are already two ops clustered at least. + if (NumLoads > 2) + return false; + + // Cluster the load/store only when they have the same base + // register or FI. + if ((BaseOp1.isReg() != BaseOp2.isReg()) || + (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) || + (BaseOp1.isFI() && BaseOp1.getIndex() != BaseOp2.getIndex())) + return false; + + // Check if the load/store are clusterable according to the PowerPC + // specification. + const MachineInstr &FirstLdSt = *BaseOp1.getParent(); + const MachineInstr &SecondLdSt = *BaseOp2.getParent(); + unsigned FirstOpc = FirstLdSt.getOpcode(); + unsigned SecondOpc = SecondLdSt.getOpcode(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + // Cluster the load/store only when they have the same opcode, and they are + // clusterable opcode according to PowerPC specification. + if (!isClusterableLdStOpcPair(FirstOpc, SecondOpc, Subtarget)) + return false; + + // Can't cluster load/store that have ordered or volatile memory reference. + if (!isLdStSafeToCluster(FirstLdSt, TRI) || + !isLdStSafeToCluster(SecondLdSt, TRI)) + return false; + + int64_t Offset1 = 0, Offset2 = 0; + unsigned Width1 = 0, Width2 = 0; + const MachineOperand *Base1 = nullptr, *Base2 = nullptr; + if (!getMemOperandWithOffsetWidth(FirstLdSt, Base1, Offset1, Width1, TRI) || + !getMemOperandWithOffsetWidth(SecondLdSt, Base2, Offset2, Width2, TRI) || + Width1 != Width2) + return false; + + assert(Base1 == &BaseOp1 && Base2 == &BaseOp2 && + "getMemOperandWithOffsetWidth return incorrect base op"); + // The caller should already have ordered FirstMemOp/SecondMemOp by offset. + assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); + return Offset1 + Width1 == Offset2; +} + /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. /// @@ -4660,11 +4766,12 @@ MachineInstr *PPCInstrInfo::findLoopInstr( bool PPCInstrInfo::getMemOperandWithOffsetWidth( const MachineInstr &LdSt, const MachineOperand *&BaseReg, int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const { - if (!LdSt.mayLoadOrStore()) + if (!LdSt.mayLoadOrStore() || LdSt.getNumExplicitOperands() != 3) return false; // Handle only loads/stores with base register followed by immediate offset. - if (LdSt.getNumExplicitOperands() != 3) + if (!LdSt.getOperand(1).isImm() || + (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI())) return false; if (!LdSt.getOperand(1).isImm() || (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI())) diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 75e8224892f4c..77ee236020a8a 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -494,6 +494,20 @@ class PPCInstrInfo : public PPCGenInstrInfo { int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const; + /// Get the base operand and byte offset of an instruction that reads/writes + /// memory. + bool getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, + SmallVectorImpl &BaseOps, int64_t &Offset, + bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const override; + + /// Returns true if the two given memory operations should be scheduled + /// adjacent. + bool shouldClusterMemOps(ArrayRef BaseOps1, + ArrayRef BaseOps2, + unsigned NumLoads, unsigned NumBytes) const override; + /// Return true if two MIs access different memory addresses and false /// otherwise bool diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index a6932005d5ad1..30605a22ea399 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -368,6 +368,8 @@ def PPCprobedalloca : SDNode<"PPCISD::PROBED_ALLOCA", SDTDynOp, [SDNPHasChain]>; def PPCmatpcreladdr : SDNode<"PPCISD::MAT_PCREL_ADDR", SDTIntUnaryOp, []>; def PPCtlsdynamatpcreladdr : SDNode<"PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR", SDTIntUnaryOp, []>; +def PPCtlslocalexecmataddr : SDNode<"PPCISD::TLS_LOCAL_EXEC_MAT_ADDR", + SDTIntUnaryOp, []>; //===----------------------------------------------------------------------===// // PowerPC specific transformation functions and pattern fragments. @@ -2624,7 +2626,7 @@ let isCompare = 1, hasSideEffects = 0 in { } } let PPC970_Unit = 3, Predicates = [HasFPU] in { // FPU Operations. -let isCompare = 1, hasSideEffects = 0 in { +let isCompare = 1, mayRaiseFPException = 1, hasSideEffects = 0 in { def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB), "fcmpu $crD, $fA, $fB", IIC_FPCompare>; def FCMPOS : XForm_17<63, 32, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB), @@ -3477,7 +3479,7 @@ def : Pat<(f64 (extloadf32 iaddr:$src)), def : Pat<(f64 (extloadf32 xaddr:$src)), (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>; -def : Pat<(f64 (fpextend f32:$src)), +def : Pat<(f64 (any_fpextend f32:$src)), (COPY_TO_REGCLASS $src, F8RC)>; } diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td index 73321dec99d37..553bcdea9bce7 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -481,6 +481,13 @@ class XX2_BF3_XO5_XB6_XO9 opcode, bits<5> xo2, bits<9> xo, dag OOL, let Inst{31} = 0; } +// X-Form: [ PO RT BI /// XO / ] +class XForm_XT5_BI5 opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list pattern> + : XForm_base_r3xo { + let B = 0; +} + multiclass MLS_DForm_R_SI34_RTA5_MEM_p opcode, dag OOL, dag IOL, dag PCRel_IOL, string asmstr, InstrItinClass itin> { @@ -829,6 +836,10 @@ let Predicates = [PCRelativeMemops], AddedComplexity = 500 in { // PPCtlsdynamatpcreladdr node is used for TLS dynamic models to materialize // tls global address with paddi instruction. def : Pat<(PPCtlsdynamatpcreladdr pcreladdr:$addr), (PADDI8pc 0, $addr)>; + // PPCtlslocalexecmataddr node is used for TLS local exec models to + // materialize tls global address with paddi instruction. + def : Pat<(PPCaddTls i64:$in, (PPCtlslocalexecmataddr tglobaltlsaddr:$addr)), + (PADDI8 $in, $addr)>; } let Predicates = [PrefixInstrs] in { @@ -873,6 +884,26 @@ let Predicates = [PrefixInstrs] in { } let Predicates = [IsISA3_1] in { + def SETBC : XForm_XT5_BI5<31, 384, (outs gprc:$RT), (ins crbitrc:$BI), + "setbc $RT, $BI", IIC_IntCompare, []>; + def SETBCR : XForm_XT5_BI5<31, 416, (outs gprc:$RT), (ins crbitrc:$BI), + "setbcr $RT, $BI", IIC_IntCompare, []>; + def SETNBC : XForm_XT5_BI5<31, 448, (outs gprc:$RT), (ins crbitrc:$BI), + "setnbc $RT, $BI", IIC_IntCompare, []>; + def SETNBCR : XForm_XT5_BI5<31, 480, (outs gprc:$RT), (ins crbitrc:$BI), + "setnbcr $RT, $BI", IIC_IntCompare, []>; + + let Interpretation64Bit = 1, isCodeGenOnly = 1 in { + def SETBC8 : XForm_XT5_BI5<31, 384, (outs g8rc:$RT), (ins crbitrc:$BI), + "setbc $RT, $BI", IIC_IntCompare, []>; + def SETBCR8 : XForm_XT5_BI5<31, 416, (outs g8rc:$RT), (ins crbitrc:$BI), + "setbcr $RT, $BI", IIC_IntCompare, []>; + def SETNBC8 : XForm_XT5_BI5<31, 448, (outs g8rc:$RT), (ins crbitrc:$BI), + "setnbc $RT, $BI", IIC_IntCompare, []>; + def SETNBCR8 : XForm_XT5_BI5<31, 480, (outs g8rc:$RT), (ins crbitrc:$BI), + "setnbcr $RT, $BI", IIC_IntCompare, []>; + } + def VSLDBI : VNForm_VTAB5_SD3<22, 0, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB, u3imm:$SH), "vsldbi $VRT, $VRA, $VRB, $SH", @@ -1042,19 +1073,23 @@ let Predicates = [IsISA3_1] in { def VCNTMBB : VXForm_RD5_MP_VB5<1602, 12, (outs g8rc:$rD), (ins vrrc:$vB, u1imm:$MP), "vcntmbb $rD, $vB, $MP", IIC_VecGeneral, - []>; + [(set i64:$rD, (int_ppc_altivec_vcntmbb + v16i8:$vB, timm:$MP))]>; def VCNTMBH : VXForm_RD5_MP_VB5<1602, 13, (outs g8rc:$rD), (ins vrrc:$vB, u1imm:$MP), "vcntmbh $rD, $vB, $MP", IIC_VecGeneral, - []>; + [(set i64:$rD, (int_ppc_altivec_vcntmbh + v8i16:$vB, timm:$MP))]>; def VCNTMBW : VXForm_RD5_MP_VB5<1602, 14, (outs g8rc:$rD), (ins vrrc:$vB, u1imm:$MP), "vcntmbw $rD, $vB, $MP", IIC_VecGeneral, - []>; + [(set i64:$rD, (int_ppc_altivec_vcntmbw + v4i32:$vB, timm:$MP))]>; def VCNTMBD : VXForm_RD5_MP_VB5<1602, 15, (outs g8rc:$rD), (ins vrrc:$vB, u1imm:$MP), "vcntmbd $rD, $vB, $MP", IIC_VecGeneral, - []>; + [(set i64:$rD, (int_ppc_altivec_vcntmbd + v2i64:$vB, timm:$MP))]>; def VEXTDUBVLX : VAForm_1a<24, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, gprc:$rC), "vextdubvlx $vD, $vA, $vB, $rC", @@ -1281,9 +1316,11 @@ let Predicates = [IsISA3_1] in { [(set v1i128:$vD, (int_ppc_altivec_vmsumcud v2i64:$vA, v2i64:$vB, v1i128:$vC))]>; def VDIVSQ : VXForm_1<267, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vdivsq $vD, $vA, $vB", IIC_VecGeneral, []>; + "vdivsq $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (sdiv v1i128:$vA, v1i128:$vB))]>; def VDIVUQ : VXForm_1<11, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vdivuq $vD, $vA, $vB", IIC_VecGeneral, []>; + "vdivuq $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (udiv v1i128:$vA, v1i128:$vB))]>; def VDIVESQ : VXForm_1<779, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vdivesq $vD, $vA, $vB", IIC_VecGeneral, []>; def VDIVEUQ : VXForm_1<523, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index c3ee1c7ea18a4..9003b1eb089b6 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -890,15 +890,15 @@ let hasSideEffects = 0 in { def XSRDPIC : XX2Form<60, 107, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsrdpic $XT, $XB", IIC_VecFP, - [(set f64:$XT, (any_fnearbyint f64:$XB))]>; + [(set f64:$XT, (fnearbyint f64:$XB))]>; def XVRDPIC : XX2Form<60, 235, (outs vsrc:$XT), (ins vsrc:$XB), "xvrdpic $XT, $XB", IIC_VecFP, - [(set v2f64:$XT, (any_fnearbyint v2f64:$XB))]>; + [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>; def XVRSPIC : XX2Form<60, 171, (outs vsrc:$XT), (ins vsrc:$XB), "xvrspic $XT, $XB", IIC_VecFP, - [(set v4f32:$XT, (any_fnearbyint v4f32:$XB))]>; + [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>; // Max/Min Instructions let isCommutable = 1 in { def XSMAXDP : XX3Form<60, 160, @@ -2681,7 +2681,7 @@ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(f32 (any_fround f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; -def : Pat<(f32 (any_fnearbyint f32:$S)), +def : Pat<(f32 (fnearbyint f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPIC (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; def : Pat<(f32 (any_ffloor f32:$S)), @@ -2696,11 +2696,11 @@ def : Pat<(f32 (any_ftrunc f32:$S)), def : Pat<(f32 (any_frint f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPIC (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; -def : Pat<(v4f32 (frint v4f32:$S)), (v4f32 (XVRSPIC $S))>; +def : Pat<(v4f32 (any_frint v4f32:$S)), (v4f32 (XVRSPIC $S))>; // Rounding for double precision. -def : Pat<(f64 (frint f64:$S)), (f64 (XSRDPIC $S))>; -def : Pat<(v2f64 (frint v2f64:$S)), (v2f64 (XVRDPIC $S))>; +def : Pat<(f64 (any_frint f64:$S)), (f64 (XSRDPIC $S))>; +def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>; // Materialize a zero-vector of long long def : Pat<(v2i64 immAllZerosV), diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp index 795abed413e04..1358bec8e36f8 100644 --- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -86,6 +86,8 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, RefKind = MCSymbolRefExpr::VK_PCREL; else if (MO.getTargetFlags() == (PPCII::MO_PCREL_FLAG | PPCII::MO_GOT_FLAG)) RefKind = MCSymbolRefExpr::VK_PPC_GOT_PCREL; + else if (MO.getTargetFlags() == (PPCII::MO_PCREL_FLAG | PPCII::MO_TPREL_FLAG)) + RefKind = MCSymbolRefExpr::VK_TPREL; else if (MO.getTargetFlags() == PPCII::MO_GOT_TLSGD_PCREL_FLAG) RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSGD_PCREL; else if (MO.getTargetFlags() == PPCII::MO_GOT_TPREL_PCREL_FLAG) diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 8021cfa4a18c6..1afed172e143b 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -11,9 +11,13 @@ //===----------------------------------------------------------------------===// #include "PPCSubtarget.h" +#include "GISel/PPCCallLowering.h" +#include "GISel/PPCLegalizerInfo.h" +#include "GISel/PPCRegisterBankInfo.h" #include "PPC.h" #include "PPCRegisterInfo.h" #include "PPCTargetMachine.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/Attributes.h" @@ -53,7 +57,15 @@ PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU, IsPPC64(TargetTriple.getArch() == Triple::ppc64 || TargetTriple.getArch() == Triple::ppc64le), TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, FS)), - InstrInfo(*this), TLInfo(TM, *this) {} + InstrInfo(*this), TLInfo(TM, *this) { + CallLoweringInfo.reset(new PPCCallLowering(*getTargetLowering())); + Legalizer.reset(new PPCLegalizerInfo(*this)); + auto *RBI = new PPCRegisterBankInfo(*getRegisterInfo()); + RegBankInfo.reset(RBI); + + InstSelector.reset(createPPCInstructionSelector( + *static_cast(&TM), *this, *RBI)); +} void PPCSubtarget::initializeEnvironment() { StackAlignment = Align(16); @@ -108,6 +120,7 @@ void PPCSubtarget::initializeEnvironment() { HasHTM = false; HasFloat128 = false; HasFusion = false; + HasStoreFusion = false; HasAddiLoadFusion = false; HasAddisLoadFusion = false; IsISA3_0 = false; @@ -227,3 +240,20 @@ bool PPCSubtarget::isUsingPCRelativeCalls() const { return isPPC64() && hasPCRelativeMemops() && isELFv2ABI() && CodeModel::Medium == getTargetMachine().getCodeModel(); } + +// GlobalISEL +const CallLowering *PPCSubtarget::getCallLowering() const { + return CallLoweringInfo.get(); +} + +const RegisterBankInfo *PPCSubtarget::getRegBankInfo() const { + return RegBankInfo.get(); +} + +const LegalizerInfo *PPCSubtarget::getLegalizerInfo() const { + return Legalizer.get(); +} + +InstructionSelector *PPCSubtarget::getInstructionSelector() const { + return InstSelector.get(); +} diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index 76b43dfc7a723..4552defd657e5 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -17,6 +17,9 @@ #include "PPCISelLowering.h" #include "PPCInstrInfo.h" #include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" @@ -137,6 +140,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool HasHTM; bool HasFloat128; bool HasFusion; + bool HasStoreFusion; bool HasAddiLoadFusion; bool HasAddisLoadFusion; bool IsISA3_0; @@ -157,6 +161,12 @@ class PPCSubtarget : public PPCGenSubtargetInfo { PPCTargetLowering TLInfo; SelectionDAGTargetInfo TSInfo; + /// GlobalISel related APIs. + std::unique_ptr CallLoweringInfo; + std::unique_ptr Legalizer; + std::unique_ptr RegBankInfo; + std::unique_ptr InstSelector; + public: /// This constructor initializes the data members to match that /// of the specified triple. @@ -308,6 +318,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool isISA3_1() const { return IsISA3_1; } bool useLongCalls() const { return UseLongCalls; } bool hasFusion() const { return HasFusion; } + bool hasStoreFusion() const { return HasStoreFusion; } bool hasAddiLoadFusion() const { return HasAddiLoadFusion; } bool hasAddisLoadFusion() const { return HasAddisLoadFusion; } bool needsSwapsForVSXMemOps() const { @@ -394,6 +405,12 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool isPredictableSelectIsExpensive() const { return PredictableSelectIsExpensive; } + + // GlobalISEL + const CallLowering *getCallLowering() const override; + const RegisterBankInfo *getRegBankInfo() const override; + const LegalizerInfo *getLegalizerInfo() const override; + InstructionSelector *getInstructionSelector() const override; }; } // End llvm namespace diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index ea9b37de6ff39..6a15b0219252c 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -24,12 +24,18 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" +#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/Localizer.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" @@ -116,6 +122,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() { initializePPCTLSDynamicCallPass(PR); initializePPCMIPeepholePass(PR); initializePPCLowerMASSVEntriesPass(PR); + initializeGlobalISel(PR); } /// Return the datalayout string of a subtarget. @@ -271,6 +278,8 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) { std::make_unique(C)); // add DAG Mutations here. DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI)); + if (ST.hasStoreFusion()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.hasFusion()) DAG->addMutation(createPowerPCMacroFusionDAGMutation()); @@ -285,6 +294,8 @@ static ScheduleDAGInstrs *createPPCPostMachineScheduler( std::make_unique(C) : std::make_unique(C), true); // add DAG Mutations here. + if (ST.hasStoreFusion()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.hasFusion()) DAG->addMutation(createPowerPCMacroFusionDAGMutation()); return DAG; @@ -381,6 +392,12 @@ class PPCPassConfig : public TargetPassConfig { void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + // GlobalISEL + bool addIRTranslator() override; + bool addLegalizeMachineIR() override; + bool addRegBankSelect() override; + bool addGlobalInstructionSelect() override; + ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override { return createPPCMachineScheduler(C); @@ -531,3 +548,24 @@ static MachineSchedRegistry PPCPostRASchedRegistry("ppc-postra", "Run PowerPC PostRA specific scheduler", createPPCPostMachineScheduler); + +// Global ISEL +bool PPCPassConfig::addIRTranslator() { + addPass(new IRTranslator()); + return false; +} + +bool PPCPassConfig::addLegalizeMachineIR() { + addPass(new Legalizer()); + return false; +} + +bool PPCPassConfig::addRegBankSelect() { + addPass(new RegBankSelect()); + return false; +} + +bool PPCPassConfig::addGlobalInstructionSelect() { + addPass(new InstructionSelect()); + return false; +} diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index f3529718b8653..5db5ab47f29e4 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/IntrinsicsPowerPC.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/Local.h" diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 43adc7426c79d..a6054a465399d 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -23,6 +23,105 @@ using namespace llvm; +// For now we use x18, a.k.a s2, as pointer to shadow call stack. +// User should explicitly set -ffixed-x18 and not use x18 in their asm. +static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL) { + if (!MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) + return; + + const auto &STI = MF.getSubtarget(); + Register RAReg = STI.getRegisterInfo()->getRARegister(); + + // Do not save RA to the SCS if it's not saved to the regular stack, + // i.e. RA is not at risk of being overwritten. + std::vector &CSI = MF.getFrameInfo().getCalleeSavedInfo(); + if (std::none_of(CSI.begin(), CSI.end(), + [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; })) + return; + + Register SCSPReg = RISCVABI::getSCSPReg(); + + auto &Ctx = MF.getFunction().getContext(); + if (!STI.isRegisterReservedByUser(SCSPReg)) { + Ctx.diagnose(DiagnosticInfoUnsupported{ + MF.getFunction(), "x18 not reserved by user for Shadow Call Stack."}); + return; + } + + const auto *RVFI = MF.getInfo(); + if (RVFI->useSaveRestoreLibCalls(MF)) { + Ctx.diagnose(DiagnosticInfoUnsupported{ + MF.getFunction(), + "Shadow Call Stack cannot be combined with Save/Restore LibCalls."}); + return; + } + + const RISCVInstrInfo *TII = STI.getInstrInfo(); + bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit); + int64_t SlotSize = STI.getXLen() / 8; + // Store return address to shadow call stack + // s[w|d] ra, 0(s2) + // addi s2, s2, [4|8] + BuildMI(MBB, MI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW)) + .addReg(RAReg) + .addReg(SCSPReg) + .addImm(0); + BuildMI(MBB, MI, DL, TII->get(RISCV::ADDI)) + .addReg(SCSPReg, RegState::Define) + .addReg(SCSPReg) + .addImm(SlotSize); +} + +static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL) { + if (!MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) + return; + + const auto &STI = MF.getSubtarget(); + Register RAReg = STI.getRegisterInfo()->getRARegister(); + + // See emitSCSPrologue() above. + std::vector &CSI = MF.getFrameInfo().getCalleeSavedInfo(); + if (std::none_of(CSI.begin(), CSI.end(), + [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; })) + return; + + Register SCSPReg = RISCVABI::getSCSPReg(); + + auto &Ctx = MF.getFunction().getContext(); + if (!STI.isRegisterReservedByUser(SCSPReg)) { + Ctx.diagnose(DiagnosticInfoUnsupported{ + MF.getFunction(), "x18 not reserved by user for Shadow Call Stack."}); + return; + } + + const auto *RVFI = MF.getInfo(); + if (RVFI->useSaveRestoreLibCalls(MF)) { + Ctx.diagnose(DiagnosticInfoUnsupported{ + MF.getFunction(), + "Shadow Call Stack cannot be combined with Save/Restore LibCalls."}); + return; + } + + const RISCVInstrInfo *TII = STI.getInstrInfo(); + bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit); + int64_t SlotSize = STI.getXLen() / 8; + // Load return address from shadow call stack + // l[w|d] ra, -[4|8](s2) + // addi s2, s2, -[4|8] + BuildMI(MBB, MI, DL, TII->get(IsRV64 ? RISCV::LD : RISCV::LW)) + .addReg(RAReg, RegState::Define) + .addReg(SCSPReg) + .addImm(-SlotSize); + BuildMI(MBB, MI, DL, TII->get(RISCV::ADDI)) + .addReg(SCSPReg, RegState::Define) + .addReg(SCSPReg) + .addImm(-SlotSize); +} + // Get the ID of the libcall used for spilling and restoring callee saved // registers. The ID is representative of the number of registers saved or // restored by the libcall, except it is zero-indexed - ID 0 corresponds to a @@ -222,15 +321,18 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, Register SPReg = getSPReg(STI); Register BPReg = RISCVABI::getBPReg(); + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; + + // Emit prologue for shadow call stack. + emitSCSPrologue(MF, MBB, MBBI, DL); + // Since spillCalleeSavedRegisters may have inserted a libcall, skip past // any instructions marked as FrameSetup while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) ++MBBI; - // Debug location must be unknown since the first debug location is used - // to determine the end of the prologue. - DebugLoc DL; - // Determine the correct frame layout determineFrameLayout(MF); @@ -457,6 +559,9 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, // Deallocate stack adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy); + + // Emit epilogue for shadow call stack. + emitSCSEpilogue(MF, MBB, MBBI, DL); } int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index eeb0cabc2f8bd..1b305eac74876 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -147,7 +147,7 @@ bool RISCVPassConfig::addInstSelector() { } bool RISCVPassConfig::addIRTranslator() { - addPass(new IRTranslator()); + addPass(new IRTranslator(getOptLevel())); return false; } diff --git a/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp index 43b1f8b80c5fd..9b1899a759f42 100644 --- a/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp +++ b/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp @@ -67,6 +67,9 @@ ABI getTargetABI(StringRef ABIName) { // saved registers and X8 will be used as fp. So we choose X9 as bp. Register getBPReg() { return RISCV::X9; } +// Returns the register holding shadow call stack pointer. +Register getSCSPReg() { return RISCV::X18; } + } // namespace RISCVABI namespace RISCVFeatures { diff --git a/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h index 4e6cdd8606b16..1b498b3c0102c 100644 --- a/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h @@ -208,6 +208,9 @@ ABI getTargetABI(StringRef ABIName); // Returns the register used to hold the stack pointer after realignment. Register getBPReg(); +// Returns the register holding shadow call stack pointer. +Register getSCSPReg(); + } // namespace RISCVABI namespace RISCVFeatures { diff --git a/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp index f390ddb89e3c9..1f3dead610112 100644 --- a/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp +++ b/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp @@ -8,10 +8,8 @@ #include "RISCVMatInt.h" #include "MCTargetDesc/RISCVMCTargetDesc.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Support/MachineValueType.h" +#include "llvm/ADT/APInt.h" #include "llvm/Support/MathExtras.h" -#include namespace llvm { diff --git a/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h b/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h index b12ae2eade999..17ca57458b493 100644 --- a/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h +++ b/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h @@ -9,12 +9,11 @@ #ifndef LLVM_LIB_TARGET_RISCV_MATINT_H #define LLVM_LIB_TARGET_RISCV_MATINT_H -#include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Support/MachineValueType.h" #include namespace llvm { +class APInt; namespace RISCVMatInt { struct Inst { diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 116352e083829..c0c79b6f59c61 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -2139,7 +2139,7 @@ SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain, int FI = MFI.CreateStackObject(16, Align(8), false); SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); Chain = DAG.getStore(Chain, DL, Entry.Node, FIPtr, MachinePointerInfo(), - /* Alignment = */ 8); + Align(8)); Entry.Node = FIPtr; Entry.Ty = PointerType::getUnqual(ArgTy); @@ -2198,7 +2198,7 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG, // Load RetPtr to get the return value. return DAG.getLoad(Op.getValueType(), SDLoc(Op), Chain, RetPtr, - MachinePointerInfo(), /* Alignment = */ 8); + MachinePointerInfo(), Align(8)); } SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS, diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index 6b4f35e5ba2b4..ca5ca7257bab2 100644 --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -117,9 +117,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset( return Chain1; SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst, DAG.getConstant(1, DL, PtrVT)); - SDValue Chain2 = - DAG.getStore(Chain, DL, Byte, Dst2, DstPtrInfo.getWithOffset(1), - /* Alignment = */ 1); + SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2, + DstPtrInfo.getWithOffset(1), Align(1)); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2); } } diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index b0137384971cb..0e6c95d5dd3b1 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -689,11 +689,24 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { auto Type = parseType(TypeName); if (!Type) return error("Unknown type in .globaltype directive: ", TypeTok); + // Optional mutable modifier. Default to mutable for historical reasons. + // Ideally we would have gone with immutable as the default and used `mut` + // as the modifier to match the `.wat` format. + bool Mutable = true; + if (isNext(AsmToken::Comma)) { + TypeTok = Lexer.getTok(); + auto Id = expectIdent(); + if (Id == "immutable") + Mutable = false; + else + // Should we also allow `mutable` and `mut` here for clarity? + return error("Unknown type in .globaltype modifier: ", TypeTok); + } // Now set this symbol with the correct type. auto WasmSym = cast(Ctx.getOrCreateSymbol(SymName)); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); WasmSym->setGlobalType( - wasm::WasmGlobalType{uint8_t(Type.getValue()), true}); + wasm::WasmGlobalType{uint8_t(Type.getValue()), Mutable}); // And emit the directive again. TOut.emitGlobalType(WasmSym); return expect(AsmToken::EndOfStatement, "EOL"); diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp index e954eeaebb141..d2b2de0dca1f4 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp @@ -71,8 +71,10 @@ void WebAssemblyTargetAsmStreamer::emitGlobalType(const MCSymbolWasm *Sym) { assert(Sym->isGlobal()); OS << "\t.globaltype\t" << Sym->getName() << ", " << WebAssembly::typeToString( - static_cast(Sym->getGlobalType().Type)) - << '\n'; + static_cast(Sym->getGlobalType().Type)); + if (!Sym->getGlobalType().Mutable) + OS << ", immutable"; + OS << '\n'; } void WebAssemblyTargetAsmStreamer::emitEventType(const MCSymbolWasm *Sym) { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index 02330a2dd4afa..d5ee4b3b9440e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -178,6 +178,28 @@ getLatestInsertPos(MachineBasicBlock *MBB, return InsertPos; } +// Find a catch instruction and its destination register within an EH pad. +static MachineInstr *findCatch(MachineBasicBlock *EHPad, Register &ExnReg) { + assert(EHPad->isEHPad()); + MachineInstr *Catch = nullptr; + for (auto &MI : *EHPad) { + switch (MI.getOpcode()) { + case WebAssembly::CATCH: + Catch = &MI; + ExnReg = Catch->getOperand(0).getReg(); + break; + } + } + assert(Catch && "EH pad does not have a catch"); + assert(ExnReg != 0 && "Invalid register"); + return Catch; +} + +static MachineInstr *findCatch(MachineBasicBlock *EHPad) { + Register Dummy; + return findCatch(EHPad, Dummy); +} + void WebAssemblyCFGStackify::registerScope(MachineInstr *Begin, MachineInstr *End) { BeginToEnd[Begin] = End; @@ -1101,25 +1123,8 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) { continue; MachineBasicBlock *EHPad = P.first; - - // Find 'catch' and 'local.set' or 'drop' instruction that follows the - // 'catch'. If -wasm-disable-explicit-locals is not set, 'catch' should be - // always followed by either 'local.set' or a 'drop', because 'br_on_exn' is - // generated after 'catch' in LateEHPrepare and we don't support blocks - // taking values yet. - MachineInstr *Catch = nullptr; - unsigned ExnReg = 0; - for (auto &MI : *EHPad) { - switch (MI.getOpcode()) { - case WebAssembly::CATCH: - Catch = &MI; - ExnReg = Catch->getOperand(0).getReg(); - break; - } - } - assert(Catch && "EH pad does not have a catch"); - assert(ExnReg != 0 && "Invalid register"); - + Register ExnReg = 0; + MachineInstr *Catch = findCatch(EHPad, ExnReg); auto SplitPos = std::next(Catch->getIterator()); // Create a new BB that's gonna be the destination for branches from the @@ -1371,22 +1376,41 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) { : WebAssembly::BlockType( WebAssembly::toValType(MFI.getResults().front())); - for (MachineBasicBlock &MBB : reverse(MF)) { - for (MachineInstr &MI : reverse(MBB)) { + SmallVector Worklist; + Worklist.push_back(MF.rbegin()->rbegin()); + + auto Process = [&](MachineBasicBlock::reverse_iterator It) { + auto *MBB = It->getParent(); + while (It != MBB->rend()) { + MachineInstr &MI = *It++; if (MI.isPosition() || MI.isDebugInstr()) continue; switch (MI.getOpcode()) { + case WebAssembly::END_TRY: { + // If a 'try''s return type is fixed, both its try body and catch body + // should satisfy the return type, so we need to search 'end' + // instructions before its corresponding 'catch' too. + auto *EHPad = TryToEHPad.lookup(EndToBegin[&MI]); + assert(EHPad); + Worklist.push_back(std::next(findCatch(EHPad)->getReverseIterator())); + LLVM_FALLTHROUGH; + } case WebAssembly::END_BLOCK: case WebAssembly::END_LOOP: - case WebAssembly::END_TRY: EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType)); continue; default: - // Something other than an `end`. We're done. + // Something other than an `end`. We're done for this BB. return; } } - } + // We've reached the beginning of a BB. Continue the search in the previous + // BB. + Worklist.push_back(MBB->getPrevNode()->rbegin()); + }; + + while (!Worklist.empty()) + Process(Worklist.pop_back_val()); } // WebAssembly functions end with an end instruction, as if the function body diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 8f5b7301e6532..425f8b86c9fbc 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -904,7 +904,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getConstant(Offset, DL, PtrVT)); Chains.push_back( DAG.getStore(Chain, DL, Arg, Add, - MachinePointerInfo::getFixedStack(MF, FI, Offset), 0)); + MachinePointerInfo::getFixedStack(MF, FI, Offset))); } if (!Chains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); @@ -1331,7 +1331,7 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op, SDValue ArgN = DAG.getCopyFromReg(DAG.getEntryNode(), DL, MFI->getVarargBufferVreg(), PtrVT); return DAG.getStore(Op.getOperand(0), DL, ArgN, Op.getOperand(1), - MachinePointerInfo(SV), 0); + MachinePointerInfo(SV)); } SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td index 171dd9a67beb5..63aeb1b467379 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -103,7 +103,7 @@ defm FALLTHROUGH_RETURN : I<(outs), (ins variable_ops), (outs), (ins), []>; } // isReturn = 1 -let isTrap = 1 in +let IsCanonical = 1, isTrap = 1 in defm UNREACHABLE : NRI<(outs), (ins), [(trap)], "unreachable", 0x00>; } // isTerminator = 1 diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp index a2da0ea849e04..6bfed1a7195c1 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp @@ -97,7 +97,7 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction( // values through live-range splitting and stackification, it will have to // do. MF.getInfo()->setFrameBaseVreg( - SplitLIs.back()->reg); + SplitLIs.back()->reg()); } SplitLIs.clear(); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp index 20fe2b2b7bfc5..fe127dec8aede 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp @@ -106,8 +106,8 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { continue; LiveInterval *LI = &Liveness->getInterval(VReg); - assert(LI->weight == 0.0f); - LI->weight = computeWeight(MRI, MBFI, VReg); + assert(LI->weight() == 0.0f); + LI->setWeight(computeWeight(MRI, MBFI, VReg)); LLVM_DEBUG(LI->dump()); SortedIntervals.push_back(LI); } @@ -118,10 +118,10 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { // TODO: Investigate more intelligent sorting heuristics. For starters, we // should try to coalesce adjacent live intervals before non-adjacent ones. llvm::sort(SortedIntervals, [MRI](LiveInterval *LHS, LiveInterval *RHS) { - if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg)) - return MRI->isLiveIn(LHS->reg); - if (LHS->weight != RHS->weight) - return LHS->weight > RHS->weight; + if (MRI->isLiveIn(LHS->reg()) != MRI->isLiveIn(RHS->reg())) + return MRI->isLiveIn(LHS->reg()); + if (LHS->weight() != RHS->weight()) + return LHS->weight() > RHS->weight(); if (LHS->empty() || RHS->empty()) return !LHS->empty() && RHS->empty(); return *LHS < *RHS; @@ -135,14 +135,14 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) { LiveInterval *LI = SortedIntervals[I]; - unsigned Old = LI->reg; + unsigned Old = LI->reg(); size_t Color = I; const TargetRegisterClass *RC = MRI->getRegClass(Old); // Check if it's possible to reuse any of the used colors. if (!MRI->isLiveIn(Old)) for (unsigned C : UsedColors.set_bits()) { - if (MRI->getRegClass(SortedIntervals[C]->reg) != RC) + if (MRI->getRegClass(SortedIntervals[C]->reg()) != RC) continue; for (LiveInterval *OtherLI : Assignments[C]) if (!OtherLI->empty() && OtherLI->overlaps(*LI)) @@ -152,7 +152,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { continue_outer:; } - unsigned New = SortedIntervals[Color]->reg; + unsigned New = SortedIntervals[Color]->reg(); SlotMapping[I] = New; Changed |= Old != New; UsedColors.set(Color); @@ -160,7 +160,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { // If we reassigned the stack pointer, update the debug frame base info. if (Old != New && MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Old) MFI.setFrameBaseVreg(New); - LLVM_DEBUG(dbgs() << "Assigning vreg" << Register::virtReg2Index(LI->reg) + LLVM_DEBUG(dbgs() << "Assigning vreg" << Register::virtReg2Index(LI->reg()) << " to vreg" << Register::virtReg2Index(New) << "\n"); } if (!Changed) @@ -168,7 +168,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { // Rewrite register operands. for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) { - unsigned Old = SortedIntervals[I]->reg; + unsigned Old = SortedIntervals[I]->reg(); unsigned New = SlotMapping[I]; if (Old != New) MRI->replaceRegWith(Old, New); diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 5694105dcbd11..3270932a76d08 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -32,6 +32,7 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -150,6 +151,13 @@ class X86AsmParser : public MCTargetAsmParser { IOK_TYPE, }; + enum MasmOperatorKind { + MOK_INVALID = 0, + MOK_LENGTHOF, + MOK_SIZEOF, + MOK_TYPE, + }; + class InfixCalculator { typedef std::pair< InfixCalculatorTok, int64_t > ICToken; SmallVector InfixOperatorStack; @@ -367,7 +375,7 @@ class X86AsmParser : public MCTargetAsmParser { bool MemExpr; bool OffsetOperator; SMLoc OffsetOperatorLoc; - StringRef CurType; + AsmTypeInfo CurType; bool setSymRef(const MCExpr *Val, StringRef ID, StringRef &ErrMsg) { if (Sym) { @@ -395,7 +403,10 @@ class X86AsmParser : public MCTargetAsmParser { unsigned getScale() { return Scale; } const MCExpr *getSym() { return Sym; } StringRef getSymName() { return SymName; } - StringRef getType() { return CurType; } + StringRef getType() { return CurType.Name; } + unsigned getSize() { return CurType.Size; } + unsigned getElementSize() { return CurType.ElementSize; } + unsigned getLength() { return CurType.Length; } int64_t getImm() { return Imm + IC.execute(); } bool isValidEndState() { return State == IES_RBRAC || State == IES_INTEGER; @@ -628,7 +639,8 @@ class X86AsmParser : public MCTargetAsmParser { } bool onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName, const InlineAsmIdentifierInfo &IDInfo, - bool ParsingMSInlineAsm, StringRef &ErrMsg) { + const AsmTypeInfo &Type, bool ParsingMSInlineAsm, + StringRef &ErrMsg) { // InlineAsm: Treat an enum value as an integer if (ParsingMSInlineAsm) if (IDInfo.isKind(InlineAsmIdentifierInfo::IK_EnumVal)) @@ -647,6 +659,7 @@ class X86AsmParser : public MCTargetAsmParser { case IES_NOT: case IES_INIT: case IES_LBRAC: + case IES_LPAREN: if (setSymRef(SymRef, SymRefName, ErrMsg)) return true; MemExpr = true; @@ -654,6 +667,7 @@ class X86AsmParser : public MCTargetAsmParser { IC.pushOperand(IC_IMM); if (ParsingMSInlineAsm) Info = IDInfo; + setTypeInfo(Type); break; } return false; @@ -752,6 +766,8 @@ class X86AsmParser : public MCTargetAsmParser { case IES_RPAREN: State = IES_PLUS; IC.pushOperator(IC_PLUS); + CurType.Length = 1; + CurType.Size = CurType.ElementSize; break; case IES_INIT: case IES_CAST: @@ -835,8 +851,8 @@ class X86AsmParser : public MCTargetAsmParser { } } bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID, - const InlineAsmIdentifierInfo &IDInfo, bool ParsingMSInlineAsm, - StringRef &ErrMsg) { + const InlineAsmIdentifierInfo &IDInfo, + bool ParsingMSInlineAsm, StringRef &ErrMsg) { PrevState = State; switch (State) { default: @@ -860,19 +876,19 @@ class X86AsmParser : public MCTargetAsmParser { } return false; } - void onCast(StringRef Type) { + void onCast(AsmTypeInfo Info) { PrevState = State; switch (State) { default: State = IES_ERROR; break; case IES_LPAREN: - setType(Type); + setTypeInfo(Info); State = IES_CAST; break; } } - void setType(StringRef Type) { CurType = Type; } + void setTypeInfo(AsmTypeInfo Type) { CurType = Type; } }; bool Error(SMLoc L, const Twine &Msg, SMRange Range = None, @@ -909,6 +925,8 @@ class X86AsmParser : public MCTargetAsmParser { bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End); unsigned IdentifyIntelInlineAsmOperator(StringRef Name); unsigned ParseIntelInlineAsmOperator(unsigned OpKind); + unsigned IdentifyMasmOperator(StringRef Name); + bool ParseMasmOperator(unsigned OpKind, int64_t &Val); bool ParseRoundingModeOp(SMLoc Start, OperandVector &Operands); bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM, bool &ParseError, SMLoc &End); @@ -1653,6 +1671,13 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if (ParseIntelDotOperator(SM, End)) return true; break; + case AsmToken::Dollar: + if (!Parser.isParsingMasm()) { + if ((Done = SM.isValidEndState())) + break; + return Error(Tok.getLoc(), "unknown token in expression"); + } + LLVM_FALLTHROUGH; case AsmToken::At: case AsmToken::String: case AsmToken::Identifier: { @@ -1664,7 +1689,10 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { const AsmToken &NextTok = getLexer().peekTok(); if (NextTok.is(AsmToken::Identifier) && NextTok.getIdentifier().equals_lower("ptr")) { - SM.onCast(Identifier); + AsmTypeInfo Info; + if (Parser.lookUpType(Identifier, Info)) + return Error(Tok.getLoc(), "unknown type"); + SM.onCast(Info); // Eat type and PTR. consumeToken(); End = consumeToken(); @@ -1689,16 +1717,15 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if (SM.onRegister(Reg, ErrMsg)) return Error(IdentLoc, ErrMsg); - StringRef Type; - unsigned Offset = 0; + AsmFieldInfo Info; SMLoc FieldStartLoc = SMLoc::getFromPointer(Field.data()); - if (Parser.lookUpField(Field, Type, Offset)) + if (Parser.lookUpField(Field, Info)) return Error(FieldStartLoc, "unknown offset"); else if (SM.onPlus(ErrMsg)) return Error(getTok().getLoc(), ErrMsg); - else if (SM.onInteger(Offset, ErrMsg)) + else if (SM.onInteger(Info.Offset, ErrMsg)) return Error(IdentLoc, ErrMsg); - SM.setType(Type); + SM.setTypeInfo(Info.Type); End = consumeToken(); break; @@ -1714,6 +1741,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { } // Symbol reference, when parsing assembly content InlineAsmIdentifierInfo Info; + AsmTypeInfo Type; const MCExpr *Val; if (isParsingMSInlineAsm() || Parser.isParsingMasm()) { // MS Dot Operator expression @@ -1740,13 +1768,24 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { return Error(IdentLoc, "expected identifier"); if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End)) return true; - else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg)) + else if (SM.onIdentifierExpr(Val, Identifier, Info, Type, true, ErrMsg)) return Error(IdentLoc, ErrMsg); break; } - if (getParser().parsePrimaryExpr(Val, End)) { + if (Parser.isParsingMasm()) { + if (unsigned OpKind = IdentifyMasmOperator(Identifier)) { + int64_t Val; + if (ParseMasmOperator(OpKind, Val)) + return true; + if (SM.onInteger(Val, ErrMsg)) + return Error(IdentLoc, ErrMsg); + break; + } + } + if (getParser().parsePrimaryExpr(Val, End, &Type)) { return Error(Tok.getLoc(), "Unexpected identifier!"); - } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) { + } else if (SM.onIdentifierExpr(Val, Identifier, Info, Type, false, + ErrMsg)) { return Error(IdentLoc, ErrMsg); } break; @@ -1769,8 +1808,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { return Error(Loc, "invalid reference to undefined symbol"); StringRef Identifier = Sym->getName(); InlineAsmIdentifierInfo Info; - if (SM.onIdentifierExpr(Val, Identifier, Info, isParsingMSInlineAsm(), - ErrMsg)) + AsmTypeInfo Type; + if (SM.onIdentifierExpr(Val, Identifier, Info, Type, + isParsingMSInlineAsm(), ErrMsg)) return Error(Loc, ErrMsg); End = consumeToken(); } else { @@ -1957,8 +1997,7 @@ bool X86AsmParser::ParseRoundingModeOp(SMLoc Start, OperandVector &Operands) { bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End) { const AsmToken &Tok = getTok(); - StringRef Type; - unsigned Offset = 0; + AsmFieldInfo Info; // Drop the optional '.'. StringRef DotDispStr = Tok.getString(); @@ -1969,27 +2008,28 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, if (Tok.is(AsmToken::Real)) { APInt DotDisp; DotDispStr.getAsInteger(10, DotDisp); - Offset = DotDisp.getZExtValue(); + Info.Offset = DotDisp.getZExtValue(); } else if ((isParsingMSInlineAsm() || getParser().isParsingMasm()) && Tok.is(AsmToken::Identifier)) { const std::pair BaseMember = DotDispStr.split('.'); const StringRef Base = BaseMember.first, Member = BaseMember.second; - if (getParser().lookUpField(SM.getType(), DotDispStr, Type, Offset) && - getParser().lookUpField(SM.getSymName(), DotDispStr, Type, Offset) && - getParser().lookUpField(DotDispStr, Type, Offset) && + if (getParser().lookUpField(SM.getType(), DotDispStr, Info) && + getParser().lookUpField(SM.getSymName(), DotDispStr, Info) && + getParser().lookUpField(DotDispStr, Info) && (!SemaCallback || - SemaCallback->LookupInlineAsmField(Base, Member, Offset))) + SemaCallback->LookupInlineAsmField(Base, Member, Info.Offset))) return Error(Tok.getLoc(), "Unable to lookup field reference!"); - } else + } else { return Error(Tok.getLoc(), "Unexpected token type!"); + } // Eat the DotExpression and update End End = SMLoc::getFromPointer(DotDispStr.data()); const char *DotExprEndLoc = DotDispStr.data() + DotDispStr.size(); while (Tok.getLoc().getPointer() < DotExprEndLoc) Lex(); - SM.addImm(Offset); - SM.setType(Type); + SM.addImm(Info.Offset); + SM.setTypeInfo(Info.Type); return false; } @@ -2004,7 +2044,7 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID, if (!isParsingMSInlineAsm()) { if ((getTok().isNot(AsmToken::Identifier) && getTok().isNot(AsmToken::String)) || - getParser().parsePrimaryExpr(Val, End)) + getParser().parsePrimaryExpr(Val, End, nullptr)) return Error(Start, "unexpected token!"); } else if (ParseIntelInlineAsmIdentifier(Val, ID, Info, false, End, true)) { return Error(Start, "unable to lookup expression"); @@ -2059,6 +2099,73 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) { return CVal; } +// Query a candidate string for being an Intel assembly operator +// Report back its kind, or IOK_INVALID if does not evaluated as a known one +unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) { + return StringSwitch(Name.lower()) + .Case("type", MOK_TYPE) + .Cases("size", "sizeof", MOK_SIZEOF) + .Cases("length", "lengthof", MOK_LENGTHOF) + .Default(MOK_INVALID); +} + +/// Parse the 'LENGTHOF', 'SIZEOF', and 'TYPE' operators. The LENGTHOF operator +/// returns the number of elements in an array. It returns the value 1 for +/// non-array variables. The SIZEOF operator returns the size of a type or +/// variable in bytes. A variable's size is the product of its LENGTH and TYPE. +/// The TYPE operator returns the size of a variable. If the variable is an +/// array, TYPE returns the size of a single element. +bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) { + MCAsmParser &Parser = getParser(); + SMLoc OpLoc = Parser.getTok().getLoc(); + Parser.Lex(); // Eat operator. + + Val = 0; + if (OpKind == MOK_SIZEOF || OpKind == MOK_TYPE) { + // Check for SIZEOF() and TYPE(). + bool InParens = Parser.getTok().is(AsmToken::LParen); + const AsmToken &IDTok = InParens ? getLexer().peekTok() : Parser.getTok(); + AsmTypeInfo Type; + if (IDTok.is(AsmToken::Identifier) && + !Parser.lookUpType(IDTok.getIdentifier(), Type)) { + Val = Type.Size; + + // Eat tokens. + if (InParens) + parseToken(AsmToken::LParen); + parseToken(AsmToken::Identifier); + if (InParens) + parseToken(AsmToken::RParen); + } + } + + if (!Val) { + IntelExprStateMachine SM; + SMLoc End, Start = Parser.getTok().getLoc(); + if (ParseIntelExpression(SM, End)) + return true; + + switch (OpKind) { + default: + llvm_unreachable("Unexpected operand kind!"); + case MOK_SIZEOF: + Val = SM.getSize(); + break; + case MOK_LENGTHOF: + Val = SM.getLength(); + break; + case MOK_TYPE: + Val = SM.getElementSize(); + break; + } + + if (!Val) + return Error(OpLoc, "expression has unknown type", SMRange(Start, End)); + } + + return false; +} + bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) { Size = StringSwitch(getTok().getString()) .Cases("BYTE", "byte", 8) @@ -2161,6 +2268,8 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { unsigned BaseReg = SM.getBaseReg(); unsigned IndexReg = SM.getIndexReg(); unsigned Scale = SM.getScale(); + if (!PtrInOperand) + Size = SM.getElementSize() << 3; if (Scale == 0 && BaseReg != X86::ESP && BaseReg != X86::RSP && (IndexReg == X86::ESP || IndexReg == X86::RSP)) @@ -2617,7 +2726,7 @@ bool X86AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { Res = X86MCExpr::create(RegNo, Parser.getContext()); return false; } - return Parser.parsePrimaryExpr(Res, EndLoc); + return Parser.parsePrimaryExpr(Res, EndLoc, nullptr); } bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, @@ -4063,15 +4172,20 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { return parseDirectiveFPOEndPrologue(DirectiveID.getLoc()); else if (IDVal == ".cv_fpo_endproc") return parseDirectiveFPOEndProc(DirectiveID.getLoc()); - else if (IDVal == ".seh_pushreg") + else if (IDVal == ".seh_pushreg" || + (Parser.isParsingMasm() && IDVal.equals_lower(".pushreg"))) return parseDirectiveSEHPushReg(DirectiveID.getLoc()); - else if (IDVal == ".seh_setframe") + else if (IDVal == ".seh_setframe" || + (Parser.isParsingMasm() && IDVal.equals_lower(".setframe"))) return parseDirectiveSEHSetFrame(DirectiveID.getLoc()); - else if (IDVal == ".seh_savereg") + else if (IDVal == ".seh_savereg" || + (Parser.isParsingMasm() && IDVal.equals_lower(".savereg"))) return parseDirectiveSEHSaveReg(DirectiveID.getLoc()); - else if (IDVal == ".seh_savexmm") + else if (IDVal == ".seh_savexmm" || + (Parser.isParsingMasm() && IDVal.equals_lower(".savexmm128"))) return parseDirectiveSEHSaveXMM(DirectiveID.getLoc()); - else if (IDVal == ".seh_pushframe") + else if (IDVal == ".seh_pushframe" || + (Parser.isParsingMasm() && IDVal.equals_lower(".pushframe"))) return parseDirectiveSEHPushFrame(DirectiveID.getLoc()); return true; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 0de94cda2d739..533145e57ca59 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -161,13 +161,11 @@ static bool is16BitMemOperand(const MCInst &MI, unsigned Op, const MCSubtargetInfo &STI) { const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg); const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg); - const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp); unsigned BaseReg = Base.getReg(); unsigned IndexReg = Index.getReg(); - if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0 && - Disp.isImm() && Disp.getImm() < 0x10000) + if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0) return true; if ((BaseReg != 0 && X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) || diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp index caa1f79524750..6125845a337f9 100644 --- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -202,7 +202,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, Align StackAlign = TFL->getStackAlign(); int64_t Advantage = 0; - for (auto CC : CallSeqVector) { + for (const auto &CC : CallSeqVector) { // Call sites where no parameters are passed on the stack // do not affect the cost, since there needs to be no // stack adjustment. @@ -265,7 +265,7 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { if (!isProfitable(MF, CallSeqVector)) return false; - for (auto CC : CallSeqVector) { + for (const auto &CC : CallSeqVector) { if (CC.UsePush) { adjustCallSequence(MF, CC); Changed = true; @@ -288,13 +288,13 @@ X86CallFrameOptimization::classifyInstruction( case X86::AND16mi8: case X86::AND32mi8: case X86::AND64mi8: { - MachineOperand ImmOp = MI->getOperand(X86::AddrNumOperands); + const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands); return ImmOp.getImm() == 0 ? Convert : Exit; } case X86::OR16mi8: case X86::OR32mi8: case X86::OR64mi8: { - MachineOperand ImmOp = MI->getOperand(X86::AddrNumOperands); + const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands); return ImmOp.getImm() == -1 ? Convert : Exit; } case X86::MOV32mi: @@ -506,7 +506,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, // replace uses. for (int Idx = (Context.ExpectedDist >> Log2SlotSize) - 1; Idx >= 0; --Idx) { MachineBasicBlock::iterator Store = *Context.ArgStoreVector[Idx]; - MachineOperand PushOp = Store->getOperand(X86::AddrNumOperands); + const MachineOperand &PushOp = Store->getOperand(X86::AddrNumOperands); MachineBasicBlock::iterator Push = nullptr; unsigned PushOpcode; switch (Store->getOpcode()) { diff --git a/llvm/lib/Target/X86/X86CallLowering.cpp b/llvm/lib/Target/X86/X86CallLowering.cpp index 0286482ac9af8..8342cad45dfd0 100644 --- a/llvm/lib/Target/X86/X86CallLowering.cpp +++ b/llvm/lib/Target/X86/X86CallLowering.cpp @@ -148,9 +148,9 @@ struct X86OutgoingValueHandler : public CallLowering::IncomingValueHandler { MachineFunction &MF = MIRBuilder.getMF(); Register ExtReg = extendRegister(ValVReg, VA); - auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, - VA.getLocVT().getStoreSize(), - inferAlignFromPtrInfo(MF, MPO)); + auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, + VA.getLocVT().getStoreSize(), + inferAlignFromPtrInfo(MF, MPO)); MIRBuilder.buildStore(ExtReg, Addr, *MMO); } @@ -194,7 +194,7 @@ bool X86CallLowering::lowerReturn( MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); - auto &DL = MF.getDataLayout(); + const DataLayout &DL = MF.getDataLayout(); LLVMContext &Ctx = Val->getType()->getContext(); const X86TargetLowering &TLI = *getTLI(); @@ -245,7 +245,7 @@ struct X86IncomingValueHandler : public CallLowering::IncomingValueHandler { void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { MachineFunction &MF = MIRBuilder.getMF(); - auto MMO = MF.getMachineMemOperand( + auto *MMO = MF.getMachineMemOperand( MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, inferAlignFromPtrInfo(MF, MPO)); MIRBuilder.buildLoad(ValVReg, Addr, *MMO); @@ -337,8 +337,7 @@ bool X86CallLowering::lowerFormalArguments( SmallVector SplitArgs; unsigned Idx = 0; - for (auto &Arg : F.args()) { - + for (const auto &Arg : F.args()) { // TODO: handle not simple cases. if (Arg.hasAttribute(Attribute::ByVal) || Arg.hasAttribute(Attribute::InReg) || @@ -377,10 +376,10 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); - auto &DL = F.getParent()->getDataLayout(); + const DataLayout &DL = F.getParent()->getDataLayout(); const X86Subtarget &STI = MF.getSubtarget(); const TargetInstrInfo &TII = *STI.getInstrInfo(); - auto TRI = STI.getRegisterInfo(); + const X86RegisterInfo *TRI = STI.getRegisterInfo(); // Handle only Linux C, X86_64_SysV calling conventions for now. if (!STI.isTargetLinux() || !(Info.CallConv == CallingConv::C || diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp index 488ee51f1d89b..3a0d6a52ef463 100644 --- a/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -141,7 +141,7 @@ class InstrReplacer : public InstrConverterBase { return false; // It's illegal to replace an instruction that implicitly defines a register // with an instruction that doesn't, unless that register dead. - for (auto &MO : MI->implicit_operands()) + for (const auto &MO : MI->implicit_operands()) if (MO.isReg() && MO.isDef() && !MO.isDead() && !TII->get(DstOpcode).hasImplicitDefOfPhysReg(MO.getReg())) return false; @@ -180,7 +180,7 @@ class InstrReplacerDstCOPY : public InstrConverterBase { MachineRegisterInfo *MRI) const override { assert(isLegal(MI, TII) && "Cannot convert instruction"); MachineBasicBlock *MBB = MI->getParent(); - auto &DL = MI->getDebugLoc(); + const DebugLoc &DL = MI->getDebugLoc(); Register Reg = MRI->createVirtualRegister( TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(), @@ -237,7 +237,7 @@ class InstrCOPYReplacer : public InstrReplacer { MachineRegisterInfo *MRI) const override { assert(MI->getOpcode() == TargetOpcode::COPY && "Expected a COPY"); - for (auto &MO : MI->operands()) { + for (const auto &MO : MI->operands()) { // Physical registers will not be converted. Assume that converting the // COPY to the destination domain will eventually result in a actual // instruction. @@ -517,7 +517,7 @@ void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const { } } - for (auto MI : ToErase) + for (auto *MI : ToErase) MI->eraseFromParent(); } @@ -537,7 +537,7 @@ static bool usedAsAddr(const MachineInstr &MI, unsigned Reg, for (unsigned MemOpIdx = MemOpStart, MemOpEnd = MemOpStart + X86::AddrNumOperands; MemOpIdx < MemOpEnd; ++MemOpIdx) { - auto &Op = MI.getOperand(MemOpIdx); + const MachineOperand &Op = MI.getOperand(MemOpIdx); if (Op.isReg() && Op.getReg() == Reg) return true; } diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 7437c2e978af2..90265ddf344a1 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2919,7 +2919,6 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int Offset) const { - if (Offset <= 0) return false; @@ -2942,14 +2941,13 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, unsigned Regs[2]; unsigned FoundRegs = 0; - auto &MRI = MBB.getParent()->getRegInfo(); - auto RegMask = Prev->getOperand(1); + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const MachineOperand &RegMask = Prev->getOperand(1); auto &RegClass = Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass; // Try to find up to NumPops free registers. for (auto Candidate : RegClass) { - // Poor man's liveness: // Since we're immediately after a call, any register that is clobbered // by the call and not defined by it can be considered dead. diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 840f132ec6664..3b5a29ef31fcf 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -3502,6 +3502,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { // Shift NBits left by 8 bits, thus producing 'control'. // This makes the low 8 bits to be zero. SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); + insertDAGNode(*CurDAG, SDValue(Node, 0), C8); SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8); insertDAGNode(*CurDAG, SDValue(Node, 0), Control); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ad8704f686c16..2480e395e0a4a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -193,10 +193,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasCMov()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); setOperationAction(ISD::ABS , MVT::i32 , Custom); + if (Subtarget.is64Bit()) + setOperationAction(ISD::ABS , MVT::i64 , Custom); } - setOperationAction(ISD::ABS , MVT::i64 , Custom); - if (Subtarget.is64Bit()) - setOperationAction(ISD::ABS , MVT::i128 , Custom); // Funnel shifts. for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { @@ -386,6 +385,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::f80, MVT::f16, Expand); setTruncStoreAction(MVT::f128, MVT::f16, Expand); + setOperationAction(ISD::PARITY, MVT::i8, Custom); if (Subtarget.hasPOPCNT()) { setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); } else { @@ -396,6 +396,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP , MVT::i64 , Expand); else setOperationAction(ISD::CTPOP , MVT::i64 , Custom); + + setOperationAction(ISD::PARITY, MVT::i16, Custom); + setOperationAction(ISD::PARITY, MVT::i32, Custom); + if (Subtarget.is64Bit()) + setOperationAction(ISD::PARITY, MVT::i64, Custom); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); @@ -3110,7 +3115,7 @@ argsAreStructReturn(ArrayRef Ins, bool IsMCU) { static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, const SDLoc &dl) { - SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); + SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl); return DAG.getMemcpy( Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), @@ -12121,23 +12126,32 @@ static SDValue lowerShuffleAsByteRotateAndPermute( /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and -/// blends. -static SDValue lowerShuffleAsDecomposedShuffleBlend( +/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend. +static SDValue lowerShuffleAsDecomposedShuffleMerge( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + int NumElts = Mask.size(); + int NumLanes = VT.getSizeInBits() / 128; + int NumEltsPerLane = NumElts / NumLanes; + // Shuffle the input elements into the desired positions in V1 and V2 and - // blend them together. - SmallVector V1Mask(Mask.size(), -1); - SmallVector V2Mask(Mask.size(), -1); - SmallVector BlendMask(Mask.size(), -1); - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= 0 && Mask[i] < Size) { - V1Mask[i] = Mask[i]; - BlendMask[i] = i; - } else if (Mask[i] >= Size) { - V2Mask[i] = Mask[i] - Size; - BlendMask[i] = i + Size; + // unpack/blend them together. + bool IsAlternating = true; + SmallVector V1Mask(NumElts, -1); + SmallVector V2Mask(NumElts, -1); + SmallVector FinalMask(NumElts, -1); + for (int i = 0; i < NumElts; ++i) { + int M = Mask[i]; + if (M >= 0 && M < NumElts) { + V1Mask[i] = M; + FinalMask[i] = i; + IsAlternating &= (i & 1) == 0; + } else if (M >= NumElts) { + V2Mask[i] = M - NumElts; + FinalMask[i] = i + NumElts; + IsAlternating &= (i & 1) == 1; } + } // Try to lower with the simpler initial blend/unpack/rotate strategies unless // one of the input shuffles would be a no-op. We prefer to shuffle inputs as @@ -12161,9 +12175,30 @@ static SDValue lowerShuffleAsDecomposedShuffleBlend( return BlendPerm; } + // If the final mask is an alternating blend of vXi8/vXi16, convert to an + // UNPCKL(SHUFFLE, SHUFFLE) pattern. + // TODO: It doesn't have to be alternating - but each lane mustn't have more + // than half the elements coming from each source. + if (IsAlternating && VT.getScalarSizeInBits() < 32) { + V1Mask.assign(NumElts, -1); + V2Mask.assign(NumElts, -1); + FinalMask.assign(NumElts, -1); + for (int i = 0; i != NumElts; i += NumEltsPerLane) + for (int j = 0; j != NumEltsPerLane; ++j) { + int M = Mask[i + j]; + if (M >= 0 && M < NumElts) { + V1Mask[i + (j / 2)] = M; + FinalMask[i + j] = i + (j / 2); + } else if (M >= NumElts) { + V2Mask[i + (j / 2)] = M - NumElts; + FinalMask[i + j] = i + (j / 2) + NumElts; + } + } + } + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); - return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); + return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask); } /// Try to lower a vector shuffle as a bit rotation. @@ -13902,7 +13937,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef Mask, // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG); // We implement this with SHUFPD which is pretty lame because it will likely @@ -13996,6 +14031,12 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, NewMask[2] = Mask[2] < 4 ? 1 : 3; NewMask[3] = Mask[2] < 4 ? 3 : 1; } + } else if (NumV2Elements == 3) { + // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but + // we can get here due to other paths (e.g repeated mask matching) that we + // don't want to do another round of lowerVECTOR_SHUFFLE. + ShuffleVectorSDNode::commuteMask(NewMask); + return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG); } return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); @@ -14194,7 +14235,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef Mask, // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG); // Try to lower by permuting the inputs into an unpack instruction. @@ -14944,8 +14985,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef Mask, } // We can always bit-blend if we have to so the fallback strategy is to - // decompose into single-input permutes and blends. - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, + // decompose into single-input permutes and blends/unpacks. + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG); } @@ -15282,9 +15323,9 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, return Result; } - // Handle multi-input cases by blending single-input shuffles. + // Handle multi-input cases by blending/unpacking single-input shuffles. if (NumV2Elements > 0) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG); // The fallback path for single-input shuffles widens this into two v8i16 @@ -15464,7 +15505,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, } /// Either split a vector in halves or decompose the shuffles and the -/// blend. +/// blend/unpack. /// /// This is provided as a good fallback for many lowerings of non-single-input /// shuffles with more than one 128-bit lane. In those cases, we want to select @@ -15499,8 +15540,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, return true; }; if (DoBothBroadcast()) - return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget, + DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to @@ -15516,9 +15557,9 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); - // Otherwise, just fall back to decomposed shuffles and a blend. This requires - // that the decomposed single-input shuffles don't end up here. - return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget, + // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This + // requires that the decomposed single-input shuffles don't end up here. + return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget, DAG); } @@ -16570,7 +16611,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -16598,7 +16639,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget.hasAVX2()) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. @@ -16680,7 +16721,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef Mask, // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -16700,7 +16741,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef Mask, return Result; // Otherwise fall back on generic blend lowering. - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); } @@ -16789,14 +16830,13 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef Mask, // since after split we get a more efficient code using vpunpcklwd and // vpunpckhwd instrs than vblend. if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) - if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, - Subtarget, DAG)) - return V; + return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, + DAG); // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. @@ -16829,9 +16869,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef Mask, // vpunpcklwd and vpunpckhwd instrs. if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && !Subtarget.hasAVX512()) - if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, - Subtarget, DAG)) - return V; + return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, + DAG); if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -16916,7 +16955,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef Mask, return Result; // Otherwise fall back on generic blend lowering. - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG); } @@ -19231,7 +19270,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { else IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo()); - auto &DL = DAG.getDataLayout(); + const DataLayout &DL = DAG.getDataLayout(); SDValue Scale = DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8); IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); @@ -19802,17 +19841,15 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, // Load the 64-bit value into an XMM register. SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo)); - SDValue CLod0 = - DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - /* Alignment = */ 16); + SDValue CLod0 = DAG.getLoad( + MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16)); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); - SDValue CLod1 = - DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - /* Alignment = */ 16); + SDValue CLod1 = DAG.getLoad( + MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16)); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); SDValue Sub; SDValue Chain; @@ -20178,17 +20215,17 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8); int SSFI = cast(StackSlot)->getIndex(); + Align SlotAlign(8); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI); if (SrcVT == MVT::i32) { SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl); - SDValue Store1 = - DAG.getStore(Chain, dl, Src, StackSlot, MPI, 8 /*Align*/); + SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), - OffsetSlot, MPI.getWithOffset(4), 4); + OffsetSlot, MPI.getWithOffset(4), SlotAlign); std::pair Tmp = - BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, Align(8), DAG); + BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG); if (IsStrict) return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); @@ -20204,7 +20241,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); } SDValue Store = - DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Align(8)); + DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. We must be careful to do the computation in x87 extended // precision, not in SSE. @@ -20212,7 +20249,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue Ops[] = { Store, StackSlot }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI, - Align(8), MachineMemOperand::MOLoad); + SlotAlign, MachineMemOperand::MOLoad); Chain = Fild.getValue(1); @@ -20346,7 +20383,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, *DAG.getContext(), TheVT); SDValue Cmp; if (IsStrict) { - Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT, + Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT, SDNodeFlags(), Chain, /*IsSignaling*/ true); Chain = Cmp.getValue(1); } else { @@ -26265,9 +26302,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(2, dl, MVT::i64)); - OutChains[1] = - DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2), - /* Alignment = */ 2); + OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, + MachinePointerInfo(TrmpAddr, 2), Align(2)); // Load the 'nest' parameter value into R10. // R10 is specified in X86CallingConv.td @@ -26279,9 +26315,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(12, dl, MVT::i64)); - OutChains[3] = - DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12), - /* Alignment = */ 2); + OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, + MachinePointerInfo(TrmpAddr, 12), Align(2)); // Jump to the nested function. OpCode = (JMP64r << 8) | REX_WB; // jmpq *... @@ -26323,7 +26358,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) if (Attrs.hasAttribute(Idx, Attribute::InReg)) { - auto &DL = DAG.getDataLayout(); + const DataLayout &DL = DAG.getDataLayout(); // FIXME: should only count parameters that are lowered to integers. InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; } @@ -26361,22 +26396,20 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(1, dl, MVT::i32)); - OutChains[1] = - DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1), - /* Alignment = */ 1); + OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, + MachinePointerInfo(TrmpAddr, 1), Align(1)); const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(5, dl, MVT::i32)); - OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), - Addr, MachinePointerInfo(TrmpAddr, 5), - /* Alignment = */ 1); + OutChains[2] = + DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr, + MachinePointerInfo(TrmpAddr, 5), Align(1)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(6, dl, MVT::i32)); - OutChains[3] = - DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6), - /* Alignment = */ 1); + OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, + MachinePointerInfo(TrmpAddr, 6), Align(1)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } @@ -27164,8 +27197,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); Entry.Node = StackPtr; - InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, - MPI, /* Alignment = */ 16); + InChain = + DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16)); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Ty = PointerType::get(ArgTy,0); Entry.IsSExt = false; @@ -28838,6 +28871,58 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); } +static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue X = Op.getOperand(0); + MVT VT = Op.getSimpleValueType(); + + // Special case. If the input fits in 8-bits we can use a single 8-bit TEST. + if (VT == MVT::i8 || + DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) { + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); + SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X, + DAG.getConstant(0, DL, MVT::i8)); + // Copy the inverse of the parity flag into a register with setcc. + SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); + // Extend to the original type. + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp); + } + + if (VT == MVT::i64) { + // Xor the high and low 16-bits together using a 32-bit operation. + SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, + DAG.getNode(ISD::SRL, DL, MVT::i64, X, + DAG.getConstant(32, DL, MVT::i8))); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi); + } + + if (VT != MVT::i16) { + // Xor the high and low 16-bits together using a 32-bit operation. + SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X, + DAG.getConstant(16, DL, MVT::i8)); + X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16); + } else { + // If the input is 16-bits, we need to extend to use an i32 shift below. + X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X); + } + + // Finally xor the low 2 bytes together and use a 8-bit flag setting xor. + // This should allow an h-reg to be used to save a shift. + SDValue Hi = DAG.getNode( + ISD::TRUNCATE, DL, MVT::i8, + DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8))); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); + SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32); + SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1); + + // Copy the inverse of the parity flag into a register with setcc. + SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); + // Extend to the original type. + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp); +} + static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NewOpc = 0; @@ -28974,7 +29059,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); Chain = DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr, - MPI, /*Align*/ 0, MachineMemOperand::MOStore); + MPI, MaybeAlign(), MachineMemOperand::MOStore); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue LdOps[] = {Chain, StackPtr}; SDValue Value = @@ -29456,6 +29541,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget); case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); + case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG); @@ -29720,31 +29806,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Res); return; } - case ISD::ABS: { - assert((Subtarget.is64Bit() || N->getValueType(0) == MVT::i64) && - "Unexpected type (!= i64) on ABS."); - assert((!Subtarget.is64Bit() || N->getValueType(0) == MVT::i128) && - "Unexpected type (!= i128) on ABS."); - MVT VT = N->getSimpleValueType(0); - MVT HalfT = VT == MVT::i128 ? MVT::i64 : MVT::i32; - SDValue Lo, Hi, Tmp; - SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); - - Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), - DAG.getConstant(0, dl, HalfT)); - Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), - DAG.getConstant(1, dl, HalfT)); - Tmp = DAG.getNode( - ISD::SRA, dl, HalfT, Hi, - DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl)); - Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); - Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, - SDValue(Lo.getNode(), 1)); - Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); - Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, VT, Lo, Hi)); - return; - } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -31238,7 +31299,7 @@ static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, /// Utility function to emit xbegin specifying the start of an RTM region. static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII) { - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); @@ -31364,7 +31425,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); // struct va_list { // i32 gp_offset @@ -31611,7 +31672,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( // Now add the instructions. const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); Register CountReg = MI.getOperand(0).getReg(); int64_t RegSaveFrameIndex = MI.getOperand(1).getImm(); @@ -31923,7 +31984,7 @@ MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, MachineBasicBlock *ThisMBB) const { const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the @@ -32078,7 +32139,7 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const X86FrameLowering &TFI = *Subtarget.getFrameLowering(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); const unsigned ProbeSize = getStackProbeSize(*MF); @@ -32171,7 +32232,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); @@ -32206,7 +32267,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, const TargetRegisterClass *AddrRegClass = getRegClassFor(getPointerTy(MF->getDataLayout())); - unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), + Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), @@ -32306,7 +32367,7 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI, MachineFunction *MF = BB->getParent(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); assert(!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && @@ -32344,7 +32405,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, // inside MC, therefore without the two markers shrink-wrapping // may push the prologue/epilogue pass them. const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction &MF = *BB->getParent(); // Emit CALLSEQ_START right before the instruction. @@ -32373,7 +32434,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, // be in the normal return register. MachineFunction *F = BB->getParent(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?"); assert(MI.getOperand(3).isGlobal() && "This should be a global"); @@ -32512,7 +32573,7 @@ X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI, MachineBasicBlock *BB) const { // Copy the virtual register into the R11 physical register and // call the retpoline thunk. - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); Register CalleeVReg = MI.getOperand(0).getReg(); unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode()); @@ -32574,7 +32635,7 @@ X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI, /// \param [in] MBB The Machine Basic Block that will be modified. void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const { - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -32617,7 +32678,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); @@ -32777,7 +32838,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock * X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const { - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -32958,7 +33019,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock * X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -33042,7 +33103,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const { - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); @@ -33091,7 +33152,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock * X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *BB) const { - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); @@ -33321,7 +33382,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); auto TMMImmToTMMReg = [](unsigned Imm) { assert (Imm < 8 && "Illegal tmm index"); @@ -33656,7 +33717,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTDPBUSD: case X86::PTDPBUUD: case X86::PTDPBF16PS: { - const DebugLoc &DL = MI.getDebugLoc(); unsigned Opc; switch (MI.getOpcode()) { case X86::PTDPBSSD: Opc = X86::TDPBSSD; break; @@ -33676,7 +33736,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return BB; } case X86::PTILEZERO: { - const DebugLoc &DL = MI.getDebugLoc(); unsigned Imm = MI.getOperand(0).getImm(); BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm)); MI.eraseFromParent(); // The pseudo is gone now. @@ -33685,7 +33744,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTILELOADD: case X86::PTILELOADDT1: case X86::PTILESTORED: { - const DebugLoc &DL = MI.getDebugLoc(); unsigned Opc; switch (MI.getOpcode()) { case X86::PTILELOADD: Opc = X86::TILELOADD; break; @@ -35852,9 +35910,9 @@ static SDValue combineX86ShufflesRecursively( SDValue Op = SrcOps[SrcOpIndex]; Op = peekThroughOneUseBitcasts(Op); - MVT VT = Op.getSimpleValueType(); - if (!VT.isVector()) - return SDValue(); // Bail if we hit a non-vector. + EVT VT = Op.getValueType(); + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); // Bail if we hit a non-simple non-vector. assert(VT.getSizeInBits() == RootSizeInBits && "Can only combine shuffles of the same vector register size."); @@ -36657,6 +36715,27 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } } + // Pull subvector inserts into undef through VZEXT_MOVL by making it an + // insert into a zero vector. This helps get VZEXT_MOVL closer to + // scalar_to_vectors where 256/512 are canonicalized to an insert and a + // 128-bit scalar_to_vector. This reduces the number of isel patterns. + if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) { + SDValue V = peekThroughOneUseBitcasts(N0); + + if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() && + isNullConstant(V.getOperand(2))) { + SDValue In = V.getOperand(1); + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + In.getValueSizeInBits() / + VT.getScalarSizeInBits()); + In = DAG.getBitcast(SubVT, In); + SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, + getZeroVector(VT, Subtarget, DAG, DL), Movl, + V.getOperand(2)); + } + } + return SDValue(); } case X86ISD::BLENDI: { @@ -37335,32 +37414,11 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // TODO - merge this into combineX86ShufflesRecursively. APInt KnownUndef, KnownZero; APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); - if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI)) + if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, + DCI)) return SDValue(N, 0); } - // Pull subvector inserts into undef through VZEXT_MOVL by making it an - // insert into a zero vector. This helps get VZEXT_MOVL closer to - // scalar_to_vectors where 256/512 are canonicalized to an insert and a - // 128-bit scalar_to_vector. This reduces the number of isel patterns. - if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() && - N->getOperand(0).hasOneUse()) { - SDValue V = peekThroughOneUseBitcasts(N->getOperand(0)); - - if (V.getOpcode() == ISD::INSERT_SUBVECTOR && - V.getOperand(0).isUndef() && isNullConstant(V.getOperand(2))) { - SDValue In = V.getOperand(1); - MVT SubVT = - MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(), - In.getValueSizeInBits() / VT.getScalarSizeInBits()); - In = DAG.getBitcast(SubVT, In); - SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, SubVT, In); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, - getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl), - Movl, V.getOperand(2)); - } - } - return SDValue(); } @@ -37805,7 +37863,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::BLENDI: - // Saturated Packs. + // Integer ops. + case X86ISD::AVG: case X86ISD::PACKSS: case X86ISD::PACKUS: // Horizontal Ops. @@ -39312,10 +39371,8 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32; if (BinOp == ISD::XOR) { - // parity -> (AND (CTPOP(MOVMSK X)), 1) - SDValue Mask = DAG.getConstant(1, DL, CmpVT); - SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk); - Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask); + // parity -> (PARITY(MOVMSK X)) + SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk); return DAG.getZExtOrTrunc(Result, DL, ExtractVT); } @@ -43283,89 +43340,6 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, return SDValue(); } -// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity. -// Turn it into series of XORs and a setnp. -static SDValue combineParity(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - // RHS needs to be 1. - if (!isOneConstant(N1)) - return SDValue(); - - // Popcnt may be truncated. - if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) - N0 = N0.getOperand(0); - - // LHS needs to be a single use CTPOP. - if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse()) - return SDValue(); - - EVT VT = N0.getValueType(); - - // We only support 64-bit and 32-bit. 64-bit requires special handling - // unless the 64-bit popcnt instruction is legal. - if (VT != MVT::i32 && VT != MVT::i64) - return SDValue(); - - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT)) - return SDValue(); - - SDLoc DL(N); - SDValue X = N0.getOperand(0); - - // Special case. If the input fits in 8-bits we can use a single 8-bit TEST. - if (DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) { - X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); - SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X, - DAG.getConstant(0, DL, MVT::i8)); - // Copy the inverse of the parity flag into a register with setcc. - SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); - // Extend or truncate to the original type. - return DAG.getZExtOrTrunc(Setnp, DL, N->getValueType(0)); - } - - // If this is 64-bit, its always best to xor the two 32-bit pieces together - // even if we have popcnt. - if (VT == MVT::i64) { - SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, - DAG.getNode(ISD::SRL, DL, VT, X, - DAG.getConstant(32, DL, MVT::i8))); - SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); - X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi); - // Generate a 32-bit parity idiom. This will bring us back here if we need - // to expand it too. - SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32, - DAG.getNode(ISD::CTPOP, DL, MVT::i32, X), - DAG.getConstant(1, DL, MVT::i32)); - return DAG.getZExtOrTrunc(Parity, DL, N->getValueType(0)); - } - assert(VT == MVT::i32 && "Unexpected VT!"); - - // Xor the high and low 16-bits together using a 32-bit operation. - SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X, - DAG.getConstant(16, DL, MVT::i8)); - X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16); - - // Finally xor the low 2 bytes together and use a 8-bit flag setting xor. - // This should allow an h-reg to be used to save a shift. - // FIXME: We only get an h-reg in 32-bit mode. - SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, - DAG.getNode(ISD::SRL, DL, VT, X, - DAG.getConstant(8, DL, MVT::i8))); - SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); - SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32); - SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1); - - // Copy the inverse of the parity flag into a register with setcc. - SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); - // Extend or truncate to the original type. - return DAG.getZExtOrTrunc(Setnp, DL, N->getValueType(0)); -} - - // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C) // Where C is a mask containing the same number of bits as the setcc and // where the setcc will freely 0 upper bits of k-register. We can replace the @@ -43457,10 +43431,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, } } - // This must be done before legalization has expanded the ctpop. - if (SDValue V = combineParity(N, DAG, Subtarget)) - return V; - // Match all-of bool scalar reductions into a bitcast/movmsk + cmp. // TODO: Support multiple SrcOps. if (VT == MVT::i1) { @@ -44217,8 +44187,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, unsigned NumElems = VT.getVectorNumElements(); EVT ScalarVT = VT.getVectorElementType(); - if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && - NumElems >= 2 && isPowerOf2_32(NumElems))) + if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2)) return SDValue(); // InScalarVT is the intermediate type in AVG pattern and it should be greater @@ -44269,6 +44238,29 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops); }; + auto AVGSplitter = [&](SDValue Op0, SDValue Op1) { + // Pad to a power-of-2 vector, split+apply and extract the original vector. + unsigned NumElemsPow2 = PowerOf2Ceil(NumElems); + EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2); + if (NumElemsPow2 != NumElems) { + SmallVector Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT)); + SmallVector Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT)); + for (unsigned i = 0; i != NumElems; ++i) { + SDValue Idx = DAG.getIntPtrConstant(i, DL); + Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx); + Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx); + } + Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0); + Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1); + } + SDValue Res = + SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder); + if (NumElemsPow2 == NumElems) + return Res; + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + }; + // Take care of the case when one of the operands is a constant vector whose // element is in the range [1, 256]. if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && @@ -44279,9 +44271,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, SDValue VecOnes = DAG.getConstant(1, DL, InVT); Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes); Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); - return SplitOpsAndApply(DAG, Subtarget, DL, VT, - { Operands[0].getOperand(0), Operands[1] }, - AVGBuilder); + return AVGSplitter(Operands[0].getOperand(0), Operands[1]); } // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)). @@ -44328,8 +44318,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, } // The pattern is detected, emit X86ISD::AVG instruction(s). - return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]}, - AVGBuilder); + return AVGSplitter(Operands[0], Operands[1]); } return SDValue(); @@ -44454,7 +44443,8 @@ static int getOneTrueElt(SDValue V) { /// scalar element, and the alignment for the scalar memory access. static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, - SDValue &Index, unsigned &Alignment) { + SDValue &Index, Align &Alignment, + unsigned &Offset) { int TrueMaskElt = getOneTrueElt(MaskedOp->getMask()); if (TrueMaskElt < 0) return false; @@ -44462,15 +44452,17 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, // Get the address of the one scalar element that is specified by the mask // using the appropriate offset from the base pointer. EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType(); + Offset = 0; Addr = MaskedOp->getBasePtr(); if (TrueMaskElt != 0) { - unsigned Offset = TrueMaskElt * EltVT.getStoreSize(); + Offset = TrueMaskElt * EltVT.getStoreSize(); Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset), SDLoc(MaskedOp)); } Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp)); - Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize()); + Alignment = commonAlignment(MaskedOp->getOriginalAlign(), + EltVT.getStoreSize()); return true; } @@ -44487,8 +44479,9 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, // is profitable. Endianness would also have to be considered. SDValue Addr, VecIndex; - unsigned Alignment; - if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment)) + Align Alignment; + unsigned Offset; + if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset)) return SDValue(); // Load the one scalar element that is specified by the mask using the @@ -44497,7 +44490,8 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, EVT VT = ML->getValueType(0); EVT EltVT = VT.getVectorElementType(); SDValue Load = - DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(), + DAG.getLoad(EltVT, DL, ML->getChain(), Addr, + ML->getPointerInfo().getWithOffset(Offset), Alignment, ML->getMemOperand()->getFlags()); // Insert the loaded element into the appropriate place in the vector. @@ -44608,8 +44602,9 @@ static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, // is profitable. Endianness would also have to be considered. SDValue Addr, VecIndex; - unsigned Alignment; - if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment)) + Align Alignment; + unsigned Offset; + if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset)) return SDValue(); // Extract the one scalar element that is actually being stored. @@ -44620,7 +44615,8 @@ static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, MS->getValue(), VecIndex); // Store that element at the appropriate offset from the base pointer. - return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(), + return DAG.getStore(MS->getChain(), DL, Extract, Addr, + MS->getPointerInfo().getWithOffset(Offset), Alignment, MS->getMemOperand()->getFlags()); } @@ -50622,23 +50618,27 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // Not found as a standard register? if (!Res.second) { - // Map st(0) -> st(7) -> ST0 - if (Constraint.size() == 7 && Constraint[0] == '{' && - tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' && - Constraint[3] == '(' && - (Constraint[4] >= '0' && Constraint[4] <= '7') && - Constraint[5] == ')' && Constraint[6] == '}') { - // st(7) is not allocatable and thus not a member of RFP80. Return - // singleton class in cases where we have a reference to it. - if (Constraint[4] == '7') - return std::make_pair(X86::FP7, &X86::RFP80_7RegClass); - return std::make_pair(X86::FP0 + Constraint[4] - '0', - &X86::RFP80RegClass); - } - - // GCC allows "st(0)" to be called just plain "st". - if (StringRef("{st}").equals_lower(Constraint)) - return std::make_pair(X86::FP0, &X86::RFP80RegClass); + // Only match x87 registers if the VT is one SelectionDAGBuilder can convert + // to/from f80. + if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) { + // Map st(0) -> st(7) -> ST0 + if (Constraint.size() == 7 && Constraint[0] == '{' && + tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' && + Constraint[3] == '(' && + (Constraint[4] >= '0' && Constraint[4] <= '7') && + Constraint[5] == ')' && Constraint[6] == '}') { + // st(7) is not allocatable and thus not a member of RFP80. Return + // singleton class in cases where we have a reference to it. + if (Constraint[4] == '7') + return std::make_pair(X86::FP7, &X86::RFP80_7RegClass); + return std::make_pair(X86::FP0 + Constraint[4] - '0', + &X86::RFP80RegClass); + } + + // GCC allows "st(0)" to be called just plain "st". + if (StringRef("{st}").equals_lower(Constraint)) + return std::make_pair(X86::FP0, &X86::RFP80RegClass); + } // flags -> EFLAGS if (StringRef("{flags}").equals_lower(Constraint)) diff --git a/llvm/lib/Target/X86/X86InsertWait.cpp b/llvm/lib/Target/X86/X86InsertWait.cpp index a82d98d88b306..56d2709f59374 100644 --- a/llvm/lib/Target/X86/X86InsertWait.cpp +++ b/llvm/lib/Target/X86/X86InsertWait.cpp @@ -27,7 +27,6 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/Support/Debug.h" @@ -48,9 +47,6 @@ class WaitInsert : public MachineFunctionPass { StringRef getPassName() const override { return "X86 insert wait instruction"; } - -private: - const TargetInstrInfo *TII; // Machine instruction info. }; } // namespace @@ -119,7 +115,7 @@ bool WaitInsert::runOnMachineFunction(MachineFunction &MF) { return false; const X86Subtarget &ST = MF.getSubtarget(); - TII = ST.getInstrInfo(); + const X86InstrInfo *TII = ST.getInstrInfo(); bool Changed = false; for (MachineBasicBlock &MBB : MF) { diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index e2582bae3010c..94ee799010756 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -16,6 +16,7 @@ #include "X86TargetTransformInfo.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" using namespace llvm; @@ -24,19 +25,29 @@ using namespace llvm; /// Return a constant boolean vector that has true elements in all positions /// where the input constant data vector has an element with the sign bit set. -static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { - SmallVector BoolVec; - IntegerType *BoolTy = Type::getInt1Ty(V->getContext()); - for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) { - Constant *Elt = V->getElementAsConstant(I); - assert((isa(Elt) || isa(Elt)) && - "Unexpected constant data vector element type"); - bool Sign = V->getElementType()->isIntegerTy() - ? cast(Elt)->isNegative() - : cast(Elt)->isNegative(); - BoolVec.push_back(ConstantInt::get(BoolTy, Sign)); - } - return ConstantVector::get(BoolVec); +static Constant *getNegativeIsTrueBoolVec(Constant *V) { + VectorType *IntTy = VectorType::getInteger(cast(V->getType())); + V = ConstantExpr::getBitCast(V, IntTy); + V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy), + V); + return V; +} + +/// Convert the x86 XMM integer vector mask to a vector of bools based on +/// each element's most significant bit (the sign bit). +static Value *getBoolVecFromMask(Value *Mask) { + // Fold Constant Mask. + if (auto *ConstantMask = dyn_cast(Mask)) + return getNegativeIsTrueBoolVec(ConstantMask); + + // Mask was extended from a boolean vector. + Value *ExtMask; + if (PatternMatch::match( + Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && + ExtMask->getType()->isIntOrIntVectorTy(1)) + return ExtMask; + + return nullptr; } // TODO: If the x86 backend knew how to convert a bool vector mask back to an @@ -47,32 +58,26 @@ static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { Value *Mask = II.getOperand(1); Constant *ZeroVec = Constant::getNullValue(II.getType()); - // Special case a zero mask since that's not a ConstantDataVector. - // This masked load instruction creates a zero vector. + // Zero Mask - masked load instruction creates a zero vector. if (isa(Mask)) return IC.replaceInstUsesWith(II, ZeroVec); - auto *ConstMask = dyn_cast(Mask); - if (!ConstMask) - return nullptr; - - // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic - // to allow target-independent optimizations. - - // First, cast the x86 intrinsic scalar pointer to a vector pointer to match - // the LLVM intrinsic definition for the pointer argument. - unsigned AddrSpace = cast(Ptr->getType())->getAddressSpace(); - PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); - Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); - - // Second, convert the x86 XMM integer vector mask to a vector of bools based - // on each element's most significant bit (the sign bit). - Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); + // The mask is constant or extended from a bool vector. Convert this x86 + // intrinsic to the LLVM intrinsic to allow target-independent optimizations. + if (Value *BoolMask = getBoolVecFromMask(Mask)) { + // First, cast the x86 intrinsic scalar pointer to a vector pointer to match + // the LLVM intrinsic definition for the pointer argument. + unsigned AddrSpace = cast(Ptr->getType())->getAddressSpace(); + PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); + Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); + + // The pass-through vector for an x86 masked load is a zero vector. + CallInst *NewMaskedLoad = + IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); + return IC.replaceInstUsesWith(II, NewMaskedLoad); + } - // The pass-through vector for an x86 masked load is a zero vector. - CallInst *NewMaskedLoad = - IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); - return IC.replaceInstUsesWith(II, NewMaskedLoad); + return nullptr; } // TODO: If the x86 backend knew how to convert a bool vector mask back to an @@ -83,8 +88,7 @@ static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { Value *Mask = II.getOperand(1); Value *Vec = II.getOperand(2); - // Special case a zero mask since that's not a ConstantDataVector: - // this masked store instruction does nothing. + // Zero Mask - this masked store instruction does nothing. if (isa(Mask)) { IC.eraseInstFromFunction(II); return true; @@ -95,28 +99,21 @@ static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) return false; - auto *ConstMask = dyn_cast(Mask); - if (!ConstMask) - return false; - - // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic - // to allow target-independent optimizations. + // The mask is constant or extended from a bool vector. Convert this x86 + // intrinsic to the LLVM intrinsic to allow target-independent optimizations. + if (Value *BoolMask = getBoolVecFromMask(Mask)) { + unsigned AddrSpace = cast(Ptr->getType())->getAddressSpace(); + PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); + Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); - // First, cast the x86 intrinsic scalar pointer to a vector pointer to match - // the LLVM intrinsic definition for the pointer argument. - unsigned AddrSpace = cast(Ptr->getType())->getAddressSpace(); - PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); - Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); + IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); - // Second, convert the x86 XMM integer vector mask to a vector of bools based - // on each element's most significant bit (the sign bit). - Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); - - IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); + // 'Replace uses' doesn't work for stores. Erase the original masked store. + IC.eraseInstFromFunction(II); + return true; + } - // 'Replace uses' doesn't work for stores. Erase the original masked store. - IC.eraseInstFromFunction(II); - return true; + return false; } static Value *simplifyX86immShift(const IntrinsicInst &II, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 5aac29e21d6f9..1f4bf30cc1d02 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3663,6 +3663,34 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, } } +bool X86InstrInfo::preservesZeroValueInReg( + const MachineInstr *MI, const Register NullValueReg, + const TargetRegisterInfo *TRI) const { + if (!MI->modifiesRegister(NullValueReg, TRI)) + return true; + switch (MI->getOpcode()) { + // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax + // X. + case X86::SHR64ri: + case X86::SHR32ri: + case X86::SHL64ri: + case X86::SHL32ri: + assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() && + "expected for shift opcode!"); + return MI->getOperand(0).getReg() == NullValueReg && + MI->getOperand(1).getReg() == NullValueReg; + // Zero extend of a sub-reg of NullValueReg into itself does not change the + // null value. + case X86::MOV32rr: + return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) { + return TRI->isSubRegisterEq(NullValueReg, MO.getReg()); + }); + default: + return false; + } + llvm_unreachable("Should be handled above!"); +} + bool X86InstrInfo::getMemOperandsWithOffsetWidth( const MachineInstr &MemOp, SmallVectorImpl &BaseOps, int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index cd91144c829af..215318105de45 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -317,6 +317,10 @@ class X86InstrInfo final : public X86GenInstrInfo { SmallVectorImpl &Cond, bool AllowModify) const override; + bool preservesZeroValueInReg(const MachineInstr *MI, + const Register NullValueReg, + const TargetRegisterInfo *TRI) const override; + bool getMemOperandsWithOffsetWidth( const MachineInstr &LdSt, SmallVectorImpl &BaseOps, int64_t &Offset, diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index ce8d1d464da97..e76908ef4bc40 100644 --- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -24,6 +24,10 @@ using namespace llvm; #define DEBUG_TYPE "x86-selectiondag-info" +static cl::opt + UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false), + cl::desc("Use fast short rep mov in memcpy lowering")); + bool X86SelectionDAGInfo::isBaseRegConflictPossible( SelectionDAG &DAG, ArrayRef ClobberSet) const { // We cannot use TRI->hasBasePointer() until *after* we select all basic @@ -306,6 +310,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( const X86Subtarget &Subtarget = DAG.getMachineFunction().getSubtarget(); + // If enabled and available, use fast short rep mov. + if (UseFSRMForMemcpy && Subtarget.hasFSRM()) + return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8); + /// Handle constant sizes, if (ConstantSDNode *ConstantSize = dyn_cast(Size)) return emitConstantSizeRepmov( diff --git a/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp b/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp index 7e91c37367d2f..d57871130b0cb 100644 --- a/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp +++ b/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp @@ -161,6 +161,7 @@ bool X86SpeculativeExecutionSideEffectSuppression::runOnMachineFunction( // This branch requires adding an LFENCE. if (!PrevInstIsLFENCE) { + assert(FirstTerminator && "Unknown terminator instruction"); BuildMI(MBB, FirstTerminator, DebugLoc(), TII->get(X86::LFENCE)); NumLFENCEsInserted++; Modified = true; diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 4cf17e46a598a..d50c552a65b6f 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -258,12 +258,13 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, report_fatal_error("64-bit code requested on a subtarget that doesn't " "support it!"); - // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both - // 32 and 64 bit) and for all 64-bit targets. + // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and for all + // 64-bit targets. On Solaris (32-bit), stack alignment is 4 bytes + // following the i386 psABI, while on Illumos it is always 16 bytes. if (StackAlignOverride) stackAlignment = *StackAlignOverride; - else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() || - isTargetKFreeBSD() || In64BitMode) + else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() || + In64BitMode) stackAlignment = Align(16); // Consume the vector width attribute or apply any target specific limit. diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 7616b2ea7d998..34bc72a2e69f3 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -444,7 +444,7 @@ bool X86PassConfig::addInstSelector() { } bool X86PassConfig::addIRTranslator() { - addPass(new IRTranslator()); + addPass(new IRTranslator(getOptLevel())); return false; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index c9179742bcb9c..8ce9749dc2d66 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -321,6 +321,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand. { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand. { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb. + + { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence + { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence + { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence + { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -336,6 +341,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. + + { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence + { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence + { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence + { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -353,6 +363,15 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. + + { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split. + { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split. + { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence + { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence + { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split. + { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split. + { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence + { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence }; // XOP has faster vXi8 shifts. @@ -4264,7 +4283,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { // scalarize it. if (auto *DataVTy = dyn_cast(DataTy)) { unsigned NumElts = DataVTy->getNumElements(); - if (NumElts == 1 || !isPowerOf2_32(NumElts)) + if (NumElts == 1) return false; } Type *ScalarTy = DataTy->getScalarType(); diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/llvm/lib/Target/XCore/XCoreISelLowering.cpp index 573aee02533db..db3dd7fb14383 100644 --- a/llvm/lib/Target/XCore/XCoreISelLowering.cpp +++ b/llvm/lib/Target/XCore/XCoreISelLowering.cpp @@ -443,16 +443,15 @@ SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { } if (LD->getAlignment() == 2) { - SDValue Low = - DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain, BasePtr, - LD->getPointerInfo(), MVT::i16, - /* Alignment = */ 2, LD->getMemOperand()->getFlags()); + SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain, BasePtr, + LD->getPointerInfo(), MVT::i16, Align(2), + LD->getMemOperand()->getFlags()); SDValue HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(2, DL, MVT::i32)); SDValue High = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, HighAddr, LD->getPointerInfo().getWithOffset(2), MVT::i16, - /* Alignment = */ 2, LD->getMemOperand()->getFlags()); + Align(2), LD->getMemOperand()->getFlags()); SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High, DAG.getConstant(16, DL, MVT::i32)); SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Low, HighShifted); @@ -502,14 +501,14 @@ SDValue XCoreTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue Low = Value; SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value, DAG.getConstant(16, dl, MVT::i32)); - SDValue StoreLow = DAG.getTruncStore( - Chain, dl, Low, BasePtr, ST->getPointerInfo(), MVT::i16, - /* Alignment = */ 2, ST->getMemOperand()->getFlags()); + SDValue StoreLow = + DAG.getTruncStore(Chain, dl, Low, BasePtr, ST->getPointerInfo(), + MVT::i16, Align(2), ST->getMemOperand()->getFlags()); SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr, DAG.getConstant(2, dl, MVT::i32)); SDValue StoreHigh = DAG.getTruncStore( Chain, dl, High, HighAddr, ST->getPointerInfo().getWithOffset(2), - MVT::i16, /* Alignment = */ 2, ST->getMemOperand()->getFlags()); + MVT::i16, Align(2), ST->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StoreLow, StoreHigh); } diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt index dda5f6de11e32..2a0abebdf19b5 100644 --- a/llvm/lib/Transforms/CMakeLists.txt +++ b/llvm/lib/Transforms/CMakeLists.txt @@ -6,6 +6,7 @@ add_subdirectory(Scalar) add_subdirectory(IPO) add_subdirectory(Vectorize) add_subdirectory(Hello) +add_subdirectory(HelloNew) add_subdirectory(ObjCARC) add_subdirectory(Coroutines) add_subdirectory(CFGuard) diff --git a/llvm/lib/Transforms/Coroutines/CMakeLists.txt b/llvm/lib/Transforms/Coroutines/CMakeLists.txt index c1f6d6c8d8d8f..783093c16e60e 100644 --- a/llvm/lib/Transforms/Coroutines/CMakeLists.txt +++ b/llvm/lib/Transforms/Coroutines/CMakeLists.txt @@ -6,6 +6,9 @@ add_llvm_component_library(LLVMCoroutines CoroFrame.cpp CoroSplit.cpp + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Coroutines + DEPENDS intrinsics_gen ) diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index b2677b4572e47..04afd6fe4f54d 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -625,7 +625,22 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape, // We use a pointer use visitor to discover if there are any writes into an // alloca that dominates CoroBegin. If that is the case, insertSpills will copy // the value from the alloca into the coroutine frame spill slot corresponding -// to that alloca. +// to that alloca. We also collect any alias pointing to the alloca created +// before CoroBegin but used after CoroBegin. These alias will be recreated +// after CoroBegin from the frame address so that latter references are +// pointing to the frame instead of the stack. +// Note: We are repurposing PtrUseVisitor's isEscaped() to mean whether the +// pointer is potentially written into. +// TODO: If the pointer is really escaped, we are in big trouble because we +// will be escaping a pointer to a stack address that would no longer exist +// soon. However most escape analysis isn't good enough to precisely tell, +// so we are assuming that if a pointer is escaped that it's written into. +// TODO: Another potential issue is if we are creating an alias through +// a function call, e.g: +// %a = AllocaInst ... +// %b = call @computeAddress(... %a) +// If %b is an alias of %a and will be used after CoroBegin, this will be broken +// and there is nothing we can do about it. namespace { struct AllocaUseVisitor : PtrUseVisitor { using Base = PtrUseVisitor; @@ -633,49 +648,83 @@ struct AllocaUseVisitor : PtrUseVisitor { const CoroBeginInst &CB) : PtrUseVisitor(DL), DT(DT), CoroBegin(CB) {} - // We are only interested in uses that dominate coro.begin. + // We are only interested in uses that's not dominated by coro.begin. void visit(Instruction &I) { - if (DT.dominates(&I, &CoroBegin)) + if (!DT.dominates(&CoroBegin, &I)) Base::visit(I); } // We need to provide this overload as PtrUseVisitor uses a pointer based // visiting function. void visit(Instruction *I) { return visit(*I); } - void visitLoadInst(LoadInst &) {} // Good. Nothing to do. + // We cannot handle PHI node and SelectInst because they could be selecting + // between two addresses that point to different Allocas. + void visitPHINode(PHINode &I) { + assert(!usedAfterCoroBegin(I) && + "Unable to handle PHI node of aliases created before CoroBegin but " + "used after CoroBegin"); + } + + void visitSelectInst(SelectInst &I) { + assert(!usedAfterCoroBegin(I) && + "Unable to handle Select of aliases created before CoroBegin but " + "used after CoroBegin"); + } + + void visitLoadInst(LoadInst &) {} // If the use is an operand, the pointer escaped and anything can write into // that memory. If the use is the pointer, we are definitely writing into the // alloca and therefore we need to copy. - void visitStoreInst(StoreInst &SI) { PI.setAborted(&SI); } + void visitStoreInst(StoreInst &SI) { PI.setEscaped(&SI); } - // Any other instruction that is not filtered out by PtrUseVisitor, will - // result in the copy. - void visitInstruction(Instruction &I) { PI.setAborted(&I); } + // All mem intrinsics modify the data. + void visitMemIntrinsic(MemIntrinsic &MI) { PI.setEscaped(&MI); } + + void visitBitCastInst(BitCastInst &BC) { + Base::visitBitCastInst(BC); + handleAlias(BC); + } + + void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { + Base::visitAddrSpaceCastInst(ASC); + handleAlias(ASC); + } + + void visitGetElementPtrInst(GetElementPtrInst &GEPI) { + // The base visitor will adjust Offset accordingly. + Base::visitGetElementPtrInst(GEPI); + handleAlias(GEPI); + } + + const SmallVector, 1> &getAliases() const { + return Aliases; + } private: const DominatorTree &DT; const CoroBeginInst &CoroBegin; + // All alias to the original AllocaInst, and are used after CoroBegin. + // Each entry contains the instruction and the offset in the original Alloca. + SmallVector, 1> Aliases{}; + + bool usedAfterCoroBegin(Instruction &I) { + for (auto &U : I.uses()) + if (DT.dominates(&CoroBegin, U)) + return true; + return false; + } + + void handleAlias(Instruction &I) { + if (!usedAfterCoroBegin(I)) + return; + + assert(IsOffsetKnown && "Can only handle alias with known offset created " + "before CoroBegin and used after"); + Aliases.emplace_back(&I, Offset); + } }; } // namespace -static bool mightWriteIntoAllocaPtr(AllocaInst &A, const DominatorTree &DT, - const CoroBeginInst &CB) { - const DataLayout &DL = A.getModule()->getDataLayout(); - AllocaUseVisitor Visitor(DL, DT, CB); - auto PtrI = Visitor.visitPtr(A); - if (PtrI.isEscaped() || PtrI.isAborted()) { - auto *PointerEscapingInstr = PtrI.getEscapingInst() - ? PtrI.getEscapingInst() - : PtrI.getAbortingInst(); - if (PointerEscapingInstr) { - LLVM_DEBUG( - dbgs() << "AllocaInst copy was triggered by instruction: " - << *PointerEscapingInstr << "\n"); - } - return true; - } - return false; -} // We need to make room to insert a spill after initial PHIs, but before // catchswitch instruction. Placing it before violates the requirement that @@ -821,33 +870,34 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) { Arg->getParent()->removeParamAttr(Arg->getArgNo(), Attribute::NoCapture); - } else if (auto *II = dyn_cast(CurrentValue)) { - // If we are spilling the result of the invoke instruction, split the - // normal edge and insert the spill in the new block. - auto NewBB = SplitEdge(II->getParent(), II->getNormalDest()); - InsertPt = NewBB->getTerminator(); - } else if (isa(CurrentValue)) { - // Skip the PHINodes and EH pads instructions. - BasicBlock *DefBlock = cast(E.def())->getParent(); - if (auto *CSI = dyn_cast(DefBlock->getTerminator())) - InsertPt = splitBeforeCatchSwitch(CSI); - else - InsertPt = &*DefBlock->getFirstInsertionPt(); } else if (auto CSI = dyn_cast(CurrentValue)) { // Don't spill immediately after a suspend; splitting assumes // that the suspend will be followed by a branch. InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHI(); } else { - auto *I = cast(E.def()); - assert(!I->isTerminator() && "unexpected terminator"); - // For all other values, the spill is placed immediately after - // the definition. - if (DT.dominates(CB, I)) { - InsertPt = I->getNextNode(); - } else { - // Unless, it is not dominated by CoroBegin, then it will be + auto *I = cast(CurrentValue); + if (!DT.dominates(CB, I)) { + // If it is not dominated by CoroBegin, then spill should be // inserted immediately after CoroFrame is computed. InsertPt = FramePtr->getNextNode(); + } else if (auto *II = dyn_cast(I)) { + // If we are spilling the result of the invoke instruction, split + // the normal edge and insert the spill in the new block. + auto *NewBB = SplitEdge(II->getParent(), II->getNormalDest()); + InsertPt = NewBB->getTerminator(); + } else if (isa(I)) { + // Skip the PHINodes and EH pads instructions. + BasicBlock *DefBlock = I->getParent(); + if (auto *CSI = + dyn_cast(DefBlock->getTerminator())) + InsertPt = splitBeforeCatchSwitch(CSI); + else + InsertPt = &*DefBlock->getFirstInsertionPt(); + } else { + assert(!I->isTerminator() && "unexpected terminator"); + // For all other values, the spill is placed immediately after + // the definition. + InsertPt = I->getNextNode(); } } @@ -955,7 +1005,11 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) { for (auto &P : Allocas) { AllocaInst *const A = P.first; - if (mightWriteIntoAllocaPtr(*A, DT, *CB)) { + AllocaUseVisitor Visitor(A->getModule()->getDataLayout(), DT, *CB); + auto PtrI = Visitor.visitPtr(*A); + assert(!PtrI.isAborted()); + if (PtrI.isEscaped()) { + // isEscaped really means potentially modified before CoroBegin. if (A->isArrayAllocation()) report_fatal_error( "Coroutines cannot handle copying of array allocas yet"); @@ -964,6 +1018,20 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) { auto *Value = Builder.CreateLoad(A->getAllocatedType(), A); Builder.CreateStore(Value, G); } + // For each alias to Alloca created before CoroBegin but used after + // CoroBegin, we recreate them after CoroBegin by appplying the offset + // to the pointer in the frame. + for (const auto &Alias : Visitor.getAliases()) { + auto *FramePtr = GetFramePointer(P.second, A); + auto *FramePtrRaw = + Builder.CreateBitCast(FramePtr, Type::getInt8PtrTy(C)); + auto *AliasPtr = Builder.CreateGEP( + FramePtrRaw, ConstantInt::get(Type::getInt64Ty(C), Alias.second)); + auto *AliasPtrTyped = + Builder.CreateBitCast(AliasPtr, Alias.first->getType()); + Alias.first->replaceUsesWithIf( + AliasPtrTyped, [&](Use &U) { return DT.dominates(CB, U); }); + } } } return FramePtr; diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 9c4392e7999b6..ad93ae7cf1aca 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -1563,6 +1563,42 @@ static void createDevirtTriggerFunc(CallGraph &CG, CallGraphSCC &SCC) { SCC.initialize(Nodes); } +/// Replace a call to llvm.coro.prepare.retcon. +static void replacePrepare(CallInst *Prepare, LazyCallGraph &CG, + LazyCallGraph::SCC &C) { + auto CastFn = Prepare->getArgOperand(0); // as an i8* + auto Fn = CastFn->stripPointerCasts(); // as its original type + + // Attempt to peephole this pattern: + // %0 = bitcast [[TYPE]] @some_function to i8* + // %1 = call @llvm.coro.prepare.retcon(i8* %0) + // %2 = bitcast %1 to [[TYPE]] + // ==> + // %2 = @some_function + for (auto UI = Prepare->use_begin(), UE = Prepare->use_end(); UI != UE;) { + // Look for bitcasts back to the original function type. + auto *Cast = dyn_cast((UI++)->getUser()); + if (!Cast || Cast->getType() != Fn->getType()) + continue; + + // Replace and remove the cast. + Cast->replaceAllUsesWith(Fn); + Cast->eraseFromParent(); + } + + // Replace any remaining uses with the function as an i8*. + // This can never directly be a callee, so we don't need to update CG. + Prepare->replaceAllUsesWith(CastFn); + Prepare->eraseFromParent(); + + // Kill dead bitcasts. + while (auto *Cast = dyn_cast(CastFn)) { + if (!Cast->use_empty()) + break; + CastFn = Cast->getOperand(0); + Cast->eraseFromParent(); + } +} /// Replace a call to llvm.coro.prepare.retcon. static void replacePrepare(CallInst *Prepare, CallGraph &CG) { auto CastFn = Prepare->getArgOperand(0); // as an i8* @@ -1618,6 +1654,19 @@ static void replacePrepare(CallInst *Prepare, CallGraph &CG) { } } +static bool replaceAllPrepares(Function *PrepareFn, LazyCallGraph &CG, + LazyCallGraph::SCC &C) { + bool Changed = false; + for (auto PI = PrepareFn->use_begin(), PE = PrepareFn->use_end(); PI != PE;) { + // Intrinsics can only be used in calls. + auto *Prepare = cast((PI++)->getUser()); + replacePrepare(Prepare, CG, C); + Changed = true; + } + + return Changed; +} + /// Remove calls to llvm.coro.prepare.retcon, a barrier meant to prevent /// IPO from operating on calls to a retcon coroutine before it's been /// split. This is only safe to do after we've split all retcon @@ -1656,7 +1705,7 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C, return PreservedAnalyses::all(); // Check for uses of llvm.coro.prepare.retcon. - const auto *PrepareFn = M.getFunction("llvm.coro.prepare.retcon"); + auto *PrepareFn = M.getFunction("llvm.coro.prepare.retcon"); if (PrepareFn && PrepareFn->use_empty()) PrepareFn = nullptr; @@ -1670,8 +1719,7 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C, return PreservedAnalyses::all(); if (Coroutines.empty()) - llvm_unreachable("new pass manager cannot yet handle " - "'llvm.coro.prepare.retcon'"); + replaceAllPrepares(PrepareFn, CG, C); // Split all the coroutines. for (LazyCallGraph::Node *N : Coroutines) { @@ -1704,8 +1752,7 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C, } if (PrepareFn) - llvm_unreachable("new pass manager cannot yet handle " - "'llvm.coro.prepare.retcon'"); + replaceAllPrepares(PrepareFn, CG, C); return PreservedAnalyses::none(); } diff --git a/llvm/lib/Transforms/HelloNew/CMakeLists.txt b/llvm/lib/Transforms/HelloNew/CMakeLists.txt new file mode 100644 index 0000000000000..a7a1a5b93b062 --- /dev/null +++ b/llvm/lib/Transforms/HelloNew/CMakeLists.txt @@ -0,0 +1,6 @@ +add_llvm_component_library(LLVMHelloNew + HelloWorld.cpp + + DEPENDS + intrinsics_gen + ) diff --git a/llvm/lib/Transforms/HelloNew/HelloWorld.cpp b/llvm/lib/Transforms/HelloNew/HelloWorld.cpp new file mode 100644 index 0000000000000..dea94f8a8f627 --- /dev/null +++ b/llvm/lib/Transforms/HelloNew/HelloWorld.cpp @@ -0,0 +1,17 @@ +//===-- HelloWorld.cpp - Example Transformations --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/HelloNew/HelloWorld.h" + +using namespace llvm; + +PreservedAnalyses HelloWorldPass::run(Function &F, + FunctionAnalysisManager &AM) { + errs() << F.getName() << "\n"; + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Transforms/HelloNew/LLVMBuild.txt b/llvm/lib/Transforms/HelloNew/LLVMBuild.txt new file mode 100644 index 0000000000000..06d3c81333b78 --- /dev/null +++ b/llvm/lib/Transforms/HelloNew/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/Transforms/HelloNew/LLVMBuild.txt ------------------*- Conf -*--===; +; +; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = HelloNew +parent = Transforms +library_name = HelloNew +required_libraries = Core Support diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index d511ad2729abc..348717ec5618a 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -215,9 +215,11 @@ doPromotion(Function *F, SmallPtrSetImpl &ArgsToPromote, Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(), F->getName()); NF->copyAttributesFrom(F); + NF->copyMetadata(F, 0); - // Patch the pointer to LLVM function in debug info descriptor. - NF->setSubprogram(F->getSubprogram()); + // The new function will have the !dbg metadata copied from the original + // function. The original function may not be deleted, and dbg metadata need + // to be unique so we need to drop it. F->setSubprogram(nullptr); LLVM_DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n" diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index ea285b51982c1..9927bca995552 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -73,6 +73,14 @@ static cl::opt MaxFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32)); + +static cl::opt MaxInitializationChainLengthX( + "attributor-max-initialization-chain-length", cl::Hidden, + cl::desc( + "Maximal number of chained initializations (to avoid stack overflows)"), + cl::location(MaxInitializationChainLength), cl::init(1024)); +unsigned llvm::MaxInitializationChainLength; + static cl::opt VerifyMaxFixpointIterations( "attributor-max-iterations-verify", cl::Hidden, cl::desc("Verify that max-iterations is a tight bound for a fixpoint"), @@ -132,11 +140,11 @@ static cl::opt PrintDependencies("attributor-print-dep", cl::Hidden, /// Logic operators for the change status enum class. /// ///{ -ChangeStatus llvm::operator|(ChangeStatus l, ChangeStatus r) { - return l == ChangeStatus::CHANGED ? l : r; +ChangeStatus llvm::operator|(ChangeStatus L, ChangeStatus R) { + return L == ChangeStatus::CHANGED ? L : R; } -ChangeStatus llvm::operator&(ChangeStatus l, ChangeStatus r) { - return l == ChangeStatus::UNCHANGED ? l : r; +ChangeStatus llvm::operator&(ChangeStatus L, ChangeStatus R) { + return L == ChangeStatus::UNCHANGED ? L : R; } ///} @@ -189,7 +197,7 @@ Argument *IRPosition::getAssociatedArgument() const { // Not an Argument and no argument number means this is not a call site // argument, thus we cannot find a callback argument to return. - int ArgNo = getArgNo(); + int ArgNo = getCallSiteArgNo(); if (ArgNo < 0) return nullptr; @@ -317,6 +325,13 @@ const IRPosition SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) { IRPositions.emplace_back(IRP); + // Helper to determine if operand bundles on a call site are benin or + // potentially problematic. We handle only llvm.assume for now. + auto CanIgnoreOperandBundles = [](const CallBase &CB) { + return (isa(CB) && + cast(CB).getIntrinsicID() == Intrinsic ::assume); + }; + const auto *CB = dyn_cast(&IRP.getAnchorValue()); switch (IRP.getPositionKind()) { case IRPosition::IRP_INVALID: @@ -331,7 +346,7 @@ SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) { assert(CB && "Expected call site!"); // TODO: We need to look at the operand bundles similar to the redirection // in CallBase. - if (!CB->hasOperandBundles()) + if (!CB->hasOperandBundles() || CanIgnoreOperandBundles(*CB)) if (const Function *Callee = CB->getCalledFunction()) IRPositions.emplace_back(IRPosition::function(*Callee)); return; @@ -339,7 +354,7 @@ SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) { assert(CB && "Expected call site!"); // TODO: We need to look at the operand bundles similar to the redirection // in CallBase. - if (!CB->hasOperandBundles()) { + if (!CB->hasOperandBundles() || CanIgnoreOperandBundles(*CB)) { if (const Function *Callee = CB->getCalledFunction()) { IRPositions.emplace_back(IRPosition::returned(*Callee)); IRPositions.emplace_back(IRPosition::function(*Callee)); @@ -356,17 +371,17 @@ SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) { IRPositions.emplace_back(IRPosition::callsite_function(*CB)); return; case IRPosition::IRP_CALL_SITE_ARGUMENT: { - int ArgNo = IRP.getArgNo(); - assert(CB && ArgNo >= 0 && "Expected call site!"); + assert(CB && "Expected call site!"); // TODO: We need to look at the operand bundles similar to the redirection // in CallBase. - if (!CB->hasOperandBundles()) { + if (!CB->hasOperandBundles() || CanIgnoreOperandBundles(*CB)) { const Function *Callee = CB->getCalledFunction(); - if (Callee && Callee->arg_size() > unsigned(ArgNo)) - IRPositions.emplace_back(IRPosition::argument(*Callee->getArg(ArgNo))); - if (Callee) + if (Callee) { + if (Argument *Arg = IRP.getAssociatedArgument()) + IRPositions.emplace_back(IRPosition::argument(*Arg)); IRPositions.emplace_back(IRPosition::function(*Callee)); } + } IRPositions.emplace_back(IRPosition::value(IRP.getAssociatedValue())); return; } @@ -503,7 +518,7 @@ void IRPosition::verify() { "Expected call base argument operand for a 'call site argument' " "position"); assert(cast(U->getUser())->getArgOperandNo(U) == - unsigned(getArgNo()) && + unsigned(getCallSiteArgNo()) && "Argument number mismatch!"); assert(U->get() == &getAssociatedValue() && "Associated value mismatch!"); return; @@ -1306,9 +1321,27 @@ ChangeStatus Attributor::cleanupIR() { CGUpdater.removeFunction(*Fn); } + if (!ToBeChangedUses.empty()) + ManifestChange = ChangeStatus::CHANGED; + + if (!ToBeChangedToUnreachableInsts.empty()) + ManifestChange = ChangeStatus::CHANGED; + if (!ToBeDeletedFunctions.empty()) ManifestChange = ChangeStatus::CHANGED; + if (!ToBeDeletedBlocks.empty()) + ManifestChange = ChangeStatus::CHANGED; + + if (!ToBeDeletedInsts.empty()) + ManifestChange = ChangeStatus::CHANGED; + + if (!InvokeWithDeadSuccessor.empty()) + ManifestChange = ChangeStatus::CHANGED; + + if (!DeadInsts.empty()) + ManifestChange = ChangeStatus::CHANGED; + NumFnDeleted += ToBeDeletedFunctions.size(); LLVM_DEBUG(dbgs() << "[Attributor] Deleted " << NumFnDeleted @@ -1431,7 +1464,7 @@ static void createShallowWrapper(Function &F) { BasicBlock *EntryBB = BasicBlock::Create(Ctx, "entry", Wrapper); SmallVector Args; - auto FArgIt = F.arg_begin(); + Argument *FArgIt = F.arg_begin(); for (Argument &Arg : Wrapper->args()) { Args.push_back(&Arg); Arg.setName((FArgIt++)->getName()); @@ -1463,9 +1496,8 @@ static Function *internalizeFunction(Function &F) { FunctionType *FnTy = F.getFunctionType(); // create a copy of the current function - Function *Copied = - Function::Create(FnTy, GlobalValue::PrivateLinkage, F.getAddressSpace(), - F.getName() + ".internalized"); + Function *Copied = Function::Create(FnTy, F.getLinkage(), F.getAddressSpace(), + F.getName() + ".internalized"); ValueToValueMapTy VMap; auto *NewFArgIt = Copied->arg_begin(); for (auto &Arg : F.args()) { @@ -1478,6 +1510,11 @@ static Function *internalizeFunction(Function &F) { // Copy the body of the original function to the new one CloneFunctionInto(Copied, &F, VMap, /* ModuleLevelChanges */ false, Returns); + // Set the linakage and visibility late as CloneFunctionInto has some implicit + // requirements. + Copied->setVisibility(GlobalValue::DefaultVisibility); + Copied->setLinkage(GlobalValue::PrivateLinkage); + // Copy metadata SmallVector, 1> MDs; F.getAllMetadata(MDs); @@ -1755,8 +1792,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures( assert(Success && "Assumed call site replacement to succeed!"); // Rewire the arguments. - auto OldFnArgIt = OldFn->arg_begin(); - auto NewFnArgIt = NewFn->arg_begin(); + Argument *OldFnArgIt = OldFn->arg_begin(); + Argument *NewFnArgIt = NewFn->arg_begin(); for (unsigned OldArgNum = 0; OldArgNum < ARIs.size(); ++OldArgNum, ++OldFnArgIt) { if (const std::unique_ptr &ARI = @@ -2152,7 +2189,8 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, IRPosition::Kind AP) { raw_ostream &llvm::operator<<(raw_ostream &OS, const IRPosition &Pos) { const Value &AV = Pos.getAssociatedValue(); return OS << "{" << Pos.getPositionKind() << ":" << AV.getName() << " [" - << Pos.getAnchorValue().getName() << "@" << Pos.getArgNo() << "]}"; + << Pos.getAnchorValue().getName() << "@" << Pos.getCallSiteArgNo() + << "]}"; } raw_ostream &llvm::operator<<(raw_ostream &OS, const IntegerRangeState &S) { diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index b76e83def6e80..7bec970597038 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -500,7 +500,7 @@ static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA, Optional T; // The argument number which is also the call site argument number. - unsigned ArgNo = QueryingAA.getIRPosition().getArgNo(); + unsigned ArgNo = QueryingAA.getIRPosition().getCallSiteArgNo(); auto CallSiteCheck = [&](AbstractCallSite ACS) { const IRPosition &ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo); @@ -736,7 +736,7 @@ struct AANoUnwindCallSite final : AANoUnwindImpl { void initialize(Attributor &A) override { AANoUnwindImpl::initialize(A); Function *F = getAssociatedFunction(); - if (!F) + if (!F || F->isDeclaration()) indicatePessimisticFixpoint(); } @@ -795,7 +795,7 @@ class AAReturnedValuesImpl : public AAReturnedValues, public AbstractState { ReturnedValues.clear(); Function *F = getAssociatedFunction(); - if (!F) { + if (!F || F->isDeclaration()) { indicatePessimisticFixpoint(); return; } @@ -1141,11 +1141,13 @@ ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) { RVState RVS({NewRVsMap, Unused, RetValAAIt.second}); VisitReturnedValue(*CB->getArgOperand(Arg->getArgNo()), RVS, CB); continue; - } else if (isa(RetVal)) { + } + if (isa(RetVal)) { // Call sites are resolved by the callee attribute over time, no need to // do anything for us. continue; - } else if (isa(RetVal)) { + } + if (isa(RetVal)) { // Constants are valid everywhere, we can simply take them. NewRVsMap[RetVal].insert(RIs.begin(), RIs.end()); continue; @@ -1386,7 +1388,7 @@ struct AANoSyncCallSite final : AANoSyncImpl { void initialize(Attributor &A) override { AANoSyncImpl::initialize(A); Function *F = getAssociatedFunction(); - if (!F) + if (!F || F->isDeclaration()) indicatePessimisticFixpoint(); } @@ -1451,7 +1453,7 @@ struct AANoFreeCallSite final : AANoFreeImpl { void initialize(Attributor &A) override { AANoFreeImpl::initialize(A); Function *F = getAssociatedFunction(); - if (!F) + if (!F || F->isDeclaration()) indicatePessimisticFixpoint(); } @@ -1898,7 +1900,7 @@ struct AANoRecurseCallSite final : AANoRecurseImpl { void initialize(Attributor &A) override { AANoRecurseImpl::initialize(A); Function *F = getAssociatedFunction(); - if (!F) + if (!F || F->isDeclaration()) indicatePessimisticFixpoint(); } @@ -2274,7 +2276,7 @@ struct AAWillReturnImpl : public AAWillReturn { AAWillReturn::initialize(A); Function *F = getAnchorScope(); - if (!F || !A.isFunctionIPOAmendable(*F) || mayContainUnboundedCycle(*F, A)) + if (!F || F->isDeclaration() || mayContainUnboundedCycle(*F, A)) indicatePessimisticFixpoint(); } @@ -2318,9 +2320,9 @@ struct AAWillReturnCallSite final : AAWillReturnImpl { /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { - AAWillReturnImpl::initialize(A); + AAWillReturn::initialize(A); Function *F = getAssociatedFunction(); - if (!F) + if (!F || !A.isFunctionIPOAmendable(*F)) indicatePessimisticFixpoint(); } @@ -2493,7 +2495,7 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl { void initialize(Attributor &A) override { // See callsite argument attribute and callee argument attribute. const auto &CB = cast(getAnchorValue()); - if (CB.paramHasAttr(getArgNo(), Attribute::NoAlias)) + if (CB.paramHasAttr(getCallSiteArgNo(), Attribute::NoAlias)) indicateOptimisticFixpoint(); Value &Val = getAssociatedValue(); if (isa(Val) && @@ -2508,7 +2510,7 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl { const AAMemoryBehavior &MemBehaviorAA, const CallBase &CB, unsigned OtherArgNo) { // We do not need to worry about aliasing with the underlying IRP. - if (this->getArgNo() == (int)OtherArgNo) + if (this->getCalleeArgNo() == (int)OtherArgNo) return false; // If it is not a pointer or pointer vector we do not alias. @@ -2673,6 +2675,14 @@ struct AANoAliasReturned final : AANoAliasImpl { AANoAliasReturned(const IRPosition &IRP, Attributor &A) : AANoAliasImpl(IRP, A) {} + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AANoAliasImpl::initialize(A); + Function *F = getAssociatedFunction(); + if (!F || F->isDeclaration()) + indicatePessimisticFixpoint(); + } + /// See AbstractAttribute::updateImpl(...). virtual ChangeStatus updateImpl(Attributor &A) override { @@ -2714,7 +2724,7 @@ struct AANoAliasCallSiteReturned final : AANoAliasImpl { void initialize(Attributor &A) override { AANoAliasImpl::initialize(A); Function *F = getAssociatedFunction(); - if (!F) + if (!F || F->isDeclaration()) indicatePessimisticFixpoint(); } @@ -2923,7 +2933,7 @@ struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl { /// See AbstractAttribute::manifest(...). ChangeStatus manifest(Attributor &A) override { CallBase &CB = cast(getAnchorValue()); - Use &U = CB.getArgOperandUse(getArgNo()); + Use &U = CB.getArgOperandUse(getCallSiteArgNo()); assert(!isa(U.get()) && "Expected undef values to be filtered out!"); UndefValue &UV = *UndefValue::get(U->getType()); @@ -3863,8 +3873,16 @@ struct AAAlignFloating : AAAlignImpl { /// Align attribute for function return value. struct AAAlignReturned final : AAReturnedFromReturnedValues { - AAAlignReturned(const IRPosition &IRP, Attributor &A) - : AAReturnedFromReturnedValues(IRP, A) {} + using Base = AAReturnedFromReturnedValues; + AAAlignReturned(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + Base::initialize(A); + Function *F = getAssociatedFunction(); + if (!F || F->isDeclaration()) + indicatePessimisticFixpoint(); + } /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(aligned) } @@ -3938,7 +3956,7 @@ struct AAAlignCallSiteReturned final void initialize(Attributor &A) override { Base::initialize(A); Function *F = getAssociatedFunction(); - if (!F) + if (!F || F->isDeclaration()) indicatePessimisticFixpoint(); } @@ -3954,7 +3972,7 @@ struct AANoReturnImpl : public AANoReturn { void initialize(Attributor &A) override { AANoReturn::initialize(A); Function *F = getAssociatedFunction(); - if (!F) + if (!F || F->isDeclaration()) indicatePessimisticFixpoint(); } @@ -4028,7 +4046,7 @@ struct AANoCaptureImpl : public AANoCapture { return; } - const Function *F = getArgNo() >= 0 ? getAssociatedFunction() : AnchorScope; + const Function *F = isArgumentPosition() ? getAssociatedFunction() : AnchorScope; // Check what state the associated function can actually capture. if (F) @@ -4047,7 +4065,7 @@ struct AANoCaptureImpl : public AANoCapture { if (!isAssumedNoCaptureMaybeReturned()) return; - if (getArgNo() >= 0) { + if (isArgumentPosition()) { if (isAssumedNoCapture()) Attrs.emplace_back(Attribute::get(Ctx, Attribute::NoCapture)); else if (ManifestInternal) @@ -4083,7 +4101,7 @@ struct AANoCaptureImpl : public AANoCapture { State.addKnownBits(NOT_CAPTURED_IN_RET); // Check existing "returned" attributes. - int ArgNo = IRP.getArgNo(); + int ArgNo = IRP.getCalleeArgNo(); if (F.doesNotThrow() && ArgNo >= 0) { for (unsigned u = 0, e = F.arg_size(); u < e; ++u) if (F.hasParamAttribute(u, Attribute::Returned)) { @@ -4260,12 +4278,12 @@ struct AACaptureUseTracker final : public CaptureTracker { ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) { const IRPosition &IRP = getIRPosition(); const Value *V = - getArgNo() >= 0 ? IRP.getAssociatedArgument() : &IRP.getAssociatedValue(); + isArgumentPosition() ? IRP.getAssociatedArgument() : &IRP.getAssociatedValue(); if (!V) return indicatePessimisticFixpoint(); const Function *F = - getArgNo() >= 0 ? IRP.getAssociatedFunction() : IRP.getAnchorScope(); + isArgumentPosition() ? IRP.getAssociatedFunction() : IRP.getAnchorScope(); assert(F && "Expected a function!"); const IRPosition &FnPos = IRPosition::function(*F); const auto &IsDeadAA = @@ -4611,7 +4629,7 @@ struct AAValueSimplifyArgument final : AAValueSimplifyImpl { auto PredForCallSite = [&](AbstractCallSite ACS) { const IRPosition &ACSArgPos = - IRPosition::callsite_argument(ACS, getArgNo()); + IRPosition::callsite_argument(ACS, getCallSiteArgNo()); // Check if a coresponding argument was found or if it is on not // associated (which can happen for callback calls). if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID) @@ -4892,7 +4910,8 @@ struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating { ? dyn_cast(SimplifiedAssociatedValue.getValue()) : UndefValue::get(V.getType()); if (C) { - Use &U = cast(&getAnchorValue())->getArgOperandUse(getArgNo()); + Use &U = cast(&getAnchorValue()) + ->getArgOperandUse(getCallSiteArgNo()); // We can replace the AssociatedValue with the constant. if (&V != C && V.getType() == C->getType()) { if (A.changeUseAfterManifest(U, *C)) @@ -5211,7 +5230,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { return getAssociatedValue().getType()->getPointerElementType(); Optional Ty; - unsigned ArgNo = getIRPosition().getArgNo(); + unsigned ArgNo = getIRPosition().getCallSiteArgNo(); // Make sure the associated call site argument has the same type at all call // sites and it is an allocation we know is safe to privatize, for now that @@ -5747,7 +5766,7 @@ struct AAMemoryBehaviorImpl : public AAMemoryBehavior { void initialize(Attributor &A) override { intersectAssumedBits(BEST_STATE); getKnownStateFromValue(getIRPosition(), getState()); - IRAttribute::initialize(A); + AAMemoryBehavior::initialize(A); } /// Return the memory behavior information encoded in the IR for \p IRP. @@ -5933,14 +5952,21 @@ struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument { /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { - if (Argument *Arg = getAssociatedArgument()) { - if (Arg->hasByValAttr()) { - addKnownBits(NO_WRITES); - removeKnownBits(NO_READS); - removeAssumedBits(NO_READS); - } + // If we don't have an associated attribute this is either a variadic call + // or an indirect call, either way, nothing to do here. + Argument *Arg = getAssociatedArgument(); + if (!Arg) { + indicatePessimisticFixpoint(); + return; + } + if (Arg->hasByValAttr()) { + addKnownBits(NO_WRITES); + removeKnownBits(NO_READS); + removeAssumedBits(NO_READS); } AAMemoryBehaviorArgument::initialize(A); + if (getAssociatedFunction()->isDeclaration()) + indicatePessimisticFixpoint(); } /// See AbstractAttribute::updateImpl(...). @@ -5971,6 +5997,14 @@ struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating { AAMemoryBehaviorCallSiteReturned(const IRPosition &IRP, Attributor &A) : AAMemoryBehaviorFloating(IRP, A) {} + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AAMemoryBehaviorImpl::initialize(A); + Function *F = getAssociatedFunction(); + if (!F || F->isDeclaration()) + indicatePessimisticFixpoint(); + } + /// See AbstractAttribute::manifest(...). ChangeStatus manifest(Attributor &A) override { // We do not annotate returned values. @@ -6020,10 +6054,8 @@ struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl { void initialize(Attributor &A) override { AAMemoryBehaviorImpl::initialize(A); Function *F = getAssociatedFunction(); - if (!F || !A.isFunctionIPOAmendable(*F)) { + if (!F || F->isDeclaration()) indicatePessimisticFixpoint(); - return; - } } /// See AbstractAttribute::updateImpl(...). @@ -6300,7 +6332,7 @@ struct AAMemoryLocationImpl : public AAMemoryLocation { void initialize(Attributor &A) override { intersectAssumedBits(BEST_STATE); getKnownStateFromValue(A, getIRPosition(), getState()); - IRAttribute::initialize(A); + AAMemoryLocation::initialize(A); } /// Return the memory behavior information encoded in the IR for \p IRP. @@ -6763,10 +6795,8 @@ struct AAMemoryLocationCallSite final : AAMemoryLocationImpl { void initialize(Attributor &A) override { AAMemoryLocationImpl::initialize(A); Function *F = getAssociatedFunction(); - if (!F || !A.isFunctionIPOAmendable(*F)) { + if (!F || F->isDeclaration()) indicatePessimisticFixpoint(); - return; - } } /// See AbstractAttribute::updateImpl(...). diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp index e1dc036ae413c..a185e964d1b63 100644 --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -226,10 +226,13 @@ struct PartialInlinerImpl { // multi-region outlining. FunctionCloner(Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, - function_ref LookupAC); + function_ref LookupAC, + function_ref GetTTI); FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, OptimizationRemarkEmitter &ORE, - function_ref LookupAC); + function_ref LookupAC, + function_ref GetTTI); + ~FunctionCloner(); // Prepare for function outlining: making sure there is only @@ -266,6 +269,7 @@ struct PartialInlinerImpl { std::unique_ptr ClonedFuncBFI = nullptr; OptimizationRemarkEmitter &ORE; function_ref LookupAC; + function_ref GetTTI; }; private: @@ -334,7 +338,7 @@ struct PartialInlinerImpl { // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to // approximate both the size and runtime cost (Note that in the current // inline cost analysis, there is no clear distinction there either). - static int computeBBInlineCost(BasicBlock *BB); + static int computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI); std::unique_ptr computeOutliningInfo(Function *F); std::unique_ptr @@ -448,9 +452,10 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F, // Use the same computeBBInlineCost function to compute the cost savings of // the outlining the candidate region. + TargetTransformInfo *FTTI = &GetTTI(*F); int OverallFunctionCost = 0; for (auto &BB : *F) - OverallFunctionCost += computeBBInlineCost(&BB); + OverallFunctionCost += computeBBInlineCost(&BB, FTTI); #ifndef NDEBUG if (TracePartialInlining) @@ -509,7 +514,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F, continue; int OutlineRegionCost = 0; for (auto *BB : DominateVector) - OutlineRegionCost += computeBBInlineCost(BB); + OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); #ifndef NDEBUG if (TracePartialInlining) @@ -843,7 +848,8 @@ bool PartialInlinerImpl::shouldPartialInline( // TODO: Ideally we should share Inliner's InlineCost Analysis code. // For now use a simplified version. The returned 'InlineCost' will be used // to esimate the size cost as well as runtime cost of the BB. -int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { +int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB, + TargetTransformInfo *TTI) { int InlineCost = 0; const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); for (Instruction &I : BB->instructionsWithoutDebug()) { @@ -866,6 +872,21 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { if (I.isLifetimeStartOrEnd()) continue; + if (auto *II = dyn_cast(&I)) { + Intrinsic::ID IID = II->getIntrinsicID(); + SmallVector Tys; + FastMathFlags FMF; + for (Value *Val : II->args()) + Tys.push_back(Val->getType()); + + if (auto *FPMO = dyn_cast(II)) + FMF = FPMO->getFastMathFlags(); + + IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF); + InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency); + continue; + } + if (CallInst *CI = dyn_cast(&I)) { InlineCost += getCallsiteCost(*CI, DL); continue; @@ -893,11 +914,13 @@ PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { BasicBlock* OutliningCallBB = FuncBBPair.second; // Now compute the cost of the call sequence to the outlined function // 'OutlinedFunction' in BB 'OutliningCallBB': - OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB); + auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc); + OutliningFuncCallCost += + computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI); // Now compute the cost of the extracted/outlined function itself: for (BasicBlock &BB : *OutlinedFunc) - OutlinedFunctionCost += computeBBInlineCost(&BB); + OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI); } assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && "Outlined function cost should be no less than the outlined region"); @@ -962,8 +985,9 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap( PartialInlinerImpl::FunctionCloner::FunctionCloner( Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, - function_ref LookupAC) - : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) { + function_ref LookupAC, + function_ref GetTTI) + : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { ClonedOI = std::make_unique(); // Clone the function, so that we can hack away on it. @@ -987,8 +1011,9 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner( PartialInlinerImpl::FunctionCloner::FunctionCloner( Function *F, FunctionOutliningMultiRegionInfo *OI, OptimizationRemarkEmitter &ORE, - function_ref LookupAC) - : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) { + function_ref LookupAC, + function_ref GetTTI) + : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { ClonedOMRI = std::make_unique(); // Clone the function, so that we can hack away on it. @@ -1099,10 +1124,10 @@ void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { - auto ComputeRegionCost = [](SmallVectorImpl &Region) { + auto ComputeRegionCost = [&](SmallVectorImpl &Region) { int Cost = 0; for (BasicBlock* BB : Region) - Cost += computeBBInlineCost(BB); + Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); return Cost; }; @@ -1196,9 +1221,10 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { // Gather up the blocks that we're going to extract. std::vector ToExtract; + auto *ClonedFuncTTI = &GetTTI(*ClonedFunc); ToExtract.push_back(ClonedOI->NonReturnBlock); - OutlinedRegionCost += - PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); + OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost( + ClonedOI->NonReturnBlock, ClonedFuncTTI); for (BasicBlock &BB : *ClonedFunc) if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { ToExtract.push_back(&BB); @@ -1206,7 +1232,7 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { // into the outlined function which may make the outlining // overhead (the difference of the outlined function cost // and OutliningRegionCost) look larger. - OutlinedRegionCost += computeBBInlineCost(&BB); + OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI); } // Extract the body of the if. @@ -1276,7 +1302,7 @@ std::pair PartialInlinerImpl::unswitchFunction(Function *F) { std::unique_ptr OMRI = computeOutliningColdRegionsInfo(F, ORE); if (OMRI) { - FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache); + FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI); #ifndef NDEBUG if (TracePartialInlining) { @@ -1309,7 +1335,7 @@ std::pair PartialInlinerImpl::unswitchFunction(Function *F) { if (!OI) return {false, nullptr}; - FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache); + FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache, GetTTI); Cloner.NormalizeReturnBlock(); Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 50e87f0ab684f..396165da690c1 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -157,6 +157,11 @@ cl::opt EnableMatrix( "enable-matrix", cl::init(false), cl::Hidden, cl::desc("Enable lowering of the matrix intrinsics")); +cl::opt EnableConstraintElimination( + "enable-constraint-elimination", cl::init(false), cl::Hidden, + cl::desc( + "Enable pass to eliminate conditions based on linear constraints.")); + cl::opt AttributorRun( "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE), cl::desc("Enable the attributor inter-procedural deduction pass."), @@ -385,6 +390,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses( } } + if (EnableConstraintElimination) + MPM.add(createConstraintEliminationPass()); + if (OptLevel > 1) { // Speculative execution if the target has divergent branches; otherwise nop. MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass()); @@ -793,10 +801,13 @@ void PassManagerBuilder::populateModulePassManager( // convert to more optimized IR using more aggressive simplify CFG options. // The extra sinking transform can create larger basic blocks, so do this // before SLP vectorization. + // FIXME: study whether hoisting and/or sinking of common instructions should + // be delayed until after SLP vectorizer. MPM.add(createCFGSimplificationPass(SimplifyCFGOptions() .forwardSwitchCondToPhi(true) .convertSwitchToLookupTable(true) .needCanonicalLoops(false) + .hoistCommonInsts(true) .sinkCommonInsts(true))); if (SLPVectorize) { @@ -1004,7 +1015,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // The IPO passes may leave cruft around. Clean up after them. PM.add(createInstructionCombiningPass()); addExtensionsToPM(EP_Peephole, PM); - PM.add(createJumpThreadingPass()); + PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true)); // Break up allocas PM.add(createSROAPass()); @@ -1067,7 +1078,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createInstructionCombiningPass()); addExtensionsToPM(EP_Peephole, PM); - PM.add(createJumpThreadingPass()); + PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true)); } void PassManagerBuilder::addLateLTOOptimizationPasses( diff --git a/llvm/lib/Transforms/IPO/PruneEH.cpp b/llvm/lib/Transforms/IPO/PruneEH.cpp index a16dc664db64d..3f3b18771cd5f 100644 --- a/llvm/lib/Transforms/IPO/PruneEH.cpp +++ b/llvm/lib/Transforms/IPO/PruneEH.cpp @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CallGraph.h" @@ -27,8 +28,10 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/CallGraphUpdater.h" #include "llvm/Transforms/Utils/Local.h" #include + using namespace llvm; #define DEBUG_TYPE "prune-eh" @@ -45,11 +48,10 @@ namespace { // runOnSCC - Analyze the SCC, performing the transformation if possible. bool runOnSCC(CallGraphSCC &SCC) override; - }; } -static bool SimplifyFunction(Function *F, CallGraph &CG); -static void DeleteBasicBlock(BasicBlock *BB, CallGraph &CG); +static bool SimplifyFunction(Function *F, CallGraphUpdater &CGU); +static void DeleteBasicBlock(BasicBlock *BB, CallGraphUpdater &CGU); char PruneEH::ID = 0; INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh", @@ -60,20 +62,17 @@ INITIALIZE_PASS_END(PruneEH, "prune-eh", Pass *llvm::createPruneEHPass() { return new PruneEH(); } -static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) { - SmallPtrSet SCCNodes; +static bool runImpl(CallGraphUpdater &CGU, SetVector &Functions) { +#ifndef NDEBUG + for (auto *F : Functions) + assert(F && "null Function"); +#endif bool MadeChange = false; - // Fill SCCNodes with the elements of the SCC. Used for quickly - // looking up whether a given CallGraphNode is in this SCC. - for (CallGraphNode *I : SCC) - SCCNodes.insert(I); - // First pass, scan all of the functions in the SCC, simplifying them // according to what we know. - for (CallGraphNode *I : SCC) - if (Function *F = I->getFunction()) - MadeChange |= SimplifyFunction(F, CG); + for (Function *F : Functions) + MadeChange |= SimplifyFunction(F, CGU); // Next, check to see if any callees might throw or if there are any external // functions in this SCC: if so, we cannot prune any functions in this SCC. @@ -83,13 +82,8 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) { // obviously the SCC might throw. // bool SCCMightUnwind = false, SCCMightReturn = false; - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); - (!SCCMightUnwind || !SCCMightReturn) && I != E; ++I) { - Function *F = (*I)->getFunction(); - if (!F) { - SCCMightUnwind = true; - SCCMightReturn = true; - } else if (!F->hasExactDefinition()) { + for (Function *F : Functions) { + if (!F->hasExactDefinition()) { SCCMightUnwind |= !F->doesNotThrow(); SCCMightReturn |= !F->doesNotReturn(); } else { @@ -125,10 +119,9 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) { bool InstMightUnwind = true; if (const auto *CI = dyn_cast(&I)) { if (Function *Callee = CI->getCalledFunction()) { - CallGraphNode *CalleeNode = CG[Callee]; // If the callee is outside our current SCC then we may throw // because it might. If it is inside, do nothing. - if (SCCNodes.count(CalleeNode) > 0) + if (Functions.contains(Callee)) InstMightUnwind = false; } } @@ -140,18 +133,15 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) { if (IA->hasSideEffects()) SCCMightReturn = true; } - + } if (SCCMightUnwind && SCCMightReturn) break; - } } } // If the SCC doesn't unwind or doesn't throw, note this fact. if (!SCCMightUnwind || !SCCMightReturn) - for (CallGraphNode *I : SCC) { - Function *F = I->getFunction(); - + for (Function *F : Functions) { if (!SCCMightUnwind && !F->hasFnAttribute(Attribute::NoUnwind)) { F->addFnAttr(Attribute::NoUnwind); MadeChange = true; @@ -163,30 +153,35 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) { } } - for (CallGraphNode *I : SCC) { + for (Function *F : Functions) { // Convert any invoke instructions to non-throwing functions in this node // into call instructions with a branch. This makes the exception blocks // dead. - if (Function *F = I->getFunction()) - MadeChange |= SimplifyFunction(F, CG); + MadeChange |= SimplifyFunction(F, CGU); } return MadeChange; } - bool PruneEH::runOnSCC(CallGraphSCC &SCC) { if (skipSCC(SCC)) return false; + SetVector Functions; + for (auto &N : SCC) { + if (auto *F = N->getFunction()) + Functions.insert(F); + } CallGraph &CG = getAnalysis().getCallGraph(); - return runImpl(SCC, CG); + CallGraphUpdater CGU; + CGU.initialize(CG, SCC); + return runImpl(CGU, Functions); } // SimplifyFunction - Given information about callees, simplify the specified // function if we have invokes to non-unwinding functions or code after calls to // no-return functions. -static bool SimplifyFunction(Function *F, CallGraph &CG) { +static bool SimplifyFunction(Function *F, CallGraphUpdater &CGU) { bool MadeChange = false; for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { if (InvokeInst *II = dyn_cast(BB->getTerminator())) @@ -196,7 +191,7 @@ static bool SimplifyFunction(Function *F, CallGraph &CG) { // If the unwind block is now dead, nuke it. if (pred_empty(UnwindBlock)) - DeleteBasicBlock(UnwindBlock, CG); // Delete the new BB. + DeleteBasicBlock(UnwindBlock, CGU); // Delete the new BB. ++NumRemoved; MadeChange = true; @@ -216,7 +211,7 @@ static bool SimplifyFunction(Function *F, CallGraph &CG) { BB->getInstList().pop_back(); new UnreachableInst(BB->getContext(), &*BB); - DeleteBasicBlock(New, CG); // Delete the new BB. + DeleteBasicBlock(New, CGU); // Delete the new BB. MadeChange = true; ++NumUnreach; break; @@ -229,12 +224,11 @@ static bool SimplifyFunction(Function *F, CallGraph &CG) { /// DeleteBasicBlock - remove the specified basic block from the program, /// updating the callgraph to reflect any now-obsolete edges due to calls that /// exist in the BB. -static void DeleteBasicBlock(BasicBlock *BB, CallGraph &CG) { +static void DeleteBasicBlock(BasicBlock *BB, CallGraphUpdater &CGU) { assert(pred_empty(BB) && "BB is not dead!"); Instruction *TokenInst = nullptr; - CallGraphNode *CGN = CG[BB->getParent()]; for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) { --I; @@ -246,9 +240,9 @@ static void DeleteBasicBlock(BasicBlock *BB, CallGraph &CG) { if (auto *Call = dyn_cast(&*I)) { const Function *Callee = Call->getCalledFunction(); if (!Callee || !Intrinsic::isLeaf(Callee->getIntrinsicID())) - CGN->removeCallEdgeFor(*Call); + CGU.removeCallSite(*Call); else if (!Callee->isIntrinsic()) - CGN->removeCallEdgeFor(*Call); + CGU.removeCallSite(*Call); } if (!I->use_empty()) diff --git a/llvm/lib/Transforms/IPO/StripSymbols.cpp b/llvm/lib/Transforms/IPO/StripSymbols.cpp index 088091df770f9..4fc71847a0707 100644 --- a/llvm/lib/Transforms/IPO/StripSymbols.cpp +++ b/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -19,18 +19,21 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/IPO/StripSymbols.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/TypeFinder.h" #include "llvm/IR/ValueSymbolTable.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/Local.h" + using namespace llvm; namespace { @@ -249,9 +252,7 @@ bool StripNonDebugSymbols::runOnModule(Module &M) { return StripSymbolNames(M, true); } -bool StripDebugDeclare::runOnModule(Module &M) { - if (skipModule(M)) - return false; +static bool stripDebugDeclareImpl(Module &M) { Function *Declare = M.getFunction("llvm.dbg.declare"); std::vector DeadConstants; @@ -289,17 +290,13 @@ bool StripDebugDeclare::runOnModule(Module &M) { return true; } -/// Remove any debug info for global variables/functions in the given module for -/// which said global variable/function no longer exists (i.e. is null). -/// -/// Debugging information is encoded in llvm IR using metadata. This is designed -/// such a way that debug info for symbols preserved even if symbols are -/// optimized away by the optimizer. This special pass removes debug info for -/// such symbols. -bool StripDeadDebugInfo::runOnModule(Module &M) { +bool StripDebugDeclare::runOnModule(Module &M) { if (skipModule(M)) return false; + return stripDebugDeclareImpl(M); +} +static bool stripDeadDebugInfoImpl(Module &M) { bool Changed = false; LLVMContext &C = M.getContext(); @@ -380,3 +377,40 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { return Changed; } + +/// Remove any debug info for global variables/functions in the given module for +/// which said global variable/function no longer exists (i.e. is null). +/// +/// Debugging information is encoded in llvm IR using metadata. This is designed +/// such a way that debug info for symbols preserved even if symbols are +/// optimized away by the optimizer. This special pass removes debug info for +/// such symbols. +bool StripDeadDebugInfo::runOnModule(Module &M) { + if (skipModule(M)) + return false; + return stripDeadDebugInfoImpl(M); +} + +PreservedAnalyses StripSymbolsPass::run(Module &M, ModuleAnalysisManager &AM) { + StripDebugInfo(M); + StripSymbolNames(M, false); + return PreservedAnalyses::all(); +} + +PreservedAnalyses StripNonDebugSymbolsPass::run(Module &M, + ModuleAnalysisManager &AM) { + StripSymbolNames(M, true); + return PreservedAnalyses::all(); +} + +PreservedAnalyses StripDebugDeclarePass::run(Module &M, + ModuleAnalysisManager &AM) { + stripDebugDeclareImpl(M); + return PreservedAnalyses::all(); +} + +PreservedAnalyses StripDeadDebugInfoPass::run(Module &M, + ModuleAnalysisManager &AM) { + stripDeadDebugInfoImpl(M); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 40f6e9e147d76..90571bd033670 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -319,11 +319,14 @@ Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) { return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment); } + if (isa(ConstMask->getType())) + return nullptr; + // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); APInt UndefElts(DemandedElts.getBitWidth(), 0); - if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), - DemandedElts, UndefElts)) + if (Value *V = + SimplifyDemandedVectorElts(II.getOperand(0), DemandedElts, UndefElts)) return replaceOperand(II, 0, V); return nullptr; @@ -355,14 +358,17 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) { if (ConstMask->isNullValue()) return eraseInstFromFunction(II); + if (isa(ConstMask->getType())) + return nullptr; + // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); APInt UndefElts(DemandedElts.getBitWidth(), 0); - if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), - DemandedElts, UndefElts)) + if (Value *V = + SimplifyDemandedVectorElts(II.getOperand(0), DemandedElts, UndefElts)) return replaceOperand(II, 0, V); - if (Value *V = SimplifyDemandedVectorElts(II.getOperand(1), - DemandedElts, UndefElts)) + if (Value *V = + SimplifyDemandedVectorElts(II.getOperand(1), DemandedElts, UndefElts)) return replaceOperand(II, 1, V); return nullptr; @@ -657,6 +663,19 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) { return nullptr; } +static Optional getKnownSign(Value *Op, Instruction *CxtI, + const DataLayout &DL, AssumptionCache *AC, + DominatorTree *DT) { + KnownBits Known = computeKnownBits(Op, DL, 0, AC, CxtI, DT); + if (Known.isNonNegative()) + return false; + if (Known.isNegative()) + return true; + + return isImpliedByDomCondition( + ICmpInst::ICMP_SLT, Op, Constant::getNullValue(Op->getType()), CxtI, DL); +} + /// CallInst simplification. This mostly only handles folding of intrinsic /// instructions. For normal calls, it allows visitCallBase to do the heavy /// lifting. @@ -791,11 +810,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { if (match(IIOperand, m_Select(m_Value(), m_Neg(m_Value(X)), m_Deferred(X)))) return replaceOperand(*II, 0, X); - if (Optional Imp = isImpliedByDomCondition( - ICmpInst::ICMP_SGE, IIOperand, - Constant::getNullValue(IIOperand->getType()), II, DL)) { + if (Optional Sign = getKnownSign(IIOperand, II, DL, &AC, &DT)) { // abs(x) -> x if x >= 0 - if (*Imp) + if (!*Sign) return replaceInstUsesWith(*II, IIOperand); // abs(x) -> -x if x < 0 @@ -1444,11 +1461,16 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { break; case Intrinsic::assume: { Value *IIOperand = II->getArgOperand(0); + SmallVector OpBundles; + II->getOperandBundlesAsDefs(OpBundles); + bool HasOpBundles = !OpBundles.empty(); // Remove an assume if it is followed by an identical assume. // TODO: Do we need this? Unless there are conflicting assumptions, the // computeKnownBits(IIOperand) below here eliminates redundant assumes. Instruction *Next = II->getNextNonDebugInstruction(); - if (match(Next, m_Intrinsic(m_Specific(IIOperand)))) + if (HasOpBundles && + match(Next, m_Intrinsic(m_Specific(IIOperand))) && + !cast(Next)->hasOperandBundles()) return eraseInstFromFunction(CI); // Canonicalize assume(a && b) -> assume(a); assume(b); @@ -1458,14 +1480,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Value *AssumeIntrinsic = II->getCalledOperand(); Value *A, *B; if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) { - Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, II->getName()); + Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, OpBundles, + II->getName()); Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName()); return eraseInstFromFunction(*II); } // assume(!(a || b)) -> assume(!a); assume(!b); if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) { Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, - Builder.CreateNot(A), II->getName()); + Builder.CreateNot(A), OpBundles, II->getName()); Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, Builder.CreateNot(B), II->getName()); return eraseInstFromFunction(*II); @@ -1481,7 +1504,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { isValidAssumeForContext(II, LHS, &DT)) { MDNode *MD = MDNode::get(II->getContext(), None); LHS->setMetadata(LLVMContext::MD_nonnull, MD); - return eraseInstFromFunction(*II); + if (!HasOpBundles) + return eraseInstFromFunction(*II); // TODO: apply nonnull return attributes to calls and invokes // TODO: apply range metadata for range check patterns? diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 608017b6dca25..74e9525e8ed46 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -3090,9 +3090,10 @@ Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant( switch (II->getIntrinsicID()) { case Intrinsic::abs: // abs(A) == 0 -> A == 0 - if (C.isNullValue()) + // abs(A) == INT_MIN -> A == INT_MIN + if (C.isNullValue() || C.isMinSignedValue()) return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0), - Constant::getNullValue(Ty)); + ConstantInt::get(Ty, C)); break; case Intrinsic::bswap: diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index c05c16b4bdb16..a08f5371f948b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1064,105 +1064,29 @@ static Instruction *canonicalizeMinMaxWithConstant(SelectInst &Sel, return &Sel; } -/// There are many select variants for each of ABS/NABS. -/// In matchSelectPattern(), there are different compare constants, compare -/// predicates/operands and select operands. -/// In isKnownNegation(), there are different formats of negated operands. -/// Canonicalize all these variants to 1 pattern. -/// This makes CSE more likely. +/// Canonicalize select-based abs/nabs to llvm.abs() intrinsic. static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp, InstCombinerImpl &IC) { if (!Cmp.hasOneUse() || !isa(Cmp.getOperand(1))) return nullptr; - // Choose a sign-bit check for the compare (likely simpler for codegen). - // ABS: (X hasOneUse() || (RHS->hasNUses(2) && CmpUsesNegatedOp))) - return nullptr; - - // Create the canonical compare: icmp slt LHS 0. - if (!CmpCanonicalized) { - Cmp.setPredicate(ICmpInst::ICMP_SLT); - Cmp.setOperand(1, ConstantInt::getNullValue(Cmp.getOperand(0)->getType())); - if (CmpUsesNegatedOp) - Cmp.setOperand(0, LHS); - } - - // Create the canonical RHS: RHS = sub (0, LHS). - if (!RHSCanonicalized) { - assert(RHS->hasOneUse() && "RHS use number is not right"); - RHS = IC.Builder.CreateNeg(LHS); - if (TVal == LHS) { - // Replace false value. - IC.replaceOperand(Sel, 2, RHS); - FVal = RHS; - } else { - // Replace true value. - IC.replaceOperand(Sel, 1, RHS); - TVal = RHS; - } - } - - // If the select operands do not change, we're done. - if (SPF == SelectPatternFlavor::SPF_NABS) { - if (TVal == LHS) - return &Sel; - assert(FVal == LHS && "Unexpected results from matchSelectPattern"); - } else { - if (FVal == LHS) - return &Sel; - assert(TVal == LHS && "Unexpected results from matchSelectPattern"); - } + bool IntMinIsPoison = match(RHS, m_NSWNeg(m_Specific(LHS))); + Constant *IntMinIsPoisonC = + ConstantInt::get(Type::getInt1Ty(Sel.getContext()), IntMinIsPoison); + Instruction *Abs = + IC.Builder.CreateBinaryIntrinsic(Intrinsic::abs, LHS, IntMinIsPoisonC); - // We are swapping the select operands, so swap the metadata too. - Sel.swapValues(); - Sel.swapProfMetadata(); - return &Sel; -} + if (SPF == SelectPatternFlavor::SPF_NABS) + return IntMinIsPoison ? BinaryOperator::CreateNSWNeg(Abs) + : BinaryOperator::CreateNeg(Abs); -static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *ReplaceOp, - const SimplifyQuery &Q) { - // If this is a binary operator, try to simplify it with the replaced op - // because we know Op and ReplaceOp are equivalant. - // For example: V = X + 1, Op = X, ReplaceOp = 42 - // Simplifies as: add(42, 1) --> 43 - if (auto *BO = dyn_cast(V)) { - if (BO->getOperand(0) == Op) - return SimplifyBinOp(BO->getOpcode(), ReplaceOp, BO->getOperand(1), Q); - if (BO->getOperand(1) == Op) - return SimplifyBinOp(BO->getOpcode(), BO->getOperand(0), ReplaceOp, Q); - } - - return nullptr; + return IC.replaceInstUsesWith(Sel, Abs); } /// If we have a select with an equality comparison, then we know the value in @@ -1181,30 +1105,71 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *ReplaceOp, /// /// We can't replace %sel with %add unless we strip away the flags. /// TODO: Wrapping flags could be preserved in some cases with better analysis. -static Value *foldSelectValueEquivalence(SelectInst &Sel, ICmpInst &Cmp, - const SimplifyQuery &Q) { +static Instruction *foldSelectValueEquivalence(SelectInst &Sel, ICmpInst &Cmp, + const SimplifyQuery &Q, + InstCombiner &IC) { if (!Cmp.isEquality()) return nullptr; // Canonicalize the pattern to ICMP_EQ by swapping the select operands. Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue(); - if (Cmp.getPredicate() == ICmpInst::ICMP_NE) + bool Swapped = false; + if (Cmp.getPredicate() == ICmpInst::ICMP_NE) { std::swap(TrueVal, FalseVal); + Swapped = true; + } + + // In X == Y ? f(X) : Z, try to evaluate f(X) and replace the operand. + // Take care to avoid replacing X == Y ? X : Z with X == Y ? Y : Z, as that + // would lead to an infinite replacement cycle. + Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1); + if (TrueVal != CmpLHS) + if (Value *V = SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, + /* AllowRefinement */ true)) + return IC.replaceOperand(Sel, Swapped ? 2 : 1, V); + if (TrueVal != CmpRHS) + if (Value *V = SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, + /* AllowRefinement */ true)) + return IC.replaceOperand(Sel, Swapped ? 2 : 1, V); + + auto *FalseInst = dyn_cast(FalseVal); + if (!FalseInst) + return nullptr; + + // InstSimplify already performed this fold if it was possible subject to + // current poison-generating flags. Try the transform again with + // poison-generating flags temporarily dropped. + bool WasNUW = false, WasNSW = false, WasExact = false; + if (auto *OBO = dyn_cast(FalseVal)) { + WasNUW = OBO->hasNoUnsignedWrap(); + WasNSW = OBO->hasNoSignedWrap(); + FalseInst->setHasNoUnsignedWrap(false); + FalseInst->setHasNoSignedWrap(false); + } + if (auto *PEO = dyn_cast(FalseVal)) { + WasExact = PEO->isExact(); + FalseInst->setIsExact(false); + } // Try each equivalence substitution possibility. // We have an 'EQ' comparison, so the select's false value will propagate. // Example: // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1 - // (X == 42) ? (X + 1) : 43 --> (X == 42) ? (42 + 1) : 43 --> 43 - Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1); - if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q) == TrueVal || - simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q) == TrueVal || - simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q) == FalseVal || - simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q) == FalseVal) { - if (auto *FalseInst = dyn_cast(FalseVal)) - FalseInst->dropPoisonGeneratingFlags(); - return FalseVal; + if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, + /* AllowRefinement */ false) == TrueVal || + SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, + /* AllowRefinement */ false) == TrueVal) { + return IC.replaceInstUsesWith(Sel, FalseVal); } + + // Restore poison-generating flags if the transform did not apply. + if (WasNUW) + FalseInst->setHasNoUnsignedWrap(); + if (WasNSW) + FalseInst->setHasNoSignedWrap(); + if (WasExact) + FalseInst->setIsExact(); + return nullptr; } @@ -1430,8 +1395,8 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp, /// Visit a SelectInst that has an ICmpInst as its first operand. Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI) { - if (Value *V = foldSelectValueEquivalence(SI, *ICI, SQ)) - return replaceInstUsesWith(SI, V); + if (Instruction *NewSel = foldSelectValueEquivalence(SI, *ICI, SQ, *this)) + return NewSel; if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, *this)) return NewSel; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index ef56cb77447aa..55c6ce6eb7832 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -2037,8 +2037,7 @@ static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf, if (Mask[i] == UndefMaskElem) continue; uint64_t LSBIndex = IsBigEndian ? (i + 1) * TruncRatio - 1 : i * TruncRatio; - assert(LSBIndex <= std::numeric_limits::max() && - "Overflowed 32-bits"); + assert(LSBIndex <= INT32_MAX && "Overflowed 32-bits"); if (Mask[i] != (int)LSBIndex) return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 0ca256860c596..63ba7eb85c663 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2805,6 +2805,14 @@ Instruction *InstCombinerImpl::visitUnreachableInst(UnreachableInst &I) { Instruction *Prev = I.getPrevNonDebugInstruction(); if (Prev && !Prev->isEHPad() && isGuaranteedToTransferExecutionToSuccessor(Prev)) { + // Temporarily disable removal of volatile stores preceding unreachable, + // pending a potential LangRef change permitting volatile stores to trap. + // TODO: Either remove this code, or properly integrate the check into + // isGuaranteedToTransferExecutionToSuccessor(). + if (auto *SI = dyn_cast(Prev)) + if (SI->isVolatile()) + return nullptr; + eraseInstFromFunction(*Prev); return &I; } diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt index 1fc0b140be035..63bc57ac9c440 100644 --- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt @@ -5,7 +5,7 @@ add_llvm_component_library(LLVMInstrumentation ControlHeightReduction.cpp DataFlowSanitizer.cpp GCOVProfiling.cpp - HeapProfiler.cpp + MemProfiler.cpp MemorySanitizer.cpp IndirectCallPromotion.cpp Instrumentation.cpp diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 3773c3e19ef69..c72c44809acc7 100644 --- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// +#include "CFGMST.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/STLExtras.h" @@ -20,6 +21,8 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CFG.h" @@ -53,6 +56,8 @@ namespace endian = llvm::support::endian; #define DEBUG_TYPE "insert-gcov-profiling" enum : uint32_t { + GCOV_ARC_ON_TREE = 1 << 0, + GCOV_TAG_FUNCTION = 0x01000000, GCOV_TAG_BLOCKS = 0x01410000, GCOV_TAG_ARCS = 0x01430000, @@ -95,7 +100,8 @@ class GCOVProfiler { GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {} GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {} bool - runOnModule(Module &M, + runOnModule(Module &M, function_ref GetBFI, + function_ref GetBPI, std::function GetTLI); void write(uint32_t i) { @@ -112,11 +118,14 @@ class GCOVProfiler { private: // Create the .gcno files for the Module based on DebugInfo. - void emitProfileNotes(); + bool + emitProfileNotes(NamedMDNode *CUNode, bool HasExecOrFork, + function_ref GetBFI, + function_ref GetBPI, + function_ref GetTLI); - // Modify the program to track transitions along edges and call into the - // profiling runtime to emit .gcda files when run. - bool emitProfileArcs(); + void emitGlobalConstructor( + SmallVectorImpl> &CountersBySP); bool isFunctionInstrumented(const Function &F); std::vector createRegexesFromString(StringRef RegexesStr); @@ -154,6 +163,7 @@ class GCOVProfiler { SmallVector, 16> Funcs; std::vector FilterRe; std::vector ExcludeRe; + DenseSet ExecBlocks; StringMap InstrumentedFiles; }; @@ -169,24 +179,68 @@ class GCOVProfilerLegacyPass : public ModulePass { StringRef getPassName() const override { return "GCOV Profiler"; } bool runOnModule(Module &M) override { - return Profiler.runOnModule(M, [this](Function &F) -> TargetLibraryInfo & { - return getAnalysis().getTLI(F); - }); + auto GetBFI = [this](Function &F) { + return &this->getAnalysis(F).getBFI(); + }; + auto GetBPI = [this](Function &F) { + return &this->getAnalysis(F).getBPI(); + }; + auto GetTLI = [this](Function &F) -> const TargetLibraryInfo & { + return this->getAnalysis().getTLI(F); + }; + return Profiler.runOnModule(M, GetBFI, GetBPI, GetTLI); } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addRequired(); } private: GCOVProfiler Profiler; }; + +struct BBInfo { + BBInfo *Group; + uint32_t Index; + uint32_t Rank = 0; + + BBInfo(unsigned Index) : Group(this), Index(Index) {} + const std::string infoString() const { + return (Twine("Index=") + Twine(Index)).str(); + } +}; + +struct Edge { + // This class implements the CFG edges. Note the CFG can be a multi-graph. + // So there might be multiple edges with same SrcBB and DestBB. + const BasicBlock *SrcBB; + const BasicBlock *DestBB; + uint64_t Weight; + BasicBlock *Place = nullptr; + uint32_t SrcNumber, DstNumber; + bool InMST = false; + bool Removed = false; + bool IsCritical = false; + + Edge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W = 1) + : SrcBB(Src), DestBB(Dest), Weight(W) {} + + // Return the information string of an edge. + const std::string infoString() const { + return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") + + (IsCritical ? "c" : " ") + " W=" + Twine(Weight)) + .str(); + } +}; } char GCOVProfilerLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN( GCOVProfilerLegacyPass, "insert-gcov-profiling", "Insert instrumentation for GCOV profiling", false, false) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END( GCOVProfilerLegacyPass, "insert-gcov-profiling", @@ -271,8 +325,8 @@ namespace { return LinesByFile.try_emplace(Filename, P, Filename).first->second; } - void addEdge(GCOVBlock &Successor) { - OutEdges.push_back(&Successor); + void addEdge(GCOVBlock &Successor, uint32_t Flags) { + OutEdges.emplace_back(&Successor, Flags); } void writeOut() { @@ -306,9 +360,9 @@ namespace { } uint32_t Number; - SmallVector OutEdges; + SmallVector, 4> OutEdges; - private: + private: friend class GCOVFunction; GCOVBlock(GCOVProfiler *P, uint32_t Number) @@ -325,16 +379,12 @@ namespace { GCOVFunction(GCOVProfiler *P, Function *F, const DISubprogram *SP, unsigned EndLine, uint32_t Ident, int Version) : GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident), - Version(Version), ReturnBlock(P, 1) { + Version(Version), EntryBlock(P, 0), ReturnBlock(P, 1) { LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n"); bool ExitBlockBeforeBody = Version >= 48; - uint32_t i = 0; - for (auto &BB : *F) { - // Skip index 1 if it's assigned to the ReturnBlock. - if (i == 1 && ExitBlockBeforeBody) - ++i; + uint32_t i = ExitBlockBeforeBody ? 2 : 1; + for (BasicBlock &BB : *F) Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++))); - } if (!ExitBlockBeforeBody) ReturnBlock.Number = i; @@ -345,10 +395,11 @@ namespace { FuncChecksum = hash_value(FunctionNameAndLine); } - GCOVBlock &getBlock(BasicBlock *BB) { + GCOVBlock &getBlock(const BasicBlock *BB) { return Blocks.find(BB)->second; } + GCOVBlock &getEntryBlock() { return EntryBlock; } GCOVBlock &getReturnBlock() { return ReturnBlock; } @@ -391,44 +442,58 @@ namespace { // Emit count of blocks. write(GCOV_TAG_BLOCKS); if (Version < 80) { - write(Blocks.size() + 1); - for (int i = Blocks.size() + 1; i; --i) + write(Blocks.size() + 2); + for (int i = Blocks.size() + 2; i; --i) write(0); } else { write(1); - write(Blocks.size() + 1); + write(Blocks.size() + 2); } LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n"); // Emit edges between blocks. - Function *F = Blocks.begin()->first->getParent(); - for (BasicBlock &I : *F) { - GCOVBlock &Block = getBlock(&I); + const uint32_t Outgoing = EntryBlock.OutEdges.size(); + if (Outgoing) { + write(GCOV_TAG_ARCS); + write(Outgoing * 2 + 1); + write(EntryBlock.Number); + for (const auto &E : EntryBlock.OutEdges) { + write(E.first->Number); + write(E.second); + } + } + std::vector Sorted; + Sorted.reserve(Blocks.size()); + for (auto &It : Blocks) + Sorted.push_back(&It.second); + llvm::sort(Sorted, [](GCOVBlock *x, GCOVBlock *y) { + return x->Number < y->Number; + }); + for (GCOVBlock &Block : make_pointee_range(Sorted)) { if (Block.OutEdges.empty()) continue; write(GCOV_TAG_ARCS); write(Block.OutEdges.size() * 2 + 1); write(Block.Number); - for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) { - LLVM_DEBUG(dbgs() << Block.Number << " -> " - << Block.OutEdges[i]->Number << "\n"); - write(Block.OutEdges[i]->Number); - write(0); // no flags + for (const auto &E : Block.OutEdges) { + write(E.first->Number); + write(E.second); } } // Emit lines for each block. - for (BasicBlock &I : *F) - getBlock(&I).writeOut(); + for (GCOVBlock &Block : make_pointee_range(Sorted)) + Block.writeOut(); } - private: + public: const DISubprogram *SP; unsigned EndLine; uint32_t Ident; uint32_t FuncChecksum; int Version; DenseMap Blocks; + GCOVBlock EntryBlock; GCOVBlock ReturnBlock; }; } @@ -542,20 +607,23 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU, } bool GCOVProfiler::runOnModule( - Module &M, std::function GetTLI) { + Module &M, function_ref GetBFI, + function_ref GetBPI, + std::function GetTLI) { this->M = &M; this->GetTLI = std::move(GetTLI); Ctx = &M.getContext(); - bool Modified = AddFlushBeforeForkAndExec(); + NamedMDNode *CUNode = M.getNamedMetadata("llvm.dbg.cu"); + if (!CUNode || (!Options.EmitNotes && !Options.EmitData)) + return false; + + bool HasExecOrFork = AddFlushBeforeForkAndExec(); FilterRe = createRegexesFromString(Options.Filter); ExcludeRe = createRegexesFromString(Options.Exclude); - - if (Options.EmitNotes) emitProfileNotes(); - if (Options.EmitData) - Modified |= emitProfileArcs(); - return Modified; + emitProfileNotes(CUNode, HasExecOrFork, GetBFI, GetBPI, this->GetTLI); + return true; } PreservedAnalyses GCOVProfilerPass::run(Module &M, @@ -565,9 +633,17 @@ PreservedAnalyses GCOVProfilerPass::run(Module &M, FunctionAnalysisManager &FAM = AM.getResult(M).getManager(); - if (!Profiler.runOnModule(M, [&](Function &F) -> TargetLibraryInfo & { - return FAM.getResult(F); - })) + auto GetBFI = [&FAM](Function &F) { + return &FAM.getResult(F); + }; + auto GetBPI = [&FAM](Function &F) { + return &FAM.getResult(F); + }; + auto GetTLI = [&FAM](Function &F) -> const TargetLibraryInfo & { + return FAM.getResult(F); + }; + + if (!Profiler.runOnModule(M, GetBFI, GetBPI, GetTLI)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); @@ -604,16 +680,6 @@ static bool isUsingScopeBasedEH(Function &F) { return isScopedEHPersonality(Personality); } -static bool shouldKeepInEntry(BasicBlock::iterator It) { - if (isa(*It)) return true; - if (isa(*It)) return true; - if (auto *II = dyn_cast(It)) { - if (II->getIntrinsicID() == llvm::Intrinsic::localescape) return true; - } - - return false; -} - bool GCOVProfiler::AddFlushBeforeForkAndExec() { SmallVector Forks; SmallVector Execs; @@ -683,6 +749,7 @@ bool GCOVProfiler::AddFlushBeforeForkAndExec() { // dumped FunctionCallee ResetF = M->getOrInsertFunction("llvm_reset_counters", FTy); Builder.CreateCall(ResetF)->setDebugLoc(Loc); + ExecBlocks.insert(Parent); Parent->splitBasicBlock(NextInst); Parent->back().setDebugLoc(Loc); } @@ -690,10 +757,67 @@ bool GCOVProfiler::AddFlushBeforeForkAndExec() { return !Forks.empty() || !Execs.empty(); } -void GCOVProfiler::emitProfileNotes() { - NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu"); - if (!CU_Nodes) return; +static BasicBlock *getInstrBB(CFGMST &MST, Edge &E, + const DenseSet &ExecBlocks) { + if (E.InMST || E.Removed) + return nullptr; + + BasicBlock *SrcBB = const_cast(E.SrcBB); + BasicBlock *DestBB = const_cast(E.DestBB); + // For a fake edge, instrument the real BB. + if (SrcBB == nullptr) + return DestBB; + if (DestBB == nullptr) + return SrcBB; + + auto CanInstrument = [](BasicBlock *BB) -> BasicBlock * { + // There are basic blocks (such as catchswitch) cannot be instrumented. + // If the returned first insertion point is the end of BB, skip this BB. + if (BB->getFirstInsertionPt() == BB->end()) + return nullptr; + return BB; + }; + // Instrument the SrcBB if it has a single successor, + // otherwise, the DestBB if this is not a critical edge. + Instruction *TI = SrcBB->getTerminator(); + if (TI->getNumSuccessors() <= 1 && !ExecBlocks.count(SrcBB)) + return CanInstrument(SrcBB); + if (!E.IsCritical) + return CanInstrument(DestBB); + + // Some IndirectBr critical edges cannot be split by the previous + // SplitIndirectBrCriticalEdges call. Bail out. + const unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB); + BasicBlock *InstrBB = + isa(TI) ? nullptr : SplitCriticalEdge(TI, SuccNum); + if (!InstrBB) + return nullptr; + + MST.addEdge(SrcBB, InstrBB, 0); + MST.addEdge(InstrBB, DestBB, 0).InMST = true; + E.Removed = true; + + return CanInstrument(InstrBB); +} + +#ifndef NDEBUG +static void dumpEdges(CFGMST &MST, GCOVFunction &GF) { + size_t ID = 0; + for (auto &E : make_pointee_range(MST.AllEdges)) { + GCOVBlock &Src = E.SrcBB ? GF.getBlock(E.SrcBB) : GF.getEntryBlock(); + GCOVBlock &Dst = E.DestBB ? GF.getBlock(E.DestBB) : GF.getReturnBlock(); + dbgs() << " Edge " << ID++ << ": " << Src.Number << "->" << Dst.Number + << E.infoString() << "\n"; + } +} +#endif + +bool GCOVProfiler::emitProfileNotes( + NamedMDNode *CUNode, bool HasExecOrFork, + function_ref GetBFI, + function_ref GetBPI, + function_ref GetTLI) { int Version; { uint8_t c3 = Options.Version[0]; @@ -703,27 +827,20 @@ void GCOVProfiler::emitProfileNotes() { : (c3 - '0') * 10 + c1 - '0'; } - for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) { + bool EmitGCDA = Options.EmitData; + for (unsigned i = 0, e = CUNode->getNumOperands(); i != e; ++i) { // Each compile unit gets its own .gcno file. This means that whether we run // this pass over the original .o's as they're produced, or run it after // LTO, we'll generate the same .gcno files. - auto *CU = cast(CU_Nodes->getOperand(i)); + auto *CU = cast(CUNode->getOperand(i)); // Skip module skeleton (and module) CUs. if (CU->getDWOId()) continue; - std::error_code EC; - raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC, - sys::fs::OF_None); - if (EC) { - Ctx->emitError(Twine("failed to open coverage notes file for writing: ") + - EC.message()); - continue; - } - std::vector EdgeDestinations; + SmallVector, 8> CountersBySP; Endian = M->getDataLayout().isLittleEndian() ? support::endianness::little : support::endianness::big; @@ -737,39 +854,79 @@ void GCOVProfiler::emitProfileNotes() { // TODO: Functions using scope-based EH are currently not supported. if (isUsingScopeBasedEH(F)) continue; - // gcov expects every function to start with an entry block that has a - // single successor, so split the entry block to make sure of that. - BasicBlock &EntryBlock = F.getEntryBlock(); - BasicBlock::iterator It = EntryBlock.begin(); - while (shouldKeepInEntry(It)) - ++It; - EntryBlock.splitBasicBlock(It); + // Add the function line number to the lines of the entry block + // to have a counter for the function definition. + uint32_t Line = SP->getLine(); + auto Filename = getFilename(SP); + + BranchProbabilityInfo *BPI = GetBPI(F); + BlockFrequencyInfo *BFI = GetBFI(F); + + // Split indirectbr critical edges here before computing the MST rather + // than later in getInstrBB() to avoid invalidating it. + SplitIndirectBrCriticalEdges(F, BPI, BFI); + CFGMST MST(F, /*InstrumentFuncEntry_=*/false, BPI, BFI); + + // getInstrBB can split basic blocks and push elements to AllEdges. + for (size_t I : llvm::seq(0, MST.AllEdges.size())) { + auto &E = *MST.AllEdges[I]; + // For now, disable spanning tree optimization when fork or exec* is + // used. + if (HasExecOrFork) + E.InMST = false; + E.Place = getInstrBB(MST, E, ExecBlocks); + } + // Basic blocks in F are finalized at this point. + BasicBlock &EntryBlock = F.getEntryBlock(); Funcs.push_back(std::make_unique(this, &F, SP, EndLine, FunctionIdent++, Version)); GCOVFunction &Func = *Funcs.back(); - // Add the function line number to the lines of the entry block - // to have a counter for the function definition. - uint32_t Line = SP->getLine(); - auto Filename = getFilename(SP); + // Some non-tree edges are IndirectBr which cannot be split. Ignore them + // as well. + llvm::erase_if(MST.AllEdges, [](std::unique_ptr &E) { + return E->Removed || (!E->InMST && !E->Place); + }); + const size_t Measured = + llvm::partition(MST.AllEdges, + [](std::unique_ptr &E) { return E->Place; }) - + MST.AllEdges.begin(); + for (size_t I : llvm::seq(0, Measured)) { + Edge &E = *MST.AllEdges[I]; + GCOVBlock &Src = + E.SrcBB ? Func.getBlock(E.SrcBB) : Func.getEntryBlock(); + GCOVBlock &Dst = + E.DestBB ? Func.getBlock(E.DestBB) : Func.getReturnBlock(); + E.SrcNumber = Src.Number; + E.DstNumber = Dst.Number; + } + std::stable_sort( + MST.AllEdges.begin(), MST.AllEdges.begin() + Measured, + [](const std::unique_ptr &L, const std::unique_ptr &R) { + return L->SrcNumber != R->SrcNumber ? L->SrcNumber < R->SrcNumber + : L->DstNumber < R->DstNumber; + }); + + for (const Edge &E : make_pointee_range(MST.AllEdges)) { + GCOVBlock &Src = + E.SrcBB ? Func.getBlock(E.SrcBB) : Func.getEntryBlock(); + GCOVBlock &Dst = + E.DestBB ? Func.getBlock(E.DestBB) : Func.getReturnBlock(); + Src.addEdge(Dst, E.Place ? 0 : uint32_t(GCOV_ARC_ON_TREE)); + } // Artificial functions such as global initializers if (!SP->isArtificial()) Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line); - for (auto &BB : F) { - GCOVBlock &Block = Func.getBlock(&BB); - Instruction *TI = BB.getTerminator(); - if (int successors = TI->getNumSuccessors()) { - for (int i = 0; i != successors; ++i) { - Block.addEdge(Func.getBlock(TI->getSuccessor(i))); - } - } else if (isa(TI)) { - Block.addEdge(Func.getReturnBlock()); - } - for (GCOVBlock *Succ : Block.OutEdges) { - uint32_t Idx = Succ->Number; + LLVM_DEBUG(dumpEdges(MST, Func)); + + for (auto &GB : Func.Blocks) { + const BasicBlock &BB = *GB.first; + auto &Block = GB.second; + for (auto Succ : Block.OutEdges) { + uint32_t Idx = Succ.first->Number; do EdgeDestinations.push_back(Idx & 255); while ((Idx >>= 8) > 0); } @@ -797,160 +954,110 @@ void GCOVProfiler::emitProfileNotes() { } Line = 0; } + if (EmitGCDA) { + DISubprogram *SP = F.getSubprogram(); + ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(*Ctx), Measured); + GlobalVariable *Counters = new GlobalVariable( + *M, CounterTy, false, GlobalValue::InternalLinkage, + Constant::getNullValue(CounterTy), "__llvm_gcov_ctr"); + CountersBySP.emplace_back(Counters, SP); + + for (size_t I : llvm::seq(0, Measured)) { + const Edge &E = *MST.AllEdges[I]; + IRBuilder<> Builder(E.Place, E.Place->getFirstInsertionPt()); + Value *V = Builder.CreateConstInBoundsGEP2_64( + Counters->getValueType(), Counters, 0, I); + if (Options.Atomic) { + Builder.CreateAtomicRMW(AtomicRMWInst::Add, V, Builder.getInt64(1), + AtomicOrdering::Monotonic); + } else { + Value *Count = + Builder.CreateLoad(Builder.getInt64Ty(), V, "gcov_ctr"); + Count = Builder.CreateAdd(Count, Builder.getInt64(1)); + Builder.CreateStore(Count, V); + } + } + } } char Tmp[4]; JamCRC JC; JC.update(EdgeDestinations); - os = &out; uint32_t Stamp = JC.getCRC(); FileChecksums.push_back(Stamp); - if (Endian == support::endianness::big) { - out.write("gcno", 4); - out.write(Options.Version, 4); - } else { - out.write("oncg", 4); - std::reverse_copy(Options.Version, Options.Version + 4, Tmp); - out.write(Tmp, 4); - } - write(Stamp); - if (Version >= 90) - writeString(""); // unuseful current_working_directory - if (Version >= 80) - write(0); // unuseful has_unexecuted_blocks - - for (auto &Func : Funcs) - Func->writeOut(Stamp); - - write(0); - write(0); - out.close(); - } -} - -bool GCOVProfiler::emitProfileArcs() { - NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu"); - if (!CU_Nodes) return false; - bool Result = false; - for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) { - SmallVector, 8> CountersBySP; - for (auto &F : M->functions()) { - DISubprogram *SP = F.getSubprogram(); - unsigned EndLine; - if (!SP) continue; - if (!functionHasLines(F, EndLine) || !isFunctionInstrumented(F)) + if (Options.EmitNotes) { + std::error_code EC; + raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC, + sys::fs::OF_None); + if (EC) { + Ctx->emitError( + Twine("failed to open coverage notes file for writing: ") + + EC.message()); continue; - // TODO: Functions using scope-based EH are currently not supported. - if (isUsingScopeBasedEH(F)) continue; - - DenseMap, unsigned> EdgeToCounter; - unsigned Edges = 0; - for (auto &BB : F) { - Instruction *TI = BB.getTerminator(); - if (isa(TI)) { - EdgeToCounter[{&BB, nullptr}] = Edges++; - } else { - for (BasicBlock *Succ : successors(TI)) { - EdgeToCounter[{&BB, Succ}] = Edges++; - } - } } + os = &out; + if (Endian == support::endianness::big) { + out.write("gcno", 4); + out.write(Options.Version, 4); + } else { + out.write("oncg", 4); + std::reverse_copy(Options.Version, Options.Version + 4, Tmp); + out.write(Tmp, 4); + } + write(Stamp); + if (Version >= 90) + writeString(""); // unuseful current_working_directory + if (Version >= 80) + write(0); // unuseful has_unexecuted_blocks - ArrayType *CounterTy = - ArrayType::get(Type::getInt64Ty(*Ctx), Edges); - GlobalVariable *Counters = - new GlobalVariable(*M, CounterTy, false, - GlobalValue::InternalLinkage, - Constant::getNullValue(CounterTy), - "__llvm_gcov_ctr"); - CountersBySP.push_back(std::make_pair(Counters, SP)); - - // If a BB has several predecessors, use a PHINode to select - // the correct counter. - for (auto &BB : F) { - const unsigned EdgeCount = - std::distance(pred_begin(&BB), pred_end(&BB)); - if (EdgeCount) { - // The phi node must be at the begin of the BB. - IRBuilder<> BuilderForPhi(&*BB.begin()); - Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx); - PHINode *Phi = BuilderForPhi.CreatePHI(Int64PtrTy, EdgeCount); - for (BasicBlock *Pred : predecessors(&BB)) { - auto It = EdgeToCounter.find({Pred, &BB}); - assert(It != EdgeToCounter.end()); - const unsigned Edge = It->second; - Value *EdgeCounter = BuilderForPhi.CreateConstInBoundsGEP2_64( - Counters->getValueType(), Counters, 0, Edge); - Phi->addIncoming(EdgeCounter, Pred); - } + for (auto &Func : Funcs) + Func->writeOut(Stamp); - // Skip phis, landingpads. - IRBuilder<> Builder(&*BB.getFirstInsertionPt()); - if (Options.Atomic) { - Builder.CreateAtomicRMW(AtomicRMWInst::Add, Phi, - Builder.getInt64(1), - AtomicOrdering::Monotonic); - } else { - Value *Count = Builder.CreateLoad(Builder.getInt64Ty(), Phi); - Count = Builder.CreateAdd(Count, Builder.getInt64(1)); - Builder.CreateStore(Count, Phi); - } + write(0); + write(0); + out.close(); + } - Instruction *TI = BB.getTerminator(); - if (isa(TI)) { - auto It = EdgeToCounter.find({&BB, nullptr}); - assert(It != EdgeToCounter.end()); - const unsigned Edge = It->second; - Value *Counter = Builder.CreateConstInBoundsGEP2_64( - Counters->getValueType(), Counters, 0, Edge); - if (Options.Atomic) { - Builder.CreateAtomicRMW(AtomicRMWInst::Add, Counter, - Builder.getInt64(1), - AtomicOrdering::Monotonic); - } else { - Value *Count = Builder.CreateLoad(Builder.getInt64Ty(), Counter); - Count = Builder.CreateAdd(Count, Builder.getInt64(1)); - Builder.CreateStore(Count, Counter); - } - } - } - } + if (EmitGCDA) { + emitGlobalConstructor(CountersBySP); + EmitGCDA = false; } + } + return true; +} - Function *WriteoutF = insertCounterWriteout(CountersBySP); - Function *ResetF = insertReset(CountersBySP); - - // Create a small bit of code that registers the "__llvm_gcov_writeout" to - // be executed at exit and the "__llvm_gcov_flush" function to be executed - // when "__gcov_flush" is called. - FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); - Function *F = Function::Create(FTy, GlobalValue::InternalLinkage, - "__llvm_gcov_init", M); - F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - F->setLinkage(GlobalValue::InternalLinkage); - F->addFnAttr(Attribute::NoInline); - if (Options.NoRedZone) - F->addFnAttr(Attribute::NoRedZone); - - BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F); - IRBuilder<> Builder(BB); - - FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); - auto *PFTy = PointerType::get(FTy, 0); - FTy = FunctionType::get(Builder.getVoidTy(), {PFTy, PFTy}, false); - - // Initialize the environment and register the local writeout, flush and - // reset functions. - FunctionCallee GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy); - Builder.CreateCall(GCOVInit, {WriteoutF, ResetF}); - Builder.CreateRetVoid(); +void GCOVProfiler::emitGlobalConstructor( + SmallVectorImpl> &CountersBySP) { + Function *WriteoutF = insertCounterWriteout(CountersBySP); + Function *ResetF = insertReset(CountersBySP); - appendToGlobalCtors(*M, F, 0); - Result = true; - } + // Create a small bit of code that registers the "__llvm_gcov_writeout" to + // be executed at exit and the "__llvm_gcov_flush" function to be executed + // when "__gcov_flush" is called. + FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); + Function *F = Function::Create(FTy, GlobalValue::InternalLinkage, + "__llvm_gcov_init", M); + F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + F->setLinkage(GlobalValue::InternalLinkage); + F->addFnAttr(Attribute::NoInline); + if (Options.NoRedZone) + F->addFnAttr(Attribute::NoRedZone); + + BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F); + IRBuilder<> Builder(BB); + + FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); + auto *PFTy = PointerType::get(FTy, 0); + FTy = FunctionType::get(Builder.getVoidTy(), {PFTy, PFTy}, false); + + // Initialize the environment and register the local writeout, flush and + // reset functions. + FunctionCallee GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy); + Builder.CreateCall(GCOVInit, {WriteoutF, ResetF}); + Builder.CreateRetVoid(); - return Result; + appendToGlobalCtors(*M, F, 0); } FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) { @@ -1037,15 +1144,19 @@ Function *GCOVProfiler::insertCounterWriteout( // Collect the relevant data into a large constant data structure that we can // walk to write out everything. StructType *StartFileCallArgsTy = StructType::create( - {Builder.getInt8PtrTy(), Builder.getInt32Ty(), Builder.getInt32Ty()}); + {Builder.getInt8PtrTy(), Builder.getInt32Ty(), Builder.getInt32Ty()}, + "start_file_args_ty"); StructType *EmitFunctionCallArgsTy = StructType::create( - {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()}); + {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()}, + "emit_function_args_ty"); StructType *EmitArcsCallArgsTy = StructType::create( - {Builder.getInt32Ty(), Builder.getInt64Ty()->getPointerTo()}); + {Builder.getInt32Ty(), Builder.getInt64Ty()->getPointerTo()}, + "emit_arcs_args_ty"); StructType *FileInfoTy = StructType::create({StartFileCallArgsTy, Builder.getInt32Ty(), EmitFunctionCallArgsTy->getPointerTo(), - EmitArcsCallArgsTy->getPointerTo()}); + EmitArcsCallArgsTy->getPointerTo()}, + "file_info"); Constant *Zero32 = Builder.getInt32(0); // Build an explicit array of two zeros for use in ConstantExpr GEP building. @@ -1155,41 +1266,46 @@ Function *GCOVProfiler::insertCounterWriteout( // The index into the files structure is our loop induction variable. Builder.SetInsertPoint(FileLoopHeader); - PHINode *IV = - Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2); + PHINode *IV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2, + "file_idx"); IV->addIncoming(Builder.getInt32(0), BB); auto *FileInfoPtr = Builder.CreateInBoundsGEP( FileInfoArrayTy, FileInfoArrayGV, {Builder.getInt32(0), IV}); auto *StartFileCallArgsPtr = - Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 0); + Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 0, "start_file_args"); auto *StartFileCall = Builder.CreateCall( StartFile, {Builder.CreateLoad(StartFileCallArgsTy->getElementType(0), Builder.CreateStructGEP(StartFileCallArgsTy, - StartFileCallArgsPtr, 0)), + StartFileCallArgsPtr, 0), + "filename"), Builder.CreateLoad(StartFileCallArgsTy->getElementType(1), Builder.CreateStructGEP(StartFileCallArgsTy, - StartFileCallArgsPtr, 1)), + StartFileCallArgsPtr, 1), + "version"), Builder.CreateLoad(StartFileCallArgsTy->getElementType(2), Builder.CreateStructGEP(StartFileCallArgsTy, - StartFileCallArgsPtr, 2))}); + StartFileCallArgsPtr, 2), + "stamp")}); if (auto AK = TLI->getExtAttrForI32Param(false)) StartFileCall->addParamAttr(2, AK); - auto *NumCounters = - Builder.CreateLoad(FileInfoTy->getElementType(1), - Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 1)); + auto *NumCounters = Builder.CreateLoad( + FileInfoTy->getElementType(1), + Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 1), "num_ctrs"); auto *EmitFunctionCallArgsArray = Builder.CreateLoad(FileInfoTy->getElementType(2), - Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 2)); - auto *EmitArcsCallArgsArray = - Builder.CreateLoad(FileInfoTy->getElementType(3), - Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 3)); + Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 2), + "emit_function_args"); + auto *EmitArcsCallArgsArray = Builder.CreateLoad( + FileInfoTy->getElementType(3), + Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 3), "emit_arcs_args"); auto *EnterCounterLoopCond = Builder.CreateICmpSLT(Builder.getInt32(0), NumCounters); Builder.CreateCondBr(EnterCounterLoopCond, CounterLoopHeader, FileLoopLatch); Builder.SetInsertPoint(CounterLoopHeader); - auto *JV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2); + auto *JV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2, + "ctr_idx"); JV->addIncoming(Builder.getInt32(0), FileLoopHeader); auto *EmitFunctionCallArgsPtr = Builder.CreateInBoundsGEP( EmitFunctionCallArgsTy, EmitFunctionCallArgsArray, JV); @@ -1197,14 +1313,16 @@ Function *GCOVProfiler::insertCounterWriteout( EmitFunction, {Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(0), Builder.CreateStructGEP(EmitFunctionCallArgsTy, - EmitFunctionCallArgsPtr, 0)), + EmitFunctionCallArgsPtr, 0), + "ident"), Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(1), Builder.CreateStructGEP(EmitFunctionCallArgsTy, - EmitFunctionCallArgsPtr, 1)), + EmitFunctionCallArgsPtr, 1), + "func_checkssum"), Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(2), Builder.CreateStructGEP(EmitFunctionCallArgsTy, - EmitFunctionCallArgsPtr, - 2))}); + EmitFunctionCallArgsPtr, 2), + "cfg_checksum")}); if (auto AK = TLI->getExtAttrForI32Param(false)) { EmitFunctionCall->addParamAttr(0, AK); EmitFunctionCall->addParamAttr(1, AK); @@ -1216,10 +1334,12 @@ Function *GCOVProfiler::insertCounterWriteout( EmitArcs, {Builder.CreateLoad( EmitArcsCallArgsTy->getElementType(0), - Builder.CreateStructGEP(EmitArcsCallArgsTy, EmitArcsCallArgsPtr, 0)), - Builder.CreateLoad(EmitArcsCallArgsTy->getElementType(1), - Builder.CreateStructGEP(EmitArcsCallArgsTy, - EmitArcsCallArgsPtr, 1))}); + Builder.CreateStructGEP(EmitArcsCallArgsTy, EmitArcsCallArgsPtr, 0), + "num_counters"), + Builder.CreateLoad( + EmitArcsCallArgsTy->getElementType(1), + Builder.CreateStructGEP(EmitArcsCallArgsTy, EmitArcsCallArgsPtr, 1), + "counters")}); if (auto AK = TLI->getExtAttrForI32Param(false)) EmitArcsCall->addParamAttr(0, AK); auto *NextJV = Builder.CreateAdd(JV, Builder.getInt32(1)); @@ -1230,7 +1350,7 @@ Function *GCOVProfiler::insertCounterWriteout( Builder.SetInsertPoint(FileLoopLatch); Builder.CreateCall(SummaryInfo, {}); Builder.CreateCall(EndFile, {}); - auto *NextIV = Builder.CreateAdd(IV, Builder.getInt32(1)); + auto *NextIV = Builder.CreateAdd(IV, Builder.getInt32(1), "next_file_idx"); auto *FileLoopCond = Builder.CreateICmpSLT(NextIV, Builder.getInt32(FileInfos.size())); Builder.CreateCondBr(FileLoopCond, FileLoopHeader, ExitBB); diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp index 5cf3c2e3e11b3..cfdf3cad97f73 100644 --- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -105,8 +105,8 @@ Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T, void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeAddressSanitizerLegacyPassPass(Registry); initializeModuleAddressSanitizerLegacyPassPass(Registry); - initializeHeapProfilerLegacyPassPass(Registry); - initializeModuleHeapProfilerLegacyPassPass(Registry); + initializeMemProfilerLegacyPassPass(Registry); + initializeModuleMemProfilerLegacyPassPass(Registry); initializeBoundsCheckingLegacyPassPass(Registry); initializeControlHeightReductionLegacyPassPass(Registry); initializeGCOVProfilerLegacyPassPass(Registry); diff --git a/llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp similarity index 68% rename from llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp rename to llvm/lib/Transforms/Instrumentation/MemProfiler.cpp index 5f8671d7d88fc..7f2a5ae1a189a 100644 --- a/llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp @@ -1,4 +1,4 @@ -//===- HeapProfiler.cpp - heap allocation and access profiler -------------===// +//===- MemProfiler.cpp - memory allocation and access profiler ------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,15 +6,15 @@ // //===----------------------------------------------------------------------===// // -// This file is a part of HeapProfiler. Memory accesses are instrumented +// This file is a part of MemProfiler. Memory accesses are instrumented // to increment the access count held in a shadow memory location, or // alternatively to call into the runtime. Memory intrinsic calls (memmove, -// memcpy, memset) are changed to call the heap profiling runtime version +// memcpy, memset) are changed to call the memory profiling runtime version // instead. // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Instrumentation/HeapProfiler.h" +#include "llvm/Transforms/Instrumentation/MemProfiler.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -39,9 +39,9 @@ using namespace llvm; -#define DEBUG_TYPE "heapprof" +#define DEBUG_TYPE "memprof" -constexpr int LLVM_HEAP_PROFILER_VERSION = 1; +constexpr int LLVM_MEM_PROFILER_VERSION = 1; // Size of memory mapped to a single shadow location. constexpr uint64_t DefaultShadowGranularity = 64; @@ -49,74 +49,74 @@ constexpr uint64_t DefaultShadowGranularity = 64; // Scale from granularity down to shadow size. constexpr uint64_t DefaultShadowScale = 3; -constexpr char HeapProfModuleCtorName[] = "heapprof.module_ctor"; -constexpr uint64_t HeapProfCtorAndDtorPriority = 1; +constexpr char MemProfModuleCtorName[] = "memprof.module_ctor"; +constexpr uint64_t MemProfCtorAndDtorPriority = 1; // On Emscripten, the system needs more than one priorities for constructors. -constexpr uint64_t HeapProfEmscriptenCtorAndDtorPriority = 50; -constexpr char HeapProfInitName[] = "__heapprof_init"; -constexpr char HeapProfVersionCheckNamePrefix[] = - "__heapprof_version_mismatch_check_v"; +constexpr uint64_t MemProfEmscriptenCtorAndDtorPriority = 50; +constexpr char MemProfInitName[] = "__memprof_init"; +constexpr char MemProfVersionCheckNamePrefix[] = + "__memprof_version_mismatch_check_v"; -constexpr char HeapProfShadowMemoryDynamicAddress[] = - "__heapprof_shadow_memory_dynamic_address"; +constexpr char MemProfShadowMemoryDynamicAddress[] = + "__memprof_shadow_memory_dynamic_address"; // Command-line flags. static cl::opt ClInsertVersionCheck( - "heapprof-guard-against-version-mismatch", + "memprof-guard-against-version-mismatch", cl::desc("Guard against compiler/runtime version mismatch."), cl::Hidden, cl::init(true)); // This flag may need to be replaced with -f[no-]memprof-reads. -static cl::opt ClInstrumentReads("heapprof-instrument-reads", +static cl::opt ClInstrumentReads("memprof-instrument-reads", cl::desc("instrument read instructions"), cl::Hidden, cl::init(true)); static cl::opt - ClInstrumentWrites("heapprof-instrument-writes", + ClInstrumentWrites("memprof-instrument-writes", cl::desc("instrument write instructions"), cl::Hidden, cl::init(true)); static cl::opt ClInstrumentAtomics( - "heapprof-instrument-atomics", + "memprof-instrument-atomics", cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden, cl::init(true)); static cl::opt ClUseCalls( - "heapprof-use-callbacks", + "memprof-use-callbacks", cl::desc("Use callbacks instead of inline instrumentation sequences."), cl::Hidden, cl::init(false)); static cl::opt - ClMemoryAccessCallbackPrefix("heapprof-memory-access-callback-prefix", + ClMemoryAccessCallbackPrefix("memprof-memory-access-callback-prefix", cl::desc("Prefix for memory access callbacks"), - cl::Hidden, cl::init("__heapprof_")); + cl::Hidden, cl::init("__memprof_")); // These flags allow to change the shadow mapping. // The shadow mapping looks like // Shadow = ((Mem & mask) >> scale) + offset -static cl::opt ClMappingScale("heapprof-mapping-scale", - cl::desc("scale of heapprof shadow mapping"), +static cl::opt ClMappingScale("memprof-mapping-scale", + cl::desc("scale of memprof shadow mapping"), cl::Hidden, cl::init(DefaultShadowScale)); static cl::opt - ClMappingGranularity("heapprof-mapping-granularity", - cl::desc("granularity of heapprof shadow mapping"), + ClMappingGranularity("memprof-mapping-granularity", + cl::desc("granularity of memprof shadow mapping"), cl::Hidden, cl::init(DefaultShadowGranularity)); // Debug flags. -static cl::opt ClDebug("heapprof-debug", cl::desc("debug"), cl::Hidden, +static cl::opt ClDebug("memprof-debug", cl::desc("debug"), cl::Hidden, cl::init(0)); -static cl::opt ClDebugFunc("heapprof-debug-func", cl::Hidden, +static cl::opt ClDebugFunc("memprof-debug-func", cl::Hidden, cl::desc("Debug func")); -static cl::opt ClDebugMin("heapprof-debug-min", cl::desc("Debug min inst"), +static cl::opt ClDebugMin("memprof-debug-min", cl::desc("Debug min inst"), cl::Hidden, cl::init(-1)); -static cl::opt ClDebugMax("heapprof-debug-max", cl::desc("Debug max inst"), +static cl::opt ClDebugMax("memprof-debug-max", cl::desc("Debug max inst"), cl::Hidden, cl::init(-1)); STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); @@ -139,8 +139,8 @@ struct ShadowMapping { }; static uint64_t getCtorAndDtorPriority(Triple &TargetTriple) { - return TargetTriple.isOSEmscripten() ? HeapProfEmscriptenCtorAndDtorPriority - : HeapProfCtorAndDtorPriority; + return TargetTriple.isOSEmscripten() ? MemProfEmscriptenCtorAndDtorPriority + : MemProfCtorAndDtorPriority; } struct InterestingMemoryAccess { @@ -151,10 +151,10 @@ struct InterestingMemoryAccess { Value *MaybeMask = nullptr; }; -/// Instrument the code in module to profile heap accesses. -class HeapProfiler { +/// Instrument the code in module to profile memory accesses. +class MemProfiler { public: - HeapProfiler(Module &M) { + MemProfiler(Module &M) { C = &(M.getContext()); LongSize = M.getDataLayout().getPointerSizeInBits(); IntptrTy = Type::getIntNTy(*C, LongSize); @@ -177,7 +177,7 @@ class HeapProfiler { void instrumentMemIntrinsic(MemIntrinsic *MI); Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); bool instrumentFunction(Function &F); - bool maybeInsertHeapProfInitAtFunctionEntry(Function &F); + bool maybeInsertMemProfInitAtFunctionEntry(Function &F); bool insertDynamicShadowAtFunctionEntry(Function &F); private: @@ -189,68 +189,67 @@ class HeapProfiler { ShadowMapping Mapping; // These arrays is indexed by AccessIsWrite - FunctionCallee HeapProfMemoryAccessCallback[2]; - FunctionCallee HeapProfMemoryAccessCallbackSized[2]; + FunctionCallee MemProfMemoryAccessCallback[2]; + FunctionCallee MemProfMemoryAccessCallbackSized[2]; - FunctionCallee HeapProfMemmove, HeapProfMemcpy, HeapProfMemset; + FunctionCallee MemProfMemmove, MemProfMemcpy, MemProfMemset; Value *DynamicShadowOffset = nullptr; }; -class HeapProfilerLegacyPass : public FunctionPass { +class MemProfilerLegacyPass : public FunctionPass { public: static char ID; - explicit HeapProfilerLegacyPass() : FunctionPass(ID) { - initializeHeapProfilerLegacyPassPass(*PassRegistry::getPassRegistry()); + explicit MemProfilerLegacyPass() : FunctionPass(ID) { + initializeMemProfilerLegacyPassPass(*PassRegistry::getPassRegistry()); } - StringRef getPassName() const override { return "HeapProfilerFunctionPass"; } + StringRef getPassName() const override { return "MemProfilerFunctionPass"; } bool runOnFunction(Function &F) override { - HeapProfiler Profiler(*F.getParent()); + MemProfiler Profiler(*F.getParent()); return Profiler.instrumentFunction(F); } }; -class ModuleHeapProfiler { +class ModuleMemProfiler { public: - ModuleHeapProfiler(Module &M) { TargetTriple = Triple(M.getTargetTriple()); } + ModuleMemProfiler(Module &M) { TargetTriple = Triple(M.getTargetTriple()); } bool instrumentModule(Module &); private: Triple TargetTriple; ShadowMapping Mapping; - Function *HeapProfCtorFunction = nullptr; + Function *MemProfCtorFunction = nullptr; }; -class ModuleHeapProfilerLegacyPass : public ModulePass { +class ModuleMemProfilerLegacyPass : public ModulePass { public: static char ID; - explicit ModuleHeapProfilerLegacyPass() : ModulePass(ID) { - initializeModuleHeapProfilerLegacyPassPass( - *PassRegistry::getPassRegistry()); + explicit ModuleMemProfilerLegacyPass() : ModulePass(ID) { + initializeModuleMemProfilerLegacyPassPass(*PassRegistry::getPassRegistry()); } - StringRef getPassName() const override { return "ModuleHeapProfiler"; } + StringRef getPassName() const override { return "ModuleMemProfiler"; } void getAnalysisUsage(AnalysisUsage &AU) const override {} bool runOnModule(Module &M) override { - ModuleHeapProfiler HeapProfiler(M); - return HeapProfiler.instrumentModule(M); + ModuleMemProfiler MemProfiler(M); + return MemProfiler.instrumentModule(M); } }; } // end anonymous namespace -HeapProfilerPass::HeapProfilerPass() {} +MemProfilerPass::MemProfilerPass() {} -PreservedAnalyses HeapProfilerPass::run(Function &F, - AnalysisManager &AM) { +PreservedAnalyses MemProfilerPass::run(Function &F, + AnalysisManager &AM) { Module &M = *F.getParent(); - HeapProfiler Profiler(M); + MemProfiler Profiler(M); if (Profiler.instrumentFunction(F)) return PreservedAnalyses::none(); return PreservedAnalyses::all(); @@ -258,41 +257,41 @@ PreservedAnalyses HeapProfilerPass::run(Function &F, return PreservedAnalyses::all(); } -ModuleHeapProfilerPass::ModuleHeapProfilerPass() {} +ModuleMemProfilerPass::ModuleMemProfilerPass() {} -PreservedAnalyses ModuleHeapProfilerPass::run(Module &M, - AnalysisManager &AM) { - ModuleHeapProfiler Profiler(M); +PreservedAnalyses ModuleMemProfilerPass::run(Module &M, + AnalysisManager &AM) { + ModuleMemProfiler Profiler(M); if (Profiler.instrumentModule(M)) return PreservedAnalyses::none(); return PreservedAnalyses::all(); } -char HeapProfilerLegacyPass::ID = 0; +char MemProfilerLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(HeapProfilerLegacyPass, "heapprof", - "HeapProfiler: profile heap allocations and accesses.", +INITIALIZE_PASS_BEGIN(MemProfilerLegacyPass, "memprof", + "MemProfiler: profile memory allocations and accesses.", false, false) -INITIALIZE_PASS_END(HeapProfilerLegacyPass, "heapprof", - "HeapProfiler: profile heap allocations and accesses.", +INITIALIZE_PASS_END(MemProfilerLegacyPass, "memprof", + "MemProfiler: profile memory allocations and accesses.", false, false) -FunctionPass *llvm::createHeapProfilerFunctionPass() { - return new HeapProfilerLegacyPass(); +FunctionPass *llvm::createMemProfilerFunctionPass() { + return new MemProfilerLegacyPass(); } -char ModuleHeapProfilerLegacyPass::ID = 0; +char ModuleMemProfilerLegacyPass::ID = 0; -INITIALIZE_PASS(ModuleHeapProfilerLegacyPass, "heapprof-module", - "HeapProfiler: profile heap allocations and accesses." +INITIALIZE_PASS(ModuleMemProfilerLegacyPass, "memprof-module", + "MemProfiler: profile memory allocations and accesses." "ModulePass", false, false) -ModulePass *llvm::createModuleHeapProfilerLegacyPassPass() { - return new ModuleHeapProfilerLegacyPass(); +ModulePass *llvm::createModuleMemProfilerLegacyPassPass() { + return new ModuleMemProfilerLegacyPass(); } -Value *HeapProfiler::memToShadow(Value *Shadow, IRBuilder<> &IRB) { +Value *MemProfiler::memToShadow(Value *Shadow, IRBuilder<> &IRB) { // (Shadow & mask) >> scale Shadow = IRB.CreateAnd(Shadow, Mapping.Mask); Shadow = IRB.CreateLShr(Shadow, Mapping.Scale); @@ -302,17 +301,17 @@ Value *HeapProfiler::memToShadow(Value *Shadow, IRBuilder<> &IRB) { } // Instrument memset/memmove/memcpy -void HeapProfiler::instrumentMemIntrinsic(MemIntrinsic *MI) { +void MemProfiler::instrumentMemIntrinsic(MemIntrinsic *MI) { IRBuilder<> IRB(MI); if (isa(MI)) { IRB.CreateCall( - isa(MI) ? HeapProfMemmove : HeapProfMemcpy, + isa(MI) ? MemProfMemmove : MemProfMemcpy, {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()), IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)}); } else if (isa(MI)) { IRB.CreateCall( - HeapProfMemset, + MemProfMemset, {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false), IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)}); @@ -321,7 +320,7 @@ void HeapProfiler::instrumentMemIntrinsic(MemIntrinsic *MI) { } Optional -HeapProfiler::isInterestingMemoryAccess(Instruction *I) const { +MemProfiler::isInterestingMemoryAccess(Instruction *I) const { // Do not instrument the load fetching the dynamic shadow address. if (DynamicShadowOffset == I) return None; @@ -409,11 +408,10 @@ HeapProfiler::isInterestingMemoryAccess(Instruction *I) const { return Access; } -void HeapProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL, - Value *Mask, Instruction *I, - Value *Addr, unsigned Alignment, - uint32_t TypeSize, - bool IsWrite) { +void MemProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask, + Instruction *I, Value *Addr, + unsigned Alignment, + uint32_t TypeSize, bool IsWrite) { auto *VTy = cast( cast(Addr->getType())->getElementType()); uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType()); @@ -446,8 +444,8 @@ void HeapProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL, } } -void HeapProfiler::instrumentMop(Instruction *I, const DataLayout &DL, - InterestingMemoryAccess &Access) { +void MemProfiler::instrumentMop(Instruction *I, const DataLayout &DL, + InterestingMemoryAccess &Access) { if (Access.IsWrite) NumInstrumentedWrites++; else @@ -465,14 +463,14 @@ void HeapProfiler::instrumentMop(Instruction *I, const DataLayout &DL, } } -void HeapProfiler::instrumentAddress(Instruction *OrigIns, - Instruction *InsertBefore, Value *Addr, - uint32_t TypeSize, bool IsWrite) { +void MemProfiler::instrumentAddress(Instruction *OrigIns, + Instruction *InsertBefore, Value *Addr, + uint32_t TypeSize, bool IsWrite) { IRBuilder<> IRB(InsertBefore); Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); if (ClUseCalls) { - IRB.CreateCall(HeapProfMemoryAccessCallback[IsWrite], AddrLong); + IRB.CreateCall(MemProfMemoryAccessCallback[IsWrite], AddrLong); return; } @@ -488,24 +486,24 @@ void HeapProfiler::instrumentAddress(Instruction *OrigIns, IRB.CreateStore(ShadowValue, ShadowAddr); } -bool ModuleHeapProfiler::instrumentModule(Module &M) { +bool ModuleMemProfiler::instrumentModule(Module &M) { // Create a module constructor. - std::string HeapProfVersion = std::to_string(LLVM_HEAP_PROFILER_VERSION); + std::string MemProfVersion = std::to_string(LLVM_MEM_PROFILER_VERSION); std::string VersionCheckName = - ClInsertVersionCheck ? (HeapProfVersionCheckNamePrefix + HeapProfVersion) + ClInsertVersionCheck ? (MemProfVersionCheckNamePrefix + MemProfVersion) : ""; - std::tie(HeapProfCtorFunction, std::ignore) = - createSanitizerCtorAndInitFunctions(M, HeapProfModuleCtorName, - HeapProfInitName, /*InitArgTypes=*/{}, + std::tie(MemProfCtorFunction, std::ignore) = + createSanitizerCtorAndInitFunctions(M, MemProfModuleCtorName, + MemProfInitName, /*InitArgTypes=*/{}, /*InitArgs=*/{}, VersionCheckName); const uint64_t Priority = getCtorAndDtorPriority(TargetTriple); - appendToGlobalCtors(M, HeapProfCtorFunction, Priority); + appendToGlobalCtors(M, MemProfCtorFunction, Priority); return true; } -void HeapProfiler::initializeCallbacks(Module &M) { +void MemProfiler::initializeCallbacks(Module &M) { IRBuilder<> IRB(*C); for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) { @@ -513,68 +511,68 @@ void HeapProfiler::initializeCallbacks(Module &M) { SmallVector Args2 = {IntptrTy, IntptrTy}; SmallVector Args1{1, IntptrTy}; - HeapProfMemoryAccessCallbackSized[AccessIsWrite] = + MemProfMemoryAccessCallbackSized[AccessIsWrite] = M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr + "N", FunctionType::get(IRB.getVoidTy(), Args2, false)); - HeapProfMemoryAccessCallback[AccessIsWrite] = + MemProfMemoryAccessCallback[AccessIsWrite] = M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr, FunctionType::get(IRB.getVoidTy(), Args1, false)); } - HeapProfMemmove = M.getOrInsertFunction( + MemProfMemmove = M.getOrInsertFunction( ClMemoryAccessCallbackPrefix + "memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy); - HeapProfMemcpy = M.getOrInsertFunction( - ClMemoryAccessCallbackPrefix + "memcpy", IRB.getInt8PtrTy(), - IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy); - HeapProfMemset = M.getOrInsertFunction( - ClMemoryAccessCallbackPrefix + "memset", IRB.getInt8PtrTy(), - IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy); + MemProfMemcpy = M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "memcpy", + IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IntptrTy); + MemProfMemset = M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "memset", + IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + IRB.getInt32Ty(), IntptrTy); } -bool HeapProfiler::maybeInsertHeapProfInitAtFunctionEntry(Function &F) { +bool MemProfiler::maybeInsertMemProfInitAtFunctionEntry(Function &F) { // For each NSObject descendant having a +load method, this method is invoked // by the ObjC runtime before any of the static constructors is called. - // Therefore we need to instrument such methods with a call to __heapprof_init + // Therefore we need to instrument such methods with a call to __memprof_init // at the beginning in order to initialize our runtime before any access to // the shadow memory. // We cannot just ignore these methods, because they may call other // instrumented functions. if (F.getName().find(" load]") != std::string::npos) { - FunctionCallee HeapProfInitFunction = - declareSanitizerInitFunction(*F.getParent(), HeapProfInitName, {}); + FunctionCallee MemProfInitFunction = + declareSanitizerInitFunction(*F.getParent(), MemProfInitName, {}); IRBuilder<> IRB(&F.front(), F.front().begin()); - IRB.CreateCall(HeapProfInitFunction, {}); + IRB.CreateCall(MemProfInitFunction, {}); return true; } return false; } -bool HeapProfiler::insertDynamicShadowAtFunctionEntry(Function &F) { +bool MemProfiler::insertDynamicShadowAtFunctionEntry(Function &F) { IRBuilder<> IRB(&F.front().front()); Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal( - HeapProfShadowMemoryDynamicAddress, IntptrTy); + MemProfShadowMemoryDynamicAddress, IntptrTy); DynamicShadowOffset = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress); return true; } -bool HeapProfiler::instrumentFunction(Function &F) { +bool MemProfiler::instrumentFunction(Function &F) { if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false; if (ClDebugFunc == F.getName()) return false; - if (F.getName().startswith("__heapprof_")) + if (F.getName().startswith("__memprof_")) return false; bool FunctionModified = false; - // If needed, insert __heapprof_init. + // If needed, insert __memprof_init. // This function needs to be called even if the function body is not // instrumented. - if (maybeInsertHeapProfInitAtFunctionEntry(F)) + if (maybeInsertMemProfInitAtFunctionEntry(F)) FunctionModified = true; - LLVM_DEBUG(dbgs() << "HEAPPROF instrumenting:\n" << F << "\n"); + LLVM_DEBUG(dbgs() << "MEMPROF instrumenting:\n" << F << "\n"); initializeCallbacks(*F.getParent()); @@ -607,8 +605,8 @@ bool HeapProfiler::instrumentFunction(Function &F) { if (NumInstrumented > 0) FunctionModified = true; - LLVM_DEBUG(dbgs() << "HEAPPROF done instrumenting: " << FunctionModified - << " " << F << "\n"); + LLVM_DEBUG(dbgs() << "MEMPROF done instrumenting: " << FunctionModified << " " + << F << "\n"); return FunctionModified; } diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index be2e091e8c08f..dd70c1f77d9c1 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -807,8 +807,11 @@ BasicBlock *FuncPGOInstrumentation::getInstrBB(Edge *E) { if (!E->IsCritical) return canInstrument(DestBB); + // Some IndirectBr critical edges cannot be split by the previous + // SplitIndirectBrCriticalEdges call. Bail out. unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB); - BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum); + BasicBlock *InstrBB = + isa(TI) ? nullptr : SplitCriticalEdge(TI, SuccNum); if (!InstrBB) { LLVM_DEBUG( dbgs() << "Fail to split critical edge: not instrument this edge.\n"); diff --git a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp index 6f785687b5045..fc5267261851d 100644 --- a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp +++ b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp @@ -295,7 +295,7 @@ static bool rewrite(Function &F) { } SmallVector Checks; - if (propagatesPoison(&I)) + if (propagatesPoison(cast(&I))) for (Value *V : I.operands()) Checks.push_back(getPoisonFor(ValToPoison, V)); diff --git a/llvm/lib/Transforms/LLVMBuild.txt b/llvm/lib/Transforms/LLVMBuild.txt index 5fb5efcc068c8..6c6a6bb317fa8 100644 --- a/llvm/lib/Transforms/LLVMBuild.txt +++ b/llvm/lib/Transforms/LLVMBuild.txt @@ -15,7 +15,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = AggressiveInstCombine Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC CFGuard +subdirectories = AggressiveInstCombine Coroutines HelloNew IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC CFGuard [component_0] type = Group diff --git a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h index 8fd842fd42d64..9e18052641a13 100644 --- a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h +++ b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h @@ -26,12 +26,12 @@ #define LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H #include "llvm/ADT/DenseMap.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/ValueHandle.h" #include namespace llvm { +class AAResults; class DataLayout; class PHINode; class SelectInst; @@ -49,7 +49,7 @@ namespace objcarc { /// not two pointers have the same provenance source and thus could /// potentially be related. class ProvenanceAnalysis { - AliasAnalysis *AA; + AAResults *AA; using ValuePairTy = std::pair; using CachedResultsTy = DenseMap; @@ -67,9 +67,9 @@ class ProvenanceAnalysis { ProvenanceAnalysis(const ProvenanceAnalysis &) = delete; ProvenanceAnalysis &operator=(const ProvenanceAnalysis &) = delete; - void setAA(AliasAnalysis *aa) { AA = aa; } + void setAA(AAResults *aa) { AA = aa; } - AliasAnalysis *getAA() const { return AA; } + AAResults *getAA() const { return AA; } bool related(const Value *A, const Value *B, const DataLayout &DL); diff --git a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 5c008585869cd..bccf94fc217fe 100644 --- a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -15,6 +15,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/IR/Instructions.h" #include "llvm/InitializePasses.h" #define AA_NAME "alignment-from-assumptions" #define DEBUG_TYPE AA_NAME @@ -203,103 +204,33 @@ static Align getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV, } bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I, + unsigned Idx, Value *&AAPtr, const SCEV *&AlignSCEV, const SCEV *&OffSCEV) { - // An alignment assume must be a statement about the least-significant - // bits of the pointer being zero, possibly with some offset. - ICmpInst *ICI = dyn_cast(I->getArgOperand(0)); - if (!ICI) + Type *Int64Ty = Type::getInt64Ty(I->getContext()); + OperandBundleUse AlignOB = I->getOperandBundleAt(Idx); + if (AlignOB.getTagName() != "align") return false; - - // This must be an expression of the form: x & m == 0. - if (ICI->getPredicate() != ICmpInst::ICMP_EQ) - return false; - - // Swap things around so that the RHS is 0. - Value *CmpLHS = ICI->getOperand(0); - Value *CmpRHS = ICI->getOperand(1); - const SCEV *CmpLHSSCEV = SE->getSCEV(CmpLHS); - const SCEV *CmpRHSSCEV = SE->getSCEV(CmpRHS); - if (CmpLHSSCEV->isZero()) - std::swap(CmpLHS, CmpRHS); - else if (!CmpRHSSCEV->isZero()) - return false; - - BinaryOperator *CmpBO = dyn_cast(CmpLHS); - if (!CmpBO || CmpBO->getOpcode() != Instruction::And) - return false; - - // Swap things around so that the right operand of the and is a constant - // (the mask); we cannot deal with variable masks. - Value *AndLHS = CmpBO->getOperand(0); - Value *AndRHS = CmpBO->getOperand(1); - const SCEV *AndLHSSCEV = SE->getSCEV(AndLHS); - const SCEV *AndRHSSCEV = SE->getSCEV(AndRHS); - if (isa(AndLHSSCEV)) { - std::swap(AndLHS, AndRHS); - std::swap(AndLHSSCEV, AndRHSSCEV); - } - - const SCEVConstant *MaskSCEV = dyn_cast(AndRHSSCEV); - if (!MaskSCEV) - return false; - - // The mask must have some trailing ones (otherwise the condition is - // trivial and tells us nothing about the alignment of the left operand). - unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes(); - if (!TrailingOnes) - return false; - - // Cap the alignment at the maximum with which LLVM can deal (and make sure - // we don't overflow the shift). - uint64_t Alignment; - TrailingOnes = std::min(TrailingOnes, - unsigned(sizeof(unsigned) * CHAR_BIT - 1)); - Alignment = std::min(1u << TrailingOnes, +Value::MaximumAlignment); - - Type *Int64Ty = Type::getInt64Ty(I->getParent()->getParent()->getContext()); - AlignSCEV = SE->getConstant(Int64Ty, Alignment); - - // The LHS might be a ptrtoint instruction, or it might be the pointer - // with an offset. - AAPtr = nullptr; - OffSCEV = nullptr; - if (PtrToIntInst *PToI = dyn_cast(AndLHS)) { - AAPtr = PToI->getPointerOperand(); + assert(AlignOB.Inputs.size() >= 2); + AAPtr = AlignOB.Inputs[0].get(); + // TODO: Consider accumulating the offset to the base. + AAPtr = AAPtr->stripPointerCastsSameRepresentation(); + AlignSCEV = SE->getSCEV(AlignOB.Inputs[1].get()); + AlignSCEV = SE->getTruncateOrZeroExtend(AlignSCEV, Int64Ty); + if (AlignOB.Inputs.size() == 3) + OffSCEV = SE->getSCEV(AlignOB.Inputs[2].get()); + else OffSCEV = SE->getZero(Int64Ty); - } else if (const SCEVAddExpr* AndLHSAddSCEV = - dyn_cast(AndLHSSCEV)) { - // Try to find the ptrtoint; subtract it and the rest is the offset. - for (SCEVAddExpr::op_iterator J = AndLHSAddSCEV->op_begin(), - JE = AndLHSAddSCEV->op_end(); J != JE; ++J) - if (const SCEVUnknown *OpUnk = dyn_cast(*J)) - if (PtrToIntInst *PToI = dyn_cast(OpUnk->getValue())) { - AAPtr = PToI->getPointerOperand(); - OffSCEV = SE->getMinusSCEV(AndLHSAddSCEV, *J); - break; - } - } - - if (!AAPtr) - return false; - - // Sign extend the offset to 64 bits (so that it is like all of the other - // expressions). - unsigned OffSCEVBits = OffSCEV->getType()->getPrimitiveSizeInBits(); - if (OffSCEVBits < 64) - OffSCEV = SE->getSignExtendExpr(OffSCEV, Int64Ty); - else if (OffSCEVBits > 64) - return false; - - AAPtr = AAPtr->stripPointerCasts(); + OffSCEV = SE->getTruncateOrZeroExtend(OffSCEV, Int64Ty); return true; } -bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { +bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall, + unsigned Idx) { Value *AAPtr; const SCEV *AlignSCEV, *OffSCEV; - if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV)) + if (!extractAlignmentInfo(ACall, Idx, AAPtr, AlignSCEV, OffSCEV)) return false; // Skip ConstantPointerNull and UndefValue. Assumptions on these shouldn't @@ -317,13 +248,14 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { continue; if (Instruction *K = dyn_cast(J)) - if (isValidAssumeForContext(ACall, K, DT)) WorkList.push_back(K); } while (!WorkList.empty()) { Instruction *J = WorkList.pop_back_val(); if (LoadInst *LI = dyn_cast(J)) { + if (!isValidAssumeForContext(ACall, J, DT)) + continue; Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV, LI->getPointerOperand(), SE); if (NewAlignment > LI->getAlign()) { @@ -331,6 +263,8 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { ++NumLoadAlignChanged; } } else if (StoreInst *SI = dyn_cast(J)) { + if (!isValidAssumeForContext(ACall, J, DT)) + continue; Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV, SI->getPointerOperand(), SE); if (NewAlignment > SI->getAlign()) { @@ -338,6 +272,8 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { ++NumStoreAlignChanged; } } else if (MemIntrinsic *MI = dyn_cast(J)) { + if (!isValidAssumeForContext(ACall, J, DT)) + continue; Align NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MI->getDest(), SE); @@ -369,7 +305,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { Visited.insert(J); for (User *UJ : J->users()) { Instruction *K = cast(UJ); - if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DT)) + if (!Visited.count(K)) WorkList.push_back(K); } } @@ -396,8 +332,11 @@ bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC, bool Changed = false; for (auto &AssumeVH : AC.assumptions()) - if (AssumeVH) - Changed |= processAssumption(cast(AssumeVH)); + if (AssumeVH) { + CallInst *Call = cast(AssumeVH); + for (unsigned Idx = 0; Idx < Call->getNumOperandBundles(); Idx++) + Changed |= processAssumption(Call, Idx); + } return Changed; } diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt index 89173414c16b1..ae62aa0220724 100644 --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -4,6 +4,7 @@ add_llvm_component_library(LLVMScalarOpts BDCE.cpp CallSiteSplitting.cpp ConstantHoisting.cpp + ConstraintElimination.cpp CorrelatedValuePropagation.cpp DCE.cpp DeadStoreElimination.cpp diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp new file mode 100644 index 0000000000000..8500b831fda6a --- /dev/null +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -0,0 +1,310 @@ +//===-- ConstraintElimination.cpp - Eliminate conds using constraints. ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Eliminate conditions based on constraints collected from dominating +// conditions. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstraintSystem.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/DebugCounter.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "constraint-elimination" + +STATISTIC(NumCondsRemoved, "Number of instructions removed"); +DEBUG_COUNTER(EliminatedCounter, "conds-eliminated", + "Controls which conditions are eliminated"); + +static int64_t MaxConstraintValue = std::numeric_limits::max(); + +Optional> decompose(Value *V) { + if (auto *CI = dyn_cast(V)) { + if (CI->isNegative() || CI->uge(MaxConstraintValue)) + return {}; + return {{CI->getSExtValue(), nullptr}}; + } + auto *GEP = dyn_cast(V); + if (GEP && GEP->getNumOperands() == 2 && + isa(GEP->getOperand(GEP->getNumOperands() - 1))) { + return {{cast(GEP->getOperand(GEP->getNumOperands() - 1)) + ->getSExtValue(), + GEP->getPointerOperand()}}; + } + return {{0, V}}; +} + +/// Turn a condition \p CmpI into a constraint vector, using indices from \p +/// Value2Index. If \p ShouldAdd is true, new indices are added for values not +/// yet in \p Value2Index. +static SmallVector +getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, + DenseMap &Value2Index, bool ShouldAdd) { + Value *A, *B; + + int64_t Offset1 = 0; + int64_t Offset2 = 0; + + auto TryToGetIndex = [ShouldAdd, + &Value2Index](Value *V) -> Optional { + if (ShouldAdd) { + Value2Index.insert({V, Value2Index.size() + 1}); + return Value2Index[V]; + } + auto I = Value2Index.find(V); + if (I == Value2Index.end()) + return None; + return I->second; + }; + + if (Pred == CmpInst::ICMP_UGT || Pred == CmpInst::ICMP_UGE) + return getConstraint(CmpInst::getSwappedPredicate(Pred), Op1, Op0, + Value2Index, ShouldAdd); + + if (Pred == CmpInst::ICMP_ULE || Pred == CmpInst::ICMP_ULT) { + auto ADec = decompose(Op0); + auto BDec = decompose(Op1); + if (!ADec || !BDec) + return {}; + std::tie(Offset1, A) = *ADec; + std::tie(Offset2, B) = *BDec; + Offset1 *= -1; + + if (!A && !B) + return {}; + + auto AIdx = A ? TryToGetIndex(A) : None; + auto BIdx = B ? TryToGetIndex(B) : None; + if ((A && !AIdx) || (B && !BIdx)) + return {}; + + SmallVector R(Value2Index.size() + 1, 0); + if (AIdx) + R[*AIdx] = 1; + if (BIdx) + R[*BIdx] = -1; + R[0] = Offset1 + Offset2 + (Pred == CmpInst::ICMP_ULT ? -1 : 0); + return R; + } + + return {}; +} + +static SmallVector +getConstraint(CmpInst *Cmp, DenseMap &Value2Index, + bool ShouldAdd) { + return getConstraint(Cmp->getPredicate(), Cmp->getOperand(0), + Cmp->getOperand(1), Value2Index, ShouldAdd); +} + +/// Represents either a condition that holds on entry to a block or a basic +/// block, with their respective Dominator DFS in and out numbers. +struct ConstraintOrBlock { + unsigned NumIn; + unsigned NumOut; + bool IsBlock; + bool Not; + union { + BasicBlock *BB; + CmpInst *Condition; + }; + + ConstraintOrBlock(DomTreeNode *DTN) + : NumIn(DTN->getDFSNumIn()), NumOut(DTN->getDFSNumOut()), IsBlock(true), + BB(DTN->getBlock()) {} + ConstraintOrBlock(DomTreeNode *DTN, CmpInst *Condition, bool Not) + : NumIn(DTN->getDFSNumIn()), NumOut(DTN->getDFSNumOut()), IsBlock(false), + Not(Not), Condition(Condition) {} +}; + +struct StackEntry { + unsigned NumIn; + unsigned NumOut; + CmpInst *Condition; + bool IsNot; + + StackEntry(unsigned NumIn, unsigned NumOut, CmpInst *Condition, bool IsNot) + : NumIn(NumIn), NumOut(NumOut), Condition(Condition), IsNot(IsNot) {} +}; + +static bool eliminateConstraints(Function &F, DominatorTree &DT) { + bool Changed = false; + DT.updateDFSNumbers(); + ConstraintSystem CS; + + SmallVector WorkList; + + // First, collect conditions implied by branches and blocks with their + // Dominator DFS in and out numbers. + for (BasicBlock &BB : F) { + if (!DT.getNode(&BB)) + continue; + WorkList.emplace_back(DT.getNode(&BB)); + + auto *Br = dyn_cast(BB.getTerminator()); + if (!Br || !Br->isConditional()) + continue; + auto *CmpI = dyn_cast(Br->getCondition()); + if (!CmpI) + continue; + if (Br->getSuccessor(0)->getSinglePredecessor()) + WorkList.emplace_back(DT.getNode(Br->getSuccessor(0)), CmpI, false); + if (Br->getSuccessor(1)->getSinglePredecessor()) + WorkList.emplace_back(DT.getNode(Br->getSuccessor(1)), CmpI, true); + } + + // Next, sort worklist by dominance, so that dominating blocks and conditions + // come before blocks and conditions dominated by them. If a block and a + // condition have the same numbers, the condition comes before the block, as + // it holds on entry to the block. + sort(WorkList.begin(), WorkList.end(), + [](const ConstraintOrBlock &A, const ConstraintOrBlock &B) { + return std::tie(A.NumIn, A.IsBlock) < std::tie(B.NumIn, B.IsBlock); + }); + + // Finally, process ordered worklist and eliminate implied conditions. + SmallVector DFSInStack; + DenseMap Value2Index; + for (ConstraintOrBlock &CB : WorkList) { + // First, pop entries from the stack that are out-of-scope for CB. Remove + // the corresponding entry from the constraint system. + while (!DFSInStack.empty()) { + auto &E = DFSInStack.back(); + LLVM_DEBUG(dbgs() << "Top of stack : " << E.NumIn << " " << E.NumOut + << "\n"); + LLVM_DEBUG(dbgs() << "CB: " << CB.NumIn << " " << CB.NumOut << "\n"); + bool IsDom = CB.NumIn >= E.NumIn && CB.NumOut <= E.NumOut; + if (IsDom) + break; + LLVM_DEBUG(dbgs() << "Removing " << *E.Condition << " " << E.IsNot + << "\n"); + DFSInStack.pop_back(); + CS.popLastConstraint(); + } + + LLVM_DEBUG({ + dbgs() << "Processing "; + if (CB.IsBlock) + dbgs() << *CB.BB; + else + dbgs() << *CB.Condition; + dbgs() << "\n"; + }); + + // For a block, check if any CmpInsts become known based on the current set + // of constraints. + if (CB.IsBlock) { + for (Instruction &I : *CB.BB) { + auto *Cmp = dyn_cast(&I); + if (!Cmp) + continue; + auto R = getConstraint(Cmp, Value2Index, false); + if (R.empty()) + continue; + if (CS.isConditionImplied(R)) { + if (!DebugCounter::shouldExecute(EliminatedCounter)) + continue; + + LLVM_DEBUG(dbgs() << "Condition " << *Cmp + << " implied by dominating constraints\n"); + LLVM_DEBUG({ + for (auto &E : reverse(DFSInStack)) + dbgs() << " C " << *E.Condition << " " << E.IsNot << "\n"; + }); + Cmp->replaceAllUsesWith( + ConstantInt::getTrue(F.getParent()->getContext())); + NumCondsRemoved++; + Changed = true; + } + if (CS.isConditionImplied(ConstraintSystem::negate(R))) { + if (!DebugCounter::shouldExecute(EliminatedCounter)) + continue; + + LLVM_DEBUG(dbgs() << "Condition !" << *Cmp + << " implied by dominating constraints\n"); + LLVM_DEBUG({ + for (auto &E : reverse(DFSInStack)) + dbgs() << " C " << *E.Condition << " " << E.IsNot << "\n"; + }); + Cmp->replaceAllUsesWith( + ConstantInt::getFalse(F.getParent()->getContext())); + NumCondsRemoved++; + Changed = true; + } + } + continue; + } + + // Otherwise, add the condition to the system and stack, if we can transform + // it into a constraint. + auto R = getConstraint(CB.Condition, Value2Index, true); + if (R.empty()) + continue; + + LLVM_DEBUG(dbgs() << "Adding " << *CB.Condition << " " << CB.Not << "\n"); + if (CB.Not) + R = ConstraintSystem::negate(R); + + CS.addVariableRowFill(R); + DFSInStack.emplace_back(CB.NumIn, CB.NumOut, CB.Condition, CB.Not); + } + + return Changed; +} + +namespace { + +class ConstraintElimination : public FunctionPass { +public: + static char ID; + + ConstraintElimination() : FunctionPass(ID) { + initializeConstraintEliminationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto &DT = getAnalysis().getDomTree(); + return eliminateConstraints(F, DT); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + } +}; + +} // end anonymous namespace + +char ConstraintElimination::ID = 0; + +INITIALIZE_PASS_BEGIN(ConstraintElimination, "constraint-elimination", + "Constraint Elimination", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) +INITIALIZE_PASS_END(ConstraintElimination, "constraint-elimination", + "Constraint Elimination", false, false) + +FunctionPass *llvm::createConstraintEliminationPass() { + return new ConstraintElimination(); +} diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 109e15d6d7cfc..261043743b7de 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -114,9 +114,9 @@ static cl::opt cl::desc("The number of memory instructions to scan for " "dead store elimination (default = 100)")); static cl::opt MemorySSAUpwardsStepLimit( - "dse-memoryssa-walklimit", cl::init(70), cl::Hidden, + "dse-memoryssa-walklimit", cl::init(90), cl::Hidden, cl::desc("The maximum number of steps while walking upwards to find " - "MemoryDefs that may be killed (default = 70)")); + "MemoryDefs that may be killed (default = 90)")); static cl::opt MemorySSAPartialStoreLimit( "dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden, @@ -229,11 +229,13 @@ static bool hasAnalyzableMemoryWrite(Instruction *I, case Intrinsic::memset: case Intrinsic::memmove: case Intrinsic::memcpy: + case Intrinsic::memcpy_inline: case Intrinsic::memcpy_element_unordered_atomic: case Intrinsic::memmove_element_unordered_atomic: case Intrinsic::memset_element_unordered_atomic: case Intrinsic::init_trampoline: case Intrinsic::lifetime_end: + case Intrinsic::masked_store: return true; } } @@ -257,8 +259,8 @@ static bool hasAnalyzableMemoryWrite(Instruction *I, /// Return a Location stored to by the specified instruction. If isRemovable /// returns true, this function and getLocForRead completely describe the memory /// operations for this instruction. -static MemoryLocation getLocForWrite(Instruction *Inst) { - +static MemoryLocation getLocForWrite(Instruction *Inst, + const TargetLibraryInfo &TLI) { if (StoreInst *SI = dyn_cast(Inst)) return MemoryLocation::get(SI); @@ -274,6 +276,8 @@ static MemoryLocation getLocForWrite(Instruction *Inst) { return MemoryLocation(); // Unhandled intrinsic. case Intrinsic::init_trampoline: return MemoryLocation(II->getArgOperand(0)); + case Intrinsic::masked_store: + return MemoryLocation::getForArgument(II, 1, TLI); case Intrinsic::lifetime_end: { uint64_t Len = cast(II->getArgOperand(0))->getZExtValue(); return MemoryLocation(II->getArgOperand(1), Len); @@ -320,11 +324,13 @@ static bool isRemovable(Instruction *I) { case Intrinsic::memset: case Intrinsic::memmove: case Intrinsic::memcpy: + case Intrinsic::memcpy_inline: // Don't remove volatile memory intrinsics. return !cast(II)->isVolatile(); case Intrinsic::memcpy_element_unordered_atomic: case Intrinsic::memmove_element_unordered_atomic: case Intrinsic::memset_element_unordered_atomic: + case Intrinsic::masked_store: return true; } } @@ -370,9 +376,10 @@ static bool isShortenableAtTheBeginning(Instruction *I) { } /// Return the pointer that is being written to. -static Value *getStoredPointerOperand(Instruction *I) { +static Value *getStoredPointerOperand(Instruction *I, + const TargetLibraryInfo &TLI) { //TODO: factor this to reuse getLocForWrite - MemoryLocation Loc = getLocForWrite(I); + MemoryLocation Loc = getLocForWrite(I, TLI); assert(Loc.Ptr && "unable to find pointer written for analyzable instruction?"); // TODO: most APIs don't expect const Value * @@ -404,22 +411,53 @@ enum OverwriteResult { } // end anonymous namespace -/// Return 'OW_Complete' if a store to the 'Later' location completely -/// overwrites a store to the 'Earlier' location. Return OW_MaybePartial -/// if \p Later does not completely overwrite \p Earlier, but they both -/// write to the same underlying object. In that case, use isPartialOverwrite to -/// check if \p Later partially overwrites \p Earlier. Returns 'OW_Unknown' if -/// nothing can be determined. +/// Check if two instruction are masked stores that completely +/// overwrite one another. More specifically, \p Later has to +/// overwrite \p Earlier. +template +static OverwriteResult isMaskedStoreOverwrite(const Instruction *Later, + const Instruction *Earlier, + AATy &AA) { + const auto *IIL = dyn_cast(Later); + const auto *IIE = dyn_cast(Earlier); + if (IIL == nullptr || IIE == nullptr) + return OW_Unknown; + if (IIL->getIntrinsicID() != Intrinsic::masked_store || + IIE->getIntrinsicID() != Intrinsic::masked_store) + return OW_Unknown; + // Pointers. + Value *LP = IIL->getArgOperand(1)->stripPointerCasts(); + Value *EP = IIE->getArgOperand(1)->stripPointerCasts(); + if (LP != EP && !AA.isMustAlias(LP, EP)) + return OW_Unknown; + // Masks. + // TODO: check that Later's mask is a superset of the Earlier's mask. + if (IIL->getArgOperand(3) != IIE->getArgOperand(3)) + return OW_Unknown; + return OW_Complete; +} + +/// Return 'OW_Complete' if a store to the 'Later' location (by \p LaterI +/// instruction) completely overwrites a store to the 'Earlier' location. +/// (by \p EarlierI instruction). +/// Return OW_MaybePartial if \p Later does not completely overwrite +/// \p Earlier, but they both write to the same underlying object. In that +/// case, use isPartialOverwrite to check if \p Later partially overwrites +/// \p Earlier. Returns 'OW_Unknown' if nothing can be determined. template static OverwriteResult -isOverwrite(const MemoryLocation &Later, const MemoryLocation &Earlier, +isOverwrite(const Instruction *LaterI, const Instruction *EarlierI, + const MemoryLocation &Later, const MemoryLocation &Earlier, const DataLayout &DL, const TargetLibraryInfo &TLI, int64_t &EarlierOff, int64_t &LaterOff, AATy &AA, const Function *F) { // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll // get imprecise values here, though (except for unknown sizes). - if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) - return OW_Unknown; + if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) { + // Masked stores have imprecise locations, but we can reason about them + // to some extent. + return isMaskedStoreOverwrite(LaterI, EarlierI, AA); + } const uint64_t LaterSize = Later.Size.getValue(); const uint64_t EarlierSize = Earlier.Size.getValue(); @@ -796,7 +834,7 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA, break; Value *DepPointer = - getUnderlyingObject(getStoredPointerOperand(Dependency)); + getUnderlyingObject(getStoredPointerOperand(Dependency, *TLI)); // Check for aliasing. if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) @@ -902,7 +940,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) { // See through pointer-to-pointer bitcasts SmallVector Pointers; - getUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers); + getUnderlyingObjects(getStoredPointerOperand(&*BBI, *TLI), Pointers); // Stores to stack values are valid candidates for removal. bool AllDead = true; @@ -1119,11 +1157,12 @@ static bool tryToShortenBegin(Instruction *EarlierWrite, } static bool removePartiallyOverlappedStores(const DataLayout &DL, - InstOverlapIntervalsTy &IOL) { + InstOverlapIntervalsTy &IOL, + const TargetLibraryInfo &TLI) { bool Changed = false; for (auto OI : IOL) { Instruction *EarlierWrite = OI.first; - MemoryLocation Loc = getLocForWrite(EarlierWrite); + MemoryLocation Loc = getLocForWrite(EarlierWrite, TLI); assert(isRemovable(EarlierWrite) && "Expect only removable instruction"); const Value *Ptr = Loc.Ptr->stripPointerCasts(); @@ -1284,7 +1323,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, continue; // Figure out what location is being stored to. - MemoryLocation Loc = getLocForWrite(Inst); + MemoryLocation Loc = getLocForWrite(Inst, *TLI); // If we didn't get a useful location, fail. if (!Loc.Ptr) @@ -1308,7 +1347,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, Instruction *DepWrite = InstDep.getInst(); if (!hasAnalyzableMemoryWrite(DepWrite, *TLI)) break; - MemoryLocation DepLoc = getLocForWrite(DepWrite); + MemoryLocation DepLoc = getLocForWrite(DepWrite, *TLI); // If we didn't get a useful location, or if it isn't a size, bail out. if (!DepLoc.Ptr) break; @@ -1350,8 +1389,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, if (isRemovable(DepWrite) && !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { int64_t InstWriteOffset, DepWriteOffset; - OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, - InstWriteOffset, *AA, BB.getParent()); + OverwriteResult OR = isOverwrite(Inst, DepWrite, Loc, DepLoc, DL, *TLI, + DepWriteOffset, InstWriteOffset, *AA, + BB.getParent()); if (OR == OW_MaybePartial) OR = isPartialOverwrite(Loc, DepLoc, DepWriteOffset, InstWriteOffset, DepWrite, IOL); @@ -1433,7 +1473,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, } if (EnablePartialOverwriteTracking) - MadeChange |= removePartiallyOverlappedStores(DL, IOL); + MadeChange |= removePartiallyOverlappedStores(DL, IOL, *TLI); // If this block ends in a return, unwind, or unreachable, all allocas are // dead at its end, which means stores to them are also dead. @@ -1676,6 +1716,8 @@ struct DSEState { switch (CB->getIntrinsicID()) { case Intrinsic::init_trampoline: return {MemoryLocation(CB->getArgOperand(0))}; + case Intrinsic::masked_store: + return {MemoryLocation::getForArgument(CB, 1, TLI)}; default: break; } @@ -1685,8 +1727,10 @@ struct DSEState { return MemoryLocation::getOrNone(I); } - /// Returns true if \p Use completely overwrites \p DefLoc. - bool isCompleteOverwrite(MemoryLocation DefLoc, Instruction *UseInst) { + /// Returns true if \p UseInst completely overwrites \p DefLoc + /// (stored by \p DefInst). + bool isCompleteOverwrite(MemoryLocation DefLoc, Instruction *DefInst, + Instruction *UseInst) { // UseInst has a MemoryDef associated in MemorySSA. It's possible for a // MemoryDef to not write to memory, e.g. a volatile load is modeled as a // MemoryDef. @@ -1698,9 +1742,10 @@ struct DSEState { return false; int64_t InstWriteOffset, DepWriteOffset; - auto CC = getLocForWriteEx(UseInst); - return CC && isOverwrite(*CC, DefLoc, DL, TLI, DepWriteOffset, - InstWriteOffset, BatchAA, &F) == OW_Complete; + if (auto CC = getLocForWriteEx(UseInst)) + return isOverwrite(UseInst, DefInst, *CC, DefLoc, DL, TLI, DepWriteOffset, + InstWriteOffset, BatchAA, &F) == OW_Complete; + return false; } /// Returns true if \p Def is not read before returning from the function. @@ -1731,10 +1776,12 @@ struct DSEState { } MemoryAccess *UseAccess = WorkList[I]; - if (isa(UseAccess)) { - PushMemUses(UseAccess); - continue; - } + // Simply adding the users of MemoryPhi to the worklist is not enough, + // because we might miss read clobbers in different iterations of a loop, + // for example. + // TODO: Add support for phi translation to handle the loop case. + if (isa(UseAccess)) + return false; // TODO: Checking for aliasing is expensive. Consider reducing the amount // of times this is called and/or caching it. @@ -1795,6 +1842,11 @@ struct DSEState { // Returns true if \p Use may read from \p DefLoc. bool isReadClobber(MemoryLocation DefLoc, Instruction *UseInst) { + // Monotonic or weaker atomic stores can be re-ordered and do not need to be + // treated as read clobber. + if (auto SI = dyn_cast(UseInst)) + return isStrongerThan(SI->getOrdering(), AtomicOrdering::Monotonic); + if (!UseInst->mayReadFromMemory()) return false; @@ -1809,6 +1861,32 @@ struct DSEState { return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc)); } + /// Returns true if \p Ptr is guaranteed to be loop invariant for any possible + /// loop. In particular, this guarantees that it only references a single + /// MemoryLocation during execution of the containing function. + bool IsGuaranteedLoopInvariant(Value *Ptr) { + auto IsGuaranteedLoopInvariantBase = [this](Value *Ptr) { + Ptr = Ptr->stripPointerCasts(); + if (auto *I = dyn_cast(Ptr)) { + if (isa(Ptr)) + return true; + + if (isAllocLikeFn(I, &TLI)) + return true; + + return false; + } + return true; + }; + + Ptr = Ptr->stripPointerCasts(); + if (auto *GEP = dyn_cast(Ptr)) { + return IsGuaranteedLoopInvariantBase(GEP->getPointerOperand()) && + GEP->hasAllConstantIndices(); + } + return IsGuaranteedLoopInvariantBase(Ptr); + } + // Find a MemoryDef writing to \p DefLoc and dominating \p StartAccess, with // no read access between them or on any other path to a function exit block // if \p DefLoc is not accessible after the function returns. If there is no @@ -1901,6 +1979,18 @@ struct DSEState { return None; } + // Quick check if there are direct uses that are read-clobbers. + if (any_of(Current->uses(), [this, &DefLoc, StartAccess](Use &U) { + if (auto *UseOrDef = dyn_cast(U.getUser())) + return !MSSA.dominates(StartAccess, UseOrDef) && + isReadClobber(DefLoc, UseOrDef->getMemoryInst()); + return false; + })) { + Cache.KnownReads.insert(Current); + LLVM_DEBUG(dbgs() << " ... found a read clobber\n"); + return None; + } + // If Current cannot be analyzed or is not removable, check the next // candidate. if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI)) { @@ -1928,9 +2018,20 @@ struct DSEState { } continue; } else { + // AliasAnalysis does not account for loops. Limit elimination to + // candidates for which we can guarantee they always store to the same + // memory location and not multiple locations in a loop. + if (Current->getBlock() != KillingDef->getBlock() && + !IsGuaranteedLoopInvariant(const_cast(CurrentLoc->Ptr))) { + StepAgain = true; + Current = CurrentDef->getDefiningAccess(); + WalkerStepLimit -= 1; + continue; + } + int64_t InstWriteOffset, DepWriteOffset; - auto OR = isOverwrite(DefLoc, *CurrentLoc, DL, TLI, DepWriteOffset, - InstWriteOffset, BatchAA, &F); + auto OR = isOverwrite(KillingI, CurrentI, DefLoc, *CurrentLoc, DL, TLI, + DepWriteOffset, InstWriteOffset, BatchAA, &F); // If Current does not write to the same object as KillingDef, check // the next candidate. if (OR == OW_Unknown) { @@ -2074,7 +2175,7 @@ struct DSEState { // 3 = Def(1) ; <---- Current (3, 2) = NoAlias, (3,1) = MayAlias, // stores [0,1] if (MemoryDef *UseDef = dyn_cast(UseAccess)) { - if (isCompleteOverwrite(DefLoc, UseInst)) { + if (isCompleteOverwrite(DefLoc, KillingI, UseInst)) { if (!isInvisibleToCallerAfterRet(DefUO) && UseAccess != EarlierAccess) { BasicBlock *MaybeKillingBlock = UseInst->getParent(); @@ -2431,7 +2532,7 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA, // Check if NI overwrites SI. int64_t InstWriteOffset, DepWriteOffset; OverwriteResult OR = - isOverwrite(SILoc, NILoc, State.DL, TLI, DepWriteOffset, + isOverwrite(SI, NI, SILoc, NILoc, State.DL, TLI, DepWriteOffset, InstWriteOffset, State.BatchAA, &F); if (OR == OW_MaybePartial) { auto Iter = State.IOLs.insert( @@ -2482,7 +2583,7 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA, if (EnablePartialOverwriteTracking) for (auto &KV : State.IOLs) - MadeChange |= removePartiallyOverlappedStores(State.DL, KV.second); + MadeChange |= removePartiallyOverlappedStores(State.DL, KV.second, TLI); MadeChange |= State.eliminateDeadWritesAtEndOfFunction(); return MadeChange; diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 51da10fc48790..86dd4d54d558d 100644 --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -196,6 +196,11 @@ static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A, case CmpInst::ICMP_ULT: Flavor = SPF_UMIN; break; case CmpInst::ICMP_SGT: Flavor = SPF_SMAX; break; case CmpInst::ICMP_SLT: Flavor = SPF_SMIN; break; + // Non-strict inequalities. + case CmpInst::ICMP_ULE: Flavor = SPF_UMIN; break; + case CmpInst::ICMP_UGE: Flavor = SPF_UMAX; break; + case CmpInst::ICMP_SLE: Flavor = SPF_SMIN; break; + case CmpInst::ICMP_SGE: Flavor = SPF_SMAX; break; default: break; } @@ -1463,6 +1468,7 @@ class EarlyCSELegacyCommonPass : public FunctionPass { AU.addRequired(); AU.addRequired(); if (UseMemorySSA) { + AU.addRequired(); AU.addRequired(); AU.addPreserved(); } @@ -1504,6 +1510,7 @@ INITIALIZE_PASS_BEGIN(EarlyCSEMemSSALegacyPass, "early-cse-memssa", "Early CSE w/ MemorySSA", false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index c71038d66f995..f8e8e2c773f9f 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -410,9 +410,12 @@ uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { } if (local_dep.isDef()) { - CallInst* local_cdep = cast(local_dep.getInst()); + // For masked load/store intrinsics, the local_dep may actully be + // a normal load or store instruction. + CallInst *local_cdep = dyn_cast(local_dep.getInst()); - if (local_cdep->getNumArgOperands() != C->getNumArgOperands()) { + if (!local_cdep || + local_cdep->getNumArgOperands() != C->getNumArgOperands()) { valueNumbering[C] = nextValueNumber; return nextValueNumber++; } @@ -1609,6 +1612,11 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true ReplaceOperandsWithMap[V] = True; + // Similarly, after assume(!NotV) we know that NotV == false. + Value *NotV; + if (match(V, m_Not(m_Value(NotV)))) + ReplaceOperandsWithMap[NotV] = ConstantInt::getFalse(V->getContext()); + // If we find an equality fact, canonicalize all dominated uses in this block // to one of the two values. We heuristically choice the "oldest" of the // two where age is determined by value number. (Note that propagateEquality @@ -2850,7 +2858,6 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass { if (Impl.isMemDepEnabled()) AU.addRequired(); AU.addRequired(); - AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 51d12faf712ad..f5a74b86ae9d1 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1824,7 +1824,7 @@ static bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root, // If we can't analyze propagation through this instruction, just skip it // and transitive users. Safe as false is a conservative result. - if (!propagatesPoison(I) && I != Root) + if (!propagatesPoison(cast(I)) && I != Root) continue; if (KnownPoison.insert(I).second) @@ -2329,36 +2329,6 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { return MadeAnyChanges; } -/// Return a symbolic upper bound for the backedge taken count of the loop. -/// This is more general than getConstantMaxBackedgeTakenCount as it returns -/// an arbitrary expression as opposed to only constants. -/// TODO: Move into the ScalarEvolution class. -static const SCEV* getMaxBackedgeTakenCount(ScalarEvolution &SE, - DominatorTree &DT, Loop *L) { - SmallVector ExitingBlocks; - L->getExitingBlocks(ExitingBlocks); - - // Form an expression for the maximum exit count possible for this loop. We - // merge the max and exact information to approximate a version of - // getConstantMaxBackedgeTakenCount which isn't restricted to just constants. - SmallVector ExitCounts; - for (BasicBlock *ExitingBB : ExitingBlocks) { - const SCEV *ExitCount = SE.getExitCount(L, ExitingBB); - if (isa(ExitCount)) - ExitCount = SE.getExitCount(L, ExitingBB, - ScalarEvolution::ConstantMaximum); - if (!isa(ExitCount)) { - assert(DT.dominates(ExitingBB, L->getLoopLatch()) && - "We should only have known counts for exiting blocks that " - "dominate latch!"); - ExitCounts.push_back(ExitCount); - } - } - if (ExitCounts.empty()) - return SE.getCouldNotCompute(); - return SE.getUMinFromMismatchedTypes(ExitCounts); -} - bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { SmallVector ExitingBlocks; L->getExitingBlocks(ExitingBlocks); @@ -2391,7 +2361,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { return false; // Get a symbolic upper bound on the loop backedge taken count. - const SCEV *MaxExitCount = getMaxBackedgeTakenCount(*SE, *DT, L); + const SCEV *MaxExitCount = SE->computeMaxBackedgeTakenCount(L); if (isa(MaxExitCount)) return false; diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index db9cc58bbfc40..0ed6b593a91c7 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -997,6 +997,12 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces( SmallVector UndefUsesToFix; for (Value* V : Postorder) { unsigned NewAddrSpace = InferredAddrSpace.lookup(V); + + // In some degenerate cases (e.g. invalid IR in unreachable code), we may + // not even infer the value to have its original address space. + if (NewAddrSpace == UninitializedAddressSpace) + continue; + if (V->getType()->getPointerAddressSpace() != NewAddrSpace) { Value *New = cloneValueWithNewAddressSpace( V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix); diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 311ca11de84e7..8b1ad336c8a59 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -104,6 +104,11 @@ static cl::opt PrintLVIAfterJumpThreading( cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false), cl::Hidden); +static cl::opt JumpThreadingFreezeSelectCond( + "jump-threading-freeze-select-cond", + cl::desc("Freeze the condition when unfolding select"), cl::init(false), + cl::Hidden); + static cl::opt ThreadAcrossLoopHeaders( "jump-threading-across-loop-headers", cl::desc("Allow JumpThreading to thread across loop headers, for testing"), @@ -133,7 +138,8 @@ namespace { public: static char ID; // Pass identification - JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) { + JumpThreading(bool InsertFreezeWhenUnfoldingSelect = false, int T = -1) + : FunctionPass(ID), Impl(InsertFreezeWhenUnfoldingSelect, T) { initializeJumpThreadingPass(*PassRegistry::getPassRegistry()); } @@ -166,11 +172,12 @@ INITIALIZE_PASS_END(JumpThreading, "jump-threading", "Jump Threading", false, false) // Public interface to the Jump Threading pass -FunctionPass *llvm::createJumpThreadingPass(int Threshold) { - return new JumpThreading(Threshold); +FunctionPass *llvm::createJumpThreadingPass(bool InsertFr, int Threshold) { + return new JumpThreading(InsertFr, Threshold); } -JumpThreadingPass::JumpThreadingPass(int T) { +JumpThreadingPass::JumpThreadingPass(bool InsertFr, int T) { + InsertFreezeWhenUnfoldingSelect = JumpThreadingFreezeSelectCond | InsertFr; DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T); } @@ -1040,6 +1047,9 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { return false; // Must be an invoke or callbr. } + // Keep track if we constant folded the condition in this invocation. + bool ConstantFolded = false; + // Run constant folding to see if we can reduce the condition to a simple // constant. if (Instruction *I = dyn_cast(Condition)) { @@ -1050,6 +1060,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { if (isInstructionTriviallyDead(I, TLI)) I->eraseFromParent(); Condition = SimpleVal; + ConstantFolded = true; } } @@ -1100,7 +1111,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // FIXME: Unify this with code below. if (ProcessThreadableEdges(Condition, BB, Preference, Terminator)) return true; - return false; + return ConstantFolded; } if (CmpInst *CondCmp = dyn_cast(CondInst)) { @@ -2798,13 +2809,8 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { /// select is not jump-threaded, it will be folded again in the later /// optimizations. bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { - // This transform can introduce a UB (a conditional branch that depends on a - // poison value) that was not present in the original program. See - // @TryToUnfoldSelectInCurrBB test in test/Transforms/JumpThreading/select.ll. + // This transform would reduce the quality of msan diagnostics. // Disable this transform under MemorySanitizer. - // FIXME: either delete it or replace with a valid transform. This issue is - // not limited to MemorySanitizer (but has only been observed as an MSan false - // positive in practice so far). if (BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory)) return false; @@ -2852,8 +2858,11 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { if (!SI) continue; // Expand the select. - Instruction *Term = - SplitBlockAndInsertIfThen(SI->getCondition(), SI, false); + Value *Cond = SI->getCondition(); + if (InsertFreezeWhenUnfoldingSelect && + !isGuaranteedNotToBeUndefOrPoison(Cond, SI, &DTU->getDomTree())) + Cond = new FreezeInst(Cond, "cond.fr", SI); + Instruction *Term = SplitBlockAndInsertIfThen(Cond, SI, false); BasicBlock *SplitBB = SI->getParent(); BasicBlock *NewBB = Term->getParent(); PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 4bf39ba8f151c..a8fe8280a9ce6 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -35,10 +35,12 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/GuardUtils.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -98,6 +100,11 @@ static cl::opt ControlFlowHoisting( "licm-control-flow-hoisting", cl::Hidden, cl::init(false), cl::desc("Enable control flow (and PHI) hoisting in LICM")); +static cl::opt HoistSinkColdnessThreshold( + "licm-coldness-threshold", cl::Hidden, cl::init(4), + cl::desc("Relative coldness Threshold of hoisting/sinking destination " + "block for LICM to be considered beneficial")); + static cl::opt MaxNumUsesTraversed( "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), cl::desc("Max num uses visited for identifying load " @@ -143,8 +150,9 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, MemorySSAUpdater *MSSAU, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE); static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, - const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); + BlockFrequencyInfo *BFI, const Loop *CurLoop, + ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, + OptimizationRemarkEmitter *ORE); static bool isSafeToExecuteUnconditionally(Instruction &Inst, const DominatorTree *DT, const Loop *CurLoop, @@ -171,8 +179,8 @@ static void moveInstructionBefore(Instruction &I, Instruction &Dest, namespace { struct LoopInvariantCodeMotion { bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, - TargetLibraryInfo *TLI, TargetTransformInfo *TTI, - ScalarEvolution *SE, MemorySSA *MSSA, + BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, + TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE); LoopInvariantCodeMotion(unsigned LicmMssaOptCap, @@ -208,19 +216,23 @@ struct LegacyLICMPass : public LoopPass { MemorySSA *MSSA = EnableMSSALoopDependency ? (&getAnalysis().getMSSA()) : nullptr; + bool hasProfileData = L->getHeader()->getParent()->hasProfileData(); + BlockFrequencyInfo *BFI = + hasProfileData ? &getAnalysis().getBFI() + : nullptr; // For the old PM, we can't use OptimizationRemarkEmitter as an analysis - // pass. Function analyses need to be preserved across loop transformations + // pass. Function analyses need to be preserved across loop transformations // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L->getHeader()->getParent()); - return LICM.runOnLoop(L, - &getAnalysis().getAAResults(), - &getAnalysis().getLoopInfo(), - &getAnalysis().getDomTree(), - &getAnalysis().getTLI( - *L->getHeader()->getParent()), - &getAnalysis().getTTI( - *L->getHeader()->getParent()), - SE ? &SE->getSE() : nullptr, MSSA, &ORE); + return LICM.runOnLoop( + L, &getAnalysis().getAAResults(), + &getAnalysis().getLoopInfo(), + &getAnalysis().getDomTree(), BFI, + &getAnalysis().getTLI( + *L->getHeader()->getParent()), + &getAnalysis().getTTI( + *L->getHeader()->getParent()), + SE ? &SE->getSE() : nullptr, MSSA, &ORE); } /// This transformation requires natural loop information & requires that @@ -236,6 +248,9 @@ struct LegacyLICMPass : public LoopPass { } AU.addRequired(); getLoopAnalysisUsage(AU); + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); + AU.addPreserved(); + AU.addPreserved(); } private: @@ -251,8 +266,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap); - if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE, - AR.MSSA, &ORE)) + if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI, + &AR.SE, AR.MSSA, &ORE)) return PreservedAnalyses::all(); auto PA = getLoopPassPreservedAnalyses(); @@ -272,6 +287,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LazyBFIPass) INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false, false) @@ -286,8 +302,8 @@ Pass *llvm::createLICMPass(unsigned LicmMssaOptCap, /// times on one loop. bool LoopInvariantCodeMotion::runOnLoop( Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, - TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE, - MemorySSA *MSSA, OptimizationRemarkEmitter *ORE) { + BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE) { bool Changed = false; assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); @@ -347,12 +363,13 @@ bool LoopInvariantCodeMotion::runOnLoop( LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true}; if (L->hasDedicatedExits()) - Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L, - CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); + Changed |= + sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI, L, + CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); Flags.IsSink = false; if (Preheader) Changed |= - hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, + hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L, CurAST.get(), MSSAU.get(), SE, &SafetyInfo, Flags, ORE); // Now that all loop invariants have been removed from the loop, promote any @@ -449,10 +466,10 @@ bool LoopInvariantCodeMotion::runOnLoop( /// definitions, allowing us to sink a loop body in one pass without iteration. /// bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, - DominatorTree *DT, TargetLibraryInfo *TLI, - TargetTransformInfo *TTI, Loop *CurLoop, - AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, - ICFLoopSafetyInfo *SafetyInfo, + DominatorTree *DT, BlockFrequencyInfo *BFI, + TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + Loop *CurLoop, AliasSetTracker *CurAST, + MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) { @@ -501,7 +518,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) && canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags, ORE)) { - if (sink(I, LI, DT, CurLoop, SafetyInfo, MSSAU, ORE)) { + if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) { if (!FreeInLoop) { ++II; salvageDebugInfo(I); @@ -746,13 +763,43 @@ class ControlFlowHoister { }; } // namespace +// Hoisting/sinking instruction out of a loop isn't always beneficial. It's only +// only worthwhile if the destination block is actually colder than current +// block. +static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock, + OptimizationRemarkEmitter *ORE, + BlockFrequencyInfo *BFI) { + // Check block frequency only when runtime profile is available + // to avoid pathological cases. With static profile, lean towards + // hosting because it helps canonicalize the loop for vectorizer. + if (!DstBlock->getParent()->hasProfileData()) + return true; + + if (!HoistSinkColdnessThreshold || !BFI) + return true; + + BasicBlock *SrcBlock = I.getParent(); + if (BFI->getBlockFreq(DstBlock).getFrequency() / HoistSinkColdnessThreshold > + BFI->getBlockFreq(SrcBlock).getFrequency()) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "SinkHoistInst", &I) + << "failed to sink or hoist instruction because containing block " + "has lower frequency than destination block"; + }); + return false; + } + + return true; +} + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before /// uses, allowing us to hoist a loop body in one pass without iteration. /// bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, - DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, + DominatorTree *DT, BlockFrequencyInfo *BFI, + TargetLibraryInfo *TLI, Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, @@ -803,13 +850,15 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, // Try hoisting the instruction out to the preheader. We can only do // this if all of the operands of the instruction are loop invariant and - // if it is safe to hoist the instruction. + // if it is safe to hoist the instruction. We also check block frequency + // to make sure instruction only gets hoisted into colder blocks. // TODO: It may be safe to hoist if we are hoisting to a conditional block // and we have accurately duplicated the control flow from the loop header // to that block. if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags, ORE) && + worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) && isSafeToExecuteUnconditionally( I, DT, CurLoop, SafetyInfo, ORE, CurLoop->getLoopPreheader()->getTerminator())) { @@ -940,7 +989,19 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT, Loop *CurLoop) { Value *Addr = LI->getOperand(0); const DataLayout &DL = LI->getModule()->getDataLayout(); - const uint32_t LocSizeInBits = DL.getTypeSizeInBits(LI->getType()); + const TypeSize LocSizeInBits = DL.getTypeSizeInBits(LI->getType()); + + // It is not currently possible for clang to generate an invariant.start + // intrinsic with scalable vector types because we don't support thread local + // sizeless types and we don't permit sizeless types in structs or classes. + // Furthermore, even if support is added for this in future the intrinsic + // itself is defined to have a size of -1 for variable sized objects. This + // makes it impossible to verify if the intrinsic envelops our region of + // interest. For example, both and + // types would have a -1 parameter, but the former is clearly double the size + // of the latter. + if (LocSizeInBits.isScalable()) + return false; // if the type is i8 addrspace(x)*, we know this is the type of // llvm.invariant.start operand @@ -970,13 +1031,17 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT, if (!II || II->getIntrinsicID() != Intrinsic::invariant_start || !II->use_empty()) continue; - unsigned InvariantSizeInBits = - cast(II->getArgOperand(0))->getSExtValue() * 8; + ConstantInt *InvariantSize = cast(II->getArgOperand(0)); + // The intrinsic supports having a -1 argument for variable sized objects + // so we should check for that here. + if (InvariantSize->isNegative()) + continue; + uint64_t InvariantSizeInBits = InvariantSize->getSExtValue() * 8; // Confirm the invariant.start location size contains the load operand size // in bits. Also, the invariant.start should dominate the load, and we // should not hoist the load out of a loop that contains this dominating // invariant.start. - if (LocSizeInBits <= InvariantSizeInBits && + if (LocSizeInBits.getFixedSize() <= InvariantSizeInBits && DT->properlyDominates(II->getParent(), CurLoop->getHeader())) return true; } @@ -1529,8 +1594,9 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, /// position, and may either delete it or move it to outside of the loop. /// static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, - const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE) { + BlockFrequencyInfo *BFI, const Loop *CurLoop, + ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, + OptimizationRemarkEmitter *ORE) { LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I) @@ -1606,7 +1672,10 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, // If this instruction is only used outside of the loop, then all users are // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of // the instruction. + // First check if I is worth sinking for all uses. Sink only when it is worth + // across all uses. SmallSetVector Users(I.user_begin(), I.user_end()); + SmallVector ExitPNs; for (auto *UI : Users) { auto *User = cast(UI); @@ -1616,6 +1685,15 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, PHINode *PN = cast(User); assert(ExitBlockSet.count(PN->getParent()) && "The LCSSA PHI is not in an exit block!"); + if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) { + return Changed; + } + + ExitPNs.push_back(PN); + } + + for (auto *PN : ExitPNs) { + // The PHI must be trivially replaceable. Instruction *New = sinkThroughTriviallyReplaceablePHI( PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU); diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 7867a5468891b..04b7254e4cdba 100644 --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -1058,7 +1058,8 @@ PreservedAnalyses LoopDistributePass::run(Function &F, auto &LAM = AM.getResult(F).getManager(); std::function GetLAA = [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, + TLI, TTI, nullptr, nullptr}; return LAM.getResult(L, AR); }; diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 011d6f487742d..147ccc939ac9f 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -468,8 +468,11 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) { Value *StorePtr = SI->getPointerOperand(); // Reject stores that are so large that they overflow an unsigned. - uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); - if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) + // When storing out scalable vectors we bail out for now, since the code + // below currently only works for constant strides. + TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); + if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) || + (SizeInBits.getFixedSize() >> 32) != 0) return LegalStoreKind::None; // See if the pointer expression is an AddRec like {base,+,1} on the current diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index 3b70695640414..ce010c9bacacf 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -486,7 +486,6 @@ class LoadEliminationForLoop { // Filter the candidates further. SmallVector Candidates; - unsigned NumForwarding = 0; for (const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) { LLVM_DEBUG(dbgs() << "Candidate " << Cand); @@ -506,12 +505,17 @@ class LoadEliminationForLoop { if (!Cand.isDependenceDistanceOfOne(PSE, L)) continue; - ++NumForwarding; + assert(isa(PSE.getSCEV(Cand.Load->getPointerOperand())) && + "Loading from something other than indvar?"); + assert( + isa(PSE.getSCEV(Cand.Store->getPointerOperand())) && + "Storing to something other than indvar?"); + + Candidates.push_back(Cand); LLVM_DEBUG( dbgs() - << NumForwarding + << Candidates.size() << ". Valid store-to-load forwarding across the loop backedge\n"); - Candidates.push_back(Cand); } if (Candidates.empty()) return false; @@ -563,6 +567,17 @@ class LoadEliminationForLoop { LV.setAliasChecks(std::move(Checks)); LV.setSCEVChecks(LAI.getPSE().getUnionPredicate()); LV.versionLoop(); + + // After versioning, some of the candidates' pointers could stop being + // SCEVAddRecs. We need to filter them out. + auto NoLongerGoodCandidate = [this]( + const StoreToLoadForwardingCandidate &Cand) { + return !isa( + PSE.getSCEV(Cand.Load->getPointerOperand())) || + !isa( + PSE.getSCEV(Cand.Store->getPointerOperand())); + }; + llvm::erase_if(Candidates, NoLongerGoodCandidate); } // Next, propagate the value stored by the store to the users of the load. @@ -571,7 +586,7 @@ class LoadEliminationForLoop { "storeforward"); for (const auto &Cand : Candidates) propagateStoredValueToLoadUsers(Cand, SEE); - NumLoopLoadEliminted += NumForwarding; + NumLoopLoadEliminted += Candidates.size(); return true; } @@ -705,7 +720,8 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F, auto &LAM = AM.getResult(F).getManager(); bool Changed = eliminateLoadsAcrossLoops( F, LI, DT, BFI, PSI, [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, + TLI, TTI, nullptr, MSSA}; return LAM.getResult(L, AR); }); diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index c3e46c1fadef3..47329fa1f043e 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -3834,10 +3834,14 @@ void LSRInstance::GenerateConstantOffsetsImpl( F.BaseOffset = (uint64_t)F.BaseOffset + Imm; if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) return; - if (IsScaledReg) + if (IsScaledReg) { F.ScaledReg = G; - else + } else { F.BaseRegs[Idx] = G; + // We may generate non canonical Formula if G is a recurrent expr reg + // related with current loop while F.ScaledReg is not. + F.canonicalize(*L); + } (void)InsertFormula(LU, LUIdx, F); } diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index bd62419323065..495906e1a7630 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -288,6 +288,13 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, None, None, None, None, None); TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(L, SE, TTI, None, None); + + TransformationMode EnableMode = hasUnrollAndJamTransformation(L); + if (EnableMode & TM_Disable) + return LoopUnrollResult::Unmodified; + if (EnableMode & TM_ForcedByUser) + UP.UnrollAndJam = true; + if (AllowUnrollAndJam.getNumOccurrences() > 0) UP.UnrollAndJam = AllowUnrollAndJam; if (UnrollAndJamThreshold.getNumOccurrences() > 0) @@ -300,10 +307,6 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, << L->getHeader()->getParent()->getName() << "] Loop %" << L->getHeader()->getName() << "\n"); - TransformationMode EnableMode = hasUnrollAndJamTransformation(L); - if (EnableMode & TM_Disable) - return LoopUnrollResult::Unmodified; - // A loop with any unroll pragma (enabling/disabling/count/etc) is left for // the unroller, so long as it does not explicitly have unroll_and_jam // metadata. This means #pragma nounroll will disable unroll and jam as well diff --git a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index d83b7b05f88b5..00b242c16f384 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -32,6 +32,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -217,6 +218,10 @@ namespace { /// loop preheaders be inserted into the CFG. /// void getAnalysisUsage(AnalysisUsage &AU) const override { + // Lazy BFI and BPI are marked as preserved here so Loop Unswitching + // can remain part of the same loop pass as LICM + AU.addPreserved(); + AU.addPreserved(); AU.addRequired(); AU.addRequired(); if (EnableMSSALoopDependency) { diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 0fe7dd9cfb39f..33f73f6e163af 100644 --- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -24,7 +24,6 @@ #include "llvm/IR/Metadata.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/MisExpect.h" @@ -48,10 +47,10 @@ STATISTIC(ExpectIntrinsicsHandled, // 'select' instructions. It may be worthwhile to hoist these values to some // shared space, so they can be used directly by other passes. -static cl::opt LikelyBranchWeight( +cl::opt llvm::LikelyBranchWeight( "likely-branch-weight", cl::Hidden, cl::init(2000), cl::desc("Weight of the branch likely to be taken (default = 2000)")); -static cl::opt UnlikelyBranchWeight( +cl::opt llvm::UnlikelyBranchWeight( "unlikely-branch-weight", cl::Hidden, cl::init(1), cl::desc("Weight of the branch unlikely to be taken (default = 1)")); diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 2afc778ed8214..33ab2907906e0 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -1350,6 +1350,25 @@ void SCCPSolver::handleCallResult(CallBase &CB) { return (void)mergeInValue(IV, &CB, CopyOfVal); } + + if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) { + // Compute result range for intrinsics supported by ConstantRange. + // Do this even if we don't know a range for all operands, as we may + // still know something about the result range, e.g. of abs(x). + SmallVector OpRanges; + for (Value *Op : II->args()) { + const ValueLatticeElement &State = getValueState(Op); + if (State.isConstantRange()) + OpRanges.push_back(State.getConstantRange()); + else + OpRanges.push_back( + ConstantRange::getFull(Op->getType()->getScalarSizeInBits())); + } + + ConstantRange Result = + ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges); + return (void)mergeInValue(II, ValueLatticeElement::getRange(Result)); + } } // The common case is that we aren't tracking the callee, either because we diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp index f4dc6f2996b98..8a740295b19c4 100644 --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -38,6 +38,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeAlignmentFromAssumptionsPass(Registry); initializeCallSiteSplittingLegacyPassPass(Registry); initializeConstantHoistingLegacyPassPass(Registry); + initializeConstraintEliminationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); initializeDCELegacyPassPass(Registry); initializeDeadInstEliminationPass(Registry); diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 3bc0cbde8c19d..c7fe21f2a3dac 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -398,7 +398,8 @@ void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) { continue; Instruction *Old = cast(V); - CV[I]->takeName(Old); + if (isa(CV[I])) + CV[I]->takeName(Old); Old->replaceAllUsesWith(CV[I]); PotentiallyDeadInstrs.emplace_back(Old); } diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index db5211df397a8..b0435bf6e4eac 100644 --- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -63,8 +63,8 @@ static cl::opt UserForwardSwitchCond( cl::desc("Forward switch condition to phi ops (default = false)")); static cl::opt UserHoistCommonInsts( - "hoist-common-insts", cl::Hidden, cl::init(true), - cl::desc("hoist common instructions (default = true)")); + "hoist-common-insts", cl::Hidden, cl::init(false), + cl::desc("hoist common instructions (default = false)")); static cl::opt UserSinkCommonInsts( "sink-common-insts", cl::Hidden, cl::init(false), diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index c20e57b02c1a5..688900a1c20f8 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -343,7 +343,7 @@ char StructurizeCFG::ID = 0; INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(LowerSwitch) +INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG", diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index d4d2957efab4c..09ed68a5f6782 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -262,6 +262,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { return Changed; case LibFunc_setbuf: case LibFunc_setvbuf: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); return Changed; @@ -274,6 +275,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { return Changed; case LibFunc_stat: case LibFunc_statvfs: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setDoesNotCapture(F, 1); @@ -304,6 +306,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setOnlyReadsMemory(F, 2); return Changed; case LibFunc_setitimer: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 1); Changed |= setDoesNotCapture(F, 2); @@ -311,6 +314,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { return Changed; case LibFunc_system: // May throw; "system" is a valid pthread cancellation point. + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotCapture(F, 0); Changed |= setOnlyReadsMemory(F, 0); return Changed; @@ -369,11 +373,13 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setRetDoesNotAlias(F); return Changed; case LibFunc_mkdir: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setOnlyReadsMemory(F, 0); return Changed; case LibFunc_mktime: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); return Changed; @@ -395,11 +401,13 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_rmdir: case LibFunc_remove: case LibFunc_realpath: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setOnlyReadsMemory(F, 0); return Changed; case LibFunc_rename: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setDoesNotCapture(F, 1); @@ -407,6 +415,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setOnlyReadsMemory(F, 1); return Changed; case LibFunc_readlink: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setDoesNotCapture(F, 1); @@ -445,6 +454,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { return Changed; case LibFunc_chmod: case LibFunc_chown: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setOnlyReadsMemory(F, 0); @@ -452,6 +462,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_ctermid: case LibFunc_clearerr: case LibFunc_closedir: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); return Changed; @@ -464,6 +475,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); return Changed; case LibFunc_access: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setOnlyReadsMemory(F, 0); @@ -583,6 +595,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); return Changed; case LibFunc_getlogin_r: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); return Changed; @@ -592,6 +605,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); return Changed; case LibFunc_getenv: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setOnlyReadsMemory(F); Changed |= setDoesNotCapture(F, 0); @@ -603,10 +617,12 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotThrow(F); return Changed; case LibFunc_getitimer: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 1); return Changed; case LibFunc_getpwnam: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setOnlyReadsMemory(F, 0); @@ -617,21 +633,25 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 1); return Changed; case LibFunc_uname: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); return Changed; case LibFunc_unlink: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setOnlyReadsMemory(F, 0); return Changed; case LibFunc_unsetenv: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setOnlyReadsMemory(F, 0); return Changed; case LibFunc_utime: case LibFunc_utimes: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setDoesNotCapture(F, 1); @@ -669,6 +689,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotThrow(F); return Changed; case LibFunc_popen: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setRetDoesNotAlias(F); Changed |= setDoesNotCapture(F, 0); @@ -677,6 +698,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setOnlyReadsMemory(F, 1); return Changed; case LibFunc_pclose: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); return Changed; @@ -733,16 +755,19 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setOnlyReadsMemory(F, 0); return Changed; case LibFunc_opendir: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setRetDoesNotAlias(F); Changed |= setDoesNotCapture(F, 0); Changed |= setOnlyReadsMemory(F, 0); return Changed; case LibFunc_tmpfile: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setRetDoesNotAlias(F); return Changed; case LibFunc_times: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); return Changed; @@ -754,18 +779,22 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotAccessMemory(F); return Changed; case LibFunc_lstat: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setDoesNotCapture(F, 1); Changed |= setOnlyReadsMemory(F, 0); return Changed; case LibFunc_lchown: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setOnlyReadsMemory(F, 0); return Changed; case LibFunc_qsort: // May throw; places call through function pointer. + // Cannot give undef pointer/size + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotCapture(F, 3); return Changed; case LibFunc_dunder_strdup: @@ -799,6 +828,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_stat64: case LibFunc_lstat64: case LibFunc_statvfs64: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setDoesNotCapture(F, 1); @@ -828,6 +858,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); return Changed; case LibFunc_tmpfile64: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setRetDoesNotAlias(F); return Changed; @@ -847,6 +878,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { // Currently some platforms have the restrict keyword on the arguments to // gettimeofday. To be conservative, do not add noalias to gettimeofday's // arguments. + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); Changed |= setDoesNotCapture(F, 1); @@ -874,6 +906,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { return Changed; // int __nvvm_reflect(const char *) case LibFunc_nvvm_reflect: + Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotAccessMemory(F); Changed |= setDoesNotThrow(F); return Changed; diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp index 5a47c1fd0b6cb..7141e4b1e879e 100644 --- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -430,10 +430,11 @@ bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee, } } for (; I < NumArgs; I++) { - // Vararg functions can have more arguments than paramters. + // Vararg functions can have more arguments than parameters. assert(Callee->isVarArg()); if (CB.paramHasAttr(I, Attribute::StructRet)) { - *FailureReason = "SRet arg to vararg function"; + if (FailureReason) + *FailureReason = "SRet arg to vararg function"; return false; } } diff --git a/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp b/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp index cae9d9ee6d709..dca58bcdc0b73 100644 --- a/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp +++ b/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp @@ -41,7 +41,27 @@ IRBuilder<> *EscapeEnumerator::Next() { if (!isa(TI) && !isa(TI)) continue; - Builder.SetInsertPoint(TI); + // If the ret instruction is followed by a musttaill call, + // or a bitcast instruction and then a musttail call, we should return + // the musttail call as the insertion point to not break the musttail + // contract. + auto AdjustMustTailCall = [&](Instruction *I) -> Instruction * { + auto *RI = dyn_cast(I); + if (!RI || !RI->getPrevNode()) + return I; + auto *CI = dyn_cast(RI->getPrevNode()); + if (CI && CI->isMustTailCall()) + return CI; + auto *BI = dyn_cast(RI->getPrevNode()); + if (!BI || !BI->getPrevNode()) + return I; + CI = dyn_cast(BI->getPrevNode()); + if (CI && CI->isMustTailCall()) + return CI; + return I; + }; + + Builder.SetInsertPoint(AdjustMustTailCall(TI)); return &Builder; } @@ -54,11 +74,12 @@ IRBuilder<> *EscapeEnumerator::Next() { return nullptr; // Find all 'call' instructions that may throw. + // We cannot tranform calls with musttail tag. SmallVector Calls; for (BasicBlock &BB : F) for (Instruction &II : BB) if (CallInst *CI = dyn_cast(&II)) - if (!CI->doesNotThrow()) + if (!CI->doesNotThrow() && !CI->isMustTailCall()) Calls.push_back(CI); if (Calls.empty()) diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp index 460ba9e97fc6e..8d75eea25ba85 100644 --- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp +++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp @@ -104,7 +104,7 @@ FunctionPass *llvm::createFixIrreduciblePass() { return new FixIrreducible(); } INITIALIZE_PASS_BEGIN(FixIrreducible, "fix-irreducible", "Convert irreducible control-flow into natural loops", false /* Only looks at CFG */, false /* Analysis Pass */) -INITIALIZE_PASS_DEPENDENCY(LowerSwitch) +INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(FixIrreducible, "fix-irreducible", diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 30726627bc829..7ff21d7ee9ef6 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -2061,7 +2061,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, dyn_cast(AI->getArraySize())) { auto &DL = Caller->getParent()->getDataLayout(); Type *AllocaType = AI->getAllocatedType(); - uint64_t AllocaTypeSize = DL.getTypeAllocSize(AllocaType); + TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType); uint64_t AllocaArraySize = AIArraySize->getLimitedValue(); // Don't add markers for zero-sized allocas. @@ -2070,9 +2070,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // Check that array size doesn't saturate uint64_t and doesn't // overflow when it's multiplied by type size. - if (AllocaArraySize != std::numeric_limits::max() && + if (!AllocaTypeSize.isScalable() && + AllocaArraySize != std::numeric_limits::max() && std::numeric_limits::max() / AllocaArraySize >= - AllocaTypeSize) { + AllocaTypeSize.getFixedSize()) { AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()), AllocaArraySize * AllocaTypeSize); } diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 41349457e2b95..51e8251b22800 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -104,6 +104,12 @@ static cl::opt PHICSEDebugHash( cl::desc("Perform extra assertion checking to verify that PHINodes's hash " "function is well-behaved w.r.t. its isEqual predicate")); +static cl::opt PHICSENumPHISmallSize( + "phicse-num-phi-smallsize", cl::init(32), cl::Hidden, + cl::desc( + "When the basic block contains not more than this number of PHI nodes, " + "perform a (faster!) exhaustive search instead of set-driven one.")); + // Max recursion depth for collectBitParts used when detecting bswap and // bitreverse idioms static const unsigned BitPartRecursionMaxDepth = 64; @@ -1132,9 +1138,39 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, return true; } -// WARNING: this logic must be kept in sync with -// Instruction::isIdenticalToWhenDefined()! -bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { +static bool EliminateDuplicatePHINodesNaiveImpl(BasicBlock *BB) { + // This implementation doesn't currently consider undef operands + // specially. Theoretically, two phis which are identical except for + // one having an undef where the other doesn't could be collapsed. + + bool Changed = false; + + // Examine each PHI. + // Note that increment of I must *NOT* be in the iteration_expression, since + // we don't want to immediately advance when we restart from the beginning. + for (auto I = BB->begin(); PHINode *PN = dyn_cast(I);) { + ++I; + // Is there an identical PHI node in this basic block? + // Note that we only look in the upper square's triangle, + // we already checked that the lower triangle PHI's aren't identical. + for (auto J = I; PHINode *DuplicatePN = dyn_cast(J); ++J) { + if (!DuplicatePN->isIdenticalToWhenDefined(PN)) + continue; + // A duplicate. Replace this PHI with the base PHI. + ++NumPHICSEs; + DuplicatePN->replaceAllUsesWith(PN); + DuplicatePN->eraseFromParent(); + Changed = true; + + // The RAUW can change PHIs that we already visited. + I = BB->begin(); + break; // Start over from the beginning. + } + } + return Changed; +} + +static bool EliminateDuplicatePHINodesSetBasedImpl(BasicBlock *BB) { // This implementation doesn't currently consider undef operands // specially. Theoretically, two phis which are identical except for // one having an undef where the other doesn't could be collapsed. @@ -1152,6 +1188,8 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { return PN == getEmptyKey() || PN == getTombstoneKey(); } + // WARNING: this logic must be kept in sync with + // Instruction::isIdenticalToWhenDefined()! static unsigned getHashValueImpl(PHINode *PN) { // Compute a hash value on the operands. Instcombine will likely have // sorted them, which helps expose duplicates, but we have to check all @@ -1191,6 +1229,7 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { // Set of unique PHINodes. DenseSet PHISet; + PHISet.reserve(4 * PHICSENumPHISmallSize); // Examine each PHI. bool Changed = false; @@ -1213,6 +1252,16 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { return Changed; } +bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { + if ( +#ifndef NDEBUG + !PHICSEDebugHash && +#endif + hasNItemsOrLess(BB->phis(), PHICSENumPHISmallSize)) + return EliminateDuplicatePHINodesNaiveImpl(BB); + return EliminateDuplicatePHINodesSetBasedImpl(BB); +} + /// enforceKnownAlignment - If the specified pointer points to an object that /// we control, modify the object's alignment to PrefAlign. This isn't /// often possible though. If alignment is important, a more reliable approach @@ -2795,10 +2844,10 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals, if (Instruction *I = dyn_cast(V)) { // If this is an or instruction, it may be an inner node of the bswap. if (I->getOpcode() == Instruction::Or) { - auto &A = collectBitParts(I->getOperand(0), MatchBSwaps, - MatchBitReversals, BPS, Depth + 1); - auto &B = collectBitParts(I->getOperand(1), MatchBSwaps, - MatchBitReversals, BPS, Depth + 1); + const auto &A = collectBitParts(I->getOperand(0), MatchBSwaps, + MatchBitReversals, BPS, Depth + 1); + const auto &B = collectBitParts(I->getOperand(1), MatchBSwaps, + MatchBitReversals, BPS, Depth + 1); if (!A || !B) return Result; @@ -2830,8 +2879,8 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals, if (BitShift > BitWidth) return Result; - auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps, - MatchBitReversals, BPS, Depth + 1); + const auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps, + MatchBitReversals, BPS, Depth + 1); if (!Res) return Result; Result = Res; @@ -2862,8 +2911,8 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals, if (!MatchBitReversals && NumMaskedBits % 8 != 0) return Result; - auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps, - MatchBitReversals, BPS, Depth + 1); + const auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps, + MatchBitReversals, BPS, Depth + 1); if (!Res) return Result; Result = Res; @@ -2877,8 +2926,8 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals, // If this is a zext instruction zero extend the result. if (I->getOpcode() == Instruction::ZExt) { - auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps, - MatchBitReversals, BPS, Depth + 1); + const auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps, + MatchBitReversals, BPS, Depth + 1); if (!Res) return Result; diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index b4925064bc6b9..fe8fb90d140ab 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -357,7 +357,8 @@ PreservedAnalyses LoopVersioningPass::run(Function &F, auto &LAM = AM.getResult(F).getManager(); auto GetLAA = [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, + TLI, TTI, nullptr, MSSA}; return LAM.getResult(L, AR); }; diff --git a/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/llvm/lib/Transforms/Utils/LowerSwitch.cpp index 34e836d9660f3..10a4420b1753b 100644 --- a/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -12,6 +12,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Utils/LowerSwitch.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" @@ -26,6 +27,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -55,9 +57,9 @@ namespace { } // end anonymous namespace +namespace { // Return true iff R is covered by Ranges. -static bool IsInRanges(const IntRange &R, - const std::vector &Ranges) { +bool IsInRanges(const IntRange &R, const std::vector &Ranges) { // Note: Ranges must be sorted, non-overlapping and non-adjacent. // Find the first range whose High field is >= R.High, @@ -68,120 +70,34 @@ static bool IsInRanges(const IntRange &R, return I != Ranges.end() && I->Low <= R.Low; } -namespace { - - /// Replace all SwitchInst instructions with chained branch instructions. - class LowerSwitch : public FunctionPass { - public: - // Pass identification, replacement for typeid - static char ID; - - LowerSwitch() : FunctionPass(ID) { - initializeLowerSwitchPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - } - - struct CaseRange { - ConstantInt* Low; - ConstantInt* High; - BasicBlock* BB; - - CaseRange(ConstantInt *low, ConstantInt *high, BasicBlock *bb) - : Low(low), High(high), BB(bb) {} - }; - - using CaseVector = std::vector; - using CaseItr = std::vector::iterator; - - private: - void processSwitchInst(SwitchInst *SI, - SmallPtrSetImpl &DeleteList, - AssumptionCache *AC, LazyValueInfo *LVI); - - BasicBlock *switchConvert(CaseItr Begin, CaseItr End, - ConstantInt *LowerBound, ConstantInt *UpperBound, - Value *Val, BasicBlock *Predecessor, - BasicBlock *OrigBlock, BasicBlock *Default, - const std::vector &UnreachableRanges); - BasicBlock *newLeafBlock(CaseRange &Leaf, Value *Val, - ConstantInt *LowerBound, ConstantInt *UpperBound, - BasicBlock *OrigBlock, BasicBlock *Default); - unsigned Clusterify(CaseVector &Cases, SwitchInst *SI); - }; - - /// The comparison function for sorting the switch case values in the vector. - /// WARNING: Case ranges should be disjoint! - struct CaseCmp { - bool operator()(const LowerSwitch::CaseRange& C1, - const LowerSwitch::CaseRange& C2) { - const ConstantInt* CI1 = cast(C1.Low); - const ConstantInt* CI2 = cast(C2.High); - return CI1->getValue().slt(CI2->getValue()); - } - }; - -} // end anonymous namespace - -char LowerSwitch::ID = 0; - -// Publicly exposed interface to pass... -char &llvm::LowerSwitchID = LowerSwitch::ID; - -INITIALIZE_PASS_BEGIN(LowerSwitch, "lowerswitch", - "Lower SwitchInst's to branches", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) -INITIALIZE_PASS_END(LowerSwitch, "lowerswitch", - "Lower SwitchInst's to branches", false, false) - -// createLowerSwitchPass - Interface to this file... -FunctionPass *llvm::createLowerSwitchPass() { - return new LowerSwitch(); -} - -bool LowerSwitch::runOnFunction(Function &F) { - LazyValueInfo *LVI = &getAnalysis().getLVI(); - auto *ACT = getAnalysisIfAvailable(); - AssumptionCache *AC = ACT ? &ACT->getAssumptionCache(F) : nullptr; - - bool Changed = false; - SmallPtrSet DeleteList; - - for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { - BasicBlock *Cur = &*I++; // Advance over block so we don't traverse new blocks - - // If the block is a dead Default block that will be deleted later, don't - // waste time processing it. - if (DeleteList.count(Cur)) - continue; - - if (SwitchInst *SI = dyn_cast(Cur->getTerminator())) { - Changed = true; - processSwitchInst(SI, DeleteList, AC, LVI); - } - } - - for (BasicBlock* BB: DeleteList) { - LVI->eraseBlock(BB); - DeleteDeadBlock(BB); +struct CaseRange { + ConstantInt *Low; + ConstantInt *High; + BasicBlock *BB; + + CaseRange(ConstantInt *low, ConstantInt *high, BasicBlock *bb) + : Low(low), High(high), BB(bb) {} +}; + +using CaseVector = std::vector; +using CaseItr = std::vector::iterator; + +/// The comparison function for sorting the switch case values in the vector. +/// WARNING: Case ranges should be disjoint! +struct CaseCmp { + bool operator()(const CaseRange &C1, const CaseRange &C2) { + const ConstantInt *CI1 = cast(C1.Low); + const ConstantInt *CI2 = cast(C2.High); + return CI1->getValue().slt(CI2->getValue()); } - - return Changed; -} +}; /// Used for debugging purposes. LLVM_ATTRIBUTE_USED -static raw_ostream &operator<<(raw_ostream &O, - const LowerSwitch::CaseVector &C) { +raw_ostream &operator<<(raw_ostream &O, const CaseVector &C) { O << "["; - for (LowerSwitch::CaseVector::const_iterator B = C.begin(), E = C.end(); - B != E;) { + for (CaseVector::const_iterator B = C.begin(), E = C.end(); B != E;) { O << "[" << B->Low->getValue() << ", " << B->High->getValue() << "]"; if (++B != E) O << ", "; @@ -200,9 +116,9 @@ static raw_ostream &operator<<(raw_ostream &O, /// 2) Removed if subsequent incoming values now share the same case, i.e., /// multiple outcome edges are condensed into one. This is necessary to keep the /// number of phi values equal to the number of branches to SuccBB. -static void -fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB, - const unsigned NumMergedCases = std::numeric_limits::max()) { +void FixPhis( + BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB, + const unsigned NumMergedCases = std::numeric_limits::max()) { for (BasicBlock::iterator I = SuccBB->begin(), IE = SuccBB->getFirstNonPHI()->getIterator(); I != IE; ++I) { @@ -233,17 +149,80 @@ fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB, } } +/// Create a new leaf block for the binary lookup tree. It checks if the +/// switch's value == the case's value. If not, then it jumps to the default +/// branch. At this point in the tree, the value can't be another valid case +/// value, so the jump to the "default" branch is warranted. +BasicBlock *NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound, + ConstantInt *UpperBound, BasicBlock *OrigBlock, + BasicBlock *Default) { + Function *F = OrigBlock->getParent(); + BasicBlock *NewLeaf = BasicBlock::Create(Val->getContext(), "LeafBlock"); + F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewLeaf); + + // Emit comparison + ICmpInst *Comp = nullptr; + if (Leaf.Low == Leaf.High) { + // Make the seteq instruction... + Comp = + new ICmpInst(*NewLeaf, ICmpInst::ICMP_EQ, Val, Leaf.Low, "SwitchLeaf"); + } else { + // Make range comparison + if (Leaf.Low == LowerBound) { + // Val >= Min && Val <= Hi --> Val <= Hi + Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High, + "SwitchLeaf"); + } else if (Leaf.High == UpperBound) { + // Val <= Max && Val >= Lo --> Val >= Lo + Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SGE, Val, Leaf.Low, + "SwitchLeaf"); + } else if (Leaf.Low->isZero()) { + // Val >= 0 && Val <= Hi --> Val <=u Hi + Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High, + "SwitchLeaf"); + } else { + // Emit V-Lo <=u Hi-Lo + Constant *NegLo = ConstantExpr::getNeg(Leaf.Low); + Instruction *Add = BinaryOperator::CreateAdd( + Val, NegLo, Val->getName() + ".off", NewLeaf); + Constant *UpperBound = ConstantExpr::getAdd(NegLo, Leaf.High); + Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Add, UpperBound, + "SwitchLeaf"); + } + } + + // Make the conditional branch... + BasicBlock *Succ = Leaf.BB; + BranchInst::Create(Succ, Default, Comp, NewLeaf); + + // If there were any PHI nodes in this successor, rewrite one entry + // from OrigBlock to come from NewLeaf. + for (BasicBlock::iterator I = Succ->begin(); isa(I); ++I) { + PHINode *PN = cast(I); + // Remove all but one incoming entries from the cluster + uint64_t Range = Leaf.High->getSExtValue() - Leaf.Low->getSExtValue(); + for (uint64_t j = 0; j < Range; ++j) { + PN->removeIncomingValue(OrigBlock); + } + + int BlockIdx = PN->getBasicBlockIndex(OrigBlock); + assert(BlockIdx != -1 && "Switch didn't go to this successor??"); + PN->setIncomingBlock((unsigned)BlockIdx, NewLeaf); + } + + return NewLeaf; +} + /// Convert the switch statement into a binary lookup of the case values. /// The function recursively builds this tree. LowerBound and UpperBound are /// used to keep track of the bounds for Val that have already been checked by /// a block emitted by one of the previous calls to switchConvert in the call /// stack. -BasicBlock * -LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, - ConstantInt *UpperBound, Value *Val, - BasicBlock *Predecessor, BasicBlock *OrigBlock, - BasicBlock *Default, - const std::vector &UnreachableRanges) { +BasicBlock *SwitchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, + ConstantInt *UpperBound, Value *Val, + BasicBlock *Predecessor, BasicBlock *OrigBlock, + BasicBlock *Default, + const std::vector &UnreachableRanges) { assert(LowerBound && UpperBound && "Bounds must be initialized"); unsigned Size = End - Begin; @@ -255,10 +234,10 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, if (Begin->Low == LowerBound && Begin->High == UpperBound) { unsigned NumMergedCases = 0; NumMergedCases = UpperBound->getSExtValue() - LowerBound->getSExtValue(); - fixPhis(Begin->BB, OrigBlock, Predecessor, NumMergedCases); + FixPhis(Begin->BB, OrigBlock, Predecessor, NumMergedCases); return Begin->BB; } - return newLeafBlock(*Begin, Val, LowerBound, UpperBound, OrigBlock, + return NewLeafBlock(*Begin, Val, LowerBound, UpperBound, OrigBlock, Default); } @@ -305,12 +284,12 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT, Val, Pivot.Low, "Pivot"); - BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound, - NewUpperBound, Val, NewNode, OrigBlock, - Default, UnreachableRanges); - BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound, - UpperBound, Val, NewNode, OrigBlock, - Default, UnreachableRanges); + BasicBlock *LBranch = + SwitchConvert(LHS.begin(), LHS.end(), LowerBound, NewUpperBound, Val, + NewNode, OrigBlock, Default, UnreachableRanges); + BasicBlock *RBranch = + SwitchConvert(RHS.begin(), RHS.end(), NewLowerBound, UpperBound, Val, + NewNode, OrigBlock, Default, UnreachableRanges); F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewNode); NewNode->getInstList().push_back(Comp); @@ -319,78 +298,10 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, return NewNode; } -/// Create a new leaf block for the binary lookup tree. It checks if the -/// switch's value == the case's value. If not, then it jumps to the default -/// branch. At this point in the tree, the value can't be another valid case -/// value, so the jump to the "default" branch is warranted. -BasicBlock *LowerSwitch::newLeafBlock(CaseRange &Leaf, Value *Val, - ConstantInt *LowerBound, - ConstantInt *UpperBound, - BasicBlock *OrigBlock, - BasicBlock *Default) { - Function* F = OrigBlock->getParent(); - BasicBlock* NewLeaf = BasicBlock::Create(Val->getContext(), "LeafBlock"); - F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewLeaf); - - // Emit comparison - ICmpInst* Comp = nullptr; - if (Leaf.Low == Leaf.High) { - // Make the seteq instruction... - Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_EQ, Val, - Leaf.Low, "SwitchLeaf"); - } else { - // Make range comparison - if (Leaf.Low == LowerBound) { - // Val >= Min && Val <= Hi --> Val <= Hi - Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High, - "SwitchLeaf"); - } else if (Leaf.High == UpperBound) { - // Val <= Max && Val >= Lo --> Val >= Lo - Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SGE, Val, Leaf.Low, - "SwitchLeaf"); - } else if (Leaf.Low->isZero()) { - // Val >= 0 && Val <= Hi --> Val <=u Hi - Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High, - "SwitchLeaf"); - } else { - // Emit V-Lo <=u Hi-Lo - Constant* NegLo = ConstantExpr::getNeg(Leaf.Low); - Instruction* Add = BinaryOperator::CreateAdd(Val, NegLo, - Val->getName()+".off", - NewLeaf); - Constant *UpperBound = ConstantExpr::getAdd(NegLo, Leaf.High); - Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Add, UpperBound, - "SwitchLeaf"); - } - } - - // Make the conditional branch... - BasicBlock* Succ = Leaf.BB; - BranchInst::Create(Succ, Default, Comp, NewLeaf); - - // If there were any PHI nodes in this successor, rewrite one entry - // from OrigBlock to come from NewLeaf. - for (BasicBlock::iterator I = Succ->begin(); isa(I); ++I) { - PHINode* PN = cast(I); - // Remove all but one incoming entries from the cluster - uint64_t Range = Leaf.High->getSExtValue() - - Leaf.Low->getSExtValue(); - for (uint64_t j = 0; j < Range; ++j) { - PN->removeIncomingValue(OrigBlock); - } - - int BlockIdx = PN->getBasicBlockIndex(OrigBlock); - assert(BlockIdx != -1 && "Switch didn't go to this successor??"); - PN->setIncomingBlock((unsigned)BlockIdx, NewLeaf); - } - - return NewLeaf; -} - /// Transform simple list of \p SI's cases into list of CaseRange's \p Cases. /// \post \p Cases wouldn't contain references to \p SI's default BB. /// \returns Number of \p SI's cases that do not reference \p SI's default BB. -unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { +unsigned Clusterify(CaseVector &Cases, SwitchInst *SI) { unsigned NumSimpleCases = 0; // Start with "simple" cases @@ -431,9 +342,9 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { /// Replace the specified switch instruction with a sequence of chained if-then /// insts in a balanced binary search. -void LowerSwitch::processSwitchInst(SwitchInst *SI, - SmallPtrSetImpl &DeleteList, - AssumptionCache *AC, LazyValueInfo *LVI) { +void ProcessSwitchInst(SwitchInst *SI, + SmallPtrSetImpl &DeleteList, + AssumptionCache *AC, LazyValueInfo *LVI) { BasicBlock *OrigBlock = SI->getParent(); Function *F = OrigBlock->getParent(); Value *Val = SI->getCondition(); // The value we are switching on... @@ -458,7 +369,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI, if (Cases.empty()) { BranchInst::Create(Default, OrigBlock); // Remove all the references from Default's PHIs to OrigBlock, but one. - fixPhis(Default, OrigBlock, OrigBlock); + FixPhis(Default, OrigBlock, OrigBlock); SI->eraseFromParent(); return; } @@ -592,12 +503,12 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI, BranchInst::Create(Default, NewDefault); BasicBlock *SwitchBlock = - switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val, + SwitchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val, OrigBlock, OrigBlock, NewDefault, UnreachableRanges); // If there are entries in any PHI nodes for the default edge, make sure // to update them as well. - fixPhis(Default, OrigBlock, NewDefault); + FixPhis(Default, OrigBlock, NewDefault); // Branch to our shiny new if-then stuff... BranchInst::Create(SwitchBlock, OrigBlock); @@ -610,3 +521,81 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI, if (pred_begin(OldDefault) == pred_end(OldDefault)) DeleteList.insert(OldDefault); } + +bool LowerSwitch(Function &F, LazyValueInfo *LVI, AssumptionCache *AC) { + bool Changed = false; + SmallPtrSet DeleteList; + + for (Function::iterator I = F.begin(), E = F.end(); I != E;) { + BasicBlock *Cur = + &*I++; // Advance over block so we don't traverse new blocks + + // If the block is a dead Default block that will be deleted later, don't + // waste time processing it. + if (DeleteList.count(Cur)) + continue; + + if (SwitchInst *SI = dyn_cast(Cur->getTerminator())) { + Changed = true; + ProcessSwitchInst(SI, DeleteList, AC, LVI); + } + } + + for (BasicBlock *BB : DeleteList) { + LVI->eraseBlock(BB); + DeleteDeadBlock(BB); + } + + return Changed; +} + +/// Replace all SwitchInst instructions with chained branch instructions. +class LowerSwitchLegacyPass : public FunctionPass { +public: + // Pass identification, replacement for typeid + static char ID; + + LowerSwitchLegacyPass() : FunctionPass(ID) { + initializeLowerSwitchLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } +}; + +} // end anonymous namespace + +char LowerSwitchLegacyPass::ID = 0; + +// Publicly exposed interface to pass... +char &llvm::LowerSwitchID = LowerSwitchLegacyPass::ID; + +INITIALIZE_PASS_BEGIN(LowerSwitchLegacyPass, "lowerswitch", + "Lower SwitchInst's to branches", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) +INITIALIZE_PASS_END(LowerSwitchLegacyPass, "lowerswitch", + "Lower SwitchInst's to branches", false, false) + +// createLowerSwitchPass - Interface to this file... +FunctionPass *llvm::createLowerSwitchPass() { + return new LowerSwitchLegacyPass(); +} + +bool LowerSwitchLegacyPass::runOnFunction(Function &F) { + LazyValueInfo *LVI = &getAnalysis().getLVI(); + auto *ACT = getAnalysisIfAvailable(); + AssumptionCache *AC = ACT ? &ACT->getAssumptionCache(F) : nullptr; + return LowerSwitch(F, LVI, AC); +} + +PreservedAnalyses LowerSwitchPass::run(Function &F, + FunctionAnalysisManager &AM) { + LazyValueInfo *LVI = &AM.getResult(F); + AssumptionCache *AC = AM.getCachedResult(F); + return LowerSwitch(F, LVI, AC) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 1bb827cd3057b..165030c6d2f1b 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -2184,26 +2184,37 @@ template static int costAndCollectOperands( const T *S = cast(WorkItem.S); int Cost = 0; - // Collect the opcodes of all the instructions that will be needed to expand - // the SCEVExpr. This is so that when we come to cost the operands, we know - // what the generated user(s) will be. - SmallVector Opcodes; + // Object to help map SCEV operands to expanded IR instructions. + struct OperationIndices { + OperationIndices(unsigned Opc, size_t min, size_t max) : + Opcode(Opc), MinIdx(min), MaxIdx(max) { } + unsigned Opcode; + size_t MinIdx; + size_t MaxIdx; + }; + + // Collect the operations of all the instructions that will be needed to + // expand the SCEVExpr. This is so that when we come to cost the operands, + // we know what the generated user(s) will be. + SmallVector Operations; auto CastCost = [&](unsigned Opcode) { - Opcodes.push_back(Opcode); + Operations.emplace_back(Opcode, 0, 0); return TTI.getCastInstrCost(Opcode, S->getType(), S->getOperand(0)->getType(), TTI::CastContextHint::None, CostKind); }; - auto ArithCost = [&](unsigned Opcode, unsigned NumRequired) { - Opcodes.push_back(Opcode); + auto ArithCost = [&](unsigned Opcode, unsigned NumRequired, + unsigned MinIdx = 0, unsigned MaxIdx = 1) { + Operations.emplace_back(Opcode, MinIdx, MaxIdx); return NumRequired * TTI.getArithmeticInstrCost(Opcode, S->getType(), CostKind); }; - auto CmpSelCost = [&](unsigned Opcode, unsigned NumRequired) { - Opcodes.push_back(Opcode); + auto CmpSelCost = [&](unsigned Opcode, unsigned NumRequired, + unsigned MinIdx, unsigned MaxIdx) { + Operations.emplace_back(Opcode, MinIdx, MaxIdx); Type *OpType = S->getOperand(0)->getType(); return NumRequired * TTI.getCmpSelInstrCost(Opcode, OpType, @@ -2246,8 +2257,8 @@ template static int costAndCollectOperands( case scUMaxExpr: case scSMinExpr: case scUMinExpr: { - Cost += CmpSelCost(Instruction::ICmp, S->getNumOperands() - 1); - Cost += CmpSelCost(Instruction::Select, S->getNumOperands() - 1); + Cost += CmpSelCost(Instruction::ICmp, S->getNumOperands() - 1, 0, 1); + Cost += CmpSelCost(Instruction::Select, S->getNumOperands() - 1, 0, 2); break; } case scAddRecExpr: { @@ -2270,7 +2281,8 @@ template static int costAndCollectOperands( // Much like with normal add expr, the polynominal will require // one less addition than the number of it's terms. - int AddCost = ArithCost(Instruction::Add, NumTerms - 1); + int AddCost = ArithCost(Instruction::Add, NumTerms - 1, + /*MinIdx*/1, /*MaxIdx*/1); // Here, *each* one of those will require a multiplication. int MulCost = ArithCost(Instruction::Mul, NumNonZeroDegreeNonOneTerms); Cost = AddCost + MulCost; @@ -2286,12 +2298,18 @@ template static int costAndCollectOperands( // x ^ {PolyDegree} will give us x ^ {2} .. x ^ {PolyDegree-1} for free. // FIXME: this is conservatively correct, but might be overly pessimistic. Cost += MulCost * (PolyDegree - 1); + break; } } - for (unsigned Opc : Opcodes) - for (auto I : enumerate(S->operands())) - Worklist.emplace_back(Opc, I.index(), I.value()); + for (auto &CostOp : Operations) { + for (auto SCEVOp : enumerate(S->operands())) { + // Clamp the index to account for multiple IR operations being chained. + size_t MinIdx = std::max(SCEVOp.index(), CostOp.MinIdx); + size_t OpIdx = std::min(MinIdx, CostOp.MaxIdx); + Worklist.emplace_back(CostOp.Opcode, OpIdx, SCEVOp.value()); + } + } return Cost; } @@ -2305,7 +2323,7 @@ bool SCEVExpander::isHighCostExpansionHelper( const SCEV *S = WorkItem.S; // Was the cost of expansion of this expression already accounted for? - if (!Processed.insert(S).second) + if (!isa(S) && !Processed.insert(S).second) return false; // We have already accounted for this expression. // If we can find an existing value for this scev available at the point "At" @@ -2313,16 +2331,26 @@ bool SCEVExpander::isHighCostExpansionHelper( if (getRelatedExistingExpansion(S, &At, L)) return false; // Consider the expression to be free. - switch (S->getSCEVType()) { - case scUnknown: - case scConstant: - return false; // Assume to be zero-cost. - } + // Assume to be zero-cost. + if (isa(S)) + return false; TargetTransformInfo::TargetCostKind CostKind = - TargetTransformInfo::TCK_RecipThroughput; - - if (isa(S)) { + L->getHeader()->getParent()->hasMinSize() + ? TargetTransformInfo::TCK_CodeSize + : TargetTransformInfo::TCK_RecipThroughput; + + if (auto *Constant = dyn_cast(S)) { + // Only evalulate the costs of constants when optimizing for size. + if (CostKind != TargetTransformInfo::TCK_CodeSize) + return 0; + const APInt &Imm = Constant->getAPInt(); + Type *Ty = S->getType(); + BudgetRemaining -= + TTI.getIntImmCostInst(WorkItem.ParentOpcode, WorkItem.OperandIdx, + Imm, Ty, CostKind); + return BudgetRemaining < 0; + } else if (isa(S)) { int Cost = costAndCollectOperands(WorkItem, TTI, CostKind, Worklist); BudgetRemaining -= Cost; diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 34eb9e1b8124f..60b7da7e64feb 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1748,6 +1748,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) { Sqrt = getSqrtCall(Base, Pow->getCalledFunction()->getAttributes(), Pow->doesNotAccessMemory(), M, B, TLI); + if (!Sqrt) + return nullptr; } // We will memoize intermediate products of the Addition Chain. diff --git a/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp index 9af39d9a0dd1c..621e944741b14 100644 --- a/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp +++ b/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp @@ -6,10 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This pass is used to ensure that functions have at most one return -// instruction in them. Additionally, it keeps track of which node is the new -// exit node of the CFG. If there are no exit nodes in the CFG, the getExitNode -// method will return a null pointer. +// This pass is used to ensure that functions have at most one return and one +// unreachable instruction in them. // //===----------------------------------------------------------------------===// @@ -42,53 +40,41 @@ void UnifyFunctionExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ AU.addPreservedID(LowerSwitchID); } -// UnifyAllExitNodes - Unify all exit nodes of the CFG by creating a new -// BasicBlock, and converting all returns to unconditional branches to this -// new basic block. The singular exit node is returned. -// -// If there are no return stmts in the Function, a null pointer is returned. -// -bool UnifyFunctionExitNodes::runOnFunction(Function &F) { - // Loop over all of the blocks in a function, tracking all of the blocks that - // return. - // - std::vector ReturningBlocks; +bool UnifyFunctionExitNodes::unifyUnreachableBlocks(Function &F) { std::vector UnreachableBlocks; + for (BasicBlock &I : F) - if (isa(I.getTerminator())) - ReturningBlocks.push_back(&I); - else if (isa(I.getTerminator())) + if (isa(I.getTerminator())) UnreachableBlocks.push_back(&I); - // Then unreachable blocks. - if (UnreachableBlocks.empty()) { - UnreachableBlock = nullptr; - } else if (UnreachableBlocks.size() == 1) { - UnreachableBlock = UnreachableBlocks.front(); - } else { - UnreachableBlock = BasicBlock::Create(F.getContext(), - "UnifiedUnreachableBlock", &F); - new UnreachableInst(F.getContext(), UnreachableBlock); - - for (BasicBlock *BB : UnreachableBlocks) { - BB->getInstList().pop_back(); // Remove the unreachable inst. - BranchInst::Create(UnreachableBlock, BB); - } + if (UnreachableBlocks.size() <= 1) + return false; + + BasicBlock *UnreachableBlock = + BasicBlock::Create(F.getContext(), "UnifiedUnreachableBlock", &F); + new UnreachableInst(F.getContext(), UnreachableBlock); + + for (BasicBlock *BB : UnreachableBlocks) { + BB->getInstList().pop_back(); // Remove the unreachable inst. + BranchInst::Create(UnreachableBlock, BB); } - // Now handle return blocks. - if (ReturningBlocks.empty()) { - ReturnBlock = nullptr; - return false; // No blocks return - } else if (ReturningBlocks.size() == 1) { - ReturnBlock = ReturningBlocks.front(); // Already has a single return block + return true; +} + +bool UnifyFunctionExitNodes::unifyReturnBlocks(Function &F) { + std::vector ReturningBlocks; + + for (BasicBlock &I : F) + if (isa(I.getTerminator())) + ReturningBlocks.push_back(&I); + + if (ReturningBlocks.size() <= 1) return false; - } - // Otherwise, we need to insert a new basic block into the function, add a PHI - // nodes (if the function returns values), and convert all of the return - // instructions into unconditional branches. - // + // Insert a new basic block into the function, add PHI nodes (if the function + // returns values), and convert all of the return instructions into + // unconditional branches. BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), "UnifiedReturnBlock", &F); @@ -105,7 +91,6 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) { // Loop over all of the blocks, replacing the return instruction with an // unconditional branch. - // for (BasicBlock *BB : ReturningBlocks) { // Add an incoming element to the PHI node for every return instruction that // is merging into this new block... @@ -115,6 +100,16 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) { BB->getInstList().pop_back(); // Remove the return insn BranchInst::Create(NewRetBlock, BB); } - ReturnBlock = NewRetBlock; + return true; } + +// Unify all exit nodes of the CFG by creating a new BasicBlock, and converting +// all returns to unconditional branches to this new basic block. Also, unify +// all unreachable blocks. +bool UnifyFunctionExitNodes::runOnFunction(Function &F) { + bool Changed = false; + Changed |= unifyUnreachableBlocks(F); + Changed |= unifyReturnBlocks(F); + return Changed; +} diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index b10deee3907c7..7017ee7bea957 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -16,6 +16,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/MapVector.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/InitializePasses.h" @@ -53,7 +54,7 @@ FunctionPass *llvm::createUnifyLoopExitsPass() { return new UnifyLoopExits(); } INITIALIZE_PASS_BEGIN(UnifyLoopExits, "unify-loop-exits", "Fixup each natural loop to have a single exit block", false /* Only looks at CFG */, false /* Analysis Pass */) -INITIALIZE_PASS_DEPENDENCY(LowerSwitch) +INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(UnifyLoopExits, "unify-loop-exits", @@ -80,7 +81,7 @@ static void restoreSSA(const DominatorTree &DT, const Loop *L, const SetVector &Incoming, BasicBlock *LoopExitBlock) { using InstVector = SmallVector; - using IIMap = DenseMap; + using IIMap = MapVector; IIMap ExternalUsers; for (auto BB : L->blocks()) { for (auto &I : *BB) { diff --git a/llvm/lib/Transforms/Utils/Utils.cpp b/llvm/lib/Transforms/Utils/Utils.cpp index ce98a739bea88..1638635440a95 100644 --- a/llvm/lib/Transforms/Utils/Utils.cpp +++ b/llvm/lib/Transforms/Utils/Utils.cpp @@ -34,7 +34,7 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) { initializeLibCallsShrinkWrapLegacyPassPass(Registry); initializeLoopSimplifyPass(Registry); initializeLowerInvokeLegacyPassPass(Registry); - initializeLowerSwitchPass(Registry); + initializeLowerSwitchLegacyPassPass(Registry); initializeNameAnonGlobalLegacyPassPass(Registry); initializePromoteLegacyPassPass(Registry); initializeStripNonLineTableDebugInfoPass(Registry); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b9f7ae71d0cf2..b203dd88eb3dd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6883,7 +6883,7 @@ void LoopVectorizationCostModel::collectInLoopReductions() { // For the moment, without predicated reduction instructions, we do not // support inloop reductions whilst folding the tail, and hence in those cases // all reductions are currently out of the loop. - if (!PreferInLoopReductions || foldTailByMasking()) + if (foldTailByMasking()) return; for (auto &Reduction : Legal->getReductionVars()) { @@ -6894,6 +6894,14 @@ void LoopVectorizationCostModel::collectInLoopReductions() { if (RdxDesc.getRecurrenceType() != Phi->getType()) continue; + // If the target would prefer this reduction to happen "in-loop", then we + // want to record it as such. + unsigned Opcode = RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()); + if (!PreferInLoopReductions && + !TTI.preferInLoopReduction(Opcode, Phi->getType(), + TargetTransformInfo::ReductionFlags())) + continue; + // Check that we can correctly put the reductions into the loop, by // finding the chain of operations that leads from the phi to the loop // exit value. @@ -8613,7 +8621,8 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &LAM = AM.getResult(F).getManager(); std::function GetLAA = [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, + TLI, TTI, nullptr, MSSA}; return LAM.getResult(L, AR); }; auto &MAMProxy = AM.getResult(F); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ec138bf2b7c88..c487301177c14 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -17,11 +17,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/SLPVectorizer.h" -#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/MapVector.h" -#include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" @@ -30,7 +27,6 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" @@ -67,7 +63,6 @@ #include "llvm/IR/Module.h" #include "llvm/IR/NoFolder.h" #include "llvm/IR/Operator.h" -#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" @@ -507,7 +502,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, } /// \returns the AA location that is being access by the instruction. -static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) { +static MemoryLocation getLocation(Instruction *I, AAResults *AA) { if (StoreInst *SI = dyn_cast(I)) return MemoryLocation::get(SI); if (LoadInst *LI = dyn_cast(I)) @@ -544,7 +539,7 @@ class BoUpSLP { MapVector>; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, - TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, + TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE) : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), @@ -2240,7 +2235,7 @@ class BoUpSLP { ScalarEvolution *SE; TargetTransformInfo *TTI; TargetLibraryInfo *TLI; - AliasAnalysis *AA; + AAResults *AA; LoopInfo *LI; DominatorTree *DT; AssumptionCache *AC; @@ -3694,11 +3689,13 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI) { // Look past the root to find a source value. Arbitrarily follow the // path through operand 0 of any 'or'. Also, peek through optional - // shift-left-by-constant. + // shift-left-by-multiple-of-8-bits. Value *ZextLoad = Root; + const APInt *ShAmtC; while (!isa(ZextLoad) && (match(ZextLoad, m_Or(m_Value(), m_Value())) || - match(ZextLoad, m_Shl(m_Value(), m_Constant())))) + (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && + ShAmtC->urem(8) == 0))) ZextLoad = cast(ZextLoad)->getOperand(0); // Check if the input is an extended load of the required or/shift expression. @@ -5706,7 +5703,7 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, - TargetLibraryInfo *TLI_, AliasAnalysis *AA_, + TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_) { @@ -6256,9 +6253,9 @@ class HorizontalReduction { enum ReductionKind { RK_None, /// Not a reduction. RK_Arithmetic, /// Binary reduction data. - RK_Min, /// Minimum reduction data. + RK_SMin, /// Signed minimum reduction data. RK_UMin, /// Unsigned minimum reduction data. - RK_Max, /// Maximum reduction data. + RK_SMax, /// Signed maximum reduction data. RK_UMax, /// Unsigned maximum reduction data. }; @@ -6276,9 +6273,6 @@ class HorizontalReduction { /// Kind of the reduction operation. ReductionKind Kind = RK_None; - /// True if float point min/max reduction has no NaNs. - bool NoNaN = false; - /// Checks if the reduction operation can be vectorized. bool isVectorizable() const { return LHS && RHS && @@ -6288,10 +6282,9 @@ class HorizontalReduction { Opcode == Instruction::Mul || Opcode == Instruction::FMul || Opcode == Instruction::And || Opcode == Instruction::Or || Opcode == Instruction::Xor)) || - ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && - (Kind == RK_Min || Kind == RK_Max)) || (Opcode == Instruction::ICmp && - (Kind == RK_UMin || Kind == RK_UMax))); + (Kind == RK_SMin || Kind == RK_SMax || + Kind == RK_UMin || Kind == RK_UMax))); } /// Creates reduction operation with the current opcode. @@ -6303,13 +6296,13 @@ class HorizontalReduction { case RK_Arithmetic: return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS, Name); - case RK_Min: - Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS) - : Builder.CreateFCmpOLT(LHS, RHS); + case RK_SMin: + assert(Opcode == Instruction::ICmp && "Expected integer types."); + Cmp = Builder.CreateICmpSLT(LHS, RHS); return Builder.CreateSelect(Cmp, LHS, RHS, Name); - case RK_Max: - Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS) - : Builder.CreateFCmpOGT(LHS, RHS); + case RK_SMax: + assert(Opcode == Instruction::ICmp && "Expected integer types."); + Cmp = Builder.CreateICmpSGT(LHS, RHS); return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_UMin: assert(Opcode == Instruction::ICmp && "Expected integer types."); @@ -6337,9 +6330,8 @@ class HorizontalReduction { /// Constructor for reduction operations with opcode and its left and /// right operands. - OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind, - bool NoNaN = false) - : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) { + OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind) + : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind) { assert(Kind != RK_None && "One of the reduction operations is expected."); } @@ -6350,8 +6342,8 @@ class HorizontalReduction { switch (Kind) { case RK_Arithmetic: return false; - case RK_Min: - case RK_Max: + case RK_SMin: + case RK_SMax: case RK_UMin: case RK_UMax: return true; @@ -6433,10 +6425,8 @@ class HorizontalReduction { switch (Kind) { case RK_Arithmetic: return I->isAssociative(); - case RK_Min: - case RK_Max: - return Opcode == Instruction::ICmp || - cast(I->getOperand(0))->isFast(); + case RK_SMin: + case RK_SMax: case RK_UMin: case RK_UMax: assert(Opcode == Instruction::ICmp && @@ -6466,7 +6456,6 @@ class HorizontalReduction { LHS = nullptr; RHS = nullptr; Kind = RK_None; - NoNaN = false; } /// Get the opcode of the reduction operation. @@ -6494,8 +6483,8 @@ class HorizontalReduction { case RK_Arithmetic: propagateIRFlags(Op, ReductionOps[0]); return Op; - case RK_Min: - case RK_Max: + case RK_SMin: + case RK_SMax: case RK_UMin: case RK_UMax: if (auto *SI = dyn_cast(Op)) @@ -6518,8 +6507,8 @@ class HorizontalReduction { case RK_Arithmetic: propagateIRFlags(Op, I); return Op; - case RK_Min: - case RK_Max: + case RK_SMin: + case RK_SMax: case RK_UMin: case RK_UMax: if (auto *SI = dyn_cast(Op)) { @@ -6536,16 +6525,15 @@ class HorizontalReduction { TargetTransformInfo::ReductionFlags getFlags() const { TargetTransformInfo::ReductionFlags Flags; - Flags.NoNaN = NoNaN; switch (Kind) { case RK_Arithmetic: break; - case RK_Min: - Flags.IsSigned = Opcode == Instruction::ICmp; + case RK_SMin: + Flags.IsSigned = true; Flags.IsMaxOp = false; break; - case RK_Max: - Flags.IsSigned = Opcode == Instruction::ICmp; + case RK_SMax: + Flags.IsSigned = true; Flags.IsMaxOp = true; break; case RK_UMin: @@ -6610,21 +6598,11 @@ class HorizontalReduction { if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) { return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin); } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) { - return OperationData(Instruction::ICmp, LHS, RHS, RK_Min); - } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) || - m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) { - return OperationData( - Instruction::FCmp, LHS, RHS, RK_Min, - cast(Select->getCondition())->hasNoNaNs()); + return OperationData(Instruction::ICmp, LHS, RHS, RK_SMin); } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) { return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax); } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) { - return OperationData(Instruction::ICmp, LHS, RHS, RK_Max); - } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) || - m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) { - return OperationData( - Instruction::FCmp, LHS, RHS, RK_Max, - cast(Select->getCondition())->hasNoNaNs()); + return OperationData(Instruction::ICmp, LHS, RHS, RK_SMax); } else { // Try harder: look for min/max pattern based on instructions producing // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). @@ -6672,14 +6650,7 @@ class HorizontalReduction { case CmpInst::ICMP_SLT: case CmpInst::ICMP_SLE: - return OperationData(Instruction::ICmp, LHS, RHS, RK_Min); - - case CmpInst::FCMP_OLT: - case CmpInst::FCMP_OLE: - case CmpInst::FCMP_ULT: - case CmpInst::FCMP_ULE: - return OperationData(Instruction::FCmp, LHS, RHS, RK_Min, - cast(Cond)->hasNoNaNs()); + return OperationData(Instruction::ICmp, LHS, RHS, RK_SMin); case CmpInst::ICMP_UGT: case CmpInst::ICMP_UGE: @@ -6687,14 +6658,7 @@ class HorizontalReduction { case CmpInst::ICMP_SGT: case CmpInst::ICMP_SGE: - return OperationData(Instruction::ICmp, LHS, RHS, RK_Max); - - case CmpInst::FCMP_OGT: - case CmpInst::FCMP_OGE: - case CmpInst::FCMP_UGT: - case CmpInst::FCMP_UGE: - return OperationData(Instruction::FCmp, LHS, RHS, RK_Max, - cast(Cond)->hasNoNaNs()); + return OperationData(Instruction::ICmp, LHS, RHS, RK_SMax); } } } @@ -6832,35 +6796,26 @@ class HorizontalReduction { return true; } - /// Attempt to vectorize the tree found by - /// matchAssociativeReduction. + /// Attempt to vectorize the tree found by matchAssociativeReduction. bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { - if (ReducedVals.empty()) - return false; - - // If there is a sufficient number of reduction values, reduce - // to a nearby power-of-2. Can safely generate oversized + // If there are a sufficient number of reduction values, reduce + // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. unsigned NumReducedVals = ReducedVals.size(); if (NumReducedVals < 4) return false; - unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); - - Value *VectorizedTree = nullptr; - // FIXME: Fast-math-flags should be set based on the instructions in the // reduction (not all of 'fast' are required). IRBuilder<> Builder(cast(ReductionRoot)); FastMathFlags Unsafe; Unsafe.setFast(); Builder.setFastMathFlags(Unsafe); - unsigned i = 0; BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; - // The same extra argument may be used several time, so log each attempt + // The same extra argument may be used several times, so log each attempt // to use it. - for (auto &Pair : ExtraArgs) { + for (std::pair &Pair : ExtraArgs) { assert(Pair.first && "DebugLoc must be set."); ExternallyUsedValues[Pair.second].push_back(Pair.first); } @@ -6880,10 +6835,42 @@ class HorizontalReduction { // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; SmallVector IgnoreList; - for (auto &V : ReductionOps) - IgnoreList.append(V.begin(), V.end()); + for (ReductionOpsType &RdxOp : ReductionOps) + IgnoreList.append(RdxOp.begin(), RdxOp.end()); + + unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); + if (NumReducedVals > ReduxWidth) { + // In the loop below, we are building a tree based on a window of + // 'ReduxWidth' values. + // If the operands of those values have common traits (compare predicate, + // constant operand, etc), then we want to group those together to + // minimize the cost of the reduction. + + // TODO: This should be extended to count common operands for + // compares and binops. + + // Step 1: Count the number of times each compare predicate occurs. + SmallDenseMap PredCountMap; + for (Value *RdxVal : ReducedVals) { + CmpInst::Predicate Pred; + if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value()))) + ++PredCountMap[Pred]; + } + // Step 2: Sort the values so the most common predicates come first. + stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) { + CmpInst::Predicate PredA, PredB; + if (match(A, m_Cmp(PredA, m_Value(), m_Value())) && + match(B, m_Cmp(PredB, m_Value(), m_Value()))) { + return PredCountMap[PredA] > PredCountMap[PredB]; + } + return false; + }); + } + + Value *VectorizedTree = nullptr; + unsigned i = 0; while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { - auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); + ArrayRef VL = makeArrayRef(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, IgnoreList); Optional> Order = V.bestOrder(); // TODO: Handle orders of size less than number of elements in the vector. @@ -6906,25 +6893,25 @@ class HorizontalReduction { int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth); int Cost = TreeCost + ReductionCost; if (Cost >= -SLPCostThreshold) { - V.getORE()->emit([&]() { - return OptimizationRemarkMissed( - SV_NAME, "HorSLPNotBeneficial", cast(VL[0])) - << "Vectorizing horizontal reduction is possible" - << "but not beneficial with cost " - << ore::NV("Cost", Cost) << " and threshold " - << ore::NV("Threshold", -SLPCostThreshold); - }); - break; + V.getORE()->emit([&]() { + return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", + cast(VL[0])) + << "Vectorizing horizontal reduction is possible" + << "but not beneficial with cost " << ore::NV("Cost", Cost) + << " and threshold " + << ore::NV("Threshold", -SLPCostThreshold); + }); + break; } LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost << ". (HorRdx)\n"); V.getORE()->emit([&]() { - return OptimizationRemark( - SV_NAME, "VectorizedHorizontalReduction", cast(VL[0])) - << "Vectorized horizontal reduction with cost " - << ore::NV("Cost", Cost) << " and with tree size " - << ore::NV("TreeSize", V.getTreeSize()); + return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", + cast(VL[0])) + << "Vectorized horizontal reduction with cost " + << ore::NV("Cost", Cost) << " and with tree size " + << ore::NV("TreeSize", V.getTreeSize()); }); // Vectorize a tree. @@ -6941,15 +6928,19 @@ class HorizontalReduction { Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); - if (VectorizedTree) { + + if (!VectorizedTree) { + // Initialize the final value in the reduction. + VectorizedTree = ReducedSubTree; + } else { + // Update the final value in the reduction. Builder.SetCurrentDebugLocation(Loc); OperationData VectReductionData(ReductionData.getOpcode(), VectorizedTree, ReducedSubTree, ReductionData.getKind()); VectorizedTree = VectReductionData.createOp(Builder, "op.rdx", ReductionOps); - } else - VectorizedTree = ReducedSubTree; + } i += ReduxWidth; ReduxWidth = PowerOf2Floor(NumReducedVals - i); } @@ -7017,8 +7008,8 @@ class HorizontalReduction { TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, /*IsPairwiseForm=*/false); break; - case RK_Min: - case RK_Max: + case RK_SMin: + case RK_SMax: case RK_UMin: case RK_UMax: { auto *VecCondTy = cast(CmpInst::makeCmpResultType(VecTy)); @@ -7045,8 +7036,8 @@ class HorizontalReduction { ScalarReduxCost = TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy); break; - case RK_Min: - case RK_Max: + case RK_SMin: + case RK_SMax: case RK_UMin: case RK_UMax: ScalarReduxCost = @@ -7544,6 +7535,11 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { SmallVector PostProcessInstructions; SmallDenseSet KeyNodes; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // Skip instructions with scalable type. The num of elements is unknown at + // compile-time for scalable type. + if (isa(it->getType())) + continue; + // Skip instructions marked for the deletion. if (R.isDeleted(&*it)) continue; diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 29e9b92040d43..1bac16b92a9d9 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -92,24 +92,29 @@ static void replaceValue(Value &Old, Value &New) { } bool VectorCombine::vectorizeLoadInsert(Instruction &I) { - // Match insert of scalar load. + // Match insert into fixed vector of scalar load. + auto *Ty = dyn_cast(I.getType()); Value *Scalar; - if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt()))) + if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || + !Scalar->hasOneUse()) return false; + + // Do not vectorize scalar load (widening) if atomic/volatile or under + // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions + // or create data races non-existent in the source. auto *Load = dyn_cast(Scalar); - Type *ScalarTy = Scalar->getType(); - if (!Load || !Load->isSimple()) - return false; - auto *Ty = dyn_cast(I.getType()); - if (!Ty) + if (!Load || !Load->isSimple() || + Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || + mustSuppressSpeculation(*Load)) return false; // TODO: Extend this to match GEP with constant offsets. Value *PtrOp = Load->getPointerOperand()->stripPointerCasts(); assert(isa(PtrOp->getType()) && "Expected a pointer type"); - unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); + Type *ScalarTy = Scalar->getType(); uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); + unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0) return false; diff --git a/llvm/lib/WindowsManifest/CMakeLists.txt b/llvm/lib/WindowsManifest/CMakeLists.txt index 7ccc17ad577d3..0f597af3c36f8 100644 --- a/llvm/lib/WindowsManifest/CMakeLists.txt +++ b/llvm/lib/WindowsManifest/CMakeLists.txt @@ -1,23 +1,28 @@ +include(GetLibraryName) + +if(LLVM_ENABLE_LIBXML2) + set(imported_libs LibXml2::LibXml2) +endif() + add_llvm_component_library(LLVMWindowsManifest WindowsManifestMerger.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/WindowsManifest - ${Backtrace_INCLUDE_DIRS}) + ${Backtrace_INCLUDE_DIRS} + LINK_LIBS ${imported_libs}) -if(LIBXML2_LIBRARIES) - target_link_libraries(LLVMWindowsManifest PUBLIC ${LIBXML2_LIBRARIES}) - - get_filename_component(xml2_library ${LIBXML2_LIBRARIES} NAME) - if (CMAKE_STATIC_LIBRARY_PREFIX AND - xml2_library MATCHES "^${CMAKE_STATIC_LIBRARY_PREFIX}.*${CMAKE_STATIC_LIBRARY_SUFFIX}$") - string(REGEX REPLACE "^${CMAKE_STATIC_LIBRARY_PREFIX}" "" xml2_library ${xml2_library}) - string(REGEX REPLACE "${CMAKE_STATIC_LIBRARY_SUFFIX}$" "" xml2_library ${xml2_library}) - elseif (CMAKE_SHARED_LIBRARY_PREFIX AND - xml2_library MATCHES "^${CMAKE_SHARED_LIBRARY_PREFIX}.*${CMAKE_SHARED_LIBRARY_SUFFIX}$") - string(REGEX REPLACE "^${CMAKE_SHARED_LIBRARY_PREFIX}" "" xml2_library ${xml2_library}) - string(REGEX REPLACE "${CMAKE_SHARED_LIBRARY_SUFFIX}$" "" xml2_library ${xml2_library}) +# This block is only needed for llvm-config. When we deprecate llvm-config and +# move to using CMake export, this block can be removed. +if(LLVM_ENABLE_LIBXML2) + # CMAKE_BUILD_TYPE is only meaningful to single-configuration generators. + if(CMAKE_BUILD_TYPE) + string(TOUPPER ${CMAKE_BUILD_TYPE} build_type) + get_property(libxml2_library TARGET LibXml2::LibXml2 PROPERTY LOCATION_${build_type}) + endif() + if(NOT zlib_library) + get_property(libxml2_library TARGET LibXml2::LibXml2 PROPERTY LOCATION) endif() - set_property(TARGET LLVMWindowsManifest PROPERTY - LLVM_SYSTEM_LIBS ${xml2_library}) + get_library_name(${libxml2_library} libxml2_library) + set_property(TARGET LLVMWindowsManifest PROPERTY LLVM_SYSTEM_LIBS ${libxml2_library}) endif() diff --git a/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp b/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp index 031a963cd3b0c..6af7bc699d056 100644 --- a/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp +++ b/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp @@ -16,7 +16,7 @@ #include -#if LLVM_LIBXML2_ENABLED +#if LLVM_ENABLE_LIBXML2 #include #endif @@ -41,7 +41,7 @@ class WindowsManifestMerger::WindowsManifestMergerImpl { private: static void errorCallback(void *Ctx, const char *Format, ...); Error getParseError(); -#if LLVM_LIBXML2_ENABLED +#if LLVM_ENABLE_LIBXML2 xmlDocPtr CombinedDoc = nullptr; std::vector MergedDocs; @@ -56,7 +56,7 @@ class WindowsManifestMerger::WindowsManifestMergerImpl { bool ParseErrorOccurred = false; }; -#if LLVM_LIBXML2_ENABLED +#if LLVM_ENABLE_LIBXML2 static constexpr std::pair MtNsHrefsPrefixes[] = { {"urn:schemas-microsoft-com:asm.v1", "ms_asmv1"}, diff --git a/llvm/test/Analysis/AliasSet/guards.ll b/llvm/test/Analysis/AliasSet/guards.ll index 3a162b5c21c8d..f822290917c85 100644 --- a/llvm/test/Analysis/AliasSet/guards.ll +++ b/llvm/test/Analysis/AliasSet/guards.ll @@ -1,4 +1,5 @@ ; RUN: opt -basic-aa -print-alias-sets -S -o - < %s 2>&1 | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa -passes=print-alias-sets -S -o - < %s 2>&1 | FileCheck %s declare void @llvm.experimental.guard(i1, ...) ; CHECK: Alias sets for function 'test0': diff --git a/llvm/test/Analysis/BasicAA/intrinsics.ll b/llvm/test/Analysis/BasicAA/intrinsics.ll index 9cc55ca7a3dec..679beefac5284 100644 --- a/llvm/test/Analysis/BasicAA/intrinsics.ll +++ b/llvm/test/Analysis/BasicAA/intrinsics.ll @@ -23,5 +23,5 @@ declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) nounwind ; CHECK: attributes #0 = { argmemonly nounwind readonly willreturn } -; CHECK: attributes #1 = { argmemonly nounwind willreturn } +; CHECK: attributes #1 = { argmemonly nounwind willreturn writeonly } ; CHECK: attributes [[ATTR]] = { nounwind } diff --git a/llvm/test/Analysis/CostModel/X86/div.ll b/llvm/test/Analysis/CostModel/X86/div.ll index fb3b705fd186d..4bead926bb90b 100644 --- a/llvm/test/Analysis/CostModel/X86/div.ll +++ b/llvm/test/Analysis/CostModel/X86/div.ll @@ -450,62 +450,24 @@ define i32 @udiv_const() { } define i32 @sdiv_uniformconst() { -; SSE2-LABEL: 'sdiv_uniformconst' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4i32 = sdiv <4 x i32> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = sdiv <8 x i32> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = sdiv <16 x i32> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSSE3-LABEL: 'sdiv_uniformconst' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4i32 = sdiv <4 x i32> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = sdiv <8 x i32> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = sdiv <16 x i32> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'sdiv_uniformconst' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = sdiv <8 x i32> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = sdiv <16 x i32> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'sdiv_uniformconst' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = sdiv <8 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = sdiv <16 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'sdiv_uniformconst' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7 @@ -513,9 +475,9 @@ define i32 @sdiv_uniformconst() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = sdiv <8 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = sdiv <16 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8i32 = sdiv <8 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16i32 = sdiv <16 x i32> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, @@ -532,9 +494,9 @@ define i32 @sdiv_uniformconst() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = sdiv <8 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16i32 = sdiv <16 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = sdiv <8 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i32 = sdiv <16 x i32> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, @@ -551,9 +513,9 @@ define i32 @sdiv_uniformconst() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = sdiv <8 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = sdiv <16 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = sdiv <8 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = sdiv <16 x i32> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, @@ -570,9 +532,9 @@ define i32 @sdiv_uniformconst() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = sdiv <8 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = sdiv <16 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = sdiv <8 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = sdiv <16 x i32> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, @@ -589,9 +551,9 @@ define i32 @sdiv_uniformconst() { ; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = sdiv <8 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = sdiv <16 x i32> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = sdiv <8 x i32> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = sdiv <16 x i32> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 ; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, @@ -608,9 +570,9 @@ define i32 @sdiv_uniformconst() { ; BTVER2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = sdiv <8 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = sdiv <16 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8i32 = sdiv <8 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16i32 = sdiv <16 x i32> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, @@ -651,9 +613,9 @@ define i32 @udiv_uniformconst() { ; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = udiv <8 x i32> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = udiv <16 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8i32 = udiv <8 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16i32 = udiv <16 x i32> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, @@ -670,9 +632,9 @@ define i32 @udiv_uniformconst() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = udiv <8 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = udiv <16 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = udiv <8 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = udiv <16 x i32> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, @@ -689,9 +651,9 @@ define i32 @udiv_uniformconst() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = udiv <8 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16i32 = udiv <16 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = udiv <8 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16i32 = udiv <16 x i32> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, @@ -708,9 +670,9 @@ define i32 @udiv_uniformconst() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = udiv <8 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = udiv <16 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = udiv <8 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16i32 = udiv <16 x i32> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, @@ -727,9 +689,9 @@ define i32 @udiv_uniformconst() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = udiv <8 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = udiv <16 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = udiv <8 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16i32 = udiv <16 x i32> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, @@ -746,9 +708,9 @@ define i32 @udiv_uniformconst() { ; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = udiv <8 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = udiv <16 x i32> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8i32 = udiv <8 x i32> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16i32 = udiv <16 x i32> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, @@ -765,9 +727,9 @@ define i32 @udiv_uniformconst() { ; BTVER2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = udiv <8 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = udiv <16 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = udiv <8 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = udiv <16 x i32> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, diff --git a/llvm/test/Analysis/CostModel/X86/rem.ll b/llvm/test/Analysis/CostModel/X86/rem.ll index 7942cda3725f3..30dd9a7a4f13f 100644 --- a/llvm/test/Analysis/CostModel/X86/rem.ll +++ b/llvm/test/Analysis/CostModel/X86/rem.ll @@ -450,62 +450,24 @@ define i32 @urem_const() { } define i32 @srem_uniformconst() { -; SSE2-LABEL: 'srem_uniformconst' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = srem <16 x i32> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSSE3-LABEL: 'srem_uniformconst' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = srem <16 x i32> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'srem_uniformconst' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = srem <16 x i32> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'srem_uniformconst' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = srem <8 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i32 = srem <16 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'srem_uniformconst' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 @@ -513,9 +475,9 @@ define i32 @srem_uniformconst() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = srem <8 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = srem <16 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8i32 = srem <8 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16i32 = srem <16 x i32> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, @@ -532,9 +494,9 @@ define i32 @srem_uniformconst() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16i32 = srem <16 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = srem <8 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i32 = srem <16 x i32> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, @@ -551,9 +513,9 @@ define i32 @srem_uniformconst() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = srem <16 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = srem <8 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = srem <16 x i32> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, @@ -570,9 +532,9 @@ define i32 @srem_uniformconst() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = srem <16 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = srem <8 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = srem <16 x i32> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, @@ -583,53 +545,15 @@ define i32 @srem_uniformconst() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; SLM-LABEL: 'srem_uniformconst' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = srem <16 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; GLM-LABEL: 'srem_uniformconst' -; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 -; GLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = srem <16 x i32> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 -; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7 -; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; ; BTVER2-LABEL: 'srem_uniformconst' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = srem <8 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = srem <16 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8i32 = srem <8 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16i32 = srem <16 x i32> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, @@ -670,9 +594,9 @@ define i32 @urem_uniformconst() { ; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7 -; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = urem <8 x i32> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = urem <16 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8i32 = urem <8 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16i32 = urem <16 x i32> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7 ; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = urem <16 x i16> undef, @@ -689,9 +613,9 @@ define i32 @urem_uniformconst() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8i32 = urem <8 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V16i32 = urem <16 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = urem <8 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i32 = urem <16 x i32> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, @@ -708,9 +632,9 @@ define i32 @urem_uniformconst() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16i32 = urem <16 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = urem <8 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i32 = urem <16 x i32> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, @@ -727,9 +651,9 @@ define i32 @urem_uniformconst() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = urem <16 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = urem <8 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = urem <16 x i32> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, @@ -746,9 +670,9 @@ define i32 @urem_uniformconst() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = urem <16 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = urem <8 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = urem <16 x i32> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, @@ -765,9 +689,9 @@ define i32 @urem_uniformconst() { ; BTVER2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8i32 = urem <8 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V16i32 = urem <16 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = urem <8 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i32 = urem <16 x i32> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, diff --git a/llvm/test/Analysis/CostModel/X86/vdiv-cost.ll b/llvm/test/Analysis/CostModel/X86/vdiv-cost.ll index d87d21c487d84..8552509daeced 100644 --- a/llvm/test/Analysis/CostModel/X86/vdiv-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/vdiv-cost.ll @@ -10,7 +10,7 @@ define <4 x i32> @test1(<4 x i32> %a) { ; CHECK-LABEL: 'test1' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = udiv <4 x i32> %a, +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %div = udiv <4 x i32> %a, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div ; %div = udiv <4 x i32> %a, @@ -19,19 +19,19 @@ define <4 x i32> @test1(<4 x i32> %a) { define <8 x i32> @test2(<8 x i32> %a) { ; SSE-LABEL: 'test2' -; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %div = udiv <8 x i32> %a, +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %div = udiv <8 x i32> %a, ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; ; AVX1-LABEL: 'test2' -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %div = udiv <8 x i32> %a, +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %div = udiv <8 x i32> %a, ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; ; AVX2-LABEL: 'test2' -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = udiv <8 x i32> %a, +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %div = udiv <8 x i32> %a, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; ; AVX512-LABEL: 'test2' -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = udiv <8 x i32> %a, +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %div = udiv <8 x i32> %a, ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; %div = udiv <8 x i32> %a, @@ -108,53 +108,29 @@ define <16 x i8> @test7(<16 x i8> %a) { } define <4 x i32> @test8(<4 x i32> %a) { -; SSE2-LABEL: 'test8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %div = sdiv <4 x i32> %a, -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div -; -; SSSE3-LABEL: 'test8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %div = sdiv <4 x i32> %a, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div -; -; SSE42-LABEL: 'test8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <4 x i32> %a, -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div -; -; AVX-LABEL: 'test8' -; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <4 x i32> %a, -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div -; -; AVX512-LABEL: 'test8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <4 x i32> %a, -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div +; CHECK-LABEL: 'test8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %div = sdiv <4 x i32> %a, +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div ; %div = sdiv <4 x i32> %a, ret <4 x i32> %div } define <8 x i32> @test9(<8 x i32> %a) { -; SSE2-LABEL: 'test9' -; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %div = sdiv <8 x i32> %a, -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div -; -; SSSE3-LABEL: 'test9' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %div = sdiv <8 x i32> %a, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div -; -; SSE42-LABEL: 'test9' -; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %div = sdiv <8 x i32> %a, -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div +; SSE-LABEL: 'test9' +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %div = sdiv <8 x i32> %a, +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; ; AVX1-LABEL: 'test9' -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %div = sdiv <8 x i32> %a, +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %div = sdiv <8 x i32> %a, ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; ; AVX2-LABEL: 'test9' -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <8 x i32> %a, +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %div = sdiv <8 x i32> %a, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; ; AVX512-LABEL: 'test9' -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <8 x i32> %a, +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %div = sdiv <8 x i32> %a, ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; %div = sdiv <8 x i32> %a, diff --git a/llvm/test/Analysis/DemandedBits/add.ll b/llvm/test/Analysis/DemandedBits/add.ll index 01673f82c2b36..dfd54525d0740 100644 --- a/llvm/test/Analysis/DemandedBits/add.ll +++ b/llvm/test/Analysis/DemandedBits/add.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -demanded-bits -analyze < %s | FileCheck %s +; RUN: opt -S -demanded-bits -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -S -disable-output -passes="print" < %s 2>&1 | FileCheck %s ; CHECK-DAG: DemandedBits: 0x1e for %1 = and i32 %a, 9 diff --git a/llvm/test/Analysis/DemandedBits/basic.ll b/llvm/test/Analysis/DemandedBits/basic.ll index 6f44465315e63..a05d3804156a3 100644 --- a/llvm/test/Analysis/DemandedBits/basic.ll +++ b/llvm/test/Analysis/DemandedBits/basic.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -demanded-bits -analyze < %s | FileCheck %s +; RUN: opt -S -demanded-bits -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -S -disable-output -passes="print" < %s 2>&1 | FileCheck %s ; CHECK-DAG: DemandedBits: 0xff for %1 = add nsw i32 %a, 5 diff --git a/llvm/test/Analysis/DemandedBits/intrinsics.ll b/llvm/test/Analysis/DemandedBits/intrinsics.ll index 6987f14f8b1ba..ec78178ea22dc 100644 --- a/llvm/test/Analysis/DemandedBits/intrinsics.ll +++ b/llvm/test/Analysis/DemandedBits/intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -demanded-bits -analyze < %s | FileCheck %s +; RUN: opt -S -demanded-bits -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -S -disable-output -passes="print" < %s 2>&1 | FileCheck %s ; CHECK-DAG: DemandedBits: 0xff000000 for %1 = or i32 %x, 1 diff --git a/llvm/test/Analysis/DemandedBits/vectors.ll b/llvm/test/Analysis/DemandedBits/vectors.ll index 36cde05fb7c62..a7835ca799bca 100644 --- a/llvm/test/Analysis/DemandedBits/vectors.ll +++ b/llvm/test/Analysis/DemandedBits/vectors.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -demanded-bits -analyze < %s | FileCheck %s +; RUN: opt -S -demanded-bits -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -S -disable-output -passes="print" < %s 2>&1 | FileCheck %s ; CHECK-DAG: DemandedBits: 0xff00 for %x = or <2 x i32> %a, zeroinitializer diff --git a/llvm/test/Analysis/DependenceAnalysis/AA.ll b/llvm/test/Analysis/DependenceAnalysis/AA.ll index efb5c8d1ef031..f74c331668453 100644 --- a/llvm/test/Analysis/DependenceAnalysis/AA.ll +++ b/llvm/test/Analysis/DependenceAnalysis/AA.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" \ ; RUN: "-aa-pipeline=basic-aa,tbaa" 2>&1 | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -tbaa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -tbaa -da | FileCheck %s ; CHECK-LABEL: 'Dependence Analysis' for function 'test_no_noalias' ; CHECK: da analyze - none! diff --git a/llvm/test/Analysis/DependenceAnalysis/Banerjee.ll b/llvm/test/Analysis/DependenceAnalysis/Banerjee.ll index 06fa7ad06983f..9f1a2de727e2a 100644 --- a/llvm/test/Analysis/DependenceAnalysis/Banerjee.ll +++ b/llvm/test/Analysis/DependenceAnalysis/Banerjee.ll @@ -1,9 +1,9 @@ ; RUN: opt < %s -disable-output -da-delinearize=false "-passes=print" \ ; RUN: -aa-pipeline=basic-aa 2>&1 | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da -da-delinearize=false | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da -da-delinearize=false | FileCheck %s ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -check-prefix=DELIN -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s -check-prefix=DELIN +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s -check-prefix=DELIN target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" diff --git a/llvm/test/Analysis/DependenceAnalysis/BasePtrBug.ll b/llvm/test/Analysis/DependenceAnalysis/BasePtrBug.ll index 7d1e8e22b956c..08a497c87a4ad 100644 --- a/llvm/test/Analysis/DependenceAnalysis/BasePtrBug.ll +++ b/llvm/test/Analysis/DependenceAnalysis/BasePtrBug.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s ; Test that the dependence analysis generates the correct results when using ; an aliased object that points to a different element in the same array. diff --git a/llvm/test/Analysis/DependenceAnalysis/Constraints.ll b/llvm/test/Analysis/DependenceAnalysis/Constraints.ll index d086bf37bb894..130e248ba7f83 100644 --- a/llvm/test/Analysis/DependenceAnalysis/Constraints.ll +++ b/llvm/test/Analysis/DependenceAnalysis/Constraints.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 -; RUN: opt < %s -analyze -basic-aa -da +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da ;; Check that this code doesn't abort. Test case is reduced version of lnt Polybench benchmark test case dynprog. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Analysis/DependenceAnalysis/Coupled.ll b/llvm/test/Analysis/DependenceAnalysis/Coupled.ll index 4e81589d3bd9c..3a24813e98def 100644 --- a/llvm/test/Analysis/DependenceAnalysis/Coupled.ll +++ b/llvm/test/Analysis/DependenceAnalysis/Coupled.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" diff --git a/llvm/test/Analysis/DependenceAnalysis/DADelin.ll b/llvm/test/Analysis/DependenceAnalysis/DADelin.ll index 40054aa2187ea..6faa1bccc9008 100644 --- a/llvm/test/Analysis/DependenceAnalysis/DADelin.ll +++ b/llvm/test/Analysis/DependenceAnalysis/DADelin.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8m.main-arm-none-eabi" diff --git a/llvm/test/Analysis/DependenceAnalysis/ExactRDIV.ll b/llvm/test/Analysis/DependenceAnalysis/ExactRDIV.ll index 40e12a784b18a..4c22e86ac8c80 100644 --- a/llvm/test/Analysis/DependenceAnalysis/ExactRDIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/ExactRDIV.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s ; ModuleID = 'ExactRDIV.bc' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Analysis/DependenceAnalysis/ExactSIV.ll b/llvm/test/Analysis/DependenceAnalysis/ExactSIV.ll index 720d4166ed1a5..b5f13ebe99161 100644 --- a/llvm/test/Analysis/DependenceAnalysis/ExactSIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/ExactSIV.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" diff --git a/llvm/test/Analysis/DependenceAnalysis/GCD.ll b/llvm/test/Analysis/DependenceAnalysis/GCD.ll index a3564b7f89553..99c5cef969785 100644 --- a/llvm/test/Analysis/DependenceAnalysis/GCD.ll +++ b/llvm/test/Analysis/DependenceAnalysis/GCD.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -check-prefix=DELIN -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s -check-prefix=DELIN +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s -check-prefix=DELIN target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" diff --git a/llvm/test/Analysis/DependenceAnalysis/Invariant.ll b/llvm/test/Analysis/DependenceAnalysis/Invariant.ll index 5aaa3868cf9af..20358768bc827 100644 --- a/llvm/test/Analysis/DependenceAnalysis/Invariant.ll +++ b/llvm/test/Analysis/DependenceAnalysis/Invariant.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s ; Test for a bug, which caused an assert when an invalid ; SCEVAddRecExpr is created in addToCoefficient. diff --git a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll index e222755dd8e45..5642c845a2902 100644 --- a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll +++ b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -analyze -basic-aa -da +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da ; RUN: opt < %s -passes="print" ; Test that the dependence analysis pass does seg-fault due to a null pointer diff --git a/llvm/test/Analysis/DependenceAnalysis/NonAffineExpr.ll b/llvm/test/Analysis/DependenceAnalysis/NonAffineExpr.ll index 2561df503913e..642cf67f394d4 100644 --- a/llvm/test/Analysis/DependenceAnalysis/NonAffineExpr.ll +++ b/llvm/test/Analysis/DependenceAnalysis/NonAffineExpr.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 -; RUN: opt < %s -analyze -basic-aa -da +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da ; ; CHECK: da analyze - consistent input [S S]! ; CHECK: da analyze - confused! diff --git a/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll b/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll index d1df4ef63b542..10f57d0fd0fa9 100644 --- a/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll +++ b/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -check-prefix=DELIN -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s -check-prefix=DELIN +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s -check-prefix=DELIN target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" diff --git a/llvm/test/Analysis/DependenceAnalysis/PR21585.ll b/llvm/test/Analysis/DependenceAnalysis/PR21585.ll index 6dd1403cd1354..d76e37a70dfea 100644 --- a/llvm/test/Analysis/DependenceAnalysis/PR21585.ll +++ b/llvm/test/Analysis/DependenceAnalysis/PR21585.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" \ ; RUN: "-aa-pipeline=basic-aa,globals-aa" 2>&1 | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -globals-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -globals-aa -da | FileCheck %s define void @i32_subscript(i32* %a) { entry: br label %for.body diff --git a/llvm/test/Analysis/DependenceAnalysis/Preliminary.ll b/llvm/test/Analysis/DependenceAnalysis/Preliminary.ll index 05848a61a7378..ef2757fbc0662 100644 --- a/llvm/test/Analysis/DependenceAnalysis/Preliminary.ll +++ b/llvm/test/Analysis/DependenceAnalysis/Preliminary.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" diff --git a/llvm/test/Analysis/DependenceAnalysis/Propagating.ll b/llvm/test/Analysis/DependenceAnalysis/Propagating.ll index 41640a0b4b657..fe8f40a4fc428 100644 --- a/llvm/test/Analysis/DependenceAnalysis/Propagating.ll +++ b/llvm/test/Analysis/DependenceAnalysis/Propagating.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" diff --git a/llvm/test/Analysis/DependenceAnalysis/Separability.ll b/llvm/test/Analysis/DependenceAnalysis/Separability.ll index bbbc0db4a609f..93803cf5c0694 100644 --- a/llvm/test/Analysis/DependenceAnalysis/Separability.ll +++ b/llvm/test/Analysis/DependenceAnalysis/Separability.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" diff --git a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll index 7063f20cd0c30..e6ddafdad96dd 100644 --- a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll +++ b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output -passes="print" \ ; RUN: -da-disable-delinearization-checks 2>&1 | FileCheck %s -; RUN: opt < %s -da -analyze -da-disable-delinearization-checks | FileCheck %s +; RUN: opt < %s -da -analyze -enable-new-pm=0 -da-disable-delinearization-checks | FileCheck %s ; CHECK-LABEL: t1 ; CHECK: da analyze - none! diff --git a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll index d783d2ec163fc..5dcba2252e303 100644 --- a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll +++ b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output -passes="print" \ ; RUN: -da-disable-delinearization-checks 2>&1 | FileCheck %s -; RUN: opt < %s -da -analyze -da-disable-delinearization-checks | FileCheck %s +; RUN: opt < %s -da -analyze -enable-new-pm=0 -da-disable-delinearization-checks | FileCheck %s ; CHECK-LABEL: t1 ; CHECK: da analyze - none! diff --git a/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll b/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll index 397ef8a2d3a03..be6b19ead51f7 100644 --- a/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" diff --git a/llvm/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll b/llvm/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll index 0151c7c78404e..6cdb0cacb4913 100644 --- a/llvm/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s ; ModuleID = 'SymbolicRDIV.bc' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Analysis/DependenceAnalysis/SymbolicSIV.ll b/llvm/test/Analysis/DependenceAnalysis/SymbolicSIV.ll index 7a37107baf913..46a0c27b5c5f1 100644 --- a/llvm/test/Analysis/DependenceAnalysis/SymbolicSIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/SymbolicSIV.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" diff --git a/llvm/test/Analysis/DependenceAnalysis/UsefulGEP.ll b/llvm/test/Analysis/DependenceAnalysis/UsefulGEP.ll index c2d7765b03230..9b3896fa395d7 100644 --- a/llvm/test/Analysis/DependenceAnalysis/UsefulGEP.ll +++ b/llvm/test/Analysis/DependenceAnalysis/UsefulGEP.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 -; RUN: opt < %s -analyze -basic-aa -da +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da ;; Check this doesn't crash. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll b/llvm/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll index 449cffc7cd036..8e0f516a6d5cd 100644 --- a/llvm/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s ; ModuleID = 'WeakCrossingSIV.bc' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll b/llvm/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll index af9c0bd8f2bb1..9007910b2e36a 100644 --- a/llvm/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s ; ModuleID = 'WeakZeroDstSIV.bc' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll b/llvm/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll index 70612a4b5c1c2..8b87c068edb3c 100644 --- a/llvm/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s ; ModuleID = 'WeakZeroSrcSIV.bc' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Analysis/DependenceAnalysis/ZIV.ll b/llvm/test/Analysis/DependenceAnalysis/ZIV.ll index 4e1ea0834e9b5..fe7d9c433f5d9 100644 --- a/llvm/test/Analysis/DependenceAnalysis/ZIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/ZIV.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ ; RUN: | FileCheck %s -; RUN: opt < %s -analyze -basic-aa -da | FileCheck %s +; RUN: opt < %s -analyze -enable-new-pm=0 -basic-aa -da | FileCheck %s ; ModuleID = 'ZIV.bc' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Analysis/Dominators/2006-10-02-BreakCritEdges.ll b/llvm/test/Analysis/Dominators/2006-10-02-BreakCritEdges.ll index c036fe22ab87e..6fa3fec0359e5 100644 --- a/llvm/test/Analysis/Dominators/2006-10-02-BreakCritEdges.ll +++ b/llvm/test/Analysis/Dominators/2006-10-02-BreakCritEdges.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -domtree -break-crit-edges -analyze -domtree | FileCheck %s +; RUN: opt < %s -domtree -break-crit-edges -analyze -domtree -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='require,break-crit-edges,print' -disable-output 2>&1| FileCheck %s ; PR932 diff --git a/llvm/test/Analysis/Dominators/basic.ll b/llvm/test/Analysis/Dominators/basic.ll index 353c3397b5da7..afa6f1e9a9b6b 100644 --- a/llvm/test/Analysis/Dominators/basic.ll +++ b/llvm/test/Analysis/Dominators/basic.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -domtree -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OLDPM +; RUN: opt < %s -domtree -analyze -enable-new-pm=0 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OLDPM ; RUN: opt < %s -disable-output -passes='print' 2>&1 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-NEWPM define void @test1() { diff --git a/llvm/test/Analysis/GlobalsModRef/comdat-ipo.ll b/llvm/test/Analysis/GlobalsModRef/comdat-ipo.ll index f251e01ca69ca..aeeebfd3aede3 100644 --- a/llvm/test/Analysis/GlobalsModRef/comdat-ipo.ll +++ b/llvm/test/Analysis/GlobalsModRef/comdat-ipo.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -basic-aa -globals-aa -gvn -S | FileCheck %s +; RUN: opt < %s -basic-aa -globals-aa -gvn -enable-new-pm=0 -S | FileCheck %s +; RUN: opt < %s -basic-aa -globals-aa -gvn -enable-new-pm=1 -S | FileCheck %s ; See PR26774 diff --git a/llvm/test/Analysis/GlobalsModRef/no-escape.ll b/llvm/test/Analysis/GlobalsModRef/no-escape.ll index 9d0f1053902f0..fc95b6ad63147 100644 --- a/llvm/test/Analysis/GlobalsModRef/no-escape.ll +++ b/llvm/test/Analysis/GlobalsModRef/no-escape.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -basic-aa -globals-aa -S -licm | FileCheck %s +; RUN: opt < %s -basic-aa -globals-aa -S -licm -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -basic-aa -globals-aa -S -licm -enable-new-pm=1 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" diff --git a/llvm/test/Analysis/Lint/get-active-lane-mask.ll b/llvm/test/Analysis/Lint/get-active-lane-mask.ll new file mode 100644 index 0000000000000..4ee344afe6665 --- /dev/null +++ b/llvm/test/Analysis/Lint/get-active-lane-mask.ll @@ -0,0 +1,39 @@ +; RUN: opt -lint -disable-output < %s 2>&1 | FileCheck %s + +define <4 x i1> @t1(i32 %IV) { +; +; CHECK: get_active_lane_mask: operand #2 must be greater than 0 +; CHECK-NEXT: %res = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %IV, i32 0) +; + %res = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %IV, i32 0) + ret <4 x i1> %res +} + +define <4 x i1> @t2(i32 %IV) { +; +; CHECK-NOT: get_active_lane_mask +; CHECK-NOT: call <4 x i1> @llvm.get.active.lane.mask +; + %res = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %IV, i32 1) + ret <4 x i1> %res +} + +define <4 x i1> @t3(i32 %IV) { +; +; CHECK-NOT: get_active_lane_mask +; CHECK-NOT: call <4 x i1> @llvm.get.active.lane.mask +; + %res = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %IV, i32 -1) + ret <4 x i1> %res +} + +define <4 x i1> @t4(i32 %IV, i32 %TC) { +; +; CHECK-NOT: get_active_lane_mask +; CHECK-NOT: call <4 x i1> @llvm.get.active.lane.mask +; + %res = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %IV, i32 %TC) + ret <4 x i1> %res +} + +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/Analysis/LoopAccessAnalysis/backward-dep-different-types.ll b/llvm/test/Analysis/LoopAccessAnalysis/backward-dep-different-types.ll index d8040a31a8dc3..7471adfb62399 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/backward-dep-different-types.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/backward-dep-different-types.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; In this loop just because we access A through different types (int, float) diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll index 7d3ac09dbb9c4..8d3bfca58eb33 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; for (unsigned i = 0; i < 100; i++) { diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll index 41e2a2904fb2f..8ad02e15ed73e 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; Check that loop-indepedent forward dependences are discovered properly. diff --git a/llvm/test/Analysis/LoopAccessAnalysis/independent-interleaved.ll b/llvm/test/Analysis/LoopAccessAnalysis/independent-interleaved.ll index fe56ea9ab5939..c4acdf248f93c 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/independent-interleaved.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/independent-interleaved.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -store-to-load-forwarding-conflict-detection=false -loop-accesses -analyze | FileCheck %s +; RUN: opt < %s -store-to-load-forwarding-conflict-detection=false -loop-accesses -analyze -enable-new-pm=0 | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -store-to-load-forwarding-conflict-detection=false -disable-output < %s 2>&1 | FileCheck %s ; This test checks that we prove the strided accesses to be independent before diff --git a/llvm/test/Analysis/LoopAccessAnalysis/memcheck-for-loop-invariant.ll b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-for-loop-invariant.ll index f06bb00ec64aa..0a592488f1534 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/memcheck-for-loop-invariant.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-for-loop-invariant.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; Handle memchecks involving loop-invariant addresses: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll index 01813c8a81041..6114b453fa911 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll @@ -1,4 +1,5 @@ -; RUN: opt -analyze --loop-accesses %s | FileCheck %s +; RUN: opt -analyze --loop-accesses %s -enable-new-pm=0 | FileCheck %s +; RUN: opt -passes=print-access-info %s -disable-output 2>&1 | FileCheck %s ; This test verifies run-time boundary check of memory accesses. ; The original loop: @@ -18,7 +19,7 @@ ; The loop was vectorized to 4, 32 byte memory access ( <4 x i64> ), ; store a value at *%op touched memory under *%src. -;CHECK: Printing analysis 'Loop Access Analysis' for function 'fastCopy' +;CHECK: function 'fastCopy': ;CHECK: (Low: %op High: (32 + %op)) ;CHECK: (Low: %src High: (32 + %src)) diff --git a/llvm/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll index 484f2b47b22a1..94034bfd6fbc0 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll @@ -1,4 +1,5 @@ -; RUN: opt -basic-aa -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -basic-aa -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s +; RUN: opt -passes=print-access-info %s -disable-output 2>&1 | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll index 60c2a3930b5c0..362a1f48be1e8 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze -S < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 -S < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; This is the test case from PR26314. diff --git a/llvm/test/Analysis/LoopAccessAnalysis/non-wrapping-pointer.ll b/llvm/test/Analysis/LoopAccessAnalysis/non-wrapping-pointer.ll index 99ba107ed09ea..73a981705c0d1 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/non-wrapping-pointer.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/non-wrapping-pointer.ll @@ -1,4 +1,4 @@ -; RUN: opt -basic-aa -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -basic-aa -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,require,loop(print-access-info)' -aa-pipeline='basic-aa' -disable-output < %s 2>&1 | FileCheck %s ; For this loop: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/nullptr.ll b/llvm/test/Analysis/LoopAccessAnalysis/nullptr.ll index 8fbf47304e800..1c2ac0c9b3b38 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/nullptr.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/nullptr.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; Test that the loop accesses are proven safe in this case. diff --git a/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll b/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll index 4528976a09e65..34dddbe5cc1b3 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll b/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll index a10b851bcd1a2..2109a4d0ec4b1 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pr31098.ll b/llvm/test/Analysis/LoopAccessAnalysis/pr31098.ll index 04b73828f5148..399a395e09315 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/pr31098.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/pr31098.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Analysis/LoopAccessAnalysis/resort-to-memchecks-only.ll b/llvm/test/Analysis/LoopAccessAnalysis/resort-to-memchecks-only.ll index 921fd4d06314d..8405b0399ffe3 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/resort-to-memchecks-only.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/resort-to-memchecks-only.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; We give up analyzing the dependences in this loop due to non-constant diff --git a/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll b/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll index 4285ef0f1170c..8113c8d7106b2 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; The runtime memory check code and the access grouping diff --git a/llvm/test/Analysis/LoopAccessAnalysis/safe-no-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/safe-no-checks.ll index 2a937cbe62f6e..647b509450b56 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/safe-no-checks.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/safe-no-checks.ll @@ -1,4 +1,4 @@ -; RUN: opt -basic-aa -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -basic-aa -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,require,loop(print-access-info)' -aa-pipeline='basic-aa' -disable-output < %s 2>&1 | FileCheck %s ; If the arrays don't alias this loop is safe with no memchecks: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/safe-with-dep-distance.ll b/llvm/test/Analysis/LoopAccessAnalysis/safe-with-dep-distance.ll index 910d49edbb181..9335a21c170e8 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/safe-with-dep-distance.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/safe-with-dep-distance.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; Analyze this loop: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll b/llvm/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll index 611e957168ffd..1b36ac156d22a 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-accesses -analyze | FileCheck -check-prefix=OLDPM %s +; RUN: opt < %s -loop-accesses -analyze -enable-new-pm=0 | FileCheck -check-prefix=OLDPM %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck -check-prefix=NEWPM %s ; Test to confirm LAA will find multiple stores to an invariant address in the diff --git a/llvm/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll b/llvm/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll index d21cc6926c3b1..123ccd62503b4 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-accesses -analyze | FileCheck %s +; RUN: opt < %s -loop-accesses -analyze -enable-new-pm=0 | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; Test to confirm LAA will not find store to invariant address. diff --git a/llvm/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll b/llvm/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll index b25d79b3d0394..e877ce03d8419 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-accesses -analyze | FileCheck %s +; RUN: opt < %s -loop-accesses -analyze -enable-new-pm=0 | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; Inner loop has a store to invariant address, but LAA does not need to identify diff --git a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll index 4fe6f9f704f71..fc9fe3da8e604 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-1.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-1.ll index 1204e8359a13a..1ac52a7cf8909 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-1.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-1.ll @@ -1,4 +1,4 @@ -; RUN: opt -basic-aa -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -basic-aa -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; In: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll index dc2232334a7b0..3fd1f72cdce3e 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll @@ -1,4 +1,4 @@ -; RUN: opt -basic-aa -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -basic-aa -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; This loop: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll b/llvm/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll index 7f42e2730c0dc..c05f8a394e2a7 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; Analyze this loop: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks.ll index 7fbed6fcc15cf..998e0005aa493 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s +; RUN: opt -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; Analyze this loop: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/wrapping-pointer-versioning.ll b/llvm/test/Analysis/LoopAccessAnalysis/wrapping-pointer-versioning.ll index 4c058b190d69f..5d26e834e309d 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/wrapping-pointer-versioning.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/wrapping-pointer-versioning.ll @@ -1,4 +1,4 @@ -; RUN: opt -basic-aa -loop-accesses -analyze < %s | FileCheck %s -check-prefix=LAA +; RUN: opt -basic-aa -loop-accesses -analyze -enable-new-pm=0 < %s | FileCheck %s -check-prefix=LAA ; RUN: opt -passes='require,require,require,loop(print-access-info)' -aa-pipeline='basic-aa' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=LAA ; RUN: opt -loop-versioning -S < %s | FileCheck %s -check-prefix=LV diff --git a/llvm/test/Other/2003-02-19-LoopInfoNestingBug.ll b/llvm/test/Analysis/LoopInfo/2003-02-19-LoopInfoNestingBug.ll similarity index 76% rename from llvm/test/Other/2003-02-19-LoopInfoNestingBug.ll rename to llvm/test/Analysis/LoopInfo/2003-02-19-LoopInfoNestingBug.ll index b807c4440008c..caa27b3c58ffd 100644 --- a/llvm/test/Other/2003-02-19-LoopInfoNestingBug.ll +++ b/llvm/test/Analysis/LoopInfo/2003-02-19-LoopInfoNestingBug.ll @@ -2,9 +2,10 @@ ; figure out that loop "Inner" should be nested inside of leep "LoopHeader", ; and instead nests it just inside loop "Top" ; -; RUN: opt < %s -analyze -loops | \ -; RUN: grep " Loop at depth 3 containing: %Inner
    " -; +; RUN: opt < %s -analyze -loops -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s + +; CHECK: Loop at depth 3 containing: %Inner
    define void @test() { br label %Top diff --git a/llvm/test/Analysis/MemorySSA/optimize-use.ll b/llvm/test/Analysis/MemorySSA/optimize-use.ll index ec0d5c3df1a3f..38ec971dbf539 100644 --- a/llvm/test/Analysis/MemorySSA/optimize-use.ll +++ b/llvm/test/Analysis/MemorySSA/optimize-use.ll @@ -22,22 +22,22 @@ entry: store i32 7, i32* %1, align 4 ; NOLIMIT: MemoryUse(3) MustAlias ; NOLIMIT-NEXT: %2 = load i32, i32* %0, align 4 -; LIMIT: MemoryUse(4) MayAlias +; LIMIT: MemoryUse(4) ; LIMIT-NEXT: %2 = load i32, i32* %0, align 4 %2 = load i32, i32* %0, align 4 ; NOLIMIT: MemoryUse(4) MustAlias ; NOLIMIT-NEXT: %3 = load i32, i32* %1, align 4 -; LIMIT: MemoryUse(4) MayAlias +; LIMIT: MemoryUse(4) ; LIMIT-NEXT: %3 = load i32, i32* %1, align 4 %3 = load i32, i32* %1, align 4 ; NOLIMIT: MemoryUse(3) MustAlias ; NOLIMIT-NEXT: %4 = load i32, i32* %0, align 4 -; LIMIT: MemoryUse(4) MayAlias +; LIMIT: MemoryUse(4) ; LIMIT-NEXT: %4 = load i32, i32* %0, align 4 %4 = load i32, i32* %0, align 4 ; NOLIMIT: MemoryUse(4) MustAlias ; NOLIMIT-NEXT: %5 = load i32, i32* %1, align 4 -; LIMIT: MemoryUse(4) MayAlias +; LIMIT: MemoryUse(4) ; LIMIT-NEXT: %5 = load i32, i32* %1, align 4 %5 = load i32, i32* %1, align 4 %add = add nsw i32 %3, %5 diff --git a/llvm/test/Analysis/MemorySSA/phi-translation.ll b/llvm/test/Analysis/MemorySSA/phi-translation.ll index 3909437b12303..4951c022f9fbd 100644 --- a/llvm/test/Analysis/MemorySSA/phi-translation.ll +++ b/llvm/test/Analysis/MemorySSA/phi-translation.ll @@ -25,7 +25,7 @@ if.end: ; CHECK: 3 = MemoryPhi({entry,1},{if.then,2}) ; NOLIMIT: MemoryUse(1) MayAlias ; NOLIMIT-NEXT: load i8, i8* %local, align 1 -; LIMIT: MemoryUse(3) MayAlias +; LIMIT: MemoryUse(3) ; LIMIT-NEXT: load i8, i8* %local, align 1 load i8, i8* %local, align 1 ret void @@ -68,7 +68,7 @@ phi.1: ; CHECK: 6 = MemoryPhi({phi.2,4},{phi.3,3}) ; NOLIMIT: MemoryUse(1) MayAlias ; NOLIMIT-NEXT: load i8, i8* %local -; LIMIT: MemoryUse(6) MayAlias +; LIMIT: MemoryUse(6) ; LIMIT-NEXT: load i8, i8* %local load i8, i8* %local ret void @@ -81,7 +81,7 @@ define void @cross_phi(i8* noalias %p1, i8* noalias %p2) { store i8 0, i8* %p1 ; NOLIMIT: MemoryUse(1) MustAlias ; NOLIMIT-NEXT: load i8, i8* %p1 -; LIMIT: MemoryUse(1) MayAlias +; LIMIT: MemoryUse(1) ; LIMIT-NEXT: load i8, i8* %p1 load i8, i8* %p1 br i1 undef, label %a, label %b @@ -116,7 +116,7 @@ e: ; 8 = MemoryPhi({c,4},{d,5}) ; NOLIMIT: MemoryUse(1) MustAlias ; NOLIMIT-NEXT: load i8, i8* %p1 -; LIMIT: MemoryUse(8) MayAlias +; LIMIT: MemoryUse(8) ; LIMIT-NEXT: load i8, i8* %p1 load i8, i8* %p1 ret void @@ -150,7 +150,7 @@ loop.3: store i8 2, i8* %p2 ; NOLIMIT: MemoryUse(1) MayAlias ; NOLIMIT-NEXT: load i8, i8* %p1 -; LIMIT: MemoryUse(4) MayAlias +; LIMIT: MemoryUse(4) ; LIMIT-NEXT: load i8, i8* %p1 load i8, i8* %p1 br i1 undef, label %loop.2, label %loop.1 @@ -179,7 +179,7 @@ if.then2: if.end: ; CHECK: 4 = MemoryPhi({while.cond,5},{if.then,1},{if.then2,2}) -; CHECK: MemoryUse(4) MayAlias +; CHECK: MemoryUse(4) ; CHECK-NEXT: load i8, i8* %p1 load i8, i8* %p1 ; CHECK: 3 = MemoryDef(4) @@ -187,7 +187,7 @@ if.end: store i8 2, i8* %p2 ; NOLIMIT: MemoryUse(4) MayAlias ; NOLIMIT-NEXT: load i8, i8* %p1 -; LIMIT: MemoryUse(3) MayAlias +; LIMIT: MemoryUse(3) ; LIMIT-NEXT: load i8, i8* %p1 load i8, i8* %p1 br label %while.cond @@ -212,11 +212,11 @@ for.body: ; preds = %entry, %for.inc %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %cmp1 = icmp eq i64 %indvars.iv, 0 %arrayidx2 = getelementptr inbounds i32, i32* %m_i_strides, i64 %indvars.iv -; CHECK: MemoryUse(4) MayAlias +; CHECK: MemoryUse(4) ; CHECK-NEXT: %0 = load i32, i32* %arrayidx2, align 4 %0 = load i32, i32* %arrayidx2, align 4 %arrayidx4 = getelementptr inbounds i32, i32* %eval_left_dims, i64 %indvars.iv -; CHECK: MemoryUse(4) MayAlias +; CHECK: MemoryUse(4) ; CHECK-NEXT: %1 = load i32, i32* %arrayidx4, align 4 %1 = load i32, i32* %arrayidx4, align 4 %mul = mul nsw i32 %1, %0 @@ -270,7 +270,7 @@ for.main.body: ; preds = %if.end220.if.then185_crit_edge, %for.bod %add199 = add nuw nsw i64 %nocontract_idx.0656, 1 %cmp200 = icmp eq i64 %nocontract_idx.0656, 0 %arrayidx.i559 = getelementptr inbounds %BigStruct, %BigStruct* %this, i64 0, i32 7, i32 0, i64 %nocontract_idx.0656 -; CHECK: MemoryUse(4) MayAlias +; CHECK: MemoryUse(4) ; CHECK-NEXT: %tmp21 = load i64, i64* %arrayidx.i559, align 8 %tmp21 = load i64, i64* %arrayidx.i559, align 8 %mul206 = mul nsw i64 %tmp21, %tmp21 @@ -298,7 +298,7 @@ define i32 @dont_merge_noalias_simple(i32* noalias %ptr) { ; CHECK-NEXT: store i16 1, i16* %s1.ptr, align 2 ; CHECK-LABEL: %for.body -; CHECK: ; MemoryUse(4) MayAlias +; CHECK: ; MemoryUse(4) ; CHECK-NEXT: %lv = load i16, i16* %arrayidx, align 2 entry: @@ -331,7 +331,7 @@ define i32 @dont_merge_noalias_complex(i32* noalias %ptr, i32* noalias %another) ; CHECK-NEXT: store i16 1, i16* %s1.ptr, align 2 ; CHECK-LABEL: %for.body -; CHECK: ; MemoryUse(7) MayAlias +; CHECK: ; MemoryUse(7) ; CHECK-NEXT: %lv = load i16, i16* %arrayidx, align 2 entry: @@ -369,3 +369,149 @@ for.end: ; preds = %for.body ret i32 0 } +declare i1 @should_exit(i32) readnone +declare void @init([32 x i32]*) + +; Test case for PR47498. +; %l.1 may read the result of `store i32 10, i32* %p.1` in %storebb, because +; after %storebb has been executed, %loop.1.header might be executed again. +; Make sure %l.1's defining access is the MemoryPhi in the block. +define void @dont_merge_noalias_complex_2(i32 %arg, i32 %arg1) { +; CHECK-LABEL: define void @dont_merge_noalias_complex_2( + +; CHECK-LABEL: entry: +; CHECK: ; 1 = MemoryDef(liveOnEntry) +; CHECK-NEXT: call void @init([32 x i32]* %tmp) + +; CHECK-LABEL: loop.1.header: +; CHECK-NEXT: ; 4 = MemoryPhi({entry,1},{loop.1.latch,3}) +; CHECK: ; MemoryUse(4) +; CHECK-NEXT: %l.1 = load i32, i32* %p.1, align 4 + +; CHECK-LABEL: loop.1.latch: +; CHECK-NEXT: ; 3 = MemoryPhi({loop.1.header,4},{storebb,2}) + +; CHECK-LABEL: storebb: +; CHECK-NEXT: %iv.add2 = add nuw nsw i64 %iv, 2 +; CHECK-NEXT: %p.2 = getelementptr inbounds [32 x i32], [32 x i32]* %tmp, i64 0, i64 %iv.add2 +; CHECK-NEXT: ; MemoryUse(4) +; CHECK-NEXT: %l.2 = load i32, i32* %p.2, align 4 +; CHECK-NEXT: ; 2 = MemoryDef(4) +; CHECK-NEXT: store i32 10, i32* %p.1, align 4 +entry: + %tmp = alloca [32 x i32], align 16 + call void @init([32 x i32]* %tmp) + br label %loop.1.header + +loop.1.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.1.latch ] + %iv.next = add nuw nsw i64 %iv, 1 + %p.1 = getelementptr inbounds [32 x i32], [32 x i32]* %tmp, i64 0, i64 %iv.next + %l.1 = load i32, i32* %p.1, align 4 + %tmp244 = icmp ult i64 %iv, 10 + br i1 %tmp244, label %loop.1.latch, label %storebb + +loop.1.latch: + %ec = call i1 @should_exit(i32 %l.1) + br i1 %ec, label %exit, label %loop.1.header + +storebb: + %iv.add2 = add nuw nsw i64 %iv, 2 + %p.2 = getelementptr inbounds [32 x i32], [32 x i32]* %tmp, i64 0, i64 %iv.add2 + %l.2 = load i32, i32* %p.2, align 4 + store i32 10, i32* %p.1, align 4 + br label %loop.1.latch + +exit: + ret void +} + +; CHECK-LABEL: define void @use_clobbered_by_def_in_loop() +define void @use_clobbered_by_def_in_loop() { +entry: + %nodeStack = alloca [12 x i32], align 4 + %0 = bitcast [12 x i32]* %nodeStack to i8* + call void @llvm.lifetime.start.p0i8(i64 48, i8* nonnull %0) + br i1 false, label %cleanup, label %while.cond + +; CHECK-LABEL: while.cond: +; CHECK-NEXT: ; [[NO6:.*]] = MemoryPhi({entry,1},{while.cond.backedge,5}) + +while.cond: ; preds = %entry, %while.cond.backedge + %depth.1 = phi i32 [ %depth.1.be, %while.cond.backedge ], [ 0, %entry ] + %cmp = icmp sgt i32 %depth.1, 0 + br i1 %cmp, label %land.rhs, label %while.end + +; CHECK-LABEL: land.rhs: +; CHECK-NEXT: %sub = add nsw i32 %depth.1, -1 +; CHECK-NEXT: %arrayidx = getelementptr inbounds [12 x i32], [12 x i32]* %nodeStack, i32 0, i32 %sub +; CHECK-NEXT: ; MemoryUse([[NO6]]) +; CHECK-NEXT: %1 = load i32, i32* %arrayidx, align 4 + +land.rhs: ; preds = %while.cond + %sub = add nsw i32 %depth.1, -1 + %arrayidx = getelementptr inbounds [12 x i32], [12 x i32]* %nodeStack, i32 0, i32 %sub + %1 = load i32, i32* %arrayidx, align 4 + br i1 true, label %while.body, label %while.end + +while.body: ; preds = %land.rhs + br i1 true, label %cleanup, label %while.cond.backedge + +while.cond.backedge: ; preds = %while.body, %while.end + %depth.1.be = phi i32 [ %sub, %while.body ], [ %inc, %while.end ] + br label %while.cond + +while.end: ; preds = %while.cond, %land.rhs + %arrayidx10 = getelementptr inbounds [12 x i32], [12 x i32]* %nodeStack, i32 0, i32 %depth.1 + store i32 %depth.1, i32* %arrayidx10, align 4 + %inc = add nsw i32 %depth.1, 1 + br i1 true, label %cleanup, label %while.cond.backedge + +cleanup: ; preds = %while.body, %while.end, %entry + call void @llvm.lifetime.end.p0i8(i64 48, i8* nonnull %0) + ret void +} + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) + +define void @another_loop_clobber() { +; CHECK-LABEL: void @another_loop_clobber +; CHECK-LABEL: loop.header: +; CHECK-NEXT: ; 4 = MemoryPhi({entry,1},{cond.read,3}) + +; CHECK-LABEL: cond.read: +; CHECK: ; MemoryUse(4) +; CHECK-NEXT: %use = load i32, i32* %ptr.1, align 4 +; CHECK-NEXT: ; 2 = MemoryDef(4) +; CHECK-NEXT: %c.2 = call i1 @cond(i32 %use) +; CHECK-NEXT: %ptr.10 = getelementptr inbounds [12 x i32], [12 x i32]* %nodeStack, i32 0, i32 %inc +; CHECK-NEXT: ; 3 = MemoryDef(2) +; CHECK-NEXT: store i32 10, i32* %ptr.2, align 4 + +entry: + %nodeStack = alloca [12 x i32], align 4 + %c.1 = call i1 @cond(i32 1) + br i1 %c.1, label %cleanup, label %loop.header + +loop.header: ; preds = %entry, %while.cond.backedge + %depth.1 = phi i32 [ %inc, %cond.read], [ 1, %entry ] + %cmp = icmp sgt i32 %depth.1, 0 + %inc = add nsw i32 %depth.1, 3 + %inc2 = add nsw i32 %depth.1, 6 + br i1 %cmp, label %cond.read, label %cleanup + +cond.read: ; preds = %while.cond + %ptr.1 = getelementptr inbounds [12 x i32], [12 x i32]* %nodeStack, i32 0, i32 %depth.1 + %ptr.2 = getelementptr inbounds [12 x i32], [12 x i32]* %nodeStack, i32 0, i32 %inc2 + %use = load i32, i32* %ptr.1, align 4 + %c.2 = call i1 @cond(i32 %use) + %ptr.10 = getelementptr inbounds [12 x i32], [12 x i32]* %nodeStack, i32 0, i32 %inc + store i32 10, i32* %ptr.2, align 4 + br i1 %c.2, label %loop.header, label %cleanup + +cleanup: + ret void +} + +declare i1 @cond(i32) diff --git a/llvm/test/Analysis/MemorySSA/pr43427.ll b/llvm/test/Analysis/MemorySSA/pr43427.ll index 3cb571505f730..00a015c98e8fd 100644 --- a/llvm/test/Analysis/MemorySSA/pr43427.ll +++ b/llvm/test/Analysis/MemorySSA/pr43427.ll @@ -20,7 +20,7 @@ ; CHECK-NEXT: [[NO7]] = MemoryPhi({lbl2,[[NO8]]},{for.end,2}) ; CHECK: cleanup: -; CHECK-NEXT: MemoryUse([[NO7]]) MayAlias +; CHECK-NEXT: MemoryUse([[NO7]]) ; CHECK-NEXT: %cleanup.dest = load i32, i32* undef, align 1 ; CHECK: lbl1.backedge: diff --git a/llvm/test/Analysis/MemorySSA/pr45927.ll b/llvm/test/Analysis/MemorySSA/pr45927.ll new file mode 100644 index 0000000000000..2dfa1e43d1f24 --- /dev/null +++ b/llvm/test/Analysis/MemorySSA/pr45927.ll @@ -0,0 +1,73 @@ +; RUN: opt -disable-output -loop-simplify -lcssa -licm -print-memoryssa < %s -enable-new-pm=0 2>&1 | FileCheck %s +; RUN: opt -disable-output -aa-pipeline=basic-aa -passes='loop-mssa(licm),print' < %s 2>&1 | FileCheck %s + + +@a = external dso_local global i16, align 1 +@c = external dso_local global i16, align 1 + +; CHECK-LABEL: @main() + +; CHECK: entry: +; CHECK-NEXT: %res.addr.i = alloca i16 +; CHECK-NEXT: ; MemoryUse(liveOnEntry) +; CHECK-NEXT: %c.promoted = load i16, i16* @c +; CHECK-NEXT: br label %for.cond.i + +; CHECK: for.cond.i: +; CHECK-NEXT: ; [[NO5:.*]] = MemoryPhi({entry,liveOnEntry},{f.exit.i,[[NO5]]}) +; CHECK-NEXT: %inc.i1 = phi i16 [ %inc.i, %f.exit.i ], [ %c.promoted, %entry ] +; CHECK-NEXT: %inc.i = add nsw i16 %inc.i1, 1 +; CHECK-NEXT: br i1 false, label %f.exit.thread.i, label %f.exit.i + +; CHECK: f.exit.thread.i: +; CHECK-NEXT: %inc.i.lcssa = phi i16 [ %inc.i, %for.cond.i ] +; CHECK-NEXT: ; [[NO6:.*]] = MemoryDef([[NO5]]) +; CHECK-NEXT: store i16 %inc.i.lcssa, i16* @c, align 1 +; CHECK-NEXT: ; [[NO2:.*]] = MemoryDef([[NO6]]) +; CHECK-NEXT: store i16 1, i16* @a, align 1 +; CHECK-NEXT: ; MemoryUse([[NO2]]) +; CHECK-NEXT: %tmp2 = load i16, i16* @c, align 1 +; CHECK-NEXT: br label %g.exit + +; CHECK: f.exit.i +; CHECK-NEXT: br i1 false, label %g.exit.loopexit, label %for.cond.i + +; CHECK: g.exit.loopexit: +; CHECK-NEXT: %inc.i.lcssa2 = phi i16 [ %inc.i, %f.exit.i ] +; CHECK-NEXT: ; [[NO7:.*]] = MemoryDef([[NO5]]) +; CHECK-NEXT: store i16 %inc.i.lcssa2, i16* @c, align 1 +; CHECK-NEXT: br label %g.exit + +; CHECK: g.exit +; CHECK-NEXT: ; [[NO4:.*]] = MemoryPhi({f.exit.thread.i,[[NO2]]},{g.exit.loopexit,[[NO7]]}) +; CHECK-NEXT: ; MemoryUse([[NO4]]) +; CHECK-NEXT: %tmp1 = load i16, i16* @c, align 1 +; CHECK-NEXT: ; [[NO3:.*]] = MemoryDef([[NO4]]) +; CHECK-NEXT: store i16 %tmp1, i16* %res.addr.i, align 1 +; CHECK-NEXT: ret void + +define dso_local void @main() { +entry: + %res.addr.i = alloca i16, align 1 + br label %for.cond.i + +for.cond.i: ; preds = %f.exit.i, %entry + %tmp0 = load i16, i16* @c, align 1 + %inc.i = add nsw i16 %tmp0, 1 + store i16 %inc.i, i16* @c, align 1 + br i1 false, label %f.exit.thread.i, label %f.exit.i + +f.exit.thread.i: ; preds = %for.cond.i + store i16 1, i16* @a, align 1 + %tmp2 = load i16, i16* @c, align 1 + br label %g.exit + +f.exit.i: ; preds = %for.cond.i + br i1 false, label %g.exit, label %for.cond.i + +g.exit: ; preds = %f.exit.i, %f.exit.thread.i + %tmp1 = load i16, i16* @c, align 1 + store i16 %tmp1, i16* %res.addr.i, align 1 + ret void +} + diff --git a/llvm/test/Analysis/PostDominators/infinite-loop.ll b/llvm/test/Analysis/PostDominators/infinite-loop.ll index 5796b8614dbde..5146fd6e21c0a 100644 --- a/llvm/test/Analysis/PostDominators/infinite-loop.ll +++ b/llvm/test/Analysis/PostDominators/infinite-loop.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -postdomtree -analyze | FileCheck %s +; RUN: opt < %s -postdomtree -analyze -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s @a = external global i32, align 4 diff --git a/llvm/test/Analysis/PostDominators/infinite-loop2.ll b/llvm/test/Analysis/PostDominators/infinite-loop2.ll index 139abb76e9512..de7413e40874f 100644 --- a/llvm/test/Analysis/PostDominators/infinite-loop2.ll +++ b/llvm/test/Analysis/PostDominators/infinite-loop2.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -postdomtree -analyze | FileCheck %s +; RUN: opt < %s -postdomtree -analyze -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s @a = external global i32, align 4 diff --git a/llvm/test/Analysis/PostDominators/infinite-loop3.ll b/llvm/test/Analysis/PostDominators/infinite-loop3.ll index f767df79d3a81..1536004ddc314 100644 --- a/llvm/test/Analysis/PostDominators/infinite-loop3.ll +++ b/llvm/test/Analysis/PostDominators/infinite-loop3.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -postdomtree -analyze | FileCheck %s +; RUN: opt < %s -postdomtree -analyze -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s @a = external global i32, align 4 diff --git a/llvm/test/Analysis/PostDominators/pr1098.ll b/llvm/test/Analysis/PostDominators/pr1098.ll index 1dae0c566f055..62aaf96e0f69f 100644 --- a/llvm/test/Analysis/PostDominators/pr1098.ll +++ b/llvm/test/Analysis/PostDominators/pr1098.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -postdomtree -analyze | FileCheck %s +; RUN: opt < %s -postdomtree -analyze -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s ; PR932 diff --git a/llvm/test/Analysis/PostDominators/pr24415.ll b/llvm/test/Analysis/PostDominators/pr24415.ll index 536c36848b9a5..aaee72758afa6 100644 --- a/llvm/test/Analysis/PostDominators/pr24415.ll +++ b/llvm/test/Analysis/PostDominators/pr24415.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -postdomtree -analyze | FileCheck %s +; RUN: opt < %s -postdomtree -analyze -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s ; Function Attrs: nounwind ssp uwtable @@ -15,4 +15,4 @@ define void @foo() { ; CHECK-NEXT: [1] <> ; CHECK-NEXT: [2] %2 ; CHECK-NEXT: [2] %1 -; CHECK-NEXT: [3] %0 \ No newline at end of file +; CHECK-NEXT: [3] %0 diff --git a/llvm/test/Analysis/PostDominators/pr6047_a.ll b/llvm/test/Analysis/PostDominators/pr6047_a.ll index 32ccbe61271f2..08153f9864c6a 100644 --- a/llvm/test/Analysis/PostDominators/pr6047_a.ll +++ b/llvm/test/Analysis/PostDominators/pr6047_a.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -postdomtree -analyze | FileCheck %s +; RUN: opt < %s -postdomtree -analyze -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s define internal void @f() { entry: br i1 undef, label %bb35, label %bb3.i diff --git a/llvm/test/Analysis/PostDominators/pr6047_b.ll b/llvm/test/Analysis/PostDominators/pr6047_b.ll index f1fbb648f5396..6b970b5cf7268 100644 --- a/llvm/test/Analysis/PostDominators/pr6047_b.ll +++ b/llvm/test/Analysis/PostDominators/pr6047_b.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -postdomtree -analyze | FileCheck %s +; RUN: opt < %s -postdomtree -analyze -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s define internal void @f() { entry: br i1 undef, label %a, label %bb3.i @@ -22,4 +23,4 @@ bb35: ; CHECK-NEXT: [3] %bb35.loopexit3 ; CHECK-NEXT: [2] %a ; CHECK-NEXT: [2] %entry -; CHECK-NEXT: [2] %bb3.i \ No newline at end of file +; CHECK-NEXT: [2] %bb3.i diff --git a/llvm/test/Analysis/PostDominators/pr6047_c.ll b/llvm/test/Analysis/PostDominators/pr6047_c.ll index 0eef023b418ca..d2a9516ce39c7 100644 --- a/llvm/test/Analysis/PostDominators/pr6047_c.ll +++ b/llvm/test/Analysis/PostDominators/pr6047_c.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -postdomtree -analyze | FileCheck %s +; RUN: opt < %s -postdomtree -analyze -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s define internal void @f() { entry: br i1 undef, label %bb35, label %bb3.i @@ -194,4 +195,4 @@ bb35: ; CHECK-NEXT: [3] %bb35.loopexit3 ; CHECK-NEXT: [2] %entry ; CHECK-NEXT: [2] %bb3.i -; CHECK-NEXT: Roots: %bb35 %bb3.i \ No newline at end of file +; CHECK-NEXT: Roots: %bb35 %bb3.i diff --git a/llvm/test/Analysis/PostDominators/pr6047_d.ll b/llvm/test/Analysis/PostDominators/pr6047_d.ll index 45ed86c27f869..93434af6ade83 100644 --- a/llvm/test/Analysis/PostDominators/pr6047_d.ll +++ b/llvm/test/Analysis/PostDominators/pr6047_d.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -postdomtree -analyze | FileCheck %s +; RUN: opt < %s -postdomtree -analyze -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s define internal void @f() { entry: br i1 1, label %a, label %b @@ -29,4 +30,4 @@ bb35: ; CHECK-NEXT: [3] %a ; CHECK-NEXT: [3] %entry ; CHECK-NEXT: [3] %b -; CHECK-NEXT: [2] %bb3.i \ No newline at end of file +; CHECK-NEXT: [2] %bb3.i diff --git a/llvm/test/Analysis/RegionInfo/bad_node_traversal.ll b/llvm/test/Analysis/RegionInfo/bad_node_traversal.ll index 00dd1207af9f0..7e658f6bda68d 100644 --- a/llvm/test/Analysis/RegionInfo/bad_node_traversal.ll +++ b/llvm/test/Analysis/RegionInfo/bad_node_traversal.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s +; RUN: opt -passes='print' -disable-output < %s 2>&1 | FileCheck %s ; While working on improvements to the region info analysis, this test ; case caused an incorrect region 3 => 8 to be detected. diff --git a/llvm/test/Analysis/RegionInfo/block_sort.ll b/llvm/test/Analysis/RegionInfo/block_sort.ll index ce1a48132901e..ace6849fc848c 100644 --- a/llvm/test/Analysis/RegionInfo/block_sort.ll +++ b/llvm/test/Analysis/RegionInfo/block_sort.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s -; RUN: opt -regions -stats -analyze < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s +; RUN: opt -regions -stats -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @BZ2_blockSort() nounwind { start: diff --git a/llvm/test/Analysis/RegionInfo/cond_loop.ll b/llvm/test/Analysis/RegionInfo/cond_loop.ll index 7dc311a299ce6..9fb2e22b49f1f 100644 --- a/llvm/test/Analysis/RegionInfo/cond_loop.ll +++ b/llvm/test/Analysis/RegionInfo/cond_loop.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @normal_condition() nounwind { "5": diff --git a/llvm/test/Analysis/RegionInfo/condition_complicated.ll b/llvm/test/Analysis/RegionInfo/condition_complicated.ll index e700503f8a48a..3c1507acf2211 100644 --- a/llvm/test/Analysis/RegionInfo/condition_complicated.ll +++ b/llvm/test/Analysis/RegionInfo/condition_complicated.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define internal fastcc zeroext i8 @handle_compress() nounwind { end165: diff --git a/llvm/test/Analysis/RegionInfo/condition_complicated_2.ll b/llvm/test/Analysis/RegionInfo/condition_complicated_2.ll index 584ebba6f04b4..12564b3abc4ea 100644 --- a/llvm/test/Analysis/RegionInfo/condition_complicated_2.ll +++ b/llvm/test/Analysis/RegionInfo/condition_complicated_2.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define internal fastcc void @compress() nounwind { end33: diff --git a/llvm/test/Analysis/RegionInfo/condition_forward_edge.ll b/llvm/test/Analysis/RegionInfo/condition_forward_edge.ll index cc9a3294e1451..76ae02882a036 100644 --- a/llvm/test/Analysis/RegionInfo/condition_forward_edge.ll +++ b/llvm/test/Analysis/RegionInfo/condition_forward_edge.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @normal_condition() nounwind { "0": diff --git a/llvm/test/Analysis/RegionInfo/condition_same_exit.ll b/llvm/test/Analysis/RegionInfo/condition_same_exit.ll index f3f443b2ba643..39787409198a5 100644 --- a/llvm/test/Analysis/RegionInfo/condition_same_exit.ll +++ b/llvm/test/Analysis/RegionInfo/condition_same_exit.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @normal_condition() nounwind { "0": diff --git a/llvm/test/Analysis/RegionInfo/condition_simple.ll b/llvm/test/Analysis/RegionInfo/condition_simple.ll index 67bdb506702eb..f4456825f797a 100644 --- a/llvm/test/Analysis/RegionInfo/condition_simple.ll +++ b/llvm/test/Analysis/RegionInfo/condition_simple.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @normal_condition() nounwind { "0": diff --git a/llvm/test/Analysis/RegionInfo/exit_in_condition.ll b/llvm/test/Analysis/RegionInfo/exit_in_condition.ll index 8a6d208f479ef..a8c3624ff4e65 100644 --- a/llvm/test/Analysis/RegionInfo/exit_in_condition.ll +++ b/llvm/test/Analysis/RegionInfo/exit_in_condition.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define internal fastcc zeroext i8 @handle_compress() nounwind { entry: diff --git a/llvm/test/Analysis/RegionInfo/infinite_loop.ll b/llvm/test/Analysis/RegionInfo/infinite_loop.ll index 35c82ce8e0419..f27bb1a461f60 100644 --- a/llvm/test/Analysis/RegionInfo/infinite_loop.ll +++ b/llvm/test/Analysis/RegionInfo/infinite_loop.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s +; RUN: opt -passes='print' -disable-output < %s 2>&1 ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s define void @normal_condition() nounwind { diff --git a/llvm/test/Analysis/RegionInfo/infinite_loop_2.ll b/llvm/test/Analysis/RegionInfo/infinite_loop_2.ll index 76ecdd833c426..8c2cf2578b06a 100644 --- a/llvm/test/Analysis/RegionInfo/infinite_loop_2.ll +++ b/llvm/test/Analysis/RegionInfo/infinite_loop_2.ll @@ -1,8 +1,12 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -passes='print' -disable-output < %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @normal_condition() nounwind { "0": diff --git a/llvm/test/Analysis/RegionInfo/infinite_loop_3.ll b/llvm/test/Analysis/RegionInfo/infinite_loop_3.ll index 2b1b643005c01..960730766cbd1 100644 --- a/llvm/test/Analysis/RegionInfo/infinite_loop_3.ll +++ b/llvm/test/Analysis/RegionInfo/infinite_loop_3.ll @@ -1,9 +1,14 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s + +; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @normal_condition() nounwind { "0": diff --git a/llvm/test/Analysis/RegionInfo/infinite_loop_4.ll b/llvm/test/Analysis/RegionInfo/infinite_loop_4.ll index c3ad028b0e558..8ff8e57783732 100644 --- a/llvm/test/Analysis/RegionInfo/infinite_loop_4.ll +++ b/llvm/test/Analysis/RegionInfo/infinite_loop_4.ll @@ -1,8 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s + +; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @normal_condition() nounwind { "0": diff --git a/llvm/test/Analysis/RegionInfo/infinite_loop_5_a.ll b/llvm/test/Analysis/RegionInfo/infinite_loop_5_a.ll index bf56add87ac11..76f7b247c9664 100644 --- a/llvm/test/Analysis/RegionInfo/infinite_loop_5_a.ll +++ b/llvm/test/Analysis/RegionInfo/infinite_loop_5_a.ll @@ -1,4 +1,5 @@ -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s +; RUN: opt -passes='print' -disable-output < %s 2>&1 | FileCheck %s define void @normal_condition() nounwind { "0": diff --git a/llvm/test/Analysis/RegionInfo/infinite_loop_5_b.ll b/llvm/test/Analysis/RegionInfo/infinite_loop_5_b.ll index d8602054cd007..9a5ff40cecc42 100644 --- a/llvm/test/Analysis/RegionInfo/infinite_loop_5_b.ll +++ b/llvm/test/Analysis/RegionInfo/infinite_loop_5_b.ll @@ -1,4 +1,5 @@ -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s +; RUN: opt -passes='print' -disable-output < %s 2>&1 | FileCheck %s define void @normal_condition() nounwind { "0": diff --git a/llvm/test/Analysis/RegionInfo/infinite_loop_5_c.ll b/llvm/test/Analysis/RegionInfo/infinite_loop_5_c.ll index 0508d0a45bda5..fe2c29a72613a 100644 --- a/llvm/test/Analysis/RegionInfo/infinite_loop_5_c.ll +++ b/llvm/test/Analysis/RegionInfo/infinite_loop_5_c.ll @@ -1,4 +1,5 @@ -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s +; RUN: opt -passes='print' -disable-output < %s 2>&1 | FileCheck %s define void @normal_condition() nounwind { "0": diff --git a/llvm/test/Analysis/RegionInfo/loop_with_condition.ll b/llvm/test/Analysis/RegionInfo/loop_with_condition.ll index 244f253d25df5..1965fed8ee2a6 100644 --- a/llvm/test/Analysis/RegionInfo/loop_with_condition.ll +++ b/llvm/test/Analysis/RegionInfo/loop_with_condition.ll @@ -1,11 +1,14 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @normal_condition() nounwind { "0": diff --git a/llvm/test/Analysis/RegionInfo/loops_1.ll b/llvm/test/Analysis/RegionInfo/loops_1.ll index 91023198ea296..39f59bf197148 100644 --- a/llvm/test/Analysis/RegionInfo/loops_1.ll +++ b/llvm/test/Analysis/RegionInfo/loops_1.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define internal fastcc zeroext i8 @loops_1() nounwind { entry: diff --git a/llvm/test/Analysis/RegionInfo/loops_2.ll b/llvm/test/Analysis/RegionInfo/loops_2.ll index 80cd34251d7e6..3973973381766 100644 --- a/llvm/test/Analysis/RegionInfo/loops_2.ll +++ b/llvm/test/Analysis/RegionInfo/loops_2.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @meread_() nounwind { entry: diff --git a/llvm/test/Analysis/RegionInfo/mix_1.ll b/llvm/test/Analysis/RegionInfo/mix_1.ll index a462119575a79..7637f59d1375c 100644 --- a/llvm/test/Analysis/RegionInfo/mix_1.ll +++ b/llvm/test/Analysis/RegionInfo/mix_1.ll @@ -1,11 +1,14 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @a_linear_impl_fig_1() nounwind { "0": diff --git a/llvm/test/Analysis/RegionInfo/multiple_exiting_edge.ll b/llvm/test/Analysis/RegionInfo/multiple_exiting_edge.ll index 8de6472299428..0c3860ca3df92 100644 --- a/llvm/test/Analysis/RegionInfo/multiple_exiting_edge.ll +++ b/llvm/test/Analysis/RegionInfo/multiple_exiting_edge.ll @@ -1,5 +1,7 @@ -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -passes='print' -print-region-style=bb -disable-output < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn -disable-output < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @normal_condition_0() nounwind { bb38: ; preds = %bb34, %bb34, %bb37 diff --git a/llvm/test/Analysis/RegionInfo/nested_loops.ll b/llvm/test/Analysis/RegionInfo/nested_loops.ll index 5d47d792cd924..980b52460ad40 100644 --- a/llvm/test/Analysis/RegionInfo/nested_loops.ll +++ b/llvm/test/Analysis/RegionInfo/nested_loops.ll @@ -1,11 +1,14 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define internal fastcc zeroext i8 @handle_compress() nounwind { entry: diff --git a/llvm/test/Analysis/RegionInfo/next.ll b/llvm/test/Analysis/RegionInfo/next.ll index 03aa53e59a490..5976ecadad220 100644 --- a/llvm/test/Analysis/RegionInfo/next.ll +++ b/llvm/test/Analysis/RegionInfo/next.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt -passes='print' -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @MAIN__() nounwind { entry: diff --git a/llvm/test/Analysis/RegionInfo/outgoing_edge.ll b/llvm/test/Analysis/RegionInfo/outgoing_edge.ll index 39e1a39d7e5b5..db4932f831c6a 100644 --- a/llvm/test/Analysis/RegionInfo/outgoing_edge.ll +++ b/llvm/test/Analysis/RegionInfo/outgoing_edge.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s ; While working on improvements to the region info analysis, this test diff --git a/llvm/test/Analysis/RegionInfo/outgoing_edge_1.ll b/llvm/test/Analysis/RegionInfo/outgoing_edge_1.ll index 6f51131a188c5..7f723cd6d4e25 100644 --- a/llvm/test/Analysis/RegionInfo/outgoing_edge_1.ll +++ b/llvm/test/Analysis/RegionInfo/outgoing_edge_1.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s ; While working on improvements to region info analysis, this test diff --git a/llvm/test/Analysis/RegionInfo/paper.ll b/llvm/test/Analysis/RegionInfo/paper.ll index bc0fb18a0e276..31ce58dc7d8c9 100644 --- a/llvm/test/Analysis/RegionInfo/paper.ll +++ b/llvm/test/Analysis/RegionInfo/paper.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define void @a_linear_impl_fig_1() nounwind { "0": diff --git a/llvm/test/Analysis/RegionInfo/two_loops_same_header.ll b/llvm/test/Analysis/RegionInfo/two_loops_same_header.ll index d230d76440f8c..8c6546d2ced5c 100644 --- a/llvm/test/Analysis/RegionInfo/two_loops_same_header.ll +++ b/llvm/test/Analysis/RegionInfo/two_loops_same_header.ll @@ -1,10 +1,13 @@ ; REQUIRES: asserts -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt -regions -stats -disable-output < %s 2>&1 | FileCheck -check-prefix=STAT %s -; RUN: opt -regions -print-region-style=bb -analyze < %s 2>&1 | FileCheck -check-prefix=BBIT %s -; RUN: opt -regions -print-region-style=rn -analyze < %s 2>&1 | FileCheck -check-prefix=RNIT %s +; RUN: opt -regions -print-region-style=bb -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -regions -print-region-style=rn -analyze -enable-new-pm=0 < %s 2>&1 | FileCheck -check-prefix=RNIT %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s +; RUN: opt < %s -passes='print' -stats 2>&1 | FileCheck -check-prefix=STAT %s +; RUN: opt -passes='print' -print-region-style=bb < %s 2>&1 | FileCheck -check-prefix=BBIT %s +; RUN: opt -passes='print' -print-region-style=rn < %s 2>&1 | FileCheck -check-prefix=RNIT %s define internal fastcc zeroext i8 @handle_compress() nounwind { entry: diff --git a/llvm/test/Analysis/RegionInfo/unreachable_bb.ll b/llvm/test/Analysis/RegionInfo/unreachable_bb.ll index 5dd1be958e71a..6268fff522690 100644 --- a/llvm/test/Analysis/RegionInfo/unreachable_bb.ll +++ b/llvm/test/Analysis/RegionInfo/unreachable_bb.ll @@ -1,4 +1,4 @@ -; RUN: opt -regions -analyze < %s | FileCheck %s +; RUN: opt -regions -analyze -enable-new-pm=0 < %s | FileCheck %s ; RUN: opt < %s -passes='print' 2>&1 | FileCheck %s ; We should not crash if there are some bbs that are not reachable. diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll new file mode 100644 index 0000000000000..0bbb8aace805f --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll @@ -0,0 +1,55 @@ +; RUN: opt -analyze -scalar-evolution %s | FileCheck %s + +; Test case for PR40961. The loop guard limit the max backedge-taken count. + +define void @test_guard_less_than_16(i32* nocapture %a, i64 %i) { +; CHECK-LABEL: Determining loop execution counts for: @test_guard_less_than_16 +; CHECK-NEXT: Loop %loop: backedge-taken count is (15 + (-1 * %i)) +; CHECK-NEXT: Loop %loop: max backedge-taken count is -1 +; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is (15 + (-1 * %i)) +; +entry: + %cmp3 = icmp ult i64 %i, 16 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ %i, %entry ] + %idx = getelementptr inbounds i32, i32* %a, i64 %iv + store i32 1, i32* %idx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 16 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +; Test case for PR47247. Both the guard condition and the assume limit the +; max backedge-taken count. + +define void @test_guard_and_assume(i32* nocapture readonly %data, i64 %count) { +; CHECK-LABEL: Determining loop execution counts for: @test_guard_and_assume +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: max backedge-taken count is -2 +; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is (-1 + %count) +; +entry: + %cmp = icmp ult i64 %count, 5 + tail call void @llvm.assume(i1 %cmp) + %cmp18.not = icmp eq i64 %count, 0 + br i1 %cmp18.not, label %exit, label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %idx = getelementptr inbounds i32, i32* %data, i64 %iv + store i32 1, i32* %idx, align 4 + %iv.next = add nuw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %count + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Function Attrs: nounwind willreturn +declare void @llvm.assume(i1 noundef) diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll index 648fcf707f9f6..116a0ce0f3afa 100644 --- a/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll +++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll @@ -23,7 +23,7 @@ declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) nounwind ; CHECK: attributes #0 = { argmemonly nounwind readonly willreturn } -; CHECK: attributes #1 = { argmemonly nounwind willreturn } +; CHECK: attributes #1 = { argmemonly nounwind willreturn writeonly } ; CHECK: attributes [[NUW]] = { nounwind } !0 = !{!"tbaa root"} diff --git a/llvm/test/Assembler/ConstantExprNoFold.ll b/llvm/test/Assembler/ConstantExprNoFold.ll index 42e558eb38657..d91855925c897 100644 --- a/llvm/test/Assembler/ConstantExprNoFold.ll +++ b/llvm/test/Assembler/ConstantExprNoFold.ll @@ -42,6 +42,12 @@ target datalayout = "p:32:32" @empty.2 = external global [0 x i8], align 1 @empty.cmp = global i1 icmp eq ([0 x i8]* @empty.1, [0 x i8]* @empty.2) +; Two unnamed_addr globals can share an address +; CHECK: @unnamed.cmp = global i1 icmp eq ([5 x i8]* @unnamed.1, [5 x i8]* @unnamed.2) +@unnamed.1 = unnamed_addr constant [5 x i8] c"asdf\00" +@unnamed.2 = unnamed_addr constant [5 x i8] c"asdf\00" +@unnamed.cmp = global i1 icmp eq ([5 x i8]* @unnamed.1, [5 x i8]* @unnamed.2) + @addrspace3 = internal addrspace(3) global i32 undef ; CHECK: @no.fold.addrspace.icmp.eq.gv.null = global i1 icmp eq (i32 addrspace(3)* @addrspace3, i32 addrspace(3)* null) diff --git a/llvm/test/BugPoint/unsymbolized.ll b/llvm/test/BugPoint/unsymbolized.ll index d2060ddee168c..55aadc35884cb 100644 --- a/llvm/test/BugPoint/unsymbolized.ll +++ b/llvm/test/BugPoint/unsymbolized.ll @@ -3,7 +3,7 @@ ; RUN: echo "print('args = ' + str(sys.argv))" >> %t.py ; RUN: echo "exit(1)" >> %t.py ; RUN: not bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -opt-command=%python -opt-args %t.py | FileCheck %s -; RUN: not --crash opt -load %llvmshlibdir/BugpointPasses%shlibext %s -bugpoint-crashcalls -disable-symbolication 2>&1 | FileCheck --check-prefix=CRASH %s +; RUN: not --crash opt -enable-new-pm=0 -load %llvmshlibdir/BugpointPasses%shlibext %s -bugpoint-crashcalls -disable-symbolication 2>&1 | FileCheck --check-prefix=CRASH %s ; RUN: not bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -opt-command=%t.non.existent.opt.binary -opt-args %t.py 2>&1 | FileCheck %s --check-prefix=BAD-OPT ; Test that bugpoint disables symbolication on the opt tool to reduce runtime overhead when opt crashes diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index a6efd05e21f9b..2fda6cebe33fd 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -7,8 +7,8 @@ llvm_canonicalize_cmake_booleans( LLVM_ENABLE_FFI LLVM_ENABLE_THREADS LLVM_ENABLE_ZLIB + LLVM_ENABLE_LIBXML2 LLVM_INCLUDE_GO_TESTS - LLVM_LIBXML2_ENABLED LLVM_LINK_LLVM_DYLIB LLVM_TOOL_LTO_BUILD LLVM_USE_INTEL_JITEVENTS @@ -17,6 +17,7 @@ llvm_canonicalize_cmake_booleans( LLVM_BYE_LINK_INTO_TOOLS LLVM_HAVE_TF_AOT LLVM_HAVE_TF_API + LLVM_ENABLE_EXPENSIVE_CHECKS ) configure_lit_site_cfg( diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index 0b3371501ef89..a90d899ec3aa4 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -107,8 +107,8 @@ end: ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %{{[0-9]+}}:_(s96) = G_ADD %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: nonpow2_add_narrowing) ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_add_narrowing ; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_add_narrowing: -define void @nonpow2_add_narrowing() { - %a = add i128 undef, undef +define void @nonpow2_add_narrowing(i128 %x, i128 %y) { + %a = add i128 %x, %y %b = trunc i128 %a to i96 %dummy = add i96 %b, %b store i96 %dummy, i96* undef diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-switch.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-switch.ll index 485fa62904f0a..64d9e9588eeeb 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-switch.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-switch.ll @@ -1313,10 +1313,8 @@ define i32 @range_test(i32 %x) { ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[C1]] ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ule), [[SUB]](s32), [[C5]] - ; CHECK: [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; CHECK: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[C6]] - ; CHECK: G_BRCOND [[XOR]](s1), %bb.4 - ; CHECK: G_BR %bb.2 + ; CHECK: G_BRCOND [[ICMP1]](s1), %bb.2 + ; CHECK: G_BR %bb.4 ; CHECK: bb.2.sw.bb: ; CHECK: successors: %bb.4(0x80000000) ; CHECK: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[COPY]], [[C3]] diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-fabs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fabs.mir new file mode 100644 index 0000000000000..a543e7cd4c7e4 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fabs.mir @@ -0,0 +1,102 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s +# RUN: llc -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s + +--- +name: test_combine_fabs_fabs +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_fabs_fabs + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[FABS:%[0-9]+]]:_(s32) = G_FABS [[COPY]] + ; CHECK: $w0 = COPY [[FABS]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_FABS %0(s32) + %2:_(s32) = G_FABS %1(s32) + $w0 = COPY %2(s32) +... +--- +name: test_combine_fabs_fabs_vec +body: | + bb.1: + liveins: $x0 + ; CHECK-LABEL: name: test_combine_fabs_fabs_vec + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0 + ; CHECK: [[FABS:%[0-9]+]]:_(<2 x s32>) = G_FABS [[COPY]] + ; CHECK: $x0 = COPY [[FABS]](<2 x s32>) + %0:_(<2 x s32>) = COPY $x0 + %1:_(<2 x s32>) = G_FABS %0(<2 x s32>) + %2:_(<2 x s32>) = G_FABS %1(<2 x s32>) + $x0 = COPY %2(<2 x s32>) +... +--- +name: test_combine_half_fabs_neg_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_half_fabs_neg_constant + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4580 + ; CHECK: $h0 = COPY [[C]](s16) + %0:_(s16) = G_FCONSTANT half 0xHC580 + %1:_(s16) = G_FABS %0 + $h0 = COPY %1(s16) +... +--- +name: test_combine_half_fabs_pos_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_half_fabs_pos_constant + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4580 + ; CHECK: $h0 = COPY [[C]](s16) + %0:_(s16) = G_FCONSTANT half 0xH4580 + %1:_(s16) = G_FABS %0 + $h0 = COPY %1(s16) +... +--- +name: test_combine_float_fabs_neg_constant +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_float_fabs_neg_constant + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.500000e+00 + ; CHECK: $w0 = COPY [[C]](s32) + %0:_(s32) = G_FCONSTANT float -5.500000e+00 + %1:_(s32) = G_FABS %0 + $w0 = COPY %1(s32) +... +--- +name: test_combine_float_fabs_pos_constant +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_float_fabs_pos_constant + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.500000e+00 + ; CHECK: $w0 = COPY [[C]](s32) + %0:_(s32) = G_FCONSTANT float -5.500000e+00 + %1:_(s32) = G_FABS %0 + $w0 = COPY %1(s32) +... +--- +name: test_combine_double_fabs_neg_constant +body: | + bb.1: + liveins: $x0 + ; CHECK-LABEL: name: test_combine_double_fabs_neg_constant + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 4.200000e+00 + ; CHECK: $x0 = COPY [[C]](s64) + %0:_(s64) = G_FCONSTANT double -4.200000e+00 + %1:_(s64) = G_FABS %0 + $x0 = COPY %1(s64) +... +--- +name: test_combine_double_fabs_pos_constant +body: | + bb.1: + liveins: $x0 + ; CHECK-LABEL: name: test_combine_double_fabs_pos_constant + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 4.200000e+00 + ; CHECK: $x0 = COPY [[C]](s64) + %0:_(s64) = G_FCONSTANT double 4.200000e+00 + %1:_(s64) = G_FABS %0 + $x0 = COPY %0(s64) +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-flog2.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-flog2.mir new file mode 100644 index 0000000000000..9e7e279e9e1a3 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-flog2.mir @@ -0,0 +1,36 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s + +--- +name: test_combine_half_flog2_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_half_flog2_constant + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4000 + ; CHECK: $h0 = COPY [[C]](s16) + %0:_(s16) = G_FCONSTANT half 4.000000e+00 + %1:_(s16) = G_FLOG2 %0 + $h0 = COPY %1(s16) +... +--- +name: test_combine_float_flog2_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_float_flog2_constant + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK: $w0 = COPY [[C]](s32) + %0:_(s32) = G_FCONSTANT float 4.000000e+00 + %1:_(s32) = G_FLOG2 %0 + $w0 = COPY %1(s32) +... +--- +name: test_combine_double_flog2_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_double_flog2_constant + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00 + ; CHECK: $x0 = COPY [[C]](s64) + %0:_(s64) = G_FCONSTANT double 4.000000e+00 + %1:_(s64) = G_FLOG2 %0 + $x0 = COPY %1(s64) +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-fneg.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fneg.mir new file mode 100644 index 0000000000000..1b1077854b4c1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fneg.mir @@ -0,0 +1,94 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s +--- +name: test_combine_fneg_fneg +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_fneg_fneg + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: $w0 = COPY [[COPY]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_FNEG %0(s32) + %2:_(s32) = G_FNEG %1(s32) + $w0 = COPY %2(s32) +... +--- +name: test_combine_fneg_fneg_vec +body: | + bb.1: + liveins: $x0 + ; CHECK-LABEL: name: test_combine_fneg_fneg_vec + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0 + ; CHECK: $x0 = COPY [[COPY]](<2 x s32>) + %0:_(<2 x s32>) = COPY $x0 + %1:_(<2 x s32>) = G_FNEG %0(<2 x s32>) + %2:_(<2 x s32>) = G_FNEG %1(<2 x s32>) + $x0 = COPY %2(<2 x s32>) +... +--- +name: test_combine_half_fneg_neg_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_half_fneg_neg_constant + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4580 + ; CHECK: $h0 = COPY [[C]](s16) + %0:_(s16) = G_FCONSTANT half 0xHC580 + %1:_(s16) = G_FNEG %0 + $h0 = COPY %1(s16) +... +--- +name: test_combine_half_fneg_pos_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_half_fneg_pos_constant + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHC580 + ; CHECK: $h0 = COPY [[C]](s16) + %0:_(s16) = G_FCONSTANT half 0xH4580 + %1:_(s16) = G_FNEG %0 + $h0 = COPY %1(s16) +... +--- +name: test_combine_float_fneg_neg_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_float_fneg_neg_constant + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.500000e+00 + ; CHECK: $w0 = COPY [[C]](s32) + %0:_(s32) = G_FCONSTANT float -5.500000e+00 + %1:_(s32) = G_FNEG %0 + $w0 = COPY %1(s32) +... +--- +name: test_combine_float_fneg_pos_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_float_fneg_pos_constant + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -5.500000e+00 + ; CHECK: $w0 = COPY [[C]](s32) + %0:_(s32) = G_FCONSTANT float 5.500000e+00 + %1:_(s32) = G_FNEG %0 + $w0 = COPY %1(s32) +... +--- +name: test_combine_double_fneg_neg_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_double_fneg_neg_constant + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 4.200000e+00 + ; CHECK: $x0 = COPY [[C]](s64) + %0:_(s64) = G_FCONSTANT double -4.200000e+00 + %1:_(s64) = G_FNEG %0 + $x0 = COPY %1(s64) +... +--- +name: test_combine_double_fneg_pos_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_double_fneg_pos_constant + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double -4.200000e+00 + ; CHECK: $x0 = COPY [[C]](s64) + %0:_(s64) = G_FCONSTANT double 4.200000e+00 + %1:_(s64) = G_FNEG %0 + $x0 = COPY %1(s64) +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-fptrunc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fptrunc.mir new file mode 100644 index 0000000000000..1fd7f6f39caca --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fptrunc.mir @@ -0,0 +1,36 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s + +--- +name: test_combine_float_to_half_fptrunc_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_float_to_half_fptrunc_constant + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4580 + ; CHECK: $h0 = COPY [[C]](s16) + %0:_(s32) = G_FCONSTANT float 5.500000e+00 + %1:_(s16) = G_FPTRUNC %0(s32) + $h0 = COPY %1(s16) +... +--- +name: test_combine_double_to_half_fptrunc_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_double_to_half_fptrunc_constant + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4433 + ; CHECK: $h0 = COPY [[C]](s16) + %0:_(s64) = G_FCONSTANT double 4.200000e+00 + %1:_(s16) = G_FPTRUNC %0(s64) + $h0 = COPY %1(s16) +... +--- +name: test_combine_double_to_foat_fptrunc_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_double_to_foat_fptrunc_constant + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x4010CCCCC0000000 + ; CHECK: $w0 = COPY [[C]](s32) + %0:_(s64) = G_FCONSTANT double 4.200000e+00 + %1:_(s32) = G_FPTRUNC %0(s64) + $w0 = COPY %1(s32) +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-fsqrt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fsqrt.mir new file mode 100644 index 0000000000000..e114d01793167 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fsqrt.mir @@ -0,0 +1,39 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s + +--- +name: test_combine_half_fsqrt_constant +body: | + bb.1: + liveins: + ; CHECK-LABEL: name: test_combine_half_fsqrt_constant + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4000 + ; CHECK: $h0 = COPY [[C]](s16) + %0:_(s16) = G_FCONSTANT half 4.000000e+00 + %1:_(s16) = G_FSQRT %0 + $h0 = COPY %1 +... +--- +name: test_combine_float_fsqrt_constant +body: | + bb.1: + liveins: + ; CHECK-LABEL: name: test_combine_float_fsqrt_constant + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK: $w0 = COPY [[C]](s32) + %0:_(s32) = G_FCONSTANT float 4.000000e+00 + %1:_(s32) = G_FSQRT %0 + $w0 = COPY %1 +... +--- +name: test_combine_double_fsqrt_constant +body: | + bb.1: + liveins: + ; CHECK-LABEL: name: test_combine_double_fsqrt_constant + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00 + ; CHECK: $x0 = COPY [[C]](s64) + %0:_(s64) = G_FCONSTANT double 4.000000e+00 + %1:_(s64) = G_FSQRT %0 + $x0 = COPY %1 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-mul.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-mul.mir new file mode 100644 index 0000000000000..2f911693fd244 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-mul.mir @@ -0,0 +1,134 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s + +--- +name: mul_by_zero +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: mul_by_zero + ; CHECK: liveins: $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: $x0 = COPY [[C]](s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 0 + %2:_(s64) = G_MUL %0, %1(s64) + $x0 = COPY %2(s64) +... +--- +name: mul_vector_by_zero +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $q0 + ; Currently not implemented. + ; CHECK-LABEL: name: mul_vector_by_zero + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[COPY]], [[BUILD_VECTOR]] + ; CHECK: $q0 = COPY [[MUL]](<4 x s32>) + %0:_(<4 x s32>) = COPY $q0 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(<4 x s32>) = G_BUILD_VECTOR %1(s32), %1(s32), %1(s32), %1(s32) + %3:_(<4 x s32>) = G_MUL %0, %2(<4 x s32>) + $q0 = COPY %3(<4 x s32>) +... +--- +name: mul_by_one +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: mul_by_one + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: $x0 = COPY [[COPY]](s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 1 + %2:_(s64) = G_MUL %0, %1(s64) + $x0 = COPY %2(s64) +... +--- +name: mul_vector_by_one +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $q0 + ; Currently not implemented. + ; CHECK-LABEL: name: mul_vector_by_one + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[COPY]], [[BUILD_VECTOR]] + ; CHECK: $q0 = COPY [[MUL]](<4 x s32>) + %0:_(<4 x s32>) = COPY $q0 + %1:_(s32) = G_CONSTANT i32 1 + %2:_(<4 x s32>) = G_BUILD_VECTOR %1(s32), %1(s32), %1(s32), %1(s32) + %3:_(<4 x s32>) = G_MUL %0, %2(<4 x s32>) + $q0 = COPY %3(<4 x s32>) +... +--- +name: mul_by_neg_one +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: mul_by_neg_one + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[COPY]] + ; CHECK: $x0 = COPY [[SUB]](s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 -1 + %2:_(s64) = G_MUL %0, %1(s64) + $x0 = COPY %2(s64) +... +--- +name: mul_vector_by_neg_one +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $q0 + ; Currently not implemented. + ; CHECK-LABEL: name: mul_vector_by_neg_one + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[COPY]], [[BUILD_VECTOR]] + ; CHECK: $q0 = COPY [[MUL]](<4 x s32>) + %0:_(<4 x s32>) = COPY $q0 + %1:_(s32) = G_CONSTANT i32 -1 + %2:_(<4 x s32>) = G_BUILD_VECTOR %1(s32), %1(s32), %1(s32), %1(s32) + %3:_(<4 x s32>) = G_MUL %0, %2(<4 x s32>) + $q0 = COPY %3(<4 x s32>) +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shl.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shl.mir new file mode 100644 index 0000000000000..fe75f9965bc90 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shl.mir @@ -0,0 +1,29 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s +--- +name: test_combine_shl_undef_x_s32 +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_shl_undef_x_s32 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: $w0 = COPY [[C]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_IMPLICIT_DEF + %2:_(s32) = G_SHL %1(s32), %0(s32) + $w0 = COPY %2(s32) +... +--- +name: test_combine_shl_undef_x_v2s32 +body: | + bb.1: + liveins: $d0 + ; CHECK-LABEL: name: test_combine_shl_undef_x_v2s32 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32) + ; CHECK: $d0 = COPY [[BUILD_VECTOR]](<2 x s32>) + %0:_(<2 x s32>) = COPY $d0 + %1:_(<2 x s32>) = G_IMPLICIT_DEF + %2:_(<2 x s32>) = G_SHL %1(<2 x s32>), %0(<2 x s32>) + $d0 = COPY %2(<2 x s32>) +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir new file mode 100644 index 0000000000000..eb1652cc0dba0 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir @@ -0,0 +1,142 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s +--- +name: test_combine_trunc_undef +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_undef + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: $w0 = COPY [[DEF]](s32) + %0:_(s64) = G_IMPLICIT_DEF + %1:_(s32) = G_TRUNC %0(s64) + $w0 = COPY %1(s32) +... +--- +name: test_combine_trunc_undef_vec +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_undef_vec + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK: $x0 = COPY [[DEF]](<2 x s32>) + %0:_(<2 x s64>) = G_IMPLICIT_DEF + %1:_(<2 x s32>) = G_TRUNC %0(<2 x s64>) + $x0 = COPY %1(<2 x s32>) +... +--- +name: test_combine_trunc_anyext_s32_s16 +body: | + bb.1: + liveins: $h0 + ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s16 + ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + %0:_(s16) = COPY $h0 + %1:_(s64) = G_ANYEXT %0(s16) + %2:_(s32) = G_TRUNC %1(s64) + $w0 = COPY %2(s32) +... +--- +name: test_combine_trunc_anyext_s32_s16_vec +body: | + bb.1: + liveins: $s0 + ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s16_vec + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $s0 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT [[COPY]](<2 x s16>) + ; CHECK: $x0 = COPY [[ANYEXT]](<2 x s32>) + %0:_(<2 x s16>) = COPY $s0 + %1:_(<2 x s64>) = G_ANYEXT %0(<2 x s16>) + %2:_(<2 x s32>) = G_TRUNC %1(<2 x s64>) + $x0 = COPY %2(<2 x s32>) +... +--- +name: test_combine_trunc_sext_s32_s16 +body: | + bb.1: + liveins: $h0 + ; CHECK-LABEL: name: test_combine_trunc_sext_s32_s16 + ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 + ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16) + ; CHECK: $w0 = COPY [[SEXT]](s32) + %0:_(s16) = COPY $h0 + %1:_(s64) = G_SEXT %0(s16) + %2:_(s32) = G_TRUNC %1(s64) + $w0 = COPY %2(s32) +... +--- +name: test_combine_trunc_zext_s32_s16 +body: | + bb.1: + liveins: $h0 + ; CHECK-LABEL: name: test_combine_trunc_zext_s32_s16 + ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16) + ; CHECK: $w0 = COPY [[ZEXT]](s32) + %0:_(s16) = COPY $h0 + %1:_(s64) = G_ZEXT %0(s16) + %2:_(s32) = G_TRUNC %1(s64) + $w0 = COPY %2(s32) +... +--- +name: test_combine_trunc_anyext_s32_s32 +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: $w0 = COPY [[COPY]](s32) + %0:_(s32) = COPY $w0 + %1:_(s64) = G_ANYEXT %0(s32) + %2:_(s32) = G_TRUNC %1(s64) + $w0 = COPY %2(s32) +... +--- +name: test_combine_trunc_anyext_s32_s64 +body: | + bb.1: + liveins: $x0 + ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s64 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK: $w0 = COPY [[TRUNC]](s32) + %0:_(s64) = COPY $x0 + %1:_(s128) = G_ANYEXT %0(s64) + %2:_(s32) = G_TRUNC %1(s128) + $w0 = COPY %2(s32) +... +--- +name: test_combine_trunc_shl_s32_by_2 +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_trunc_shl_s32_by_2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32) + ; CHECK: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; CHECK: $h0 = COPY [[SHL]](s16) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 2 + %2:_(s32) = G_SHL %0(s32), %1(s32) + %3:_(s16) = G_TRUNC %2(s32) + $h0 = COPY %3(s16) +... +--- +name: test_combine_trunc_shl_s32_by_17 +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_trunc_shl_s32_by_17 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; CHECK: $h0 = COPY [[TRUNC]](s16) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 17 + %2:_(s32) = G_SHL %0(s32), %1(s32) + %3:_(s16) = G_TRUNC %2(s32) + $h0 = COPY %3(s16) +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir new file mode 100644 index 0000000000000..53c75b4d84d95 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir @@ -0,0 +1,478 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s + +# Simple unmerge(merge) case with two operands. +# The sources of the merge can be used in place of +# the destinations of the unmerge. +--- +name: test_combine_unmerge_merge +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_merge + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: $w0 = COPY [[DEF]](s32) + ; CHECK: $w1 = COPY [[DEF1]](s32) + %0:_(s32) = G_IMPLICIT_DEF + %1:_(s32) = G_IMPLICIT_DEF + %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) + %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %2(s64) + $w0 = COPY %3(s32) + $w1 = COPY %4(s32) +... + +# Simple unmerge(merge) case with three operands. +# The sources of the merge can be used in place of +# the destinations of the unmerge. +--- +name: test_combine_unmerge_merge_3ops +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_merge_3ops + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: $w0 = COPY [[DEF]](s32) + ; CHECK: $w1 = COPY [[DEF1]](s32) + ; CHECK: $w2 = COPY [[DEF2]](s32) + %0:_(s32) = G_IMPLICIT_DEF + %1:_(s32) = G_IMPLICIT_DEF + %5:_(s32) = G_IMPLICIT_DEF + %2:_(s96) = G_MERGE_VALUES %0(s32), %1(s32), %5(s32) + %3:_(s32), %4:_(s32), %6:_(s32) = G_UNMERGE_VALUES %2(s96) + $w0 = COPY %3(s32) + $w1 = COPY %4(s32) + $w2 = COPY %6(s32) +... + +# Simple unmerge(buildvector) case with two operands. +# The sources of the buildvector can be used in place of +# the destinations of the unmerge. +--- +name: test_combine_unmerge_build_vector +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_build_vector + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: $w0 = COPY [[DEF]](s32) + ; CHECK: $w1 = COPY [[DEF1]](s32) + %0:_(s32) = G_IMPLICIT_DEF + %1:_(s32) = G_IMPLICIT_DEF + %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32) + %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %2(<2 x s32>) + $w0 = COPY %3(s32) + $w1 = COPY %4(s32) +... + +# Simple unmerge(buildvector) case with three operands. +# The sources of the buildvector can be used in place of +# the destinations of the unmerge. +--- +name: test_combine_unmerge_buildvector_3ops +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_buildvector_3ops + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: $w0 = COPY [[DEF]](s32) + ; CHECK: $w1 = COPY [[DEF1]](s32) + ; CHECK: $w2 = COPY [[DEF2]](s32) + %0:_(s32) = G_IMPLICIT_DEF + %1:_(s32) = G_IMPLICIT_DEF + %5:_(s32) = G_IMPLICIT_DEF + %2:_(<3 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %5(s32) + %3:_(s32), %4:_(s32), %6:_(s32) = G_UNMERGE_VALUES %2(<3 x s32>) + $w0 = COPY %3(s32) + $w1 = COPY %4(s32) + $w2 = COPY %6(s32) +... + +# Simple unmerge(concatvectors) case. +# The sources of the concatvectors can be used in place of +# the destinations of the unmerge. +--- +name: test_combine_unmerge_concat_vectors +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_concat_vectors + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $w1 + ; CHECK: $w0 = COPY [[COPY]](<2 x s16>) + ; CHECK: $w1 = COPY [[COPY1]](<2 x s16>) + %0:_(<2 x s16>) = COPY $w0 + %1:_(<2 x s16>) = COPY $w1 + %2:_(<4 x s16>) = G_CONCAT_VECTORS %0(<2 x s16>), %1(<2 x s16>) + %3:_(<2 x s16>), %4:_(<2 x s16>) = G_UNMERGE_VALUES %2(<4 x s16>) + $w0 = COPY %3(<2 x s16>) + $w1 = COPY %4(<2 x s16>) +... + +# Unmerge(merge) case with two operands and a bitcast in the middle. +# The sources of the merge can be used in place of +# the destinations of the unmerge. +--- +name: test_combine_unmerge_bitcast_merge +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_bitcast_merge + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: $w0 = COPY [[DEF]](s32) + ; CHECK: $w1 = COPY [[DEF1]](s32) + %0:_(s32) = G_IMPLICIT_DEF + %1:_(s32) = G_IMPLICIT_DEF + %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) + %5:_(<2 x s32>) = G_BITCAST %2(s64) + %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %5(<2 x s32>) + $w0 = COPY %3(s32) + $w1 = COPY %4(s32) +... + +# Unmerge(merge) with incompatible types: unmerge destTy != merge inputTy. +# The sources of the merge cannot be used in place of +# the destinations of the unmerge, since the types don't match. +--- +name: test_combine_unmerge_merge_incompatible_types +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_merge_incompatible_types + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32) + ; CHECK: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[MV]](s64) + ; CHECK: $h0 = COPY [[UV]](s16) + ; CHECK: $h1 = COPY [[UV1]](s16) + ; CHECK: $h2 = COPY [[UV2]](s16) + ; CHECK: $h3 = COPY [[UV3]](s16) + %0:_(s32) = G_IMPLICIT_DEF + %1:_(s32) = G_IMPLICIT_DEF + %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) + %3:_(s16), %4:_(s16), %5:_(s16), %6:_(s16) = G_UNMERGE_VALUES %2(s64) + $h0 = COPY %3(s16) + $h1 = COPY %4(s16) + $h2 = COPY %5(s16) + $h3 = COPY %6(s16) +... + +# Unmerge(concatvectors) with incompatible types: unmerge destTy != merge inputTy +# but destTy.size() == inputTy.size(). +# The sources of the concatvectors can be used in place of +# the destinations of the unmerge with a bitcast since the sizes +# match. +--- +name: test_combine_unmerge_merge_incompatible_types_but_same_size +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_merge_incompatible_types_but_same_size + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $w1 + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; CHECK: $w0 = COPY [[BITCAST]](s32) + ; CHECK: $w1 = COPY [[BITCAST1]](s32) + %0:_(<2 x s16>) = COPY $w0 + %1:_(<2 x s16>) = COPY $w1 + %2:_(<4 x s16>) = G_CONCAT_VECTORS %0(<2 x s16>), %1(<2 x s16>) + %5:_(s64) = G_BITCAST %2(<4 x s16>) + %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %5(s64) + $w0 = COPY %3(s32) + $w1 = COPY %4(s32) +... + +# Unmerge a constant into a bunch of smaller constant. +# Constant is 0x0102030405060708090a0b0c0d0e0f10 and we break it down into +# bytes: +# cst1 0x10 +# cst2 0x0f +# cst3 0x0e +# ... +--- +name: test_combine_unmerge_cst +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_cst + ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 16 + ; CHECK: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 15 + ; CHECK: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 14 + ; CHECK: [[C3:%[0-9]+]]:_(s8) = G_CONSTANT i8 13 + ; CHECK: [[C4:%[0-9]+]]:_(s8) = G_CONSTANT i8 12 + ; CHECK: [[C5:%[0-9]+]]:_(s8) = G_CONSTANT i8 11 + ; CHECK: [[C6:%[0-9]+]]:_(s8) = G_CONSTANT i8 10 + ; CHECK: [[C7:%[0-9]+]]:_(s8) = G_CONSTANT i8 9 + ; CHECK: [[C8:%[0-9]+]]:_(s8) = G_CONSTANT i8 8 + ; CHECK: [[C9:%[0-9]+]]:_(s8) = G_CONSTANT i8 7 + ; CHECK: [[C10:%[0-9]+]]:_(s8) = G_CONSTANT i8 6 + ; CHECK: [[C11:%[0-9]+]]:_(s8) = G_CONSTANT i8 5 + ; CHECK: [[C12:%[0-9]+]]:_(s8) = G_CONSTANT i8 4 + ; CHECK: [[C13:%[0-9]+]]:_(s8) = G_CONSTANT i8 3 + ; CHECK: [[C14:%[0-9]+]]:_(s8) = G_CONSTANT i8 2 + ; CHECK: [[C15:%[0-9]+]]:_(s8) = G_CONSTANT i8 1 + ; CHECK: $b0 = COPY [[C]](s8) + ; CHECK: $b1 = COPY [[C1]](s8) + ; CHECK: $b2 = COPY [[C2]](s8) + ; CHECK: $b3 = COPY [[C3]](s8) + ; CHECK: $b4 = COPY [[C4]](s8) + ; CHECK: $b5 = COPY [[C5]](s8) + ; CHECK: $b6 = COPY [[C6]](s8) + ; CHECK: $b7 = COPY [[C7]](s8) + ; CHECK: $b8 = COPY [[C8]](s8) + ; CHECK: $b9 = COPY [[C9]](s8) + ; CHECK: $b10 = COPY [[C10]](s8) + ; CHECK: $b11 = COPY [[C11]](s8) + ; CHECK: $b12 = COPY [[C12]](s8) + ; CHECK: $b13 = COPY [[C13]](s8) + ; CHECK: $b14 = COPY [[C14]](s8) + ; CHECK: $b15 = COPY [[C15]](s8) + %0:_(s128) = G_CONSTANT i128 1339673755198158349044581307228491536 + %1:_(s8),%2:_(s8),%3:_(s8),%4:_(s8),%5:_(s8),%6:_(s8),%7:_(s8),%8:_(s8),%9:_(s8),%10:_(s8),%11:_(s8),%12:_(s8),%13:_(s8),%14:_(s8),%15:_(s8),%16:_(s8) = G_UNMERGE_VALUES %0(s128) + $b0 = COPY %1(s8) + $b1 = COPY %2(s8) + $b2 = COPY %3(s8) + $b3 = COPY %4(s8) + $b4 = COPY %5(s8) + $b5 = COPY %6(s8) + $b6 = COPY %7(s8) + $b7 = COPY %8(s8) + $b8 = COPY %9(s8) + $b9 = COPY %10(s8) + $b10 = COPY %11(s8) + $b11 = COPY %12(s8) + $b12 = COPY %13(s8) + $b13 = COPY %14(s8) + $b14 = COPY %15(s8) + $b15 = COPY %16(s8) +... + +# Unmerge a constant on a non-power of 2 type into a bunch of smaller constant. +# Constant is a 3 | 2 | 1 in chunks of 13-bit. +--- +name: test_combine_unmerge_cst_36bit +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_cst_36bit + ; CHECK: [[C:%[0-9]+]]:_(s13) = G_CONSTANT i13 1 + ; CHECK: [[C1:%[0-9]+]]:_(s13) = G_CONSTANT i13 2 + ; CHECK: [[C2:%[0-9]+]]:_(s13) = G_CONSTANT i13 3 + ; CHECK: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[C]](s13) + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s16) = G_ZEXT [[C1]](s13) + ; CHECK: [[ZEXT2:%[0-9]+]]:_(s16) = G_ZEXT [[C2]](s13) + ; CHECK: $h0 = COPY [[ZEXT]](s16) + ; CHECK: $h1 = COPY [[ZEXT1]](s16) + ; CHECK: $h2 = COPY [[ZEXT2]](s16) + %0:_(s39) = G_CONSTANT i39 201342977 + %1:_(s13),%2:_(s13),%3:_(s13) = G_UNMERGE_VALUES %0(s39) + %4:_(s16) = G_ZEXT %1(s13) + %5:_(s16) = G_ZEXT %2(s13) + %6:_(s16) = G_ZEXT %3(s13) + $h0 = COPY %4(s16) + $h1 = COPY %5(s16) + $h2 = COPY %6(s16) +... + +# Unmerge floating point constant. +--- +name: test_combine_unmerge_fpcst +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_fpcst + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; CHECK: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 3 + ; CHECK: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 + ; CHECK: $h0 = COPY [[C]](s16) + ; CHECK: $h1 = COPY [[C1]](s16) + ; CHECK: $h2 = COPY [[C2]](s16) + ; CHECK: $h3 = COPY [[C3]](s16) + %0:_(s64) = G_FCONSTANT double 0x0004000300020001 + %1:_(s16),%2:_(s16),%3:_(s16),%4:_(s16) = G_UNMERGE_VALUES %0(s64) + $h0 = COPY %1(s16) + $h1 = COPY %2(s16) + $h2 = COPY %3(s16) + $h3 = COPY %4(s16) +... + +# Transform unmerge into trunc when only the first definition is live. +--- +name: test_combine_unmerge_dead_to_trunc +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_dead_to_trunc + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64) + ; CHECK: $h0 = COPY [[TRUNC]](s16) + %0:_(s64) = COPY $x0 + %1:_(s16),%2:_(s16),%3:_(s16),%4:_(s16) = G_UNMERGE_VALUES %0(s64) + $h0 = COPY %1(s16) +... + +# Don't transform unmerge into trunc when middle lanes are live. +--- +name: test_dont_combine_unmerge_dead_to_trunc +body: | + bb.1: + ; CHECK-LABEL: name: test_dont_combine_unmerge_dead_to_trunc + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: $h0 = COPY [[UV2]](s16) + %0:_(s64) = COPY $x0 + %1:_(s16),%2:_(s16),%3:_(s16),%4:_(s16) = G_UNMERGE_VALUES %0(s64) + $h0 = COPY %3(s16) +... + +# Transform unmerge into trunc when only the first definition is live, even +# if the input and output types are vectors. +--- +name: test_combine_unmerge_dead_to_trunc_vec_in_n_out +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_dead_to_trunc_vec_in_n_out + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[COPY]](<2 x s32>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[TRUNC]](s32) + ; CHECK: $w0 = COPY [[BITCAST1]](<2 x s16>) + %0:_(<2 x s32>) = COPY $x0 + %1:_(<2 x s16>),%2:_(<2 x s16>) = G_UNMERGE_VALUES %0(<2 x s32>) + $w0 = COPY %1(<2 x s16>) +... + +# Transform unmerge into trunc when only the first definition is live, even +# if the input type is vector. +--- +name: test_combine_unmerge_dead_to_trunc_vec_in +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_dead_to_trunc_vec_in + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[COPY]](<2 x s32>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s64) + ; CHECK: $h0 = COPY [[TRUNC]](s16) + %0:_(<2 x s32>) = COPY $x0 + %1:_(s16),%2:_(s16),%3:_(s16),%4:_(s16) = G_UNMERGE_VALUES %0(<2 x s32>) + $h0 = COPY %1(s16) +... + +# Transform unmerge into trunc when only the first definition is live, even +# if the output type are vector. +--- +name: test_combine_unmerge_dead_to_trunc_vec_out +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_dead_to_trunc_vec_out + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[TRUNC]](s32) + ; CHECK: $w0 = COPY [[BITCAST]](<2 x s16>) + %0:_(s64) = COPY $x0 + %1:_(<2 x s16>),%2:_(<2 x s16>) = G_UNMERGE_VALUES %0(s64) + $w0 = COPY %1(<2 x s16>) +... + +# Transform unmerge(zext) into zext. +# In that test, the source of the zext is same size as the first definition +# of the unmerge. Therefore a we can just reuse the input of the zext for +# this definition. +--- +name: test_combine_unmerge_zext_to_zext_same_size +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_zext_to_zext_same_size + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: $w0 = COPY [[COPY]](s32) + ; CHECK: $w1 = COPY [[C]](s32) + %0:_(s32) = COPY $w0 + %3:_(s64) = G_ZEXT %0(s32) + %1:_(s32),%2:_(s32) = G_UNMERGE_VALUES %3(s64) + $w0 = COPY %1(s32) + $w1 = COPY %2(s32) +... + +# Transform unmerge(zext) into zext. +# In that test, the source of the zext is smaller than the first definition +# of the unmerge. Therefore a G_ZEXT is required. +--- +name: test_combine_unmerge_zext_to_zext +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_unmerge_zext_to_zext + ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY $b0 + ; CHECK: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[COPY]](s8) + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 + ; CHECK: $h0 = COPY [[ZEXT]](s16) + ; CHECK: $h1 = COPY [[C]](s16) + ; CHECK: $h2 = COPY [[C]](s16) + ; CHECK: $h3 = COPY [[C]](s16) + %0:_(s8) = COPY $b0 + %3:_(s64) = G_ZEXT %0(s8) + %1:_(s16),%2:_(s16),%4:_(s16),%5:_(s16) = G_UNMERGE_VALUES %3(s64) + $h0 = COPY %1(s16) + $h1 = COPY %2(s16) + $h2 = COPY %4(s16) + $h3 = COPY %5(s16) +... + +# Check that we don't apply the unmerge(zext) to zext transformation +# when the first destination of the unmerge is smaller than the source +# of the zext. +--- +name: test_dont_combine_unmerge_zext_to_zext_src_bigger +body: | + bb.1: + ; CHECK-LABEL: name: test_dont_combine_unmerge_zext_to_zext_src_bigger + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; CHECK: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[ZEXT]](s64) + ; CHECK: $h0 = COPY [[UV]](s16) + ; CHECK: $h1 = COPY [[UV1]](s16) + ; CHECK: $h2 = COPY [[UV2]](s16) + ; CHECK: $h3 = COPY [[UV3]](s16) + %0:_(s32) = COPY $w0 + %3:_(s64) = G_ZEXT %0(s32) + %1:_(s16),%2:_(s16),%4:_(s16),%5:_(s16) = G_UNMERGE_VALUES %3(s64) + $h0 = COPY %1(s16) + $h1 = COPY %2(s16) + $h2 = COPY %4(s16) + $h3 = COPY %5(s16) +... + +# Check that we don't apply the unmerge(zext) to zext transformation +# when the input zext deals with a vector type. +--- +name: test_dont_combine_unmerge_zext_to_zext_src_vector +body: | + bb.1: + ; CHECK-LABEL: name: test_dont_combine_unmerge_zext_to_zext_src_vector + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $w0 + ; CHECK: [[ZEXT:%[0-9]+]]:_(<2 x s32>) = G_ZEXT [[COPY]](<2 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](<2 x s32>) + ; CHECK: $w0 = COPY [[UV]](s32) + ; CHECK: $w1 = COPY [[UV1]](s32) + %0:_(<2 x s16>) = COPY $w0 + %3:_(<2 x s32>) = G_ZEXT %0(<2 x s16>) + %1:_(s32),%2:_(s32) = G_UNMERGE_VALUES %3(<2 x s32>) + $w0 = COPY %1(s32) + $w1 = COPY %2(s32) +... + +# Check that we don't apply the unmerge(zext) to zext transformation +# when the destination type is a vector type. +# We could actually handle this case but we would need to insert a cast. +--- +name: test_dont_combine_unmerge_zext_to_zext_dst_vector +body: | + bb.1: + ; CHECK-LABEL: name: test_dont_combine_unmerge_zext_to_zext_dst_vector + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[ZEXT]](s64) + ; CHECK: $w0 = COPY [[UV]](<2 x s16>) + ; CHECK: $w1 = COPY [[UV1]](<2 x s16>) + %0:_(s32) = COPY $w0 + %3:_(s64) = G_ZEXT %0(s32) + %1:_(<2 x s16>),%2:_(<2 x s16>) = G_UNMERGE_VALUES %3(s64) + $w0 = COPY %1(<2 x s16>) + $w1 = COPY %2(<2 x s16>) +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/const-0.ll b/llvm/test/CodeGen/AArch64/GlobalISel/const-0.ll deleted file mode 100644 index 89d1ee29b959c..0000000000000 --- a/llvm/test/CodeGen/AArch64/GlobalISel/const-0.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -global-isel -O0 -o - %s | FileCheck %s - -%struct.comp = type { i8*, i32, i8*, [3 x i8], i32 } - -define void @regbranch() { -; CHECK-LABEL: regbranch: -; CHECK: mov {{w[0-9]+}}, #0 -cond_next240.i: - br i1 false, label %cond_true251.i, label %cond_next272.i - -cond_true251.i: - switch i8 0, label %cond_next272.i [ - i8 42, label %bb268.i - i8 43, label %bb268.i - i8 63, label %bb268.i - ] - -bb268.i: - br label %cond_next272.i - -cond_next272.i: - %len.2.i = phi i32 [ 0, %bb268.i ], [ 0, %cond_next240.i ], [ 0, %cond_true251.i ] - %tmp278.i = icmp eq i32 %len.2.i, 1 - ret void -} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-condbr-lower-tree.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-condbr-lower-tree.ll new file mode 100644 index 0000000000000..223fa28d49faa --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-condbr-lower-tree.ll @@ -0,0 +1,234 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -mtriple aarch64 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s + +declare i32 @bar(...) +define void @or_cond(i32 %X, i32 %Y, i32 %Z) nounwind { + ; CHECK-LABEL: name: or_cond + ; CHECK: bb.1.entry: + ; CHECK: successors: %bb.2(0x20000000), %bb.4(0x60000000) + ; CHECK: liveins: $w0, $w1, $w2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s32), [[C1]] + ; CHECK: [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP1]], [[ICMP]] + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s32), [[C1]] + ; CHECK: G_BRCOND [[ICMP2]](s1), %bb.2 + ; CHECK: G_BR %bb.4 + ; CHECK: bb.4.entry: + ; CHECK: successors: %bb.2(0x2aaaaaab), %bb.3(0x55555555) + ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK: G_BRCOND [[ICMP3]](s1), %bb.2 + ; CHECK: G_BR %bb.3 + ; CHECK: bb.2.cond_true: + ; CHECK: TCRETURNdi @bar, 0, csr_aarch64_aapcs, implicit $sp + ; CHECK: bb.3.UnifiedReturnBlock: + ; CHECK: RET_ReallyLR +entry: + %tmp1 = icmp eq i32 %X, 0 + %tmp3 = icmp slt i32 %Y, 5 + %tmp4 = or i1 %tmp3, %tmp1 + br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock + +cond_true: + %tmp5 = tail call i32 (...) @bar( ) + ret void + +UnifiedReturnBlock: + ret void +} + +define void @and_cond(i32 %X, i32 %Y, i32 %Z) nounwind { + ; CHECK-LABEL: name: and_cond + ; CHECK: bb.1.entry: + ; CHECK: successors: %bb.4(0x60000000), %bb.3(0x20000000) + ; CHECK: liveins: $w0, $w1, $w2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s32), [[C1]] + ; CHECK: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP]] + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s32), [[C1]] + ; CHECK: G_BRCOND [[ICMP2]](s1), %bb.4 + ; CHECK: G_BR %bb.3 + ; CHECK: bb.4.entry: + ; CHECK: successors: %bb.2(0x55555555), %bb.3(0x2aaaaaab) + ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK: G_BRCOND [[ICMP3]](s1), %bb.2 + ; CHECK: G_BR %bb.3 + ; CHECK: bb.2.cond_true: + ; CHECK: TCRETURNdi @bar, 0, csr_aarch64_aapcs, implicit $sp + ; CHECK: bb.3.UnifiedReturnBlock: + ; CHECK: RET_ReallyLR +entry: + %tmp1 = icmp eq i32 %X, 0 + %tmp3 = icmp slt i32 %Y, 5 + %tmp4 = and i1 %tmp3, %tmp1 + br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock + +cond_true: + %tmp5 = tail call i32 (...) @bar( ) + ret void + +UnifiedReturnBlock: + ret void +} + +; Don't emit two branches for same operands. +define void @or_cond_same_values_cmp(i32 %X, i32 %Y, i32 %Z) nounwind { + ; CHECK-LABEL: name: or_cond_same_values_cmp + ; CHECK: bb.1.entry: + ; CHECK: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK: liveins: $w0, $w1, $w2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s32), [[C]] + ; CHECK: [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP1]], [[ICMP]] + ; CHECK: G_BRCOND [[OR]](s1), %bb.2 + ; CHECK: G_BR %bb.3 + ; CHECK: bb.2.cond_true: + ; CHECK: TCRETURNdi @bar, 0, csr_aarch64_aapcs, implicit $sp + ; CHECK: bb.3.UnifiedReturnBlock: + ; CHECK: RET_ReallyLR +entry: + %tmp1 = icmp eq i32 %X, 5 + %tmp3 = icmp slt i32 %X, 5 + %tmp4 = or i1 %tmp3, %tmp1 + br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock + +cond_true: + %tmp5 = tail call i32 (...) @bar( ) + ret void + +UnifiedReturnBlock: + ret void +} + +; Emit multiple branches for more than 2 cases. +define void @or_cond_multiple_cases(i32 %X, i32 %Y, i32 %Z) nounwind { + ; CHECK-LABEL: name: or_cond_multiple_cases + ; CHECK: bb.1.entry: + ; CHECK: successors: %bb.2(0x10000000), %bb.5(0x70000000) + ; CHECK: liveins: $w0, $w1, $w2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s32), [[C]] + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] + ; CHECK: [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP1]], [[ICMP]] + ; CHECK: [[OR1:%[0-9]+]]:_(s1) = G_OR [[OR]], [[ICMP2]] + ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s32), [[C]] + ; CHECK: G_BRCOND [[ICMP3]](s1), %bb.2 + ; CHECK: G_BR %bb.5 + ; CHECK: bb.5.entry: + ; CHECK: successors: %bb.2(0x12492492), %bb.4(0x6db6db6e) + ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK: G_BRCOND [[ICMP4]](s1), %bb.2 + ; CHECK: G_BR %bb.4 + ; CHECK: bb.4.entry: + ; CHECK: successors: %bb.2(0x2aaaaaab), %bb.3(0x55555555) + ; CHECK: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] + ; CHECK: G_BRCOND [[ICMP5]](s1), %bb.2 + ; CHECK: G_BR %bb.3 + ; CHECK: bb.2.cond_true: + ; CHECK: TCRETURNdi @bar, 0, csr_aarch64_aapcs, implicit $sp + ; CHECK: bb.3.UnifiedReturnBlock: + ; CHECK: RET_ReallyLR +entry: + %tmp1 = icmp eq i32 %X, 5 + %tmp3 = icmp slt i32 %X, 5 + %tmpZ = icmp eq i32 %Z, 5 + %tmp4 = or i1 %tmp3, %tmp1 + %final = or i1 %tmp4, %tmpZ + br i1 %final, label %cond_true, label %UnifiedReturnBlock + +cond_true: + %tmp5 = tail call i32 (...) @bar( ) + ret void + +UnifiedReturnBlock: + ret void +} + +; (X != null) | (Y != null) --> (X|Y) != 0 +; Don't emit two branches. +define void @or_cond_ne_null(i32 %X, i32 %Y, i32 %Z) nounwind { + ; CHECK-LABEL: name: or_cond_ne_null + ; CHECK: bb.1.entry: + ; CHECK: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK: liveins: $w0, $w1, $w2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[C]] + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[C]] + ; CHECK: [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP1]], [[ICMP]] + ; CHECK: G_BRCOND [[OR]](s1), %bb.2 + ; CHECK: G_BR %bb.3 + ; CHECK: bb.2.cond_true: + ; CHECK: TCRETURNdi @bar, 0, csr_aarch64_aapcs, implicit $sp + ; CHECK: bb.3.UnifiedReturnBlock: + ; CHECK: RET_ReallyLR +entry: + %tmp1 = icmp ne i32 %X, 0 + %tmp3 = icmp ne i32 %Y, 0 + %tmp4 = or i1 %tmp3, %tmp1 + br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock + +cond_true: + %tmp5 = tail call i32 (...) @bar( ) + ret void + +UnifiedReturnBlock: + ret void +} + +; If the branch is unpredictable, don't add another branch +; regardless of whether they are expensive or not. + +define void @unpredictable(i32 %X, i32 %Y, i32 %Z) nounwind { + ; CHECK-LABEL: name: unpredictable + ; CHECK: bb.1.entry: + ; CHECK: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK: liveins: $w0, $w1, $w2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s32), [[C1]] + ; CHECK: [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP1]], [[ICMP]] + ; CHECK: G_BRCOND [[OR]](s1), %bb.2 + ; CHECK: G_BR %bb.3 + ; CHECK: bb.2.cond_true: + ; CHECK: TCRETURNdi @bar, 0, csr_aarch64_aapcs, implicit $sp + ; CHECK: bb.3.UnifiedReturnBlock: + ; CHECK: RET_ReallyLR +entry: + %tmp1 = icmp eq i32 %X, 0 + %tmp3 = icmp slt i32 %Y, 5 + %tmp4 = or i1 %tmp3, %tmp1 + br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock, !unpredictable !0 + +cond_true: + %tmp5 = tail call i32 (...) @bar( ) + ret void + +UnifiedReturnBlock: + ret void +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-bittest.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-bittest.ll index 28756a4ae6175..8dfae82d02a62 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-bittest.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-bittest.ll @@ -4,7 +4,7 @@ define i32 @test_bittest(i16 %p) { ; CHECK-LABEL: name: test_bittest ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.4(0x40000000), %bb.5(0x40000000) + ; CHECK: successors: %bb.4(0x1b6db6db), %bb.5(0x64924925) ; CHECK: liveins: $w0 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) @@ -25,7 +25,7 @@ define i32 @test_bittest(i16 %p) { ; CHECK: G_BRCOND [[ICMP1]](s1), %bb.3 ; CHECK: G_BR %bb.2 ; CHECK: bb.5 (%ir-block.0): - ; CHECK: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; CHECK: successors: %bb.3(0x745d1746), %bb.4(0x0ba2e8ba) ; CHECK: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[C5]], [[ZEXT1]](s64) ; CHECK: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 866239240827043840 @@ -61,7 +61,7 @@ declare void @callee() define void @test_bittest_2_bt(i32 %p) { ; CHECK-LABEL: name: test_bittest_2_bt ; CHECK: bb.1.entry: - ; CHECK: successors: %bb.5(0x40000000), %bb.6(0x40000000) + ; CHECK: successors: %bb.5(0x345d1746), %bb.6(0x4ba2e8ba) ; CHECK: liveins: $w0 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 176 @@ -71,7 +71,7 @@ define void @test_bittest_2_bt(i32 %p) { ; CHECK: G_BRCOND [[ICMP]](s1), %bb.5 ; CHECK: G_BR %bb.6 ; CHECK: bb.5.entry: - ; CHECK: successors: %bb.4(0x40000000), %bb.7(0x40000000) + ; CHECK: successors: %bb.4(0x0ccccccd), %bb.7(0x73333333) ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[C2]] ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[SUB1]](s32) @@ -80,7 +80,7 @@ define void @test_bittest_2_bt(i32 %p) { ; CHECK: G_BRCOND [[ICMP1]](s1), %bb.4 ; CHECK: G_BR %bb.7 ; CHECK: bb.6.entry: - ; CHECK: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; CHECK: successors: %bb.2(0x76276276), %bb.5(0x09d89d8a) ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[SUB]](s32) ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 57351 @@ -90,7 +90,7 @@ define void @test_bittest_2_bt(i32 %p) { ; CHECK: G_BRCOND [[ICMP2]](s1), %bb.2 ; CHECK: G_BR %bb.5 ; CHECK: bb.7.entry: - ; CHECK: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; CHECK: successors: %bb.3(0x71c71c72), %bb.4(0x0e38e38e) ; CHECK: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[C7]], [[ZEXT]](s64) ; CHECK: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 365072220160 @@ -134,7 +134,7 @@ sw.default: ; preds = %entry define i32 @test_bittest_single_bt_only_with_fallthrough(i16 %p) { ; CHECK-LABEL: name: test_bittest_single_bt_only_with_fallthrough ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; CHECK: successors: %bb.2(0x0aaaaaab), %bb.4(0x75555555) ; CHECK: liveins: $w0 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) @@ -148,7 +148,7 @@ define i32 @test_bittest_single_bt_only_with_fallthrough(i16 %p) { ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ugt), [[SUB]](s32), [[C3]] ; CHECK: G_BRCOND [[ICMP]](s1), %bb.2 ; CHECK: bb.4 (%ir-block.0): - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: successors: %bb.3(0x745d1746), %bb.2(0x0ba2e8ba) ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[C4]], [[ZEXT1]](s64) ; CHECK: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 866239240827043840 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir index 0b69a126f1ae0..bb2bc3372936f 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir @@ -56,3 +56,19 @@ body: | $q0 = COPY %2(<2 x p0>) RET_ReallyLR ... +--- +name: legal_v16s8 +body: | + bb.0: + ; CHECK-LABEL: name: legal_v16s8 + ; CHECK: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[DEF]](s8), [[DEF1]](s8), [[DEF]](s8), [[DEF1]](s8), [[DEF]](s8), [[DEF1]](s8), [[DEF]](s8), [[DEF1]](s8), [[DEF]](s8), [[DEF1]](s8), [[DEF]](s8), [[DEF1]](s8), [[DEF]](s8), [[DEF1]](s8), [[DEF]](s8), [[DEF1]](s8) + ; CHECK: $q0 = COPY [[BUILD_VECTOR]](<16 x s8>) + ; CHECK: RET_ReallyLR + %0:_(s8) = G_IMPLICIT_DEF + %1:_(s8) = G_IMPLICIT_DEF + %2:_(<16 x s8>) = G_BUILD_VECTOR %0(s8), %1(s8), %0(s8), %1(s8), %0(s8), %1(s8), %0(s8), %1(s8), %0(s8), %1(s8), %0(s8), %1(s8), %0(s8), %1(s8), %0(s8), %1(s8) + $q0 = COPY %2(<16 x s8>) + RET_ReallyLR +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir index ecba4f226301e..0144df5197b14 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -o - -global-isel-abort=1 | FileCheck %s --- name: test_eve_1 @@ -19,3 +19,115 @@ body: | $x0 = COPY %2(s64) RET_ReallyLR ... +--- +name: test_eve_v2s1 +body: | + bb.0: + liveins: $q0, $q1, $x0 + ; CHECK-LABEL: name: test_eve_v2s1 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[ICMP:%[0-9]+]]:_(<2 x s64>) = G_ICMP intpred(eq), [[COPY]](<2 x s64>), [[COPY1]] + ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY [[ICMP]](<2 x s64>) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64) + ; CHECK: [[SHL:%[0-9]+]]:_(<2 x s64>) = G_SHL [[COPY3]], [[BUILD_VECTOR]](<2 x s64>) + ; CHECK: [[ASHR:%[0-9]+]]:_(<2 x s64>) = G_ASHR [[SHL]], [[BUILD_VECTOR]](<2 x s64>) + ; CHECK: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[ASHR]](<2 x s64>), [[COPY2]](s64) + ; CHECK: [[COPY4:%[0-9]+]]:_(s64) = COPY [[EVEC]](s64) + ; CHECK: $x0 = COPY [[COPY4]](s64) + ; CHECK: RET_ReallyLR + %0:_(<2 x s64>) = COPY $q0 + %1:_(<2 x s64>) = COPY $q1 + %2:_(s64) = COPY $x0 + %3:_(<2 x s1>) = G_ICMP intpred(eq), %0(<2 x s64>), %1 + %4:_(s1) = G_EXTRACT_VECTOR_ELT %3:_(<2 x s1>), %2:_(s64) + %5:_(s64) = G_ANYEXT %4(s1) + $x0 = COPY %5(s64) + RET_ReallyLR +... +--- +name: test_eve_v4s1 +body: | + bb.0: + liveins: $q0, $q1, $x0 + ; CHECK-LABEL: name: test_eve_v4s1 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(eq), [[COPY]](<4 x s32>), [[COPY1]] + ; CHECK: [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY [[ICMP]](<4 x s32>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK: [[SHL:%[0-9]+]]:_(<4 x s32>) = G_SHL [[COPY3]], [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[ASHR:%[0-9]+]]:_(<4 x s32>) = G_ASHR [[SHL]], [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[ASHR]](<4 x s32>), [[COPY2]](s64) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[EVEC]](s32) + ; CHECK: $x0 = COPY [[ANYEXT]](s64) + ; CHECK: RET_ReallyLR + %0:_(<4 x s32>) = COPY $q0 + %1:_(<4 x s32>) = COPY $q1 + %2:_(s64) = COPY $x0 + %3:_(<4 x s1>) = G_ICMP intpred(eq), %0(<4 x s32>), %1 + %4:_(s1) = G_EXTRACT_VECTOR_ELT %3:_(<4 x s1>), %2:_(s64) + %5:_(s64) = G_ANYEXT %4(s1) + $x0 = COPY %5(s64) + RET_ReallyLR +... +--- +name: test_eve_v8s1 +body: | + bb.0: + liveins: $q0, $q1, $x0 + ; CHECK-LABEL: name: test_eve_v8s1 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[ICMP:%[0-9]+]]:_(<8 x s16>) = G_ICMP intpred(eq), [[COPY]](<8 x s16>), [[COPY1]] + ; CHECK: [[COPY3:%[0-9]+]]:_(<8 x s16>) = COPY [[ICMP]](<8 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) + ; CHECK: [[SHL:%[0-9]+]]:_(<8 x s16>) = G_SHL [[COPY3]], [[BUILD_VECTOR]](<8 x s16>) + ; CHECK: [[ASHR:%[0-9]+]]:_(<8 x s16>) = G_ASHR [[SHL]], [[BUILD_VECTOR]](<8 x s16>) + ; CHECK: [[EVEC:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[ASHR]](<8 x s16>), [[COPY2]](s64) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[EVEC]](s16) + ; CHECK: $x0 = COPY [[ANYEXT]](s64) + ; CHECK: RET_ReallyLR + %0:_(<8 x s16>) = COPY $q0 + %1:_(<8 x s16>) = COPY $q1 + %2:_(s64) = COPY $x0 + %3:_(<8 x s1>) = G_ICMP intpred(eq), %0(<8 x s16>), %1 + %4:_(s1) = G_EXTRACT_VECTOR_ELT %3:_(<8 x s1>), %2:_(s64) + %5:_(s64) = G_ANYEXT %4(s1) + $x0 = COPY %5(s64) + RET_ReallyLR +... +--- +name: test_eve_v16s1 +body: | + bb.0: + liveins: $q0, $q1, $x0 + ; CHECK-LABEL: name: test_eve_v16s1 + ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<16 x s8>) = COPY $q1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[ICMP:%[0-9]+]]:_(<16 x s8>) = G_ICMP intpred(eq), [[COPY]](<16 x s8>), [[COPY1]] + ; CHECK: [[COPY3:%[0-9]+]]:_(<16 x s8>) = COPY [[ICMP]](<16 x s8>) + ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 7 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8) + ; CHECK: [[SHL:%[0-9]+]]:_(<16 x s8>) = G_SHL [[COPY3]], [[BUILD_VECTOR]](<16 x s8>) + ; CHECK: [[ASHR:%[0-9]+]]:_(<16 x s8>) = G_ASHR [[SHL]], [[BUILD_VECTOR]](<16 x s8>) + ; CHECK: [[EVEC:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[ASHR]](<16 x s8>), [[COPY2]](s64) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[EVEC]](s8) + ; CHECK: $x0 = COPY [[ANYEXT]](s64) + ; CHECK: RET_ReallyLR + %0:_(<16 x s8>) = COPY $q0 + %1:_(<16 x s8>) = COPY $q1 + %2:_(s64) = COPY $x0 + %3:_(<16 x s1>) = G_ICMP intpred(eq), %0(<16 x s8>), %1 + %4:_(s1) = G_EXTRACT_VECTOR_ELT %3:_(<16 x s1>), %2:_(s64) + %5:_(s64) = G_ANYEXT %4(s1) + $x0 = COPY %5(s64) + RET_ReallyLR +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fptrunc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fptrunc.mir new file mode 100644 index 0000000000000..381bd03cf19c7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fptrunc.mir @@ -0,0 +1,139 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-unknown -run-pass=legalizer -O0 -global-isel %s -o - | FileCheck %s +--- +name: fptrunc_s16_s32 +body: | + bb.0: + liveins: $s0 + + ; CHECK-LABEL: name: fptrunc_s16_s32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY]](s32) + ; CHECK: $h0 = COPY [[FPTRUNC]](s16) + ; CHECK: RET_ReallyLR implicit $h0 + %0:_(s32) = COPY $s0 + %1:_(s16) = G_FPTRUNC %0 + $h0 = COPY %1(s16) + RET_ReallyLR implicit $h0 +... +--- +name: fptrunc_s16_s64 +body: | + bb.0: + liveins: $d0 + + ; CHECK-LABEL: name: fptrunc_s16_s64 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY]](s64) + ; CHECK: $h0 = COPY [[FPTRUNC]](s16) + ; CHECK: RET_ReallyLR implicit $h0 + %0:_(s64) = COPY $d0 + %1:_(s16) = G_FPTRUNC %0 + $h0 = COPY %1(s16) + RET_ReallyLR implicit $h0 +... +--- +name: fptrunc_s32_s64 +body: | + bb.0: + liveins: $d0 + + ; CHECK-LABEL: name: fptrunc_s32_s64 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s32) = G_FPTRUNC [[COPY]](s64) + ; CHECK: $s0 = COPY [[FPTRUNC]](s32) + ; CHECK: RET_ReallyLR implicit $s0 + %0:_(s64) = COPY $d0 + %1:_(s32) = G_FPTRUNC %0 + $s0 = COPY %1(s32) + RET_ReallyLR implicit $s0 +... +--- +name: fptrunc_v4s16_v4s32 +body: | + bb.0: + liveins: $q0 + + ; CHECK-LABEL: name: fptrunc_v4s16_v4s32 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(<4 x s16>) = G_FPTRUNC [[COPY]](<4 x s32>) + ; CHECK: $d0 = COPY [[FPTRUNC]](<4 x s16>) + ; CHECK: RET_ReallyLR implicit $d0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(<4 x s16>) = G_FPTRUNC %0 + $d0 = COPY %1(<4 x s16>) + RET_ReallyLR implicit $d0 +... +--- +name: fptrunc_v2s16_v2s32 +body: | + bb.0: + liveins: $d0 + + ; CHECK-LABEL: name: fptrunc_v2s16_v2s32 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0 + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(<2 x s16>) = G_FPTRUNC [[COPY]](<2 x s32>) + ; CHECK: $s0 = COPY [[FPTRUNC]](<2 x s16>) + ; CHECK: RET_ReallyLR implicit $s0 + %0:_(<2 x s32>) = COPY $d0 + %1:_(<2 x s16>) = G_FPTRUNC %0 + $s0 = COPY %1(<2 x s16>) + RET_ReallyLR implicit $s0 +... +--- +name: fptrunc_v4s32_v4s64 +body: | + bb.0: + + ; CHECK-LABEL: name: fptrunc_v4s32_v4s64 + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(<2 x s32>) = G_FPTRUNC [[DEF]](<2 x s64>) + ; CHECK: [[FPTRUNC1:%[0-9]+]]:_(<2 x s32>) = G_FPTRUNC [[DEF]](<2 x s64>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[FPTRUNC]](<2 x s32>), [[FPTRUNC1]](<2 x s32>) + ; CHECK: $q0 = COPY [[CONCAT_VECTORS]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<4 x s64>) = G_IMPLICIT_DEF + %1:_(<4 x s32>) = G_FPTRUNC %0 + $q0 = COPY %1(<4 x s32>) + RET_ReallyLR implicit $q0 +... +--- +name: fptrunc_v8s32_v8s64 +body: | + bb.0: + + liveins: $x0, $q0, $q1, $q2, $q3, $x0 + + ; CHECK-LABEL: name: fptrunc_v8s32_v8s64 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3 + ; CHECK: [[COPY4:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(<2 x s32>) = G_FPTRUNC [[COPY]](<2 x s64>) + ; CHECK: [[FPTRUNC1:%[0-9]+]]:_(<2 x s32>) = G_FPTRUNC [[COPY1]](<2 x s64>) + ; CHECK: [[FPTRUNC2:%[0-9]+]]:_(<2 x s32>) = G_FPTRUNC [[COPY2]](<2 x s64>) + ; CHECK: [[FPTRUNC3:%[0-9]+]]:_(<2 x s32>) = G_FPTRUNC [[COPY3]](<2 x s64>) + ; CHECK: [[COPY5:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: G_STORE [[FPTRUNC]](<2 x s32>), [[COPY5]](p0) :: (store 8, align 32) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY5]], [[C]](s64) + ; CHECK: G_STORE [[FPTRUNC1]](<2 x s32>), [[PTR_ADD]](p0) :: (store 8 + 8) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY5]], [[C1]](s64) + ; CHECK: G_STORE [[FPTRUNC2]](<2 x s32>), [[PTR_ADD1]](p0) :: (store 8 + 16, align 16) + ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 + ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY5]], [[C2]](s64) + ; CHECK: G_STORE [[FPTRUNC3]](<2 x s32>), [[PTR_ADD2]](p0) :: (store 8 + 24) + ; CHECK: RET_ReallyLR + %2:_(<2 x s64>) = COPY $q0 + %3:_(<2 x s64>) = COPY $q1 + %4:_(<2 x s64>) = COPY $q2 + %5:_(<2 x s64>) = COPY $q3 + %0:_(<8 x s64>) = G_CONCAT_VECTORS %2(<2 x s64>), %3(<2 x s64>), %4(<2 x s64>), %5(<2 x s64>) + %1:_(p0) = COPY $x0 + %6:_(<8 x s32>) = G_FPTRUNC %0(<8 x s64>) + %7:_(p0) = COPY $x0 + G_STORE %6(<8 x s32>), %7(p0) :: (store 32) + RET_ReallyLR +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir index 504fb1a12b5d5..61104c6e432e1 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir @@ -1,59 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s - ---- | - target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" - target triple = "aarch64" - - define void @test_load() { ret void } - define void @test_store() { ret void } - - define void @store_4xi16(<4 x i16> %v, <4 x i16>* %ptr) { - store <4 x i16> %v, <4 x i16>* %ptr - ret void - } - - define void @store_4xi32(<4 x i32> %v, <4 x i32>* %ptr) { - store <4 x i32> %v, <4 x i32>* %ptr - ret void - } - - define void @store_8xi16(<8 x i16> %v, <8 x i16>* %ptr) { - store <8 x i16> %v, <8 x i16>* %ptr - ret void - } - - define void @store_16xi8(<16 x i8> %v, <16 x i8>* %ptr) { - store <16 x i8> %v, <16 x i8>* %ptr - ret void - } - - define <4 x i16> @load_4xi16(<4 x i16>* %ptr) { - %res = load <4 x i16>, <4 x i16>* %ptr - ret <4 x i16> %res - } - - define <4 x i32> @load_4xi32(<4 x i32>* %ptr) { - %res = load <4 x i32>, <4 x i32>* %ptr - ret <4 x i32> %res - } - - define <8 x i16> @load_8xi16(<8 x i16>* %ptr) { - %res = load <8 x i16>, <8 x i16>* %ptr - ret <8 x i16> %res - } - - define <16 x i8> @load_16xi8(<16 x i8>* %ptr) { - %res = load <16 x i8>, <16 x i8>* %ptr - ret <16 x i8> %res - } - - define <8 x i8> @load_8xi8(<8 x i8>* %ptr) { - %res = load <8 x i8>, <8 x i8>* %ptr - ret <8 x i8> %res - } - -... +# RUN: llc -O0 -march=aarch64 -run-pass=legalizer -global-isel-abort=1 %s -o - | FileCheck %s --- name: test_load body: | @@ -155,18 +101,18 @@ alignment: 4 tracksRegLiveness: true machineFunctionInfo: {} body: | - bb.1 (%ir-block.0): + bb.1: liveins: $d0, $x0 ; CHECK-LABEL: name: store_4xi16 ; CHECK: liveins: $d0, $x0 ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0 ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p0) :: (store 8 into %ir.ptr) + ; CHECK: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p0) :: (store 8) ; CHECK: RET_ReallyLR %0:_(<4 x s16>) = COPY $d0 %1:_(p0) = COPY $x0 - G_STORE %0(<4 x s16>), %1(p0) :: (store 8 into %ir.ptr) + G_STORE %0(<4 x s16>), %1(p0) :: (store 8) RET_ReallyLR ... @@ -176,18 +122,18 @@ alignment: 4 tracksRegLiveness: true machineFunctionInfo: {} body: | - bb.1 (%ir-block.0): + bb.1: liveins: $q0, $x0 ; CHECK-LABEL: name: store_4xi32 ; CHECK: liveins: $q0, $x0 ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: G_STORE [[COPY]](<4 x s32>), [[COPY1]](p0) :: (store 16 into %ir.ptr) + ; CHECK: G_STORE [[COPY]](<4 x s32>), [[COPY1]](p0) :: (store 16) ; CHECK: RET_ReallyLR %0:_(<4 x s32>) = COPY $q0 %1:_(p0) = COPY $x0 - G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.ptr) + G_STORE %0(<4 x s32>), %1(p0) :: (store 16) RET_ReallyLR ... @@ -197,18 +143,18 @@ alignment: 4 tracksRegLiveness: true machineFunctionInfo: {} body: | - bb.1 (%ir-block.0): + bb.1: liveins: $q0, $x0 ; CHECK-LABEL: name: store_8xi16 ; CHECK: liveins: $q0, $x0 ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: G_STORE [[COPY]](<8 x s16>), [[COPY1]](p0) :: (store 16 into %ir.ptr) + ; CHECK: G_STORE [[COPY]](<8 x s16>), [[COPY1]](p0) :: (store 16) ; CHECK: RET_ReallyLR %0:_(<8 x s16>) = COPY $q0 %1:_(p0) = COPY $x0 - G_STORE %0(<8 x s16>), %1(p0) :: (store 16 into %ir.ptr) + G_STORE %0(<8 x s16>), %1(p0) :: (store 16) RET_ReallyLR ... @@ -218,18 +164,18 @@ alignment: 4 tracksRegLiveness: true machineFunctionInfo: {} body: | - bb.1 (%ir-block.0): + bb.1: liveins: $q0, $x0 ; CHECK-LABEL: name: store_16xi8 ; CHECK: liveins: $q0, $x0 ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: G_STORE [[COPY]](<16 x s8>), [[COPY1]](p0) :: (store 16 into %ir.ptr) + ; CHECK: G_STORE [[COPY]](<16 x s8>), [[COPY1]](p0) :: (store 16) ; CHECK: RET_ReallyLR %0:_(<16 x s8>) = COPY $q0 %1:_(p0) = COPY $x0 - G_STORE %0(<16 x s8>), %1(p0) :: (store 16 into %ir.ptr) + G_STORE %0(<16 x s8>), %1(p0) :: (store 16) RET_ReallyLR ... @@ -239,17 +185,17 @@ alignment: 4 tracksRegLiveness: true machineFunctionInfo: {} body: | - bb.1 (%ir-block.0): + bb.1: liveins: $x0 ; CHECK-LABEL: name: load_4xi16 ; CHECK: liveins: $x0 ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load 8 from %ir.ptr) + ; CHECK: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load 8) ; CHECK: $d0 = COPY [[LOAD]](<4 x s16>) ; CHECK: RET_ReallyLR implicit $d0 %0:_(p0) = COPY $x0 - %1:_(<4 x s16>) = G_LOAD %0(p0) :: (load 8 from %ir.ptr) + %1:_(<4 x s16>) = G_LOAD %0(p0) :: (load 8) $d0 = COPY %1(<4 x s16>) RET_ReallyLR implicit $d0 @@ -260,17 +206,17 @@ alignment: 4 tracksRegLiveness: true machineFunctionInfo: {} body: | - bb.1 (%ir-block.0): + bb.1: liveins: $x0 ; CHECK-LABEL: name: load_4xi32 ; CHECK: liveins: $x0 ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16 from %ir.ptr) + ; CHECK: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load 16) ; CHECK: $q0 = COPY [[LOAD]](<4 x s32>) ; CHECK: RET_ReallyLR implicit $q0 %0:_(p0) = COPY $x0 - %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.ptr) + %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load 16) $q0 = COPY %1(<4 x s32>) RET_ReallyLR implicit $q0 @@ -281,17 +227,17 @@ alignment: 4 tracksRegLiveness: true machineFunctionInfo: {} body: | - bb.1 (%ir-block.0): + bb.1: liveins: $x0 ; CHECK-LABEL: name: load_8xi16 ; CHECK: liveins: $x0 ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p0) :: (load 16 from %ir.ptr) + ; CHECK: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p0) :: (load 16) ; CHECK: $q0 = COPY [[LOAD]](<8 x s16>) ; CHECK: RET_ReallyLR implicit $q0 %0:_(p0) = COPY $x0 - %1:_(<8 x s16>) = G_LOAD %0(p0) :: (load 16 from %ir.ptr) + %1:_(<8 x s16>) = G_LOAD %0(p0) :: (load 16) $q0 = COPY %1(<8 x s16>) RET_ReallyLR implicit $q0 @@ -302,17 +248,17 @@ alignment: 4 tracksRegLiveness: true machineFunctionInfo: {} body: | - bb.1 (%ir-block.0): + bb.1: liveins: $x0 ; CHECK-LABEL: name: load_16xi8 ; CHECK: liveins: $x0 ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[COPY]](p0) :: (load 16 from %ir.ptr) + ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[COPY]](p0) :: (load 16) ; CHECK: $q0 = COPY [[LOAD]](<16 x s8>) ; CHECK: RET_ReallyLR implicit $q0 %0:_(p0) = COPY $x0 - %1:_(<16 x s8>) = G_LOAD %0(p0) :: (load 16 from %ir.ptr) + %1:_(<16 x s8>) = G_LOAD %0(p0) :: (load 16) $q0 = COPY %1(<16 x s8>) RET_ReallyLR implicit $q0 @@ -323,17 +269,36 @@ alignment: 4 tracksRegLiveness: true machineFunctionInfo: {} body: | - bb.1 (%ir-block.0): + bb.1: liveins: $x0 ; CHECK-LABEL: name: load_8xi8 ; CHECK: liveins: $x0 ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[COPY]](p0) :: (load 8 from %ir.ptr) + ; CHECK: [[LOAD:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[COPY]](p0) :: (load 8) ; CHECK: $d0 = COPY [[LOAD]](<8 x s8>) ; CHECK: RET_ReallyLR implicit $d0 %0:_(p0) = COPY $x0 - %1:_(<8 x s8>) = G_LOAD %0(p0) :: (load 8 from %ir.ptr) + %1:_(<8 x s8>) = G_LOAD %0(p0) :: (load 8) $d0 = COPY %1(<8 x s8>) RET_ReallyLR implicit $d0 ... +--- +name: store_8xi8 +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.1: + liveins: $x0, $d0 + ; CHECK-LABEL: name: store_8xi8 + ; CHECK: liveins: $x0, $d0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d0 + ; CHECK: G_STORE [[COPY1]](<8 x s8>), [[COPY]](p0) :: (store 8) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(<8 x s8>) = COPY $d0 + G_STORE %1(<8 x s8>), %0(p0) :: (store 8) + RET_ReallyLR +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir index 944ac8110ce01..05cb4cb2908a5 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -O0 -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s -# RUN: llc -O0 -debugify-and-strip-all-safe -march=aarch64 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -O0 -debugify-and-strip-all-safe -march=aarch64 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --- name: test_shift body: | @@ -284,3 +284,87 @@ body: | RET_ReallyLR implicit $w0 ... +--- +name: test_ashr_v16i8 +body: | + bb.0: + ; CHECK-LABEL: name: test_ashr_v16i8 + ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<16 x s8>) = COPY $q1 + ; CHECK: [[ASHR:%[0-9]+]]:_(<16 x s8>) = G_ASHR [[COPY]], [[COPY1]](<16 x s8>) + ; CHECK: $q0 = COPY [[ASHR]](<16 x s8>) + %0:_(<16 x s8>) = COPY $q0 + %1:_(<16 x s8>) = COPY $q1 + %2:_(<16 x s8>) = G_ASHR %0, %1 + $q0 = COPY %2 +... +--- +name: test_ashr_v8i16 +body: | + bb.0: + ; CHECK-LABEL: name: test_ashr_v8i16 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1 + ; CHECK: [[ASHR:%[0-9]+]]:_(<8 x s16>) = G_ASHR [[COPY]], [[COPY1]](<8 x s16>) + ; CHECK: $q0 = COPY [[ASHR]](<8 x s16>) + %0:_(<8 x s16>) = COPY $q0 + %1:_(<8 x s16>) = COPY $q1 + %2:_(<8 x s16>) = G_ASHR %0, %1 + $q0 = COPY %2 +... +--- +name: test_shl_v16i8 +body: | + bb.0: + ; CHECK-LABEL: name: test_shl_v16i8 + ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<16 x s8>) = COPY $q1 + ; CHECK: [[SHL:%[0-9]+]]:_(<16 x s8>) = G_SHL [[COPY]], [[COPY1]](<16 x s8>) + ; CHECK: $q0 = COPY [[SHL]](<16 x s8>) + %0:_(<16 x s8>) = COPY $q0 + %1:_(<16 x s8>) = COPY $q1 + %2:_(<16 x s8>) = G_SHL %0, %1 + $q0 = COPY %2 +... +--- +name: test_shl_v8i16 +body: | + bb.0: + ; CHECK-LABEL: name: test_shl_v8i16 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1 + ; CHECK: [[SHL:%[0-9]+]]:_(<8 x s16>) = G_SHL [[COPY]], [[COPY1]](<8 x s16>) + ; CHECK: $q0 = COPY [[SHL]](<8 x s16>) + %0:_(<8 x s16>) = COPY $q0 + %1:_(<8 x s16>) = COPY $q1 + %2:_(<8 x s16>) = G_SHL %0, %1 + $q0 = COPY %2 +... +--- +name: test_lshr_v16i8 +body: | + bb.0: + ; CHECK-LABEL: name: test_lshr_v16i8 + ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<16 x s8>) = COPY $q1 + ; CHECK: [[LSHR:%[0-9]+]]:_(<16 x s8>) = G_LSHR [[COPY]], [[COPY1]](<16 x s8>) + ; CHECK: $q0 = COPY [[LSHR]](<16 x s8>) + %0:_(<16 x s8>) = COPY $q0 + %1:_(<16 x s8>) = COPY $q1 + %2:_(<16 x s8>) = G_LSHR %0, %1 + $q0 = COPY %2 +... +--- +name: test_lshr_v8i16 +body: | + bb.0: + ; CHECK-LABEL: name: test_lshr_v8i16 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1 + ; CHECK: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[COPY1]](<8 x s16>) + ; CHECK: $q0 = COPY [[LSHR]](<8 x s16>) + %0:_(<8 x s16>) = COPY $q0 + %1:_(<8 x s16>) = COPY $q1 + %2:_(<8 x s16>) = G_LSHR %0, %1 + $q0 = COPY %2 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir index 051f33dabf4c8..0631ff89ade0d 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir @@ -1,5 +1,10 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -debugify-and-strip-all-safe -O0 -run-pass=aarch64-prelegalizer-combiner -global-isel -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -debugify-and-strip-all-safe -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="opt_brcond_by_inverting_cond" -global-isel -verify-machineinstrs %s -o - | FileCheck %s + +# Need asserts for the only-enable-rule to work. + +# REQUIRES: asserts + --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "arm64-apple-ios5.0.0" @@ -38,8 +43,11 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[COPY]](s32), [[C]] - ; CHECK: G_BRCOND [[ICMP]](s1), %bb.2 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]] + ; CHECK: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; CHECK: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C2]] + ; CHECK: G_BRCOND [[XOR]](s1), %bb.2 + ; CHECK: G_BR %bb.1 ; CHECK: bb.1.if.then: ; CHECK: successors: %bb.3(0x80000000) ; CHECK: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[COPY1]], [[COPY]] diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir index 213b9edf137af..3f1515955d3af 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=aarch64-unknown-unknown -verify-machineinstrs -O0 -run-pass=regbankselect %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-unknown-unknown -verify-machineinstrs -O0 -run-pass=regbankselect -global-isel-abort=1 %s -o - | FileCheck %s name: v2s32_fpr alignment: 4 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-binop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-binop.mir index 2c53f6df4d4fa..f6aa16784b25e 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-binop.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-binop.mir @@ -330,7 +330,6 @@ body: | ; CHECK: bb.0: ; CHECK: successors: %bb.1(0x80000000) ; CHECK: [[COPY:%[0-9]+]]:gpr32sp = COPY $w0 - ; CHECK: B %bb.1 ; CHECK: bb.1: ; CHECK: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[COPY]], 1, 0 ; CHECK: $w0 = COPY [[ADDWri]] diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-constant.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-constant.mir index e25c84958b9db..c280f000b174e 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-constant.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-constant.mir @@ -8,6 +8,8 @@ define i16 @const_s16() { ret i16 42 } define i32 @const_s32() { ret i32 42 } define i64 @const_s64() { ret i64 1234567890123 } + define i32 @const_s32_zero() { ret i32 0 } + define i64 @const_s64_zero() { ret i64 0 } define i8* @const_p0_0() { ret i8* null } define i32 @fconst_s32() { ret i32 42 } @@ -81,6 +83,38 @@ body: | $x0 = COPY %0(s64) ... +--- +name: const_s32_zero +legalized: true +regBankSelected: true +registers: + - { id: 0, class: gpr } + +body: | + bb.0: + ; CHECK-LABEL: name: const_s32_zero + ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $wzr + ; CHECK: $w0 = COPY [[COPY]] + %0(s32) = G_CONSTANT i32 0 + $w0 = COPY %0(s32) +... + +--- +name: const_s64_zero +legalized: true +regBankSelected: true +registers: + - { id: 0, class: gpr } + +body: | + bb.0: + ; CHECK-LABEL: name: const_s64_zero + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $xzr + ; CHECK: $x0 = COPY [[COPY]] + %0(s64) = G_CONSTANT i64 0 + $x0 = COPY %0(s64) +... + --- name: const_p0_0 legalized: true diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-jump-table-brjt-constrain.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-jump-table-brjt-constrain.mir index 082bf43061da4..6df6573b35337 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-jump-table-brjt-constrain.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-jump-table-brjt-constrain.mir @@ -35,7 +35,6 @@ body: | ; CHECK: BR %6 ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: B %bb.3 ; CHECK: bb.3: ; CHECK: RET_ReallyLR bb.1: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-returnaddress-liveins.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-returnaddress-liveins.mir index a309daab0b4ce..f0ae4f17b2ee3 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-returnaddress-liveins.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-returnaddress-liveins.mir @@ -19,7 +19,6 @@ body: | ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $w0, $x0, $lr ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $lr - ; CHECK: B %bb.1 ; CHECK: bb.1: ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY [[COPY]] ; CHECK: $x0 = COPY [[COPY1]] @@ -47,7 +46,6 @@ body: | ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $w0, $x0, $lr ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $lr - ; CHECK: B %bb.1 ; CHECK: bb.1: ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY [[COPY]] ; CHECK: $x0 = COPY [[COPY1]] @@ -78,7 +76,6 @@ body: | ; CHECK: liveins: $w0, $x0, $lr ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $lr ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY [[COPY]] - ; CHECK: B %bb.1 ; CHECK: bb.1: ; CHECK: $x0 = COPY [[COPY1]] ; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY [[COPY]] diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir index db355dfc151f5..05038b40ca365 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir @@ -39,6 +39,9 @@ define void @store_8xi16(<8 x i16> %v, <8 x i16>* %ptr) { ret void } define void @store_16xi8(<16 x i8> %v, <16 x i8>* %ptr) { ret void } + @x = external hidden local_unnamed_addr global i32*, align 8 + define void @store_adrp_add_low() { ret void } + ... --- @@ -600,3 +603,20 @@ body: | RET_ReallyLR ... +--- +name: store_adrp_add_low +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: store_adrp_add_low + ; CHECK: liveins: $x0 + ; CHECK: %copy:gpr64 = COPY $x0 + ; CHECK: %adrp:gpr64common = ADRP target-flags(aarch64-page) @x + ; CHECK: STRXui %copy, %adrp, target-flags(aarch64-pageoff, aarch64-nc) @x :: (store 8 into @x) + %copy:gpr(p0) = COPY $x0 + %adrp:gpr64(p0) = ADRP target-flags(aarch64-page) @x + %add_low:gpr(p0) = G_ADD_LOW %adrp(p0), target-flags(aarch64-pageoff, aarch64-nc) @x + G_STORE %copy(p0), %add_low(p0) :: (store 8 into @x) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-uaddo.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-uaddo.mir index 96f9ad2b0634e..135932bdfb0c4 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-uaddo.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-uaddo.mir @@ -60,3 +60,54 @@ body: | RET_ReallyLR implicit $w0 ... +--- +name: uaddo_s32_imm +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0, $w1, $x2 + ; Check that we get ADDSWri when we can fold in a constant. + ; + ; CHECK-LABEL: name: uaddo_s32_imm + ; CHECK: liveins: $w0, $w1, $x2 + ; CHECK: %copy:gpr32sp = COPY $w0 + ; CHECK: %add:gpr32 = ADDSWri %copy, 16, 0, implicit-def $nzcv + ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 3, implicit $nzcv + ; CHECK: $w0 = COPY %add + ; CHECK: RET_ReallyLR implicit $w0 + %copy:gpr(s32) = COPY $w0 + %constant:gpr(s32) = G_CONSTANT i32 16 + %add:gpr(s32), %overflow:gpr(s1) = G_UADDO %copy, %constant + $w0 = COPY %add(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: uaddo_s32_shifted +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0, $w1, $x2 + ; Check that we get ADDSWrs when we can fold in a shift. + ; + ; CHECK-LABEL: name: uaddo_s32_shifted + ; CHECK: liveins: $w0, $w1, $x2 + ; CHECK: %copy1:gpr32 = COPY $w0 + ; CHECK: %copy2:gpr32 = COPY $w1 + ; CHECK: %add:gpr32 = ADDSWrs %copy1, %copy2, 16, implicit-def $nzcv + ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 3, implicit $nzcv + ; CHECK: $w0 = COPY %add + ; CHECK: RET_ReallyLR implicit $w0 + %copy1:gpr(s32) = COPY $w0 + %copy2:gpr(s32) = COPY $w1 + %constant:gpr(s32) = G_CONSTANT i32 16 + %shift:gpr(s32) = G_SHL %copy2(s32), %constant(s32) + %add:gpr(s32), %overflow:gpr(s1) = G_UADDO %copy1, %shift + $w0 = COPY %add(s32) + RET_ReallyLR implicit $w0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-xor.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-xor.mir index cc75386271c86..5b39ade02774b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-xor.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-xor.mir @@ -132,7 +132,6 @@ body: | ; CHECK-LABEL: name: xor_constant_n1_s32_gpr_2bb ; CHECK: bb.0: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: B %bb.1 ; CHECK: bb.1: ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 ; CHECK: [[ORNWrr:%[0-9]+]]:gpr32 = ORNWrr $wzr, [[COPY]] diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll b/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll index a4a1747b05af9..cbfadbdb5d720 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll @@ -131,8 +131,6 @@ define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float ; CHECK: malloc ; CHECK: mov x21, x0 ; CHECK: strb w{{.*}}, [x0, #8] -; CHECK: fcmp -; CHECK: b.le ; CHECK: ret entry: diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index c3740f1d1e96b..364c58f4acdf7 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -123,6 +123,7 @@ ; CHECK-NEXT: AArch64 Stack Tagging PreRA ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: Early Machine Loop Invariant Code Motion ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Block Frequency Analysis diff --git a/llvm/test/CodeGen/AArch64/arm64-aapcs.ll b/llvm/test/CodeGen/AArch64/arm64-aapcs.ll index 7887facb9accc..ac1678569ecb4 100644 --- a/llvm/test/CodeGen/AArch64/arm64-aapcs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-aapcs.ll @@ -90,8 +90,8 @@ declare void @variadic(i32 %a, ...) ; others. The extra arguments should go in registers rather than on the stack. define void @test_variadic() { call void(i32, ...) @variadic(i32 0, i64 1, double 2.0) -; CHECK: fmov d0, #2.0 ; CHECK: mov w1, #1 +; CHECK: fmov d0, #2.0 ; CHECK: bl variadic ret void } diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll index 7c546936ba27a..392af063eb8a0 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll @@ -4,8 +4,8 @@ define i32 @fptosi_wh(half %a) nounwind ssp { entry: ; CHECK-LABEL: fptosi_wh -; CHECK: fcvt s0, h0 -; CHECK: fcvtzs [[REG:w[0-9]+]], s0 +; CHECK: fcvt s1, h0 +; CHECK: fcvtzs [[REG:w[0-9]+]], s1 ; CHECK: mov w0, [[REG]] %conv = fptosi half %a to i32 ret i32 %conv @@ -15,8 +15,8 @@ entry: define i32 @fptoui_swh(half %a) nounwind ssp { entry: ; CHECK-LABEL: fptoui_swh -; CHECK: fcvt s0, h0 -; CHECK: fcvtzu [[REG:w[0-9]+]], s0 +; CHECK: fcvt s1, h0 +; CHECK: fcvtzu [[REG:w[0-9]+]], s1 ; CHECK: mov w0, [[REG]] %conv = fptoui half %a to i32 ret i32 %conv diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll index d8abf14c1366b..ed03aec07e7da 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll @@ -54,8 +54,8 @@ entry: ; CHECK: ldrh w8, [sp, #12] ; CHECK: str w8, [sp, #8] ; CHECK: ldr w8, [sp, #8] -; CHECK: ; kill: def $x8 killed $w8 -; CHECK: str x8, [sp] +; CHECK: mov x9, x8 +; CHECK: str x9, [sp] ; CHECK: ldr x0, [sp] ; CHECK: ret %a.addr = alloca i8, align 1 @@ -109,8 +109,8 @@ entry: ; CHECK: strh w8, [sp, #12] ; CHECK: ldrsh w8, [sp, #12] ; CHECK: str w8, [sp, #8] -; CHECK: ldrsw x8, [sp, #8] -; CHECK: str x8, [sp] +; CHECK: ldrsw x9, [sp, #8] +; CHECK: str x9, [sp] ; CHECK: ldr x0, [sp] ; CHECK: ret %a.addr = alloca i8, align 1 diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll index 9ab7247677070..67eba3f4e3075 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll @@ -3,6 +3,7 @@ ; RUN: -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | \ ; RUN: FileCheck %s --check-prefixes=FALLBACK,CHECK +; FALLBACK-NOT: remark{{.*}}fcvtas_2s define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind { ;CHECK-LABEL: fcvtas_2s: ;CHECK-NOT: ld1 @@ -12,6 +13,7 @@ define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind { ret <2 x i32> %tmp3 } +; FALLBACK-NOT: remark{{.*}}fcvtas_4s define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind { ;CHECK-LABEL: fcvtas_4s: ;CHECK-NOT: ld1 @@ -21,6 +23,7 @@ define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind { ret <4 x i32> %tmp3 } +; FALLBACK-NOT: remark{{.*}}fcvtas_2d define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind { ;CHECK-LABEL: fcvtas_2d: ;CHECK-NOT: ld1 diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll index e1e889b906c01..6b3e8d747d43d 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll @@ -285,11 +285,11 @@ define i16 @to_half(float %in) { ; FAST: // %bb.0: ; FAST-NEXT: sub sp, sp, #16 // =16 ; FAST-NEXT: .cfi_def_cfa_offset 16 -; FAST-NEXT: fcvt h0, s0 +; FAST-NEXT: fcvt h1, s0 ; FAST-NEXT: // implicit-def: $w0 -; FAST-NEXT: fmov s1, w0 -; FAST-NEXT: mov.16b v1, v0 -; FAST-NEXT: fmov w8, s1 +; FAST-NEXT: fmov s0, w0 +; FAST-NEXT: mov.16b v0, v1 +; FAST-NEXT: fmov w8, s0 ; FAST-NEXT: mov w0, w8 ; FAST-NEXT: str w0, [sp, #12] // 4-byte Folded Spill ; FAST-NEXT: mov w0, w8 diff --git a/llvm/test/CodeGen/AArch64/convertphitype.ll b/llvm/test/CodeGen/AArch64/convertphitype.ll index bb82ea2905c1c..bc858aa11eb78 100644 --- a/llvm/test/CodeGen/AArch64/convertphitype.ll +++ b/llvm/test/CodeGen/AArch64/convertphitype.ll @@ -70,14 +70,13 @@ define float @convphi3(i32 *%s, i32 *%d, i32 %n, float %f) { ; CHECK-LABEL: @convphi3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: [[FB:%.*]] = bitcast float [[F:%.*]] to i32 ; CHECK-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[END:%.*]] ; CHECK: then: ; CHECK-NEXT: [[LS:%.*]] = load i32, i32* [[S:%.*]], align 4 ; CHECK-NEXT: [[LS_BC:%.*]] = bitcast i32 [[LS]] to float ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[PHI_TC:%.*]] = phi float [ [[LS_BC]], [[THEN]] ], [ [[F]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[PHI_TC:%.*]] = phi float [ [[LS_BC]], [[THEN]] ], [ [[F:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: ret float [[PHI_TC]] ; entry: @@ -99,14 +98,13 @@ define void @convphi4(i32 *%s, i32 *%d, i32 %n, float %f) { ; CHECK-LABEL: @convphi4( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: [[FB:%.*]] = bitcast float [[F:%.*]] to i32 ; CHECK-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[END:%.*]] ; CHECK: then: ; CHECK-NEXT: [[LS:%.*]] = load i32, i32* [[S:%.*]], align 4 ; CHECK-NEXT: [[LS_BC:%.*]] = bitcast i32 [[LS]] to float ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[PHI_TC:%.*]] = phi float [ [[LS_BC]], [[THEN]] ], [ [[F]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[PHI_TC:%.*]] = phi float [ [[LS_BC]], [[THEN]] ], [ [[F:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[BC:%.*]] = bitcast float [[PHI_TC]] to i32 ; CHECK-NEXT: store i32 [[BC]], i32* [[D:%.*]], align 4 ; CHECK-NEXT: ret void @@ -481,6 +479,401 @@ end: ret float %b } +define void @convphi_stop(i32 *%s, i32 *%d, float *%e, i32 %n) { +; CHECK-LABEL: @convphi_stop( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[LS:%.*]] = load i32, i32* [[S:%.*]], align 4 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: else: +; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[D:%.*]], align 4 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LS]], [[THEN]] ], [ [[LD]], [[ELSE]] ] +; CHECK-NEXT: [[B:%.*]] = bitcast i32 [[PHI]] to float +; CHECK-NEXT: store float [[B]], float* [[E:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + %cmp15 = icmp sgt i32 %n, 0 + br i1 %cmp15, label %then, label %else + +then: + %ls = load i32, i32* %s, align 4 + br label %end + +else: + %ld = load i32, i32* %d, align 4 + br label %end + +end: + %phi = phi i32 [ %ls, %then ], [ %ld, %else ] + %b = bitcast i32 %phi to float + store float %b, float* %e, align 4 + ret void +} + +define void @convphi_stop2(i32 *%s, i32 *%d, float *%e, i32 %n) { +; CHECK-LABEL: @convphi_stop2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[LS:%.*]] = load i32, i32* [[S:%.*]], align 4 +; CHECK-NEXT: [[LSB:%.*]] = bitcast i32 [[LS]] to float +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: else: +; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[D:%.*]], align 4 +; CHECK-NEXT: [[LDB:%.*]] = bitcast i32 [[LD]] to float +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PHI:%.*]] = phi float [ [[LSB]], [[THEN]] ], [ [[LDB]], [[ELSE]] ] +; CHECK-NEXT: store float [[PHI]], float* [[E:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + %cmp15 = icmp sgt i32 %n, 0 + br i1 %cmp15, label %then, label %else + +then: + %ls = load i32, i32* %s, align 4 + %lsb = bitcast i32 %ls to float + br label %end + +else: + %ld = load i32, i32* %d, align 4 + %ldb = bitcast i32 %ld to float + br label %end + +end: + %phi = phi float [ %lsb, %then ], [ %ldb, %else ] + store float %phi, float* %e, align 4 + ret void +} + +define float @convphi_stop3(i32 *%s, i32 *%d, float *%e, i32 %n) { +; CHECK-LABEL: @convphi_stop3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[LS:%.*]] = load i32, i32* [[S:%.*]], align 4 +; CHECK-NEXT: [[LS_BC:%.*]] = bitcast i32 [[LS]] to float +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: else: +; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[D:%.*]], align 4 +; CHECK-NEXT: [[LD_BC:%.*]] = bitcast i32 [[LD]] to float +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PHI_TC:%.*]] = phi float [ [[LS_BC]], [[THEN]] ], [ [[LD_BC]], [[ELSE]] ] +; CHECK-NEXT: store float [[PHI_TC]], float* [[E:%.*]], align 4 +; CHECK-NEXT: ret float [[PHI_TC]] +; +entry: + %cmp15 = icmp sgt i32 %n, 0 + br i1 %cmp15, label %then, label %else + +then: + %ls = load i32, i32* %s, align 4 + br label %end + +else: + %ld = load i32, i32* %d, align 4 + br label %end + +end: + %phi = phi i32 [ %ls, %then ], [ %ld, %else ] + %b = bitcast i32 %phi to float + store float %b, float* %e, align 4 + ret float %b +} + +define void @convphi_stop4(i32 *%s, i32 *%d, float *%e, i32 %n) { +; CHECK-LABEL: @convphi_stop4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[D:%.*]], align 4 +; CHECK-NEXT: [[LD_BC:%.*]] = bitcast i32 [[LD]] to float +; CHECK-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[END:%.*]] +; CHECK: then: +; CHECK-NEXT: [[LS:%.*]] = load i32, i32* [[S:%.*]], align 4 +; CHECK-NEXT: [[LS_BC:%.*]] = bitcast i32 [[LS]] to float +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PHI_TC:%.*]] = phi float [ [[LS_BC]], [[THEN]] ], [ [[LD_BC]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: [[BC:%.*]] = bitcast float [[PHI_TC]] to i32 +; CHECK-NEXT: store i32 [[BC]], i32* [[S]], align 4 +; CHECK-NEXT: br i1 [[TMP0]], label [[THEN2:%.*]], label [[END2:%.*]] +; CHECK: then2: +; CHECK-NEXT: [[LF:%.*]] = load float, float* [[E:%.*]], align 4 +; CHECK-NEXT: br label [[END2]] +; CHECK: end2: +; CHECK-NEXT: [[PHI2:%.*]] = phi float [ [[PHI_TC]], [[END]] ], [ [[LF]], [[THEN2]] ] +; CHECK-NEXT: store float [[PHI2]], float* [[E]], align 4 +; CHECK-NEXT: ret void +; +entry: + %cmp15 = icmp sgt i32 %n, 0 + %ld = load i32, i32* %d, align 4 + br i1 %cmp15, label %then, label %end + +then: + %ls = load i32, i32* %s, align 4 + br label %end + +end: + %phi = phi i32 [ %ls, %then ], [ %ld, %entry ] + %phib = bitcast i32 %phi to float + store i32 %phi, i32* %s, align 4 + br i1 %cmp15, label %then2, label %end2 + +then2: + %lf = load float, float* %e, align 4 + br label %end2 + +end2: + %phi2 = phi float [ %phib, %end ], [ %lf, %then2 ] + store float %phi2, float* %e, align 4 + ret void +} + +define float @multiuse(i32 *%s, i32 *%d, i32 %n) { +; CHECK-LABEL: @multiuse( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[LS:%.*]] = load i32, i32* [[S:%.*]], align 4 +; CHECK-NEXT: [[A:%.*]] = add i32 [[LS]], 2 +; CHECK-NEXT: store i32 [[A]], i32* [[D:%.*]], align 4 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: else: +; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[D]], align 4 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LS]], [[THEN]] ], [ [[LD]], [[ELSE]] ] +; CHECK-NEXT: [[B:%.*]] = bitcast i32 [[PHI]] to float +; CHECK-NEXT: ret float [[B]] +; +entry: + %cmp15 = icmp sgt i32 %n, 0 + br i1 %cmp15, label %then, label %else + +then: + %ls = load i32, i32* %s, align 4 + %a = add i32 %ls, 2 + store i32 %a, i32* %d, align 4 + br label %end + +else: + %ld = load i32, i32* %d, align 4 + br label %end + +end: + %phi = phi i32 [ %ls, %then ], [ %ld, %else ] + %b = bitcast i32 %phi to float + ret float %b +} + +define float @convphi_volatile(i32 *%s, i32 *%d, i32 %n) { +; CHECK-LABEL: @convphi_volatile( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[LS:%.*]] = load volatile i32, i32* [[S:%.*]], align 4 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: else: +; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[D:%.*]], align 4 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LS]], [[THEN]] ], [ [[LD]], [[ELSE]] ] +; CHECK-NEXT: [[B:%.*]] = bitcast i32 [[PHI]] to float +; CHECK-NEXT: ret float [[B]] +; +; DEBUG-LABEL: @convphi_volatile( +; DEBUG-NEXT: entry: +; DEBUG-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg !358 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP15]], metadata !353, metadata !DIExpression()), !dbg !358 +; DEBUG-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[ELSE:%.*]], !dbg !359 +; DEBUG: then: +; DEBUG-NEXT: [[LS:%.*]] = load volatile i32, i32* [[S:%.*]], align 4, !dbg !360 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i32 [[LS]], metadata !354, metadata !DIExpression()), !dbg !360 +; DEBUG-NEXT: br label [[END:%.*]], !dbg !361 +; DEBUG: else: +; DEBUG-NEXT: [[LD:%.*]] = load i32, i32* [[D:%.*]], align 4, !dbg !362 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i32 [[LD]], metadata !355, metadata !DIExpression()), !dbg !362 +; DEBUG-NEXT: br label [[END]], !dbg !363 +; DEBUG: end: +; DEBUG-NEXT: [[PHI:%.*]] = phi i32 [ [[LS]], [[THEN]] ], [ [[LD]], [[ELSE]] ], !dbg !364 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i32 [[PHI]], metadata !356, metadata !DIExpression()), !dbg !364 +; DEBUG-NEXT: [[B:%.*]] = bitcast i32 [[PHI]] to float, !dbg !365 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata float [[B]], metadata !357, metadata !DIExpression()), !dbg !365 +; DEBUG-NEXT: ret float [[B]], !dbg !366 +; +entry: + %cmp15 = icmp sgt i32 %n, 0 + br i1 %cmp15, label %then, label %else + +then: + %ls = load volatile i32, i32* %s, align 4 + br label %end + +else: + %ld = load i32, i32* %d, align 4 + br label %end + +end: + %phi = phi i32 [ %ls, %then ], [ %ld, %else ] + %b = bitcast i32 %phi to float + ret float %b +} + +define void @convphi_volatile2(i32 *%s, i32 *%d, i32 %n, float %f) { +; CHECK-LABEL: @convphi_volatile2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: [[FB:%.*]] = bitcast float [[F:%.*]] to i32 +; CHECK-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[END:%.*]] +; CHECK: then: +; CHECK-NEXT: [[LS:%.*]] = load i32, i32* [[S:%.*]], align 4 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LS]], [[THEN]] ], [ [[FB]], [[ENTRY:%.*]] ] +; CHECK-NEXT: store volatile i32 [[PHI]], i32* [[D:%.*]], align 4 +; CHECK-NEXT: ret void +; +; DEBUG-LABEL: @convphi_volatile2( +; DEBUG-NEXT: entry: +; DEBUG-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg !373 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP15]], metadata !369, metadata !DIExpression()), !dbg !373 +; DEBUG-NEXT: [[FB:%.*]] = bitcast float [[F:%.*]] to i32, !dbg !374 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i32 [[FB]], metadata !370, metadata !DIExpression()), !dbg !374 +; DEBUG-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[END:%.*]], !dbg !375 +; DEBUG: then: +; DEBUG-NEXT: [[LS:%.*]] = load i32, i32* [[S:%.*]], align 4, !dbg !376 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i32 [[LS]], metadata !371, metadata !DIExpression()), !dbg !376 +; DEBUG-NEXT: br label [[END]], !dbg !377 +; DEBUG: end: +; DEBUG-NEXT: [[PHI:%.*]] = phi i32 [ [[LS]], [[THEN]] ], [ [[FB]], [[ENTRY:%.*]] ], !dbg !378 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i32 [[PHI]], metadata !372, metadata !DIExpression()), !dbg !378 +; DEBUG-NEXT: store volatile i32 [[PHI]], i32* [[D:%.*]], align 4, !dbg !379 +; DEBUG-NEXT: ret void, !dbg !380 +; +entry: + %cmp15 = icmp sgt i32 %n, 0 + %fb = bitcast float %f to i32 + br i1 %cmp15, label %then, label %end + +then: + %ls = load i32, i32* %s, align 4 + br label %end + +end: + %phi = phi i32 [ %ls, %then ], [ %fb, %entry ] + store volatile i32 %phi, i32 *%d + ret void +} + +define float @convphi_atomic(i32 *%s, i32 *%d, i32 %n) { +; CHECK-LABEL: @convphi_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[LS:%.*]] = load atomic i32, i32* [[S:%.*]] acquire, align 4 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: else: +; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[D:%.*]], align 4 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LS]], [[THEN]] ], [ [[LD]], [[ELSE]] ] +; CHECK-NEXT: [[B:%.*]] = bitcast i32 [[PHI]] to float +; CHECK-NEXT: ret float [[B]] +; +; DEBUG-LABEL: @convphi_atomic( +; DEBUG-NEXT: entry: +; DEBUG-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg !388 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP15]], metadata !383, metadata !DIExpression()), !dbg !388 +; DEBUG-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[ELSE:%.*]], !dbg !389 +; DEBUG: then: +; DEBUG-NEXT: [[LS:%.*]] = load atomic i32, i32* [[S:%.*]] acquire, align 4, !dbg !390 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i32 [[LS]], metadata !384, metadata !DIExpression()), !dbg !390 +; DEBUG-NEXT: br label [[END:%.*]], !dbg !391 +; DEBUG: else: +; DEBUG-NEXT: [[LD:%.*]] = load i32, i32* [[D:%.*]], align 4, !dbg !392 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i32 [[LD]], metadata !385, metadata !DIExpression()), !dbg !392 +; DEBUG-NEXT: br label [[END]], !dbg !393 +; DEBUG: end: +; DEBUG-NEXT: [[PHI:%.*]] = phi i32 [ [[LS]], [[THEN]] ], [ [[LD]], [[ELSE]] ], !dbg !394 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i32 [[PHI]], metadata !386, metadata !DIExpression()), !dbg !394 +; DEBUG-NEXT: [[B:%.*]] = bitcast i32 [[PHI]] to float, !dbg !395 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata float [[B]], metadata !387, metadata !DIExpression()), !dbg !395 +; DEBUG-NEXT: ret float [[B]], !dbg !396 +; +entry: + %cmp15 = icmp sgt i32 %n, 0 + br i1 %cmp15, label %then, label %else +then: + %ls = load atomic i32, i32* %s acquire, align 4 + br label %end +else: + %ld = load i32, i32* %d, align 4 + br label %end + +end: + %phi = phi i32 [ %ls, %then ], [ %ld, %else ] + %b = bitcast i32 %phi to float + ret float %b +} +define void @convphi_atomic2(i32 *%s, i32 *%d, i32 %n, float %f) { +; CHECK-LABEL: @convphi_atomic2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: [[FB:%.*]] = bitcast float [[F:%.*]] to i32 +; CHECK-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[END:%.*]] +; CHECK: then: +; CHECK-NEXT: [[LS:%.*]] = load i32, i32* [[S:%.*]], align 4 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LS]], [[THEN]] ], [ [[FB]], [[ENTRY:%.*]] ] +; CHECK-NEXT: store atomic i32 [[PHI]], i32* [[D:%.*]] release, align 4 +; CHECK-NEXT: ret void +; +; DEBUG-LABEL: @convphi_atomic2( +; DEBUG-NEXT: entry: +; DEBUG-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg !403 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP15]], metadata !399, metadata !DIExpression()), !dbg !403 +; DEBUG-NEXT: [[FB:%.*]] = bitcast float [[F:%.*]] to i32, !dbg !404 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i32 [[FB]], metadata !400, metadata !DIExpression()), !dbg !404 +; DEBUG-NEXT: br i1 [[CMP15]], label [[THEN:%.*]], label [[END:%.*]], !dbg !405 +; DEBUG: then: +; DEBUG-NEXT: [[LS:%.*]] = load i32, i32* [[S:%.*]], align 4, !dbg !406 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i32 [[LS]], metadata !401, metadata !DIExpression()), !dbg !406 +; DEBUG-NEXT: br label [[END]], !dbg !407 +; DEBUG: end: +; DEBUG-NEXT: [[PHI:%.*]] = phi i32 [ [[LS]], [[THEN]] ], [ [[FB]], [[ENTRY:%.*]] ], !dbg !408 +; DEBUG-NEXT: call void @llvm.dbg.value(metadata i32 [[PHI]], metadata !402, metadata !DIExpression()), !dbg !408 +; DEBUG-NEXT: store atomic i32 [[PHI]], i32* [[D:%.*]] release, align 4, !dbg !409 +; DEBUG-NEXT: ret void, !dbg !410 +; +entry: + %cmp15 = icmp sgt i32 %n, 0 + %fb = bitcast float %f to i32 + br i1 %cmp15, label %then, label %end + +then: + %ls = load i32, i32* %s, align 4 + br label %end + +end: + %phi = phi i32 [ %ls, %then ], [ %fb, %entry ] + store atomic i32 %phi, i32 *%d release, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/faddp-half.ll b/llvm/test/CodeGen/AArch64/faddp-half.ll new file mode 100644 index 0000000000000..449b9a5b8c922 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/faddp-half.ll @@ -0,0 +1,141 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=aarch64 -mattr=+fullfp16 < %s | FileCheck %s +; RUN: llc --mtriple=aarch64 < %s | FileCheck %s --check-prefix=CHECKNOFP16 + +define half @faddp_2xhalf(<2 x half> %a) { +; CHECK-LABEL: faddp_2xhalf: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: faddp h0, v0.2h +; CHECK-NEXT: ret +; +; CHECKNOFP16-LABEL: faddp_2xhalf: +; CHECKNOFP16: // %bb.0: // %entry +; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECKNOFP16-NEXT: dup v1.4h, v0.h[1] +; CHECKNOFP16-NEXT: fcvtl v1.4s, v1.4h +; CHECKNOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECKNOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECKNOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECKNOFP16-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECKNOFP16-NEXT: ret +entry: + %shift = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> + %0 = fadd <2 x half> %a, %shift + %1 = extractelement <2 x half> %0, i32 0 + ret half %1 +} + +define half @faddp_2xhalf_commute(<2 x half> %a) { +; CHECK-LABEL: faddp_2xhalf_commute: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: faddp h0, v0.2h +; CHECK-NEXT: ret +; +; CHECKNOFP16-LABEL: faddp_2xhalf_commute: +; CHECKNOFP16: // %bb.0: // %entry +; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECKNOFP16-NEXT: dup v1.4h, v0.h[1] +; CHECKNOFP16-NEXT: fcvtl v1.4s, v1.4h +; CHECKNOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECKNOFP16-NEXT: fadd v0.4s, v1.4s, v0.4s +; CHECKNOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECKNOFP16-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECKNOFP16-NEXT: ret +entry: + %shift = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> + %0 = fadd <2 x half> %shift, %a + %1 = extractelement <2 x half> %0, i32 0 + ret half %1 +} + +define half @faddp_4xhalf(<4 x half> %a) { +; CHECK-LABEL: faddp_4xhalf: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: faddp h0, v0.2h +; CHECK-NEXT: ret +; +; CHECKNOFP16-LABEL: faddp_4xhalf: +; CHECKNOFP16: // %bb.0: // %entry +; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECKNOFP16-NEXT: dup v1.4h, v0.h[1] +; CHECKNOFP16-NEXT: fcvtl v1.4s, v1.4h +; CHECKNOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECKNOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECKNOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECKNOFP16-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECKNOFP16-NEXT: ret +entry: + %shift = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> + %0 = fadd <4 x half> %a, %shift + %1 = extractelement <4 x half> %0, i32 0 + ret half %1 +} + +define half @faddp_4xhalf_commute(<4 x half> %a) { +; CHECK-LABEL: faddp_4xhalf_commute: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: faddp h0, v0.2h +; CHECK-NEXT: ret +; +; CHECKNOFP16-LABEL: faddp_4xhalf_commute: +; CHECKNOFP16: // %bb.0: // %entry +; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECKNOFP16-NEXT: dup v1.4h, v0.h[1] +; CHECKNOFP16-NEXT: fcvtl v1.4s, v1.4h +; CHECKNOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECKNOFP16-NEXT: fadd v0.4s, v1.4s, v0.4s +; CHECKNOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECKNOFP16-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECKNOFP16-NEXT: ret +entry: + %shift = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> + %0 = fadd <4 x half> %shift, %a + %1 = extractelement <4 x half> %0, i32 0 + ret half %1 +} + +define half @faddp_8xhalf(<8 x half> %a) { +; CHECK-LABEL: faddp_8xhalf: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: faddp h0, v0.2h +; CHECK-NEXT: ret +; +; CHECKNOFP16-LABEL: faddp_8xhalf: +; CHECKNOFP16: // %bb.0: // %entry +; CHECKNOFP16-NEXT: dup v1.8h, v0.h[1] +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s0, s0, s1 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: ret +entry: + %shift = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> + %0 = fadd <8 x half> %a, %shift + %1 = extractelement <8 x half> %0, i32 0 + ret half %1 +} + +define half @faddp_8xhalf_commute(<8 x half> %a) { +; CHECK-LABEL: faddp_8xhalf_commute: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: faddp h0, v0.2h +; CHECK-NEXT: ret +; +; CHECKNOFP16-LABEL: faddp_8xhalf_commute: +; CHECKNOFP16: // %bb.0: // %entry +; CHECKNOFP16-NEXT: dup v1.8h, v0.h[1] +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s0, s1, s0 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: ret +entry: + %shift = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> + %0 = fadd <8 x half> %shift, %a + %1 = extractelement <8 x half> %0, i32 0 + ret half %1 +} diff --git a/llvm/test/CodeGen/AArch64/faddp.ll b/llvm/test/CodeGen/AArch64/faddp.ll new file mode 100644 index 0000000000000..06e976136c375 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/faddp.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple aarch64 < %s | FileCheck %s + +define float @faddp_2xfloat(<2 x float> %a) { +; CHECK-LABEL: faddp_2xfloat: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: ret +entry: + %shift = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> + %0 = fadd <2 x float> %a, %shift + %1 = extractelement <2 x float> %0, i32 0 + ret float %1 +} + +define float @faddp_4xfloat(<4 x float> %a) { +; CHECK-LABEL: faddp_4xfloat: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: ret +entry: + %shift = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %0 = fadd <4 x float> %a, %shift + %1 = extractelement <4 x float> %0, i32 0 + ret float %1 +} + +define float @faddp_4xfloat_commute(<4 x float> %a) { +; CHECK-LABEL: faddp_4xfloat_commute: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: ret +entry: + %shift = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %0 = fadd <4 x float> %shift, %a + %1 = extractelement <4 x float> %0, i32 0 + ret float %1 +} + +define float @faddp_2xfloat_commute(<2 x float> %a) { +; CHECK-LABEL: faddp_2xfloat_commute: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: ret +entry: + %shift = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> + %0 = fadd <2 x float> %shift, %a + %1 = extractelement <2 x float> %0, i32 0 + ret float %1 +} + +define double @faddp_2xdouble(<2 x double> %a) { +; CHECK-LABEL: faddp_2xdouble: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: faddp d0, v0.2d +; CHECK-NEXT: ret +entry: + %shift = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> + %0 = fadd <2 x double> %a, %shift + %1 = extractelement <2 x double> %0, i32 0 + ret double %1 +} + +define double @faddp_2xdouble_commute(<2 x double> %a) { +; CHECK-LABEL: faddp_2xdouble_commute: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: faddp d0, v0.2d +; CHECK-NEXT: ret +entry: + %shift = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> + %0 = fadd <2 x double> %shift, %a + %1 = extractelement <2 x double> %0, i32 0 + ret double %1 +} + +define i64 @addp_2xi64(<2 x i64> %a) { +; CHECK-LABEL: addp_2xi64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +entry: + %shift = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %0 = add <2 x i64> %a, %shift + %1 = extractelement <2 x i64> %0, i32 0 + ret i64 %1 +} + +define i64 @addp_2xi64_commute(<2 x i64> %a) { +; CHECK-LABEL: addp_2xi64_commute: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +entry: + %shift = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %0 = add <2 x i64> %shift, %a + %1 = extractelement <2 x i64> %0, i32 0 + ret i64 %1 +} diff --git a/llvm/test/CodeGen/AArch64/fast-isel-sp-adjust.ll b/llvm/test/CodeGen/AArch64/fast-isel-sp-adjust.ll index 22e3ccf2b1209..8d62fb3556661 100644 --- a/llvm/test/CodeGen/AArch64/fast-isel-sp-adjust.ll +++ b/llvm/test/CodeGen/AArch64/fast-isel-sp-adjust.ll @@ -15,7 +15,8 @@ ; CHECK-LABEL: foo: ; CHECK: sub ; CHECK-DAG: mov x[[SP:[0-9]+]], sp -; CHECK-DAG: mov w[[OFFSET:[0-9]+]], #4104 +; CHECK-DAG: mov [[TMP:w[0-9]+]], #4104 +; CHECK: mov w[[OFFSET:[0-9]+]], [[TMP]] ; CHECK: strb w0, [x[[SP]], x[[OFFSET]]] define void @foo(i8 %in) { diff --git a/llvm/test/CodeGen/AArch64/fmov-imm-licm.ll b/llvm/test/CodeGen/AArch64/fmov-imm-licm.ll new file mode 100644 index 0000000000000..29061840c96bf --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fmov-imm-licm.ll @@ -0,0 +1,33 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s + +; The purpose of this test is to check that an FMOV instruction that +; only materializes an immediate is not MachineLICM'd out of a loop. +; We check this in two ways: by looking for the FMOV inside the loop, +; and also by checking that we're not spilling any FP callee-saved +; registers. + +%struct.Node = type { %struct.Node*, i8* } + +define void @process_nodes(%struct.Node* %0) { +; CHECK-LABEL: process_nodes: +; CHECK-NOT: stp {{d[0-9]+}} +; CHECK-LABEL: .LBB0_2: +; CHECK: fmov s0, #1.00000000 +; CHECK: bl do_it +entry: + %1 = icmp eq %struct.Node* %0, null + br i1 %1, label %exit, label %loop + +loop: + %2 = phi %struct.Node* [ %4, %loop ], [ %0, %entry ] + tail call void @do_it(float 1.000000e+00, %struct.Node* nonnull %2) + %3 = getelementptr inbounds %struct.Node, %struct.Node* %2, i64 0, i32 0 + %4 = load %struct.Node*, %struct.Node** %3, align 8 + %5 = icmp eq %struct.Node* %4, null + br i1 %5, label %exit, label %loop + +exit: + ret void +} + +declare void @do_it(float, %struct.Node*) diff --git a/llvm/test/CodeGen/AArch64/fp-cond-sel.ll b/llvm/test/CodeGen/AArch64/fp-cond-sel.ll index f74e9c3509429..570088385d0d8 100644 --- a/llvm/test/CodeGen/AArch64/fp-cond-sel.ll +++ b/llvm/test/CodeGen/AArch64/fp-cond-sel.ll @@ -20,8 +20,8 @@ define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) { %tst2 = icmp sle i64 %lhs64, %rhs64 %val2 = select i1 %tst2, double 1.0, double 0.0 store double %val2, double* @vardouble -; FLT0 is reused from above on ARM64. -; CHECK: fmov d[[FLT1:[0-9]+]], #1.0 +; CHECK-DAG: fmov d[[FLT0:[0-9]+]], xzr +; CHECK-DAG: fmov d[[FLT1:[0-9]+]], #1.0 ; CHECK: fcsel {{d[0-9]+}}, d[[FLT1]], d[[FLT0]], le call void @use_float(float 0.0) diff --git a/llvm/test/CodeGen/AArch64/fp-const-fold.ll b/llvm/test/CodeGen/AArch64/fp-const-fold.ll index b282c8719ff63..dc3f71001d610 100644 --- a/llvm/test/CodeGen/AArch64/fp-const-fold.ll +++ b/llvm/test/CodeGen/AArch64/fp-const-fold.ll @@ -161,49 +161,33 @@ define double @fmul_nnan_inf_op1(double %x) { ret double %r } -; TODO: Should simplify to undef - define double @fdiv_nnan_undef_op0(double %x) { ; CHECK-LABEL: fdiv_nnan_undef_op0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #9221120237041090560 -; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret %r = fdiv nnan double undef, %x ret double %r } -; TODO: Should simplify to undef - define double @fdiv_nnan_undef_op1(double %x) { ; CHECK-LABEL: fdiv_nnan_undef_op1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #9221120237041090560 -; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret %r = fdiv nnan double %x, undef ret double %r } -; TODO: Should simplify to undef - define double @fdiv_ninf_undef_op0(double %x) { ; CHECK-LABEL: fdiv_ninf_undef_op0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #9221120237041090560 -; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret %r = fdiv ninf double undef, %x ret double %r } -; TODO: Should simplify to undef - define double @fdiv_ninf_undef_op1(double %x) { ; CHECK-LABEL: fdiv_ninf_undef_op1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #9221120237041090560 -; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret %r = fdiv ninf double %x, undef ret double %r diff --git a/llvm/test/CodeGen/AArch64/func-calls.ll b/llvm/test/CodeGen/AArch64/func-calls.ll index 54d38a91c3873..fe48fd308265a 100644 --- a/llvm/test/CodeGen/AArch64/func-calls.ll +++ b/llvm/test/CodeGen/AArch64/func-calls.ll @@ -90,12 +90,10 @@ define void @check_stack_args() { ; memcpy gets created, but the following works for now. ; CHECK-DAG: str {{q[0-9]+}}, [sp] -; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0 -; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b +; CHECK-DAG: fmov d0, #1.0 ; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp] -; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0 -; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]] +; CHECK-NONEON-DAG: fmov d0, #1.0 ; CHECK: bl struct_on_stack ; CHECK-NOFP-NOT: fmov diff --git a/llvm/test/CodeGen/AArch64/implicit-null-check.ll b/llvm/test/CodeGen/AArch64/implicit-null-check.ll new file mode 100644 index 0000000000000..ab9b8dd348861 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/implicit-null-check.ll @@ -0,0 +1,439 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -O3 -mtriple=aarch64-unknown-unknown -enable-implicit-null-checks | FileCheck %s + +; Basic test for implicit null check conversion - this is analogous to the +; file with the same name in the X86 tree, but adjusted to remove patterns +; related to memory folding of arithmetic (since aarch64 doesn't), and add +; a couple of aarch64 specific tests. + +define i32 @imp_null_check_load_fallthrough(i32* %x) { +; CHECK-LABEL: imp_null_check_load_fallthrough: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: ldr w0, [x0] // on-fault: .LBB0_2 +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_1: // %not_null +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + not_null: + %t = load i32, i32* %x + ret i32 %t + +is_null: + ret i32 42 +} + + +define i32 @imp_null_check_load_reorder(i32* %x) { +; CHECK-LABEL: imp_null_check_load_reorder: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: ldr w0, [x0] // on-fault: .LBB1_2 +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_1: // %not_null +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %t = load i32, i32* %x + ret i32 %t +} + +define i32 @imp_null_check_unordered_load(i32* %x) { +; CHECK-LABEL: imp_null_check_unordered_load: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: ldr w0, [x0] // on-fault: .LBB2_2 +; CHECK-NEXT: b .LBB2_1 +; CHECK-NEXT: .LBB2_1: // %not_null +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %t = load atomic i32, i32* %x unordered, align 4 + ret i32 %t +} + + +; TODO: Can be converted into implicit check. +;; Probably could be implicit, but we're conservative for now +define i32 @imp_null_check_seq_cst_load(i32* %x) { +; CHECK-LABEL: imp_null_check_seq_cst_load: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz x0, .LBB3_2 +; CHECK-NEXT: // %bb.1: // %not_null +; CHECK-NEXT: ldar w0, [x0] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %t = load atomic i32, i32* %x seq_cst, align 4 + ret i32 %t +} + +;; Might be memory mapped IO, so can't rely on fault behavior +define i32 @imp_null_check_volatile_load(i32* %x) { +; CHECK-LABEL: imp_null_check_volatile_load: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz x0, .LBB4_2 +; CHECK-NEXT: // %bb.1: // %not_null +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB4_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %t = load volatile i32, i32* %x, align 4 + ret i32 %t +} + + +define i8 @imp_null_check_load_i8(i8* %x) { +; CHECK-LABEL: imp_null_check_load_i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: ldrb w0, [x0] // on-fault: .LBB5_2 +; CHECK-NEXT: b .LBB5_1 +; CHECK-NEXT: .LBB5_1: // %not_null +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB5_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret + entry: + %c = icmp eq i8* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i8 42 + + not_null: + %t = load i8, i8* %x + ret i8 %t +} + +define i256 @imp_null_check_load_i256(i256* %x) { +; CHECK-LABEL: imp_null_check_load_i256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz x0, .LBB6_2 +; CHECK-NEXT: // %bb.1: // %not_null +; CHECK-NEXT: ldp x8, x1, [x0] +; CHECK-NEXT: ldp x2, x3, [x0, #16] +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB6_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: mov x1, xzr +; CHECK-NEXT: mov x2, xzr +; CHECK-NEXT: mov x3, xzr +; CHECK-NEXT: ret + entry: + %c = icmp eq i256* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i256 42 + + not_null: + %t = load i256, i256* %x + ret i256 %t +} + + + +define i32 @imp_null_check_gep_load(i32* %x) { +; CHECK-LABEL: imp_null_check_gep_load: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: ldr w0, [x0, #128] // on-fault: .LBB7_2 +; CHECK-NEXT: b .LBB7_1 +; CHECK-NEXT: .LBB7_1: // %not_null +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB7_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %x.gep = getelementptr i32, i32* %x, i32 32 + %t = load i32, i32* %x.gep + ret i32 %t +} + +define i32 @imp_null_check_add_result(i32* %x, i32 %p) { +; CHECK-LABEL: imp_null_check_add_result: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: ldr w8, [x0] // on-fault: .LBB8_2 +; CHECK-NEXT: b .LBB8_1 +; CHECK-NEXT: .LBB8_1: // %not_null +; CHECK-NEXT: add w0, w8, w1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB8_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %t = load i32, i32* %x + %p1 = add i32 %t, %p + ret i32 %p1 +} + +; Can hoist over a potential faulting instruction as long as we don't +; change the conditions under which the instruction faults. +define i32 @imp_null_check_hoist_over_udiv(i32* %x, i32 %a, i32 %b) { +; CHECK-LABEL: imp_null_check_hoist_over_udiv: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: .Ltmp6: +; CHECK-NEXT: ldr w8, [x0] // on-fault: .LBB9_2 +; CHECK-NEXT: b .LBB9_1 +; CHECK-NEXT: .LBB9_1: // %not_null +; CHECK-NEXT: udiv w9, w1, w2 +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB9_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %p1 = udiv i32 %a, %b + %t = load i32, i32* %x + %res = add i32 %t, %p1 + ret i32 %res +} + + +; TODO: We should be able to hoist this - we can on x86, why isn't this +; working for aarch64? Aliasing? +define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z) { +; CHECK-LABEL: imp_null_check_hoist_over_unrelated_load: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz x0, .LBB10_2 +; CHECK-NEXT: // %bb.1: // %not_null +; CHECK-NEXT: ldr w8, [x1] +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: str w8, [x2] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB10_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %t0 = load i32, i32* %y + %t1 = load i32, i32* %x + store i32 %t0, i32* %z + ret i32 %t1 +} + +define i32 @imp_null_check_gep_load_with_use_dep(i32* %x, i32 %a) { +; CHECK-LABEL: imp_null_check_gep_load_with_use_dep: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: .Ltmp7: +; CHECK-NEXT: ldr w8, [x0] // on-fault: .LBB11_2 +; CHECK-NEXT: b .LBB11_1 +; CHECK-NEXT: .LBB11_1: // %not_null +; CHECK-NEXT: add w9, w0, w1 +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: add w0, w8, #4 // =4 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB11_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %x.loc = getelementptr i32, i32* %x, i32 1 + %y = ptrtoint i32* %x.loc to i32 + %b = add i32 %a, %y + %t = load i32, i32* %x + %z = add i32 %t, %b + ret i32 %z +} + +;; TODO: We could handle this case as we can lift the fence into the +;; previous block before the conditional without changing behavior. +define i32 @imp_null_check_load_fence1(i32* %x) { +; CHECK-LABEL: imp_null_check_load_fence1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz x0, .LBB12_2 +; CHECK-NEXT: // %bb.1: // %not_null +; CHECK-NEXT: dmb ishld +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB12_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret +entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + +is_null: + ret i32 42 + +not_null: + fence acquire + %t = load i32, i32* %x + ret i32 %t +} + +;; TODO: We could handle this case as we can lift the fence into the +;; previous block before the conditional without changing behavior. +define i32 @imp_null_check_load_fence2(i32* %x) { +; CHECK-LABEL: imp_null_check_load_fence2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz x0, .LBB13_2 +; CHECK-NEXT: // %bb.1: // %not_null +; CHECK-NEXT: dmb ish +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB13_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret +entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + +is_null: + ret i32 42 + +not_null: + fence seq_cst + %t = load i32, i32* %x + ret i32 %t +} + +; TODO: We can fold to implicit null here, not sure why this isn't working +define void @imp_null_check_store(i32* %x) { +; CHECK-LABEL: imp_null_check_store: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz x0, .LBB14_2 +; CHECK-NEXT: // %bb.1: // %not_null +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB14_2: // %is_null +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret void + + not_null: + store i32 1, i32* %x + ret void +} + +;; TODO: can be implicit +define void @imp_null_check_unordered_store(i32* %x) { +; CHECK-LABEL: imp_null_check_unordered_store: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz x0, .LBB15_2 +; CHECK-NEXT: // %bb.1: // %not_null +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB15_2: // %is_null +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret void + + not_null: + store atomic i32 1, i32* %x unordered, align 4 + ret void +} + +define i32 @imp_null_check_neg_gep_load(i32* %x) { +; CHECK-LABEL: imp_null_check_neg_gep_load: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: .Ltmp8: +; CHECK-NEXT: ldur w0, [x0, #-128] // on-fault: .LBB16_2 +; CHECK-NEXT: b .LBB16_1 +; CHECK-NEXT: .LBB16_1: // %not_null +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB16_2: // %is_null +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ret + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %x.gep = getelementptr i32, i32* %x, i32 -32 + %t = load i32, i32* %x.gep + ret i32 %t +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AArch64/llvm-masked-gather-legal-for-sve.ll b/llvm/test/CodeGen/AArch64/llvm-masked-gather-legal-for-sve.ll new file mode 100644 index 0000000000000..1dffd76a11927 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/llvm-masked-gather-legal-for-sve.ll @@ -0,0 +1,63 @@ +; RUN: opt -mtriple=aarch64-linux-gnu -mattr=+sve -scalarize-masked-mem-intrin -S < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; Testing that masked gathers operating on scalable vectors that are +; packed in SVE registers are not scalarized. + +; CHECK-LABEL: @masked_gather_nxv4i32( +; CHECK: call @llvm.masked.gather.nxv4i32 +define @masked_gather_nxv4i32( %ld, %masks, %passthru) { + %res = call @llvm.masked.gather.nxv4i32( %ld, i32 0, %masks, %passthru) + ret %res +} + +; Testing that masked gathers operating on scalable vectors of FP data +; that is packed in SVE registers are not scalarized. + +; CHECK-LABEL: @masked_gather_nxv2f64( +; CHECK: call @llvm.masked.gather.nxv2f64 +define @masked_gather_nxv2f64( %ld, %masks, %passthru) { + %res = call @llvm.masked.gather.nxv2f64( %ld, i32 0, %masks, %passthru) + ret %res +} + +; Testing that masked gathers operating on scalable vectors of FP data +; that is unpacked in SVE registers are not scalarized. + +; CHECK-LABEL: @masked_gather_nxv2f16( +; CHECK: call @llvm.masked.gather.nxv2f16 +define @masked_gather_nxv2f16( %ld, %masks, %passthru) { + %res = call @llvm.masked.gather.nxv2f16( %ld, i32 0, %masks, %passthru) + ret %res +} + +; Testing that masked gathers operating on 64-bit fixed vectors are +; scalarized because NEON doesn't have support for masked gather +; instructions. + +; CHECK-LABEL: @masked_gather_v2f32( +; CHECK-NOT: @llvm.masked.gather.v2f32( +define <2 x float> @masked_gather_v2f32(<2 x float*> %ld, <2 x i1> %masks, <2 x float> %passthru) { + %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthru) + ret <2 x float> %res +} + +; Testing that masked gathers operating on 128-bit fixed vectors are +; scalarized because NEON doesn't have support for masked gather +; instructions and because we are not targeting fixed width SVE. + +; CHECK-LABEL: @masked_gather_v4i32( +; CHECK-NOT: @llvm.masked.gather.v4i32( +define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ld, <4 x i1> %masks, <4 x i32> %passthru) { + %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru) + ret <4 x i32> %res +} + +declare @llvm.masked.gather.nxv4i32( %ptrs, i32 %align, %masks, %passthru) +declare @llvm.masked.gather.nxv2f64( %ptrs, i32 %align, %masks, %passthru) +declare @llvm.masked.gather.nxv2f16( %ptrs, i32 %align, %masks, %passthru) +declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %ptrs, i32 %align, <2 x i1> %masks, <2 x float> %passthru) +declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i32> %passthru) diff --git a/llvm/test/CodeGen/AArch64/llvm-masked-scatter-legal-for-sve.ll b/llvm/test/CodeGen/AArch64/llvm-masked-scatter-legal-for-sve.ll new file mode 100644 index 0000000000000..caaa146aa9595 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/llvm-masked-scatter-legal-for-sve.ll @@ -0,0 +1,63 @@ +; RUN: opt -mtriple=aarch64-linux-gnu -mattr=+sve -scalarize-masked-mem-intrin -S < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; Testing that masked scatters operating on scalable vectors that are +; packed in SVE registers are not scalarized. + +; CHECK-LABEL: @masked_scatter_nxv4i32( +; CHECK: call void @llvm.masked.scatter.nxv4i32 +define void @masked_scatter_nxv4i32( %data, %ptrs, %masks) { + call void @llvm.masked.scatter.nxv4i32( %data, %ptrs, i32 0, %masks) + ret void +} + +; Testing that masked scatters operating on scalable vectors of FP +; data that is packed in SVE registers are not scalarized. + +; CHECK-LABEL: @masked_scatter_nxv2f64( +; CHECK: call void @llvm.masked.scatter.nxv2f64 +define void @masked_scatter_nxv2f64( %data, %ptrs, %masks) { + call void @llvm.masked.scatter.nxv2f64( %data, %ptrs, i32 0, %masks) + ret void +} + +; Testing that masked scatters operating on scalable vectors of FP +; data that is unpacked in SVE registers are not scalarized. + +; CHECK-LABEL: @masked_scatter_nxv2f16( +; CHECK: call void @llvm.masked.scatter.nxv2f16 +define void @masked_scatter_nxv2f16( %data, %ptrs, %masks) { + call void @llvm.masked.scatter.nxv2f16( %data, %ptrs, i32 0, %masks) + ret void +} + +; Testing that masked scatters operating on 64-bit fixed vectors are +; scalarized because NEON doesn't have support for masked scatter +; instructions. + +; CHECK-LABEL: @masked_scatter_v2f32( +; CHECK-NOT: @llvm.masked.scatter.v2f32( +define void @masked_scatter_v2f32(<2 x float> %data, <2 x float*> %ptrs, <2 x i1> %masks) { + call void @llvm.masked.scatter.v2f32(<2 x float> %data, <2 x float*> %ptrs, i32 0, <2 x i1> %masks) + ret void +} + +; Testing that masked scatters operating on 128-bit fixed vectors are +; scalarized because NEON doesn't have support for masked scatter +; instructions and because we are not targeting fixed width SVE. + +; CHECK-LABEL: @masked_scatter_v4i32( +; CHECK-NOT: @llvm.masked.scatter.v4i32( +define void @masked_scatter_v4i32(<4 x i32> %data, <4 x i32*> %ptrs, <4 x i1> %masks) { + call void @llvm.masked.scatter.v4i32(<4 x i32> %data, <4 x i32*> %ptrs, i32 0, <4 x i1> %masks) + ret void +} + +declare void @llvm.masked.scatter.nxv4i32( %data, %ptrs, i32 %align, %masks) +declare void @llvm.masked.scatter.nxv2f64( %data, %ptrs, i32 %align, %masks) +declare void @llvm.masked.scatter.nxv2f16( %data, %ptrs, i32 %align, %masks) +declare void @llvm.masked.scatter.v2f32(<2 x float> %data, <2 x float*> %ptrs, i32 %align, <2 x i1> %masks) +declare void @llvm.masked.scatter.v4i32(<4 x i32> %data, <4 x i32*> %ptrs, i32 %align, <4 x i1> %masks) diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll new file mode 100644 index 0000000000000..bdddb6f1069ce --- /dev/null +++ b/llvm/test/CodeGen/AArch64/parity.ll @@ -0,0 +1,161 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s + +define i4 @parity_4(i4 %x) { +; CHECK-LABEL: parity_4: +; CHECK: // %bb.0: +; CHECK-NEXT: and w8, w0, #0xf +; CHECK-NEXT: eor w8, w8, w8, lsr #2 +; CHECK-NEXT: eor w8, w8, w8, lsr #1 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %1 = tail call i4 @llvm.ctpop.i4(i4 %x) + %2 = and i4 %1, 1 + ret i4 %2 +} + +define i8 @parity_8(i8 %x) { +; CHECK-LABEL: parity_8: +; CHECK: // %bb.0: +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: eor w8, w8, w8, lsr #4 +; CHECK-NEXT: eor w8, w8, w8, lsr #2 +; CHECK-NEXT: eor w8, w8, w8, lsr #1 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %1 = tail call i8 @llvm.ctpop.i8(i8 %x) + %2 = and i8 %1, 1 + ret i8 %2 +} + +define i16 @parity_16(i16 %x) { +; CHECK-LABEL: parity_16: +; CHECK: // %bb.0: +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: eor w8, w8, w8, lsr #8 +; CHECK-NEXT: eor w8, w8, w8, lsr #4 +; CHECK-NEXT: eor w8, w8, w8, lsr #2 +; CHECK-NEXT: eor w8, w8, w8, lsr #1 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %1 = tail call i16 @llvm.ctpop.i16(i16 %x) + %2 = and i16 %1, 1 + ret i16 %2 +} + +define i17 @parity_17(i17 %x) { +; CHECK-LABEL: parity_17: +; CHECK: // %bb.0: +; CHECK-NEXT: and w8, w0, #0x1ffff +; CHECK-NEXT: eor w8, w8, w8, lsr #16 +; CHECK-NEXT: eor w8, w8, w8, lsr #8 +; CHECK-NEXT: eor w8, w8, w8, lsr #4 +; CHECK-NEXT: eor w8, w8, w8, lsr #2 +; CHECK-NEXT: eor w8, w8, w8, lsr #1 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %1 = tail call i17 @llvm.ctpop.i17(i17 %x) + %2 = and i17 %1, 1 + ret i17 %2 +} + +define i32 @parity_32(i32 %x) { +; CHECK-LABEL: parity_32: +; CHECK: // %bb.0: +; CHECK-NEXT: eor w8, w0, w0, lsr #16 +; CHECK-NEXT: eor w8, w8, w8, lsr #8 +; CHECK-NEXT: eor w8, w8, w8, lsr #4 +; CHECK-NEXT: eor w8, w8, w8, lsr #2 +; CHECK-NEXT: eor w8, w8, w8, lsr #1 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %1 = tail call i32 @llvm.ctpop.i32(i32 %x) + %2 = and i32 %1, 1 + ret i32 %2 +} + +define i64 @parity_64(i64 %x) { +; CHECK-LABEL: parity_64: +; CHECK: // %bb.0: +; CHECK-NEXT: eor x8, x0, x0, lsr #32 +; CHECK-NEXT: eor x8, x8, x8, lsr #16 +; CHECK-NEXT: eor x8, x8, x8, lsr #8 +; CHECK-NEXT: eor x8, x8, x8, lsr #4 +; CHECK-NEXT: eor x8, x8, x8, lsr #2 +; CHECK-NEXT: eor w8, w8, w8, lsr #1 +; CHECK-NEXT: and x0, x8, #0x1 +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.ctpop.i64(i64 %x) + %2 = and i64 %1, 1 + ret i64 %2 +} + +define i32 @parity_64_trunc(i64 %x) { +; CHECK-LABEL: parity_64_trunc: +; CHECK: // %bb.0: +; CHECK-NEXT: eor x8, x0, x0, lsr #32 +; CHECK-NEXT: eor x8, x8, x8, lsr #16 +; CHECK-NEXT: eor x8, x8, x8, lsr #8 +; CHECK-NEXT: eor x8, x8, x8, lsr #4 +; CHECK-NEXT: eor x8, x8, x8, lsr #2 +; CHECK-NEXT: eor w8, w8, w8, lsr #1 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %1 = tail call i64 @llvm.ctpop.i64(i64 %x) + %2 = trunc i64 %1 to i32 + %3 = and i32 %2, 1 + ret i32 %3 +} + +define i8 @parity_32_trunc(i32 %x) { +; CHECK-LABEL: parity_32_trunc: +; CHECK: // %bb.0: +; CHECK-NEXT: eor w8, w0, w0, lsr #16 +; CHECK-NEXT: eor w8, w8, w8, lsr #8 +; CHECK-NEXT: eor w8, w8, w8, lsr #4 +; CHECK-NEXT: eor w8, w8, w8, lsr #2 +; CHECK-NEXT: eor w8, w8, w8, lsr #1 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %1 = tail call i32 @llvm.ctpop.i32(i32 %x) + %2 = trunc i32 %1 to i8 + %3 = and i8 %2, 1 + ret i8 %3 +} + +define i32 @parity_8_zext(i8 %x) { +; CHECK-LABEL: parity_8_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: eor w8, w8, w8, lsr #4 +; CHECK-NEXT: eor w8, w8, w8, lsr #2 +; CHECK-NEXT: eor w8, w8, w8, lsr #1 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %a = zext i8 %x to i32 + %b = tail call i32 @llvm.ctpop.i32(i32 %a) + %c = and i32 %b, 1 + ret i32 %c +} + +define i32 @parity_8_mask(i32 %x) { +; CHECK-LABEL: parity_8_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: eor w8, w8, w8, lsr #4 +; CHECK-NEXT: eor w8, w8, w8, lsr #2 +; CHECK-NEXT: eor w8, w8, w8, lsr #1 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %a = and i32 %x, 255 + %b = tail call i32 @llvm.ctpop.i32(i32 %a) + %c = and i32 %b, 1 + ret i32 %c +} + +declare i4 @llvm.ctpop.i4(i4 %x) +declare i8 @llvm.ctpop.i8(i8 %x) +declare i16 @llvm.ctpop.i16(i16 %x) +declare i17 @llvm.ctpop.i17(i17 %x) +declare i32 @llvm.ctpop.i32(i32 %x) +declare i64 @llvm.ctpop.i64(i64 %x) diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll index 105969717e46b..1e796fff710c0 100644 --- a/llvm/test/CodeGen/AArch64/popcount.ll +++ b/llvm/test/CodeGen/AArch64/popcount.ll @@ -10,11 +10,12 @@ define i8 @popcount128(i128* nocapture nonnull readonly %0) { ; CHECK-NEXT: // implicit-def: $q1 ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v1.d[1], x8 -; CHECK-NEXT: cnt v0.16b, v1.16b -; CHECK-NEXT: uaddlv h0, v0.16b +; CHECK-NEXT: cnt v1.16b, v1.16b +; CHECK-NEXT: uaddlv h2, v1.16b ; CHECK-NEXT: // implicit-def: $q1 -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: fmov w0, s1 +; CHECK-NEXT: mov v1.16b, v2.16b +; CHECK-NEXT: fmov w1, s1 +; CHECK-NEXT: mov w0, w1 ; CHECK-NEXT: ret Entry: %1 = load i128, i128* %0, align 16 @@ -36,21 +37,21 @@ define i16 @popcount256(i256* nocapture nonnull readonly %0) { ; CHECK-NEXT: // implicit-def: $q1 ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v1.d[1], x9 -; CHECK-NEXT: cnt v0.16b, v1.16b -; CHECK-NEXT: uaddlv h0, v0.16b +; CHECK-NEXT: cnt v1.16b, v1.16b +; CHECK-NEXT: uaddlv h2, v1.16b ; CHECK-NEXT: // implicit-def: $q1 -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov v1.16b, v2.16b +; CHECK-NEXT: fmov w10, s1 ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: // implicit-def: $q1 ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v1.d[1], x8 -; CHECK-NEXT: cnt v0.16b, v1.16b -; CHECK-NEXT: uaddlv h0, v0.16b +; CHECK-NEXT: cnt v1.16b, v1.16b +; CHECK-NEXT: uaddlv h2, v1.16b ; CHECK-NEXT: // implicit-def: $q1 -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: mov v1.16b, v2.16b +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: add w0, w11, w10 ; CHECK-NEXT: ret Entry: %1 = load i256, i256* %0, align 16 @@ -69,11 +70,11 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) { ; CHECK-NEXT: fmov d0, x0 ; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlv h0, v0.16b -; CHECK-NEXT: // implicit-def: $q1 -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: fmov w0, s1 -; CHECK-NEXT: // kill: def $x0 killed $w0 +; CHECK-NEXT: uaddlv h1, v0.16b +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmov w2, s0 +; CHECK-NEXT: mov w0, w2 ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov x1, v0.d[1] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/pow.ll b/llvm/test/CodeGen/AArch64/pow.ll index 0f0e2597d25a8..c8e8ab9fc9f7d 100644 --- a/llvm/test/CodeGen/AArch64/pow.ll +++ b/llvm/test/CodeGen/AArch64/pow.ll @@ -69,16 +69,14 @@ define <4 x float> @pow_v4f32_one_fourth_not_enough_fmf(<4 x float> %x) nounwind ; CHECK-LABEL: pow_v4f32_one_fourth_not_enough_fmf: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #48 // =48 -; CHECK-NEXT: str d8, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: fmov s8, #0.25000000 ; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov s0, v0.s[1] -; CHECK-NEXT: mov v1.16b, v8.16b -; CHECK-NEXT: str x30, [sp, #40] // 8-byte Folded Spill +; CHECK-NEXT: fmov s1, #0.25000000 +; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: bl powf ; CHECK-NEXT: str d0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: fmov s1, #0.25000000 ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov v1.16b, v8.16b ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload @@ -86,7 +84,7 @@ define <4 x float> @pow_v4f32_one_fourth_not_enough_fmf(<4 x float> %x) nounwind ; CHECK-NEXT: mov v0.s[1], v1.s[0] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov v1.16b, v8.16b +; CHECK-NEXT: fmov s1, #0.25000000 ; CHECK-NEXT: mov s0, v0.s[2] ; CHECK-NEXT: bl powf ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload @@ -94,12 +92,11 @@ define <4 x float> @pow_v4f32_one_fourth_not_enough_fmf(<4 x float> %x) nounwind ; CHECK-NEXT: mov v1.s[2], v0.s[0] ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v1.16b, v8.16b +; CHECK-NEXT: fmov s1, #0.25000000 ; CHECK-NEXT: mov s0, v0.s[3] ; CHECK-NEXT: bl powf ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldr d8, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: mov v1.s[3], v0.s[0] ; CHECK-NEXT: mov v0.16b, v1.16b @@ -113,21 +110,18 @@ define <2 x double> @pow_v2f64_one_fourth_not_enough_fmf(<2 x double> %x) nounwi ; CHECK-LABEL: pow_v2f64_one_fourth_not_enough_fmf: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #48 // =48 -; CHECK-NEXT: str d8, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: fmov d8, #0.25000000 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: mov d0, v0.d[1] -; CHECK-NEXT: mov v1.16b, v8.16b -; CHECK-NEXT: str x30, [sp, #40] // 8-byte Folded Spill +; CHECK-NEXT: fmov d1, #0.25000000 +; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: bl pow ; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: fmov d1, #0.25000000 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov v1.16b, v8.16b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl pow ; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldr d8, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: add sp, sp, #48 // =48 diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll index 893ed6445462f..1ae1ee43beeef 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -159,8 +159,8 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { ; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strb w8, [x2, #1] ; CHECK-NEXT: strb w9, [x2] +; CHECK-NEXT: strb w8, [x2, #1] ; CHECK-NEXT: ret %x = load <2 x i8>, <2 x i8>* %px %y = load <2 x i8>, <2 x i8>* %py @@ -201,8 +201,8 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w8, [x2, #2] ; CHECK-NEXT: strh w9, [x2] +; CHECK-NEXT: strh w8, [x2, #2] ; CHECK-NEXT: ret %x = load <2 x i16>, <2 x i16>* %px %y = load <2 x i16>, <2 x i16>* %py diff --git a/llvm/test/CodeGen/AArch64/shift_minsize.ll b/llvm/test/CodeGen/AArch64/shift_minsize.ll index ac48975f18f8d..8205e7debcd69 100644 --- a/llvm/test/CodeGen/AArch64/shift_minsize.ll +++ b/llvm/test/CodeGen/AArch64/shift_minsize.ll @@ -59,7 +59,7 @@ define dso_local { i64, i64 } @shl128(i64 %x.coerce0, i64 %x.coerce1, i8 signext ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: mov w2, w2 ; CHECK-NEXT: bl __ashlti3 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -86,7 +86,7 @@ define dso_local { i64, i64 } @ashr128(i64 %x.coerce0, i64 %x.coerce1, i8 signex ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: mov w2, w2 ; CHECK-NEXT: bl __ashrti3 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -112,7 +112,7 @@ define dso_local { i64, i64 } @lshr128(i64 %x.coerce0, i64 %x.coerce1, i8 signex ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: mov w2, w2 ; CHECK-NEXT: bl __lshrti3 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll index 2cf6e896bed0a..c5a55f23913ae 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -160,8 +160,8 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { ; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strb w8, [x2, #1] ; CHECK-NEXT: strb w9, [x2] +; CHECK-NEXT: strb w8, [x2, #1] ; CHECK-NEXT: ret %x = load <2 x i8>, <2 x i8>* %px %y = load <2 x i8>, <2 x i8>* %py @@ -202,8 +202,8 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w8, [x2, #2] ; CHECK-NEXT: strh w9, [x2] +; CHECK-NEXT: strh w8, [x2, #2] ; CHECK-NEXT: ret %x = load <2 x i16>, <2 x i16>* %px %y = load <2 x i16>, <2 x i16>* %py diff --git a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll new file mode 100644 index 0000000000000..9819f64a9546a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll @@ -0,0 +1,218 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; A collection of basic functionality tests for statepoint lowering - most +; interesting cornercases are exercised through the x86 tests. + +target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +%struct = type { i64, i64 } + +declare zeroext i1 @return_i1() +declare zeroext i32 @return_i32() +declare i32* @return_i32ptr() +declare float @return_float() +declare %struct @return_struct() +declare void @varargf(i32, ...) + +define i1 @test_i1_return() gc "statepoint-example" { +; CHECK-LABEL: test_i1_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: bl return_i1 +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: and w0, w0, #0x1 +; CHECK-NEXT: ret +; This is just checking that a i1 gets lowered normally when there's no extra +; state arguments to the statepoint +entry: + %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0) + %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token) + ret i1 %call1 +} + +define i32 @test_i32_return() gc "statepoint-example" { +; CHECK-LABEL: test_i32_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: bl return_i32 +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: ret +entry: + %safepoint_token = tail call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* @return_i32, i32 0, i32 0, i32 0, i32 0) + %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token) + ret i32 %call1 +} + +define i32* @test_i32ptr_return() gc "statepoint-example" { +; CHECK-LABEL: test_i32ptr_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: bl return_i32ptr +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: ret +entry: + %safepoint_token = tail call token (i64, i32, i32* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p0i32f(i64 0, i32 0, i32* ()* @return_i32ptr, i32 0, i32 0, i32 0, i32 0) + %call1 = call i32* @llvm.experimental.gc.result.p0i32(token %safepoint_token) + ret i32* %call1 +} + +define float @test_float_return() gc "statepoint-example" { +; CHECK-LABEL: test_float_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: bl return_float +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: ret +entry: + %safepoint_token = tail call token (i64, i32, float ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f32f(i64 0, i32 0, float ()* @return_float, i32 0, i32 0, i32 0, i32 0) + %call1 = call float @llvm.experimental.gc.result.f32(token %safepoint_token) + ret float %call1 +} + +define %struct @test_struct_return() gc "statepoint-example" { +; CHECK-LABEL: test_struct_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: bl return_struct +; CHECK-NEXT: .Ltmp6: +; CHECK-NEXT: ret +entry: + %safepoint_token = tail call token (i64, i32, %struct ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_structf(i64 0, i32 0, %struct ()* @return_struct, i32 0, i32 0, i32 0, i32 0) + %call1 = call %struct @llvm.experimental.gc.result.struct(token %safepoint_token) + ret %struct %call1 +} + +define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" { +; CHECK-LABEL: test_relocate: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 // =16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: str x0, [sp, #8] +; CHECK-NEXT: bl return_i1 +; CHECK-NEXT: .Ltmp7: +; CHECK-NEXT: and w0, w0, #0x1 +; CHECK-NEXT: add sp, sp, #16 // =16 +; CHECK-NEXT: ret +; Check that an ununsed relocate has no code-generation impact +entry: + %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %a)] + %call1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 0) + %call2 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token) + ret i1 %call2 +} + +define void @test_void_vararg() gc "statepoint-example" { +; CHECK-LABEL: test_void_vararg: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: mov w1, #43 +; CHECK-NEXT: bl varargf +; CHECK-NEXT: .Ltmp8: +; CHECK-NEXT: ret +; Check a statepoint wrapping a *void* returning vararg function works +entry: + %safepoint_token = tail call token (i64, i32, void (i32, ...)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(i64 0, i32 0, void (i32, ...)* @varargf, i32 2, i32 0, i32 42, i32 43, i32 0, i32 0) + ;; if we try to use the result from a statepoint wrapping a + ;; non-void-returning varargf, we will experience a crash. + ret void +} + +define i1 @test_i1_return_patchable() gc "statepoint-example" { +; CHECK-LABEL: test_i1_return_patchable: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: nop +; CHECK-NEXT: .Ltmp9: +; CHECK-NEXT: and w0, w0, #0x1 +; CHECK-NEXT: ret +; A patchable variant of test_i1_return +entry: + %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 4, i1 ()*null, i32 0, i32 0, i32 0, i32 0) + %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token) + ret i1 %call1 +} + +declare void @consume(i32 addrspace(1)* %obj) + +define i1 @test_cross_bb(i32 addrspace(1)* %a, i1 %external_cond) gc "statepoint-example" { +; CHECK-LABEL: test_cross_bb: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: mov w20, w1 +; CHECK-NEXT: str x0, [sp, #8] +; CHECK-NEXT: bl return_i1 +; CHECK-NEXT: .Ltmp10: +; CHECK-NEXT: tbz w20, #0, .LBB8_2 +; CHECK-NEXT: // %bb.1: // %left +; CHECK-NEXT: mov w19, w0 +; CHECK-NEXT: ldr x0, [sp, #8] +; CHECK-NEXT: bl consume +; CHECK-NEXT: and w0, w19, #0x1 +; CHECK-NEXT: b .LBB8_3 +; CHECK-NEXT: .LBB8_2: // %right +; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: .LBB8_3: // %right +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %a)] + br i1 %external_cond, label %left, label %right + +left: + %call1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 0) + %call2 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token) + call void @consume(i32 addrspace(1)* %call1) + ret i1 %call2 + +right: + ret i1 true +} + +%struct2 = type { i64, i64, i64 } + +declare void @consume_attributes(i32, i8* nest, i32, %struct2* byval) + +define void @test_attributes(%struct2* byval %s) gc "statepoint-example" { +; CHECK-LABEL: test_attributes: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 // =32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldr x8, [sp, #48] +; CHECK-NEXT: ldr q0, [sp, #32] +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: mov w1, #17 +; CHECK-NEXT: mov x18, xzr +; CHECK-NEXT: str x8, [sp, #16] +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bl consume_attributes +; CHECK-NEXT: .Ltmp11: +; CHECK-NEXT: add sp, sp, #32 // =32 +; CHECK-NEXT: ret +entry: +; Check that arguments with attributes are lowered correctly. +; We call a function that has a nest argument and a byval argument. + %statepoint_token = call token (i64, i32, void (i32, i8*, i32, %struct2*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32p0i8i32p0s_struct2sf(i64 0, i32 0, void (i32, i8*, i32, %struct2*)* @consume_attributes, i32 4, i32 0, i32 42, i8* nest null, i32 17, %struct2* byval %s, i32 0, i32 0) + ret void +} + +declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...) +declare i1 @llvm.experimental.gc.result.i1(token) + +declare token @llvm.experimental.gc.statepoint.p0f_i32f(i64, i32, i32 ()*, i32, i32, ...) +declare i32 @llvm.experimental.gc.result.i32(token) + +declare token @llvm.experimental.gc.statepoint.p0f_p0i32f(i64, i32, i32* ()*, i32, i32, ...) +declare i32* @llvm.experimental.gc.result.p0i32(token) + +declare token @llvm.experimental.gc.statepoint.p0f_f32f(i64, i32, float ()*, i32, i32, ...) +declare float @llvm.experimental.gc.result.f32(token) + +declare token @llvm.experimental.gc.statepoint.p0f_structf(i64, i32, %struct ()*, i32, i32, ...) +declare %struct @llvm.experimental.gc.result.struct(token) + +declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(i64, i32, void (i32, ...)*, i32, i32, ...) + +declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32p0i8i32p0s_struct2sf(i64, i32, void (i32, i8*, i32, %struct2*)*, i32, i32, ...) + +declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32) diff --git a/llvm/test/CodeGen/AArch64/sve-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-fcvt.ll new file mode 100644 index 0000000000000..28eaab21a9fe2 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fcvt.ll @@ -0,0 +1,296 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; +; FP_TO_SINT +; + +define @fcvtzs_h_nxv2f16( %a) { +; CHECK-LABEL: fcvtzs_h_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_h_nxv2f32( %a) { +; CHECK-LABEL: fcvtzs_h_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_h_nxv2f64( %a) { +; CHECK-LABEL: fcvtzs_h_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_h_nxv4f16( %a) { +; CHECK-LABEL: fcvtzs_h_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_h_nxv4f32( %a) { +; CHECK-LABEL: fcvtzs_h_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_h_nxv8f16( %a) { +; CHECK-LABEL: fcvtzs_h_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_s_nxv2f16( %a) { +; CHECK-LABEL: fcvtzs_s_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_s_nxv2f32( %a) { +; CHECK-LABEL: fcvtzs_s_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_s_nxv2f64( %a) { +; CHECK-LABEL: fcvtzs_s_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_s_nxv4f16( %a) { +; CHECK-LABEL: fcvtzs_s_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_s_nxv4f32( %a) { +; CHECK-LABEL: fcvtzs_s_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_d_nxv2f16( %a) { +; CHECK-LABEL: fcvtzs_d_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_d_nxv2f32( %a) { +; CHECK-LABEL: fcvtzs_d_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_d_nxv2f64( %a) { +; CHECK-LABEL: fcvtzs_d_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +; +; FP_TO_UINT +; + +; NOTE: Using fcvtzs is safe as fptoui overflow is considered poison and a +; 64bit signed value encompasses the entire range of a 16bit unsigned value +define @fcvtzu_h_nxv2f16( %a) { +; CHECK-LABEL: fcvtzu_h_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fcvtzu_h_nxv2f32( %a) { +; CHECK-LABEL: fcvtzu_h_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fcvtzu_h_nxv2f64( %a) { +; CHECK-LABEL: fcvtzu_h_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fcvtzu_h_nxv4f16( %a) { +; CHECK-LABEL: fcvtzu_h_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fcvtzu_h_nxv4f32( %a) { +; CHECK-LABEL: fcvtzu_h_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzu_h_nxv8f16( %a) { +; CHECK-LABEL: fcvtzu_h_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fcvtzu_s_nxv2f16( %a) { +; CHECK-LABEL: fcvtzu_s_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fcvtzu_s_nxv2f32( %a) { +; CHECK-LABEL: fcvtzu_s_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fcvtzu_s_nxv2f64( %a) { +; CHECK-LABEL: fcvtzu_s_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fcvtzu_s_nxv4f16( %a) { +; CHECK-LABEL: fcvtzu_s_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fcvtzu_s_nxv4f32( %a) { +; CHECK-LABEL: fcvtzu_s_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fcvtzu_d_nxv2f16( %a) { +; CHECK-LABEL: fcvtzu_d_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fcvtzu_d_nxv2f32( %a) { +; CHECK-LABEL: fcvtzu_d_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fcvtzu_d_nxv2f64( %a) { +; CHECK-LABEL: fcvtzu_d_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll new file mode 100644 index 0000000000000..1570ea2db7718 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll @@ -0,0 +1,317 @@ +; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE when its registers are no bigger than NEON. +; NO_SVE-NOT: ptrue + +; Don't use SVE for 64-bit vectors. +define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) #0 { +; CHECK-LABEL: select_v4f16: +; CHECK: bif v0.8b, v1.8b, v2.8b +; CHECK: ret + %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2 + ret <4 x half> %sel +} + +; Don't use SVE for 128-bit vectors. +define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) #0 { +; CHECK-LABEL: select_v8f16: +; CHECK: bif v0.16b, v1.16b, v2.16b +; CHECK: ret + %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2 + ret <8 x half> %sel +} + +define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i1>* %c) #0 { +; CHECK-LABEL: select_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] +; CHECK: ptrue [[PG1:p[0-9]+]].h +; VBITS_GE_256: ld1h { [[MASK:z[0-9]+]].h }, [[PG]]/z, [x9] +; VBITS_GE_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_256-NEXT: and [[AND:z[0-9]+]].h, [[MASK]].h, #0x1 +; VBITS_GE_256-NEXT: cmpne [[COND:p[0-9]+]].h, [[PG1]]/z, [[AND]].h, #0 +; VBITS_GE_256-NEXT: sel [[RES:z[0-9]+]].h, [[COND]], [[OP1]].h, [[OP2]].h +; VBITS_GE_256-NEXT: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_256: ret + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %sel = select <16 x i1> %mask, <16 x half> %op1, <16 x half> %op2 + store <16 x half> %sel, <16 x half>* %a + ret void +} + +define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i1>* %c) #0 { +; CHECK-LABEL: select_v32f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] +; CHECK: ptrue [[PG1:p[0-9]+]].h +; VBITS_GE_512: ld1h { [[MASK:z[0-9]+]].h }, [[PG]]/z, [x9] +; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_512-NEXT: and [[AND:z[0-9]+]].h, [[MASK]].h, #0x1 +; VBITS_GE_512-NEXT: cmpne [[COND:p[0-9]+]].h, [[PG1]]/z, [[AND]].h, #0 +; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].h, [[COND]], [[OP1]].h, [[OP2]].h +; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_512: ret + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x half>, <32 x half>* %a + %op2 = load <32 x half>, <32 x half>* %b + %sel = select <32 x i1> %mask, <32 x half> %op1, <32 x half> %op2 + store <32 x half> %sel, <32 x half>* %a + ret void +} + +define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i1>* %c) #0 { +; CHECK-LABEL: select_v64f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] +; CHECK: ptrue [[PG1:p[0-9]+]].h +; VBITS_GE_1024: ld1h { [[MASK:z[0-9]+]].h }, [[PG]]/z, [x9] +; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_1024-NEXT: and [[AND:z[0-9]+]].h, [[MASK]].h, #0x1 +; VBITS_GE_1024-NEXT: cmpne [[COND:p[0-9]+]].h, [[PG1]]/z, [[AND]].h, #0 +; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].h, [[COND]], [[OP1]].h, [[OP2]].h +; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_1024: ret + %mask = load <64 x i1>, <64 x i1>* %c + %op1 = load <64 x half>, <64 x half>* %a + %op2 = load <64 x half>, <64 x half>* %b + %sel = select <64 x i1> %mask, <64 x half> %op1, <64 x half> %op2 + store <64 x half> %sel, <64 x half>* %a + ret void +} + +define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i1>* %c) #0 { +; CHECK-LABEL: select_v128f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] +; CHECK: ptrue [[PG1:p[0-9]+]].h +; VBITS_GE_2048: ld1h { [[MASK:z[0-9]+]].h }, [[PG]]/z, [x9] +; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_2048-NEXT: and [[AND:z[0-9]+]].h, [[MASK]].h, #0x1 +; VBITS_GE_2048-NEXT: cmpne [[COND:p[0-9]+]].h, [[PG1]]/z, [[AND]].h, #0 +; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].h, [[COND]], [[OP1]].h, [[OP2]].h +; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_2048: ret + %mask = load <128 x i1>, <128 x i1>* %c + %op1 = load <128 x half>, <128 x half>* %a + %op2 = load <128 x half>, <128 x half>* %b + %sel = select <128 x i1> %mask, <128 x half> %op1, <128 x half> %op2 + store <128 x half> %sel, <128 x half>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) #0 { +; CHECK-LABEL: select_v2f32: +; CHECK: bif v0.8b, v1.8b, v2.8b +; CHECK: ret + %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2 + ret <2 x float> %sel +} + +; Don't use SVE for 128-bit vectors. +define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) #0 { +; CHECK-LABEL: select_v4f32: +; CHECK: bif v0.16b, v1.16b, v2.16b +; CHECK: ret + %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2 + ret <4 x float> %sel +} + +define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i1>* %c) #0 { +; CHECK-LABEL: select_v8f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] +; CHECK: ptrue [[PG1:p[0-9]+]].s +; VBITS_GE_256: ld1w { [[MASK:z[0-9]+]].s }, [[PG]]/z, [x9] +; VBITS_GE_256-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_256-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_256-NEXT: and [[AND:z[0-9]+]].s, [[MASK]].s, #0x1 +; VBITS_GE_256-NEXT: cmpne [[COND:p[0-9]+]].s, [[PG1]]/z, [[AND]].s, #0 +; VBITS_GE_256-NEXT: sel [[RES:z[0-9]+]].s, [[COND]], [[OP1]].s, [[OP2]].s +; VBITS_GE_256-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_256: ret + %mask = load <8 x i1>, <8 x i1>* %c + %op1 = load <8 x float>, <8 x float>* %a + %op2 = load <8 x float>, <8 x float>* %b + %sel = select <8 x i1> %mask, <8 x float> %op1, <8 x float> %op2 + store <8 x float> %sel, <8 x float>* %a + ret void +} + +define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i1>* %c) #0 { +; CHECK-LABEL: select_v16f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] +; CHECK: ptrue [[PG1:p[0-9]+]].s +; VBITS_GE_512: ld1w { [[MASK:z[0-9]+]].s }, [[PG]]/z, [x9] +; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_512-NEXT: and [[AND:z[0-9]+]].s, [[MASK]].s, #0x1 +; VBITS_GE_512-NEXT: cmpne [[COND:p[0-9]+]].s, [[PG1]]/z, [[AND]].s, #0 +; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].s, [[COND]], [[OP1]].s, [[OP2]].s +; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_512: ret + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x float>, <16 x float>* %a + %op2 = load <16 x float>, <16 x float>* %b + %sel = select <16 x i1> %mask, <16 x float> %op1, <16 x float> %op2 + store <16 x float> %sel, <16 x float>* %a + ret void +} + +define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i1>* %c) #0 { +; CHECK-LABEL: select_v32f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] +; CHECK: ptrue [[PG1:p[0-9]+]].s +; VBITS_GE_1024: ld1w { [[MASK:z[0-9]+]].s }, [[PG]]/z, [x9] +; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_1024-NEXT: and [[AND:z[0-9]+]].s, [[MASK]].s, #0x1 +; VBITS_GE_1024-NEXT: cmpne [[COND:p[0-9]+]].s, [[PG1]]/z, [[AND]].s, #0 +; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].s, [[COND]], [[OP1]].s, [[OP2]].s +; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_1024: ret + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x float>, <32 x float>* %a + %op2 = load <32 x float>, <32 x float>* %b + %sel = select <32 x i1> %mask, <32 x float> %op1, <32 x float> %op2 + store <32 x float> %sel, <32 x float>* %a + ret void +} + +define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i1>* %c) #0 { +; CHECK-LABEL: select_v64f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] +; CHECK: ptrue [[PG1:p[0-9]+]].s +; VBITS_GE_2048: ld1w { [[MASK:z[0-9]+]].s }, [[PG]]/z, [x9] +; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_2048-NEXT: and [[AND:z[0-9]+]].s, [[MASK]].s, #0x1 +; VBITS_GE_2048-NEXT: cmpne [[COND:p[0-9]+]].s, [[PG1]]/z, [[AND]].s, #0 +; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].s, [[COND]], [[OP1]].s, [[OP2]].s +; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_2048: ret + %mask = load <64 x i1>, <64 x i1>* %c + %op1 = load <64 x float>, <64 x float>* %a + %op2 = load <64 x float>, <64 x float>* %b + %sel = select <64 x i1> %mask, <64 x float> %op1, <64 x float> %op2 + store <64 x float> %sel, <64 x float>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) #0 { +; CHECK-LABEL: select_v1f64: +; CHECK: bif v0.8b, v1.8b, v2.8b +; CHECK: ret + %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2 + ret <1 x double> %sel +} + +; Don't use SVE for 128-bit vectors. +define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) #0 { +; CHECK-LABEL: select_v2f64: +; CHECK: bif v0.16b, v1.16b, v2.16b +; CHECK: ret + %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2 + ret <2 x double> %sel +} + +define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i1>* %c) #0 { +; CHECK-LABEL: select_v4f64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] +; CHECK: ptrue [[PG1:p[0-9]+]].d +; VBITS_GE_256: ld1d { [[MASK:z[0-9]+]].d }, [[PG]]/z, [x9] +; VBITS_GE_256-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_256-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_256-NEXT: and [[AND:z[0-9]+]].d, [[MASK]].d, #0x1 +; VBITS_GE_256-NEXT: cmpne [[COND:p[0-9]+]].d, [[PG1]]/z, [[AND]].d, #0 +; VBITS_GE_256-NEXT: sel [[RES:z[0-9]+]].d, [[COND]], [[OP1]].d, [[OP2]].d +; VBITS_GE_256-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_256: ret + %mask = load <4 x i1>, <4 x i1>* %c + %op1 = load <4 x double>, <4 x double>* %a + %op2 = load <4 x double>, <4 x double>* %b + %sel = select <4 x i1> %mask, <4 x double> %op1, <4 x double> %op2 + store <4 x double> %sel, <4 x double>* %a + ret void +} + +define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i1>* %c) #0 { +; CHECK-LABEL: select_v8f64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] +; CHECK: ptrue [[PG1:p[0-9]+]].d +; VBITS_GE_512: ld1d { [[MASK:z[0-9]+]].d }, [[PG]]/z, [x9] +; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_512-NEXT: and [[AND:z[0-9]+]].d, [[MASK]].d, #0x1 +; VBITS_GE_512-NEXT: cmpne [[COND:p[0-9]+]].d, [[PG1]]/z, [[AND]].d, #0 +; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].d, [[COND]], [[OP1]].d, [[OP2]].d +; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_512: ret + %mask = load <8 x i1>, <8 x i1>* %c + %op1 = load <8 x double>, <8 x double>* %a + %op2 = load <8 x double>, <8 x double>* %b + %sel = select <8 x i1> %mask, <8 x double> %op1, <8 x double> %op2 + store <8 x double> %sel, <8 x double>* %a + ret void +} + +define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i1>* %c) #0 { +; CHECK-LABEL: select_v16f64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] +; CHECK: ptrue [[PG1:p[0-9]+]].d +; VBITS_GE_1024: ld1d { [[MASK:z[0-9]+]].d }, [[PG]]/z, [x9] +; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_1024-NEXT: and [[AND:z[0-9]+]].d, [[MASK]].d, #0x1 +; VBITS_GE_1024-NEXT: cmpne [[COND:p[0-9]+]].d, [[PG1]]/z, [[AND]].d, #0 +; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].d, [[COND]], [[OP1]].d, [[OP2]].d +; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_1024: ret + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x double>, <16 x double>* %a + %op2 = load <16 x double>, <16 x double>* %b + %sel = select <16 x i1> %mask, <16 x double> %op1, <16 x double> %op2 + store <16 x double> %sel, <16 x double>* %a + ret void +} + +define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i1>* %c) #0 { +; CHECK-LABEL: select_v32f64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] +; CHECK: ptrue [[PG1:p[0-9]+]].d +; VBITS_GE_2048: ld1d { [[MASK:z[0-9]+]].d }, [[PG]]/z, [x9] +; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_2048-NEXT: and [[AND:z[0-9]+]].d, [[MASK]].d, #0x1 +; VBITS_GE_2048-NEXT: cmpne [[COND:p[0-9]+]].d, [[PG1]]/z, [[AND]].d, #0 +; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].d, [[COND]], [[OP1]].d, [[OP2]].d +; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_2048: ret + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x double>, <32 x double>* %a + %op2 = load <32 x double>, <32 x double>* %b + %sel = select <32 x i1> %mask, <32 x double> %op1, <32 x double> %op2 + store <32 x double> %sel, <32 x double>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll new file mode 100644 index 0000000000000..904e56fb8c096 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll @@ -0,0 +1,415 @@ +; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE when its registers are no bigger than NEON. +; NO_SVE-NOT: ptrue + +; Don't use SVE for 64-bit vectors. +define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) #0 { +; CHECK: select_v8i8: +; CHECK: bif v0.8b, v1.8b, v2.8b +; CHECK: ret + %sel = select <8 x i1> %mask, <8 x i8> %op1, <8 x i8> %op2 + ret <8 x i8> %sel +} + +; Don't use SVE for 128-bit vectors. +define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) #0 { +; CHECK: select_v16i8: +; CHECK: bif v0.16b, v1.16b, v2.16b +; CHECK: ret + %sel = select <16 x i1> %mask, <16 x i8> %op1, <16 x i8> %op2 + ret <16 x i8> %sel +} + +define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, <32 x i1>* %c) #0 { +; CHECK: select_v32i8: +; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] +; CHECK: ptrue [[PG1:p[0-9]+]].b +; VBITS_GE_256: ld1b { [[MASK:z[0-9]+]].b }, [[PG]]/z, [x9] +; VBITS_GE_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_GE_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_GE_256-NEXT: and [[AND:z[0-9]+]].b, [[MASK]].b, #0x1 +; VBITS_GE_256-NEXT: cmpne [[COND:p[0-9]+]].b, [[PG1]]/z, [[AND]].b, #0 +; VBITS_GE_256-NEXT: sel [[RES:z[0-9]+]].b, [[COND]], [[OP1]].b, [[OP2]].b +; VBITS_GE_256-NEXT: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_GE_256: ret + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %sel = select <32 x i1> %mask, <32 x i8> %op1, <32 x i8> %op2 + store <32 x i8> %sel, <32 x i8>* %a + ret void +} + +define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, <64 x i1>* %c) #0 { +; CHECK: select_v64i8: +; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] +; CHECK: ptrue [[PG1:p[0-9]+]].b +; VBITS_GE_512: ld1b { [[MASK:z[0-9]+]].b }, [[PG]]/z, [x9] +; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_GE_512-NEXT: and [[AND:z[0-9]+]].b, [[MASK]].b, #0x1 +; VBITS_GE_512-NEXT: cmpne [[COND:p[0-9]+]].b, [[PG1]]/z, [[AND]].b, #0 +; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].b, [[COND]], [[OP1]].b, [[OP2]].b +; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_GE_512: ret + %mask = load <64 x i1>, <64 x i1>* %c + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %sel = select <64 x i1> %mask, <64 x i8> %op1, <64 x i8> %op2 + store <64 x i8> %sel, <64 x i8>* %a + ret void +} + +define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, <128 x i1>* %c) #0 { +; CHECK: select_v128i8: +; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] +; CHECK: ptrue [[PG1:p[0-9]+]].b +; VBITS_GE_1024: ld1b { [[MASK:z[0-9]+]].b }, [[PG]]/z, [x9] +; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_GE_1024-NEXT: and [[AND:z[0-9]+]].b, [[MASK]].b, #0x1 +; VBITS_GE_1024-NEXT: cmpne [[COND:p[0-9]+]].b, [[PG1]]/z, [[AND]].b, #0 +; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].b, [[COND]], [[OP1]].b, [[OP2]].b +; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_GE_1024: ret + %mask = load <128 x i1>, <128 x i1>* %c + %op1 = load <128 x i8>, <128 x i8>* %a + %op2 = load <128 x i8>, <128 x i8>* %b + %sel = select <128 x i1> %mask, <128 x i8> %op1, <128 x i8> %op2 + store <128 x i8> %sel, <128 x i8>* %a + ret void +} + +define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, <256 x i1>* %c) #0 { +; CHECK: select_v256i8: +; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]] +; CHECK: ptrue [[PG1:p[0-9]+]].b +; VBITS_GE_2048: ld1b { [[MASK:z[0-9]+]].b }, [[PG]]/z, [x9] +; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_GE_2048-NEXT: and [[AND:z[0-9]+]].b, [[MASK]].b, #0x1 +; VBITS_GE_2048-NEXT: cmpne [[COND:p[0-9]+]].b, [[PG1]]/z, [[AND]].b, #0 +; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].b, [[COND]], [[OP1]].b, [[OP2]].b +; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_GE_2048: ret + %mask = load <256 x i1>, <256 x i1>* %c + %op1 = load <256 x i8>, <256 x i8>* %a + %op2 = load <256 x i8>, <256 x i8>* %b + %sel = select <256 x i1> %mask, <256 x i8> %op1, <256 x i8> %op2 + store <256 x i8> %sel, <256 x i8>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) #0 { +; CHECK: select_v4i16: +; CHECK: bif v0.8b, v1.8b, v2.8b +; CHECK: ret + %sel = select <4 x i1> %mask, <4 x i16> %op1, <4 x i16> %op2 + ret <4 x i16> %sel +} + +; Don't use SVE for 128-bit vectors. +define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0 { +; CHECK: select_v8i16: +; CHECK: bif v0.16b, v1.16b, v2.16b +; CHECK: ret + %sel = select <8 x i1> %mask, <8 x i16> %op1, <8 x i16> %op2 + ret <8 x i16> %sel +} + +define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, <16 x i1>* %c) #0 { +; CHECK: select_v16i16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] +; CHECK: ptrue [[PG1:p[0-9]+]].h +; VBITS_GE_256: ld1h { [[MASK:z[0-9]+]].h }, [[PG]]/z, [x9] +; VBITS_GE_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_256-NEXT: and [[AND:z[0-9]+]].h, [[MASK]].h, #0x1 +; VBITS_GE_256-NEXT: cmpne [[COND:p[0-9]+]].h, [[PG1]]/z, [[AND]].h, #0 +; VBITS_GE_256-NEXT: sel [[RES:z[0-9]+]].h, [[COND]], [[OP1]].h, [[OP2]].h +; VBITS_GE_256-NEXT: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_256: ret + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %sel = select <16 x i1> %mask, <16 x i16> %op1, <16 x i16> %op2 + store <16 x i16> %sel, <16 x i16>* %a + ret void +} + +define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, <32 x i1>* %c) #0 { +; CHECK: select_v32i16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] +; CHECK: ptrue [[PG1:p[0-9]+]].h +; VBITS_GE_512: ld1h { [[MASK:z[0-9]+]].h }, [[PG]]/z, [x9] +; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_512-NEXT: and [[AND:z[0-9]+]].h, [[MASK]].h, #0x1 +; VBITS_GE_512-NEXT: cmpne [[COND:p[0-9]+]].h, [[PG1]]/z, [[AND]].h, #0 +; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].h, [[COND]], [[OP1]].h, [[OP2]].h +; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_512: ret + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %sel = select <32 x i1> %mask, <32 x i16> %op1, <32 x i16> %op2 + store <32 x i16> %sel, <32 x i16>* %a + ret void +} + +define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, <64 x i1>* %c) #0 { +; CHECK: select_v64i16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] +; CHECK: ptrue [[PG1:p[0-9]+]].h +; VBITS_GE_1024: ld1h { [[MASK:z[0-9]+]].h }, [[PG]]/z, [x9] +; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_1024-NEXT: and [[AND:z[0-9]+]].h, [[MASK]].h, #0x1 +; VBITS_GE_1024-NEXT: cmpne [[COND:p[0-9]+]].h, [[PG1]]/z, [[AND]].h, #0 +; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].h, [[COND]], [[OP1]].h, [[OP2]].h +; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_1024: ret + %mask = load <64 x i1>, <64 x i1>* %c + %op1 = load <64 x i16>, <64 x i16>* %a + %op2 = load <64 x i16>, <64 x i16>* %b + %sel = select <64 x i1> %mask, <64 x i16> %op1, <64 x i16> %op2 + store <64 x i16> %sel, <64 x i16>* %a + ret void +} + +define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, <128 x i1>* %c) #0 { +; CHECK: select_v128i16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] +; CHECK: ptrue [[PG1:p[0-9]+]].h +; VBITS_GE_2048: ld1h { [[MASK:z[0-9]+]].h }, [[PG]]/z, [x9] +; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_2048-NEXT: and [[AND:z[0-9]+]].h, [[MASK]].h, #0x1 +; VBITS_GE_2048-NEXT: cmpne [[COND:p[0-9]+]].h, [[PG1]]/z, [[AND]].h, #0 +; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].h, [[COND]], [[OP1]].h, [[OP2]].h +; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_2048: ret + %mask = load <128 x i1>, <128 x i1>* %c + %op1 = load <128 x i16>, <128 x i16>* %a + %op2 = load <128 x i16>, <128 x i16>* %b + %sel = select <128 x i1> %mask, <128 x i16> %op1, <128 x i16> %op2 + store <128 x i16> %sel, <128 x i16>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) #0 { +; CHECK: select_v2i32: +; CHECK: bif v0.8b, v1.8b, v2.8b +; CHECK: ret + %sel = select <2 x i1> %mask, <2 x i32> %op1, <2 x i32> %op2 + ret <2 x i32> %sel +} + +; Don't use SVE for 128-bit vectors. +define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) #0 { +; CHECK: select_v4i32: +; CHECK: bif v0.16b, v1.16b, v2.16b +; CHECK: ret + %sel = select <4 x i1> %mask, <4 x i32> %op1, <4 x i32> %op2 + ret <4 x i32> %sel +} + +define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, <8 x i1>* %c) #0 { +; CHECK: select_v8i32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] +; CHECK: ptrue [[PG1:p[0-9]+]].s +; VBITS_GE_256: ld1w { [[MASK:z[0-9]+]].s }, [[PG]]/z, [x9] +; VBITS_GE_256-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_256-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_256-NEXT: and [[AND:z[0-9]+]].s, [[MASK]].s, #0x1 +; VBITS_GE_256-NEXT: cmpne [[COND:p[0-9]+]].s, [[PG1]]/z, [[AND]].s, #0 +; VBITS_GE_256-NEXT: sel [[RES:z[0-9]+]].s, [[COND]], [[OP1]].s, [[OP2]].s +; VBITS_GE_256-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_256: ret + %mask = load <8 x i1>, <8 x i1>* %c + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %sel = select <8 x i1> %mask, <8 x i32> %op1, <8 x i32> %op2 + store <8 x i32> %sel, <8 x i32>* %a + ret void +} + +define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, <16 x i1>* %c) #0 { +; CHECK: select_v16i32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] +; CHECK: ptrue [[PG1:p[0-9]+]].s +; VBITS_GE_512: ld1w { [[MASK:z[0-9]+]].s }, [[PG]]/z, [x9] +; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_512-NEXT: and [[AND:z[0-9]+]].s, [[MASK]].s, #0x1 +; VBITS_GE_512-NEXT: cmpne [[COND:p[0-9]+]].s, [[PG1]]/z, [[AND]].s, #0 +; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].s, [[COND]], [[OP1]].s, [[OP2]].s +; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_512: ret + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %sel = select <16 x i1> %mask, <16 x i32> %op1, <16 x i32> %op2 + store <16 x i32> %sel, <16 x i32>* %a + ret void +} + +define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, <32 x i1>* %c) #0 { +; CHECK: select_v32i32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] +; CHECK: ptrue [[PG1:p[0-9]+]].s +; VBITS_GE_1024: ld1w { [[MASK:z[0-9]+]].s }, [[PG]]/z, [x9] +; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_1024-NEXT: and [[AND:z[0-9]+]].s, [[MASK]].s, #0x1 +; VBITS_GE_1024-NEXT: cmpne [[COND:p[0-9]+]].s, [[PG1]]/z, [[AND]].s, #0 +; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].s, [[COND]], [[OP1]].s, [[OP2]].s +; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_1024: ret + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x i32>, <32 x i32>* %a + %op2 = load <32 x i32>, <32 x i32>* %b + %sel = select <32 x i1> %mask, <32 x i32> %op1, <32 x i32> %op2 + store <32 x i32> %sel, <32 x i32>* %a + ret void +} + +define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, <64 x i1>* %c) #0 { +; CHECK: select_v64i32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] +; CHECK: ptrue [[PG1:p[0-9]+]].s +; VBITS_GE_2048: ld1w { [[MASK:z[0-9]+]].s }, [[PG]]/z, [x9] +; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_2048-NEXT: and [[AND:z[0-9]+]].s, [[MASK]].s, #0x1 +; VBITS_GE_2048-NEXT: cmpne [[COND:p[0-9]+]].s, [[PG1]]/z, [[AND]].s, #0 +; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].s, [[COND]], [[OP1]].s, [[OP2]].s +; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_2048: ret + %mask = load <64 x i1>, <64 x i1>* %c + %op1 = load <64 x i32>, <64 x i32>* %a + %op2 = load <64 x i32>, <64 x i32>* %b + %sel = select <64 x i1> %mask, <64 x i32> %op1, <64 x i32> %op2 + store <64 x i32> %sel, <64 x i32>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) #0 { +; CHECK: select_v1i64: +; CHECK: bif v0.8b, v1.8b, v2.8b +; CHECK: ret + %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2 + ret <1 x i64> %sel +} + +; Don't use SVE for 128-bit vectors. +define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) #0 { +; CHECK: select_v2i64: +; CHECK: bif v0.16b, v1.16b, v2.16b +; CHECK: ret + %sel = select <2 x i1> %mask, <2 x i64> %op1, <2 x i64> %op2 + ret <2 x i64> %sel +} + +define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, <4 x i1>* %c) #0 { +; CHECK: select_v4i64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] +; CHECK: ptrue [[PG1:p[0-9]+]].d +; VBITS_GE_256: ld1d { [[MASK:z[0-9]+]].d }, [[PG]]/z, [x9] +; VBITS_GE_256-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_256-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_256-NEXT: and [[AND:z[0-9]+]].d, [[MASK]].d, #0x1 +; VBITS_GE_256-NEXT: cmpne [[COND:p[0-9]+]].d, [[PG1]]/z, [[AND]].d, #0 +; VBITS_GE_256-NEXT: sel [[RES:z[0-9]+]].d, [[COND]], [[OP1]].d, [[OP2]].d +; VBITS_GE_256-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_256: ret + %mask = load <4 x i1>, <4 x i1>* %c + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %sel = select <4 x i1> %mask, <4 x i64> %op1, <4 x i64> %op2 + store <4 x i64> %sel, <4 x i64>* %a + ret void +} + +define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, <8 x i1>* %c) #0 { +; CHECK: select_v8i64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] +; CHECK: ptrue [[PG1:p[0-9]+]].d +; VBITS_GE_512: ld1d { [[MASK:z[0-9]+]].d }, [[PG]]/z, [x9] +; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_512-NEXT: and [[AND:z[0-9]+]].d, [[MASK]].d, #0x1 +; VBITS_GE_512-NEXT: cmpne [[COND:p[0-9]+]].d, [[PG1]]/z, [[AND]].d, #0 +; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].d, [[COND]], [[OP1]].d, [[OP2]].d +; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_512: ret + %mask = load <8 x i1>, <8 x i1>* %c + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %sel = select <8 x i1> %mask, <8 x i64> %op1, <8 x i64> %op2 + store <8 x i64> %sel, <8 x i64>* %a + ret void +} + +define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, <16 x i1>* %c) #0 { +; CHECK: select_v16i64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] +; CHECK: ptrue [[PG1:p[0-9]+]].d +; VBITS_GE_1024: ld1d { [[MASK:z[0-9]+]].d }, [[PG]]/z, [x9] +; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_1024-NEXT: and [[AND:z[0-9]+]].d, [[MASK]].d, #0x1 +; VBITS_GE_1024-NEXT: cmpne [[COND:p[0-9]+]].d, [[PG1]]/z, [[AND]].d, #0 +; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].d, [[COND]], [[OP1]].d, [[OP2]].d +; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_1024: ret + %mask = load <16 x i1>, <16 x i1>* %c + %op1 = load <16 x i64>, <16 x i64>* %a + %op2 = load <16 x i64>, <16 x i64>* %b + %sel = select <16 x i1> %mask, <16 x i64> %op1, <16 x i64> %op2 + store <16 x i64> %sel, <16 x i64>* %a + ret void +} + +define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, <32 x i1>* %c) #0 { +; CHECK: select_v32i64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] +; CHECK: ptrue [[PG1:p[0-9]+]].d +; VBITS_GE_2048: ld1d { [[MASK:z[0-9]+]].d }, [[PG]]/z, [x9] +; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_2048-NEXT: and [[AND:z[0-9]+]].d, [[MASK]].d, #0x1 +; VBITS_GE_2048-NEXT: cmpne [[COND:p[0-9]+]].d, [[PG1]]/z, [[AND]].d, #0 +; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].d, [[COND]], [[OP1]].d, [[OP2]].d +; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_2048: ret + %mask = load <32 x i1>, <32 x i1>* %c + %op1 = load <32 x i64>, <32 x i64>* %a + %op2 = load <32 x i64>, <32 x i64>* %b + %sel = select <32 x i1> %mask, <32 x i64> %op1, <32 x i64> %op2 + store <32 x i64> %sel, <32 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fp.ll b/llvm/test/CodeGen/AArch64/sve-fp.ll index e4aea2847bc4c..5334e66b22f7e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp.ll @@ -480,6 +480,68 @@ define void @float_copy(* %P1, * %P2) { ret void } +; FSQRT + +define @fsqrt_nxv8f16( %a) { +; CHECK-LABEL: fsqrt_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %res = call @llvm.sqrt.nxv8f16( %a) + ret %res +} + +define @fsqrt_nxv4f16( %a) { +; CHECK-LABEL: fsqrt_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %res = call @llvm.sqrt.nxv4f16( %a) + ret %res +} + +define @fsqrt_nxv2f16( %a) { +; CHECK-LABEL: fsqrt_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %res = call @llvm.sqrt.nxv2f16( %a) + ret %res +} + +define @fsqrt_nxv4f32( %a) { +; CHECK-LABEL: fsqrt_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.sqrt.nxv4f32( %a) + ret %res +} + +define @fsqrt_nxv2f32( %a) { +; CHECK-LABEL: fsqrt_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.sqrt.nxv2f32( %a) + ret %res +} + +define @fsqrt_nxv2f64( %a) { +; CHECK-LABEL: fsqrt_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %res = call @llvm.sqrt.nxv2f64( %a) + ret %res +} + declare @llvm.aarch64.sve.frecps.x.nxv8f16(, ) declare @llvm.aarch64.sve.frecps.x.nxv4f32( , ) declare @llvm.aarch64.sve.frecps.x.nxv2f64(, ) @@ -495,5 +557,12 @@ declare @llvm.fma.nxv8f16(, @llvm.fma.nxv4f16(, , ) declare @llvm.fma.nxv2f16(, , ) +declare @llvm.sqrt.nxv8f16( ) +declare @llvm.sqrt.nxv4f16( ) +declare @llvm.sqrt.nxv2f16( ) +declare @llvm.sqrt.nxv4f32() +declare @llvm.sqrt.nxv2f32() +declare @llvm.sqrt.nxv2f64() + ; Function Attrs: nounwind readnone declare double @llvm.aarch64.sve.faddv.nxv2f64(, ) #2 diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll new file mode 100644 index 0000000000000..fbd9beceaa1f0 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll @@ -0,0 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; FP_TO_SINT + +; Split operand +define @fcvtzs_s_nxv4f64( %a) { +; CHECK-LABEL: fcvtzs_s_nxv4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_h_nxv8f64( %a) { +; CHECK-LABEL: fcvtzs_h_nxv8f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +; Split result +define @fcvtzs_d_nxv4f32( %a) { +; CHECK-LABEL: fcvtzs_d_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.s +; CHECK-NEXT: fcvtzs z1.d, p0/m, z2.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fcvtzs_s_nxv16f16( %a) { +; CHECK-LABEL: fcvtzs_s_nxv16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z2.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z4.s, z1.h +; CHECK-NEXT: uunpkhi z5.s, z1.h +; CHECK-NEXT: fcvtzs z0.s, p0/m, z2.h +; CHECK-NEXT: fcvtzs z1.s, p0/m, z3.h +; CHECK-NEXT: fcvtzs z2.s, p0/m, z4.h +; CHECK-NEXT: fcvtzs z3.s, p0/m, z5.h +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +; FP_TO_UINT + +; Split operand +define @fcvtzu_s_nxv4f64( %a) { +; CHECK-LABEL: fcvtzu_s_nxv4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +; Split result +define @fcvtzu_d_nxv4f32( %a) { +; CHECK-LABEL: fcvtzu_d_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.s +; CHECK-NEXT: fcvtzu z1.d, p0/m, z2.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} diff --git a/llvm/test/CodeGen/AArch64/sve-split-trunc.ll b/llvm/test/CodeGen/AArch64/sve-split-trunc.ll new file mode 100644 index 0000000000000..6c81c49070fb0 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-split-trunc.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s + +define @trunc_i16toi8( %in) { +; CHECK-LABEL: trunc_i16toi8: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %out = trunc %in to + ret %out +} + +define @trunc_i32toi8( %in) { +; CHECK-LABEL: trunc_i32toi8: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.h, z2.h, z3.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b +; CHECK-NEXT: ret + %out = trunc %in to + ret %out +} + +define @trunc_i32toi16( %in) { +; CHECK-LABEL: trunc_i32toi16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %out = trunc %in to + ret %out +} + +define @trunc_i64toi32( %in) { +; CHECK-LABEL: trunc_i64toi32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %out = trunc %in to + ret %out +} + +define @trunc_i64toi16( %in) { +; CHECK-LABEL: trunc_i64toi16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ret + %out = trunc %in to + ret %out +} + +define @trunc_i64toi8( %in) { +; CHECK-LABEL: trunc_i64toi8: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z6.s, z6.s, z7.s +; CHECK-NEXT: uzp1 z4.s, z4.s, z5.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z4.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %out = trunc %in to + ret %out +} diff --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll index 1eedb76204317..a8635f682ff10 100644 --- a/llvm/test/CodeGen/AArch64/swifterror.ll +++ b/llvm/test/CodeGen/AArch64/swifterror.ll @@ -339,14 +339,14 @@ define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) { ; CHECK-APPLE: malloc ; First vararg -; CHECK-APPLE-AARCH64: ldr {{w[0-9]+}}, [{{.*}}[[TMP:x[0-9]+]], #16] ; CHECK-APPLE-AARCH64: mov [[ID:w[0-9]+]], #1 +; CHECK-APPLE-AARCH64: ldr {{w[0-9]+}}, [{{.*}}[[TMP:x[0-9]+]], #16] ; CHECK-APPLE-AARCH64: add [[ARGS:x[0-9]+]], [[TMP]], #16 +; Third vararg +; CHECK-APPLE-AARCH64: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #32] ; CHECK-APPLE-AARCH64: strb [[ID]], [x0, #8] ; Second vararg ; CHECK-APPLE-AARCH64: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #24] -; Third vararg -; CHECK-APPLE-AARCH64: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #32] ; CHECK-APPLE-ARM64_32: mov [[ID:w[0-9]+]], #1 ; CHECK-APPLE-ARM64_32: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16 diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index 40bbac2c05579..5f92f713573d1 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -159,8 +159,8 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { ; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strb w8, [x2, #1] ; CHECK-NEXT: strb w9, [x2] +; CHECK-NEXT: strb w8, [x2, #1] ; CHECK-NEXT: ret %x = load <2 x i8>, <2 x i8>* %px %y = load <2 x i8>, <2 x i8>* %py @@ -201,8 +201,8 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w8, [x2, #2] ; CHECK-NEXT: strh w9, [x2] +; CHECK-NEXT: strh w8, [x2, #2] ; CHECK-NEXT: ret %x = load <2 x i16>, <2 x i16>* %px %y = load <2 x i16>, <2 x i16>* %py diff --git a/llvm/test/CodeGen/AArch64/unwind-preserved.ll b/llvm/test/CodeGen/AArch64/unwind-preserved.ll index cf2a8e9b4a36a..68fec08255428 100644 --- a/llvm/test/CodeGen/AArch64/unwind-preserved.ll +++ b/llvm/test/CodeGen/AArch64/unwind-preserved.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -O0 -global-isel=0 -global-isel-abort=0 < %s | FileCheck %s -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -O0 -global-isel=1 -global-isel-abort=0 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -O0 -global-isel=1 -global-isel-abort=0 < %s | FileCheck %s --check-prefix=GISEL ; Test that z0 is saved/restored, as the unwinder may only retain the low 64bits (d0). define @invoke_callee_may_throw_sve( %v) personality i8 0 { @@ -125,6 +125,128 @@ define @invoke_callee_may_throw_sve( %v) pe ; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; GISEL-LABEL: invoke_callee_may_throw_sve: +; GISEL: .Lfunc_begin0: +; GISEL-NEXT: .cfi_startproc +; GISEL-NEXT: // %bb.0: +; GISEL-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; GISEL-NEXT: addvl sp, sp, #-18 +; GISEL-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; GISEL-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; GISEL-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; GISEL-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; GISEL-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; GISEL-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; GISEL-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; GISEL-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; GISEL-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; GISEL-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; GISEL-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; GISEL-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; GISEL-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: addvl sp, sp, #-2 +; GISEL-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG +; GISEL-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; GISEL-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; GISEL-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; GISEL-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; GISEL-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; GISEL-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; GISEL-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; GISEL-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; GISEL-NEXT: .cfi_offset w30, -8 +; GISEL-NEXT: .cfi_offset w29, -16 +; GISEL-NEXT: .Ltmp0: +; GISEL-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; GISEL-NEXT: bl may_throw_sve +; GISEL-NEXT: .Ltmp1: +; GISEL-NEXT: str z0, [sp] // 16-byte Folded Spill +; GISEL-NEXT: b .LBB0_1 +; GISEL-NEXT: .LBB0_1: // %.Lcontinue +; GISEL-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; GISEL-NEXT: addvl sp, sp, #2 +; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: addvl sp, sp, #18 +; GISEL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; GISEL-NEXT: ret +; GISEL-NEXT: .LBB0_2: // %.Lunwind +; GISEL-NEXT: .Ltmp2: +; GISEL-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: addvl sp, sp, #2 +; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: addvl sp, sp, #18 +; GISEL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; GISEL-NEXT: ret %result = invoke @may_throw_sve( %v) to label %.Lcontinue unwind label %.Lunwind .Lcontinue: ret %result @@ -204,6 +326,72 @@ define aarch64_vector_pcs <4 x i32> @invoke_callee_may_throw_neon(<4 x i32> %v) ; CHECK-NEXT: ldp q23, q22, [sp, #32] // 32-byte Folded Reload ; CHECK-NEXT: add sp, sp, #304 // =304 ; CHECK-NEXT: ret +; +; GISEL-LABEL: invoke_callee_may_throw_neon: +; GISEL: .Lfunc_begin1: +; GISEL-NEXT: .cfi_startproc +; GISEL-NEXT: // %bb.0: +; GISEL-NEXT: sub sp, sp, #304 // =304 +; GISEL-NEXT: stp q23, q22, [sp, #32] // 32-byte Folded Spill +; GISEL-NEXT: stp q21, q20, [sp, #64] // 32-byte Folded Spill +; GISEL-NEXT: stp q19, q18, [sp, #96] // 32-byte Folded Spill +; GISEL-NEXT: stp q17, q16, [sp, #128] // 32-byte Folded Spill +; GISEL-NEXT: stp q15, q14, [sp, #160] // 32-byte Folded Spill +; GISEL-NEXT: stp q13, q12, [sp, #192] // 32-byte Folded Spill +; GISEL-NEXT: stp q11, q10, [sp, #224] // 32-byte Folded Spill +; GISEL-NEXT: stp q9, q8, [sp, #256] // 32-byte Folded Spill +; GISEL-NEXT: stp x29, x30, [sp, #288] // 16-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 304 +; GISEL-NEXT: .cfi_offset w30, -8 +; GISEL-NEXT: .cfi_offset w29, -16 +; GISEL-NEXT: .cfi_offset b8, -32 +; GISEL-NEXT: .cfi_offset b9, -48 +; GISEL-NEXT: .cfi_offset b10, -64 +; GISEL-NEXT: .cfi_offset b11, -80 +; GISEL-NEXT: .cfi_offset b12, -96 +; GISEL-NEXT: .cfi_offset b13, -112 +; GISEL-NEXT: .cfi_offset b14, -128 +; GISEL-NEXT: .cfi_offset b15, -144 +; GISEL-NEXT: .cfi_offset b16, -160 +; GISEL-NEXT: .cfi_offset b17, -176 +; GISEL-NEXT: .cfi_offset b18, -192 +; GISEL-NEXT: .cfi_offset b19, -208 +; GISEL-NEXT: .cfi_offset b20, -224 +; GISEL-NEXT: .cfi_offset b21, -240 +; GISEL-NEXT: .cfi_offset b22, -256 +; GISEL-NEXT: .cfi_offset b23, -272 +; GISEL-NEXT: .Ltmp3: +; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; GISEL-NEXT: bl may_throw_neon +; GISEL-NEXT: .Ltmp4: +; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill +; GISEL-NEXT: // %bb.1: // %.Lcontinue +; GISEL-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; GISEL-NEXT: ldp x29, x30, [sp, #288] // 16-byte Folded Reload +; GISEL-NEXT: ldp q9, q8, [sp, #256] // 32-byte Folded Reload +; GISEL-NEXT: ldp q11, q10, [sp, #224] // 32-byte Folded Reload +; GISEL-NEXT: ldp q13, q12, [sp, #192] // 32-byte Folded Reload +; GISEL-NEXT: ldp q15, q14, [sp, #160] // 32-byte Folded Reload +; GISEL-NEXT: ldp q17, q16, [sp, #128] // 32-byte Folded Reload +; GISEL-NEXT: ldp q19, q18, [sp, #96] // 32-byte Folded Reload +; GISEL-NEXT: ldp q21, q20, [sp, #64] // 32-byte Folded Reload +; GISEL-NEXT: ldp q23, q22, [sp, #32] // 32-byte Folded Reload +; GISEL-NEXT: add sp, sp, #304 // =304 +; GISEL-NEXT: ret +; GISEL-NEXT: .LBB1_2: // %.Lunwind +; GISEL-NEXT: .Ltmp5: +; GISEL-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; GISEL-NEXT: ldp x29, x30, [sp, #288] // 16-byte Folded Reload +; GISEL-NEXT: ldp q9, q8, [sp, #256] // 32-byte Folded Reload +; GISEL-NEXT: ldp q11, q10, [sp, #224] // 32-byte Folded Reload +; GISEL-NEXT: ldp q13, q12, [sp, #192] // 32-byte Folded Reload +; GISEL-NEXT: ldp q15, q14, [sp, #160] // 32-byte Folded Reload +; GISEL-NEXT: ldp q17, q16, [sp, #128] // 32-byte Folded Reload +; GISEL-NEXT: ldp q19, q18, [sp, #96] // 32-byte Folded Reload +; GISEL-NEXT: ldp q21, q20, [sp, #64] // 32-byte Folded Reload +; GISEL-NEXT: ldp q23, q22, [sp, #32] // 32-byte Folded Reload +; GISEL-NEXT: add sp, sp, #304 // =304 +; GISEL-NEXT: ret %result = invoke aarch64_vector_pcs <4 x i32> @may_throw_neon(<4 x i32> %v) to label %.Lcontinue unwind label %.Lunwind .Lcontinue: ret <4 x i32> %result diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll index 3eacf03dc6a87..08114f49bdeb7 100644 --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -160,8 +160,8 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { ; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strb w8, [x2, #1] ; CHECK-NEXT: strb w9, [x2] +; CHECK-NEXT: strb w8, [x2, #1] ; CHECK-NEXT: ret %x = load <2 x i8>, <2 x i8>* %px %y = load <2 x i8>, <2 x i8>* %py @@ -202,8 +202,8 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w8, [x2, #2] ; CHECK-NEXT: strh w9, [x2] +; CHECK-NEXT: strh w8, [x2, #2] ; CHECK-NEXT: ret %x = load <2 x i16>, <2 x i16>* %px %y = load <2 x i16>, <2 x i16>* %py diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll index 3df3d2a6f4f6a..90367377fb4a0 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll @@ -1,28 +1,53 @@ -; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic -asm-verbose=0 -mattr=+fullfp16 | FileCheck %s -; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic -asm-verbose=0 | FileCheck %s --check-prefix=CHECKNOFP16 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=aarch64-eabi -aarch64-neon-syntax=generic -mattr=+fullfp16 < %s | FileCheck %s +; RUN: llc --mtriple=aarch64-eabi -aarch64-neon-syntax=generic < %s | FileCheck %s --check-prefix=CHECKNOFP16 define float @add_HalfS(<2 x float> %bin.rdx) { ; CHECK-LABEL: add_HalfS: -; CHECK: faddp s0, v0.2s -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: ret +; +; CHECKNOFP16-LABEL: add_HalfS: +; CHECKNOFP16: // %bb.0: +; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECKNOFP16-NEXT: faddp s0, v0.2s +; CHECKNOFP16-NEXT: ret %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %bin.rdx) ret float %r } define half @add_HalfH(<4 x half> %bin.rdx) { ; CHECK-LABEL: add_HalfH: -; CHECK: mov h3, v0.h[1] -; CHECK-NEXT: mov h1, v0.h[3] -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: fadd h0, h0, h3 -; CHECK-NEXT: fadd h0, h0, h2 -; CHECK-NEXT: fadd h0, h0, h1 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov h1, v0.h[3] +; CHECK-NEXT: mov h2, v0.h[2] +; CHECK-NEXT: faddp h0, v0.2h +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: fadd h0, h0, h1 +; CHECK-NEXT: ret +; ; CHECKNOFP16-LABEL: add_HalfH: -; CHECKNOFP16-NOT: faddp -; CHECKNOFP16-NOT: fadd h{{[0-9]+}} -; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h -; CHECKNOFP16: ret +; CHECKNOFP16: // %bb.0: +; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECKNOFP16-NEXT: mov h3, v0.h[1] +; CHECKNOFP16-NEXT: mov h1, v0.h[3] +; CHECKNOFP16-NEXT: mov h2, v0.h[2] +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fadd s0, s0, s3 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fadd s0, s0, s2 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s0, s0, s1 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: ret %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half 0.0, <4 x half> %bin.rdx) ret half %r } @@ -30,80 +55,216 @@ define half @add_HalfH(<4 x half> %bin.rdx) { define half @add_H(<8 x half> %bin.rdx) { ; CHECK-LABEL: add_H: -; CHECK: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: fadd h1, h0, h1 -; CHECK-NEXT: fadd h1, h1, h2 -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: fadd h0, h1, h0 -; CHECK-NEXT: ret - +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h +; CHECK-NEXT: mov h1, v0.h[2] +; CHECK-NEXT: faddp h2, v0.2h +; CHECK-NEXT: fadd h1, h2, h1 +; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: fadd h0, h1, h0 +; CHECK-NEXT: ret +; ; CHECKNOFP16-LABEL: add_H: -; CHECKNOFP16-NOT: faddp -; CHECKNOFP16-NOT: fadd h{{[0-9]+}} -; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h -; CHECKNOFP16: ret +; CHECKNOFP16: // %bb.0: +; CHECKNOFP16-NEXT: mov h7, v0.h[1] +; CHECKNOFP16-NEXT: mov h1, v0.h[7] +; CHECKNOFP16-NEXT: mov h2, v0.h[6] +; CHECKNOFP16-NEXT: mov h3, v0.h[5] +; CHECKNOFP16-NEXT: mov h4, v0.h[4] +; CHECKNOFP16-NEXT: mov h5, v0.h[3] +; CHECKNOFP16-NEXT: mov h6, v0.h[2] +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fcvt s7, h7 +; CHECKNOFP16-NEXT: fadd s0, s0, s7 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: fcvt s6, h6 +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fadd s0, s0, s6 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: fcvt s5, h5 +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fadd s0, s0, s5 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: fcvt s4, h4 +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fadd s0, s0, s4 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fadd s0, s0, s3 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fadd s0, s0, s2 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s0, s0, s1 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: ret %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half 0.0, <8 x half> %bin.rdx) ret half %r } define float @add_S(<4 x float> %bin.rdx) { ; CHECK-LABEL: add_S: -; CHECK: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: faddp s0, v0.2s -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: ret +; +; CHECKNOFP16-LABEL: add_S: +; CHECKNOFP16: // %bb.0: +; CHECKNOFP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECKNOFP16-NEXT: fadd v0.2s, v0.2s, v1.2s +; CHECKNOFP16-NEXT: faddp s0, v0.2s +; CHECKNOFP16-NEXT: ret %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %bin.rdx) ret float %r } define double @add_D(<2 x double> %bin.rdx) { ; CHECK-LABEL: add_D: -; CHECK: faddp d0, v0.2d -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: faddp d0, v0.2d +; CHECK-NEXT: ret +; +; CHECKNOFP16-LABEL: add_D: +; CHECKNOFP16: // %bb.0: +; CHECKNOFP16-NEXT: faddp d0, v0.2d +; CHECKNOFP16-NEXT: ret %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %bin.rdx) ret double %r } define half @add_2H(<16 x half> %bin.rdx) { ; CHECK-LABEL: add_2H: -; CHECK: fadd v0.8h, v0.8h, v1.8h -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: fadd h1, h0, h1 -; CHECK-NEXT: fadd h1, h1, h2 -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: fadd h0, h1, h0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h +; CHECK-NEXT: mov h1, v0.h[2] +; CHECK-NEXT: faddp h2, v0.2h +; CHECK-NEXT: fadd h1, h2, h1 +; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: fadd h0, h1, h0 +; CHECK-NEXT: ret +; ; CHECKNOFP16-LABEL: add_2H: -; CHECKNOFP16-NOT: faddp -; CHECKNOFP16-NOT: fadd h{{[0-9]+}} -; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h -; CHECKNOFP16: ret +; CHECKNOFP16: // %bb.0: +; CHECKNOFP16-NEXT: mov h2, v1.h[1] +; CHECKNOFP16-NEXT: mov h3, v0.h[1] +; CHECKNOFP16-NEXT: mov h6, v1.h[2] +; CHECKNOFP16-NEXT: mov h7, v0.h[2] +; CHECKNOFP16-NEXT: mov h16, v1.h[3] +; CHECKNOFP16-NEXT: mov h17, v0.h[3] +; CHECKNOFP16-NEXT: fcvt s4, h1 +; CHECKNOFP16-NEXT: fcvt s5, h0 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fcvt s6, h6 +; CHECKNOFP16-NEXT: fcvt s7, h7 +; CHECKNOFP16-NEXT: fcvt s16, h16 +; CHECKNOFP16-NEXT: fcvt s17, h17 +; CHECKNOFP16-NEXT: fadd s4, s5, s4 +; CHECKNOFP16-NEXT: mov h5, v1.h[4] +; CHECKNOFP16-NEXT: fadd s2, s3, s2 +; CHECKNOFP16-NEXT: mov h3, v0.h[4] +; CHECKNOFP16-NEXT: fadd s6, s7, s6 +; CHECKNOFP16-NEXT: mov h7, v1.h[5] +; CHECKNOFP16-NEXT: fadd s16, s17, s16 +; CHECKNOFP16-NEXT: mov h17, v0.h[5] +; CHECKNOFP16-NEXT: fcvt s5, h5 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fcvt s7, h7 +; CHECKNOFP16-NEXT: fcvt s17, h17 +; CHECKNOFP16-NEXT: fadd s3, s3, s5 +; CHECKNOFP16-NEXT: mov h5, v1.h[6] +; CHECKNOFP16-NEXT: fadd s7, s17, s7 +; CHECKNOFP16-NEXT: mov h17, v0.h[6] +; CHECKNOFP16-NEXT: mov h1, v1.h[7] +; CHECKNOFP16-NEXT: mov h0, v0.h[7] +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fadd s0, s0, s1 +; CHECKNOFP16-NEXT: fcvt h1, s4 +; CHECKNOFP16-NEXT: fcvt h2, s2 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fadd s1, s1, s2 +; CHECKNOFP16-NEXT: fcvt h2, s6 +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s1, s1, s2 +; CHECKNOFP16-NEXT: fcvt h2, s16 +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s1, s1, s2 +; CHECKNOFP16-NEXT: fcvt h2, s3 +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s1, s1, s2 +; CHECKNOFP16-NEXT: fcvt h3, s7 +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s5, h5 +; CHECKNOFP16-NEXT: fcvt s17, h17 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s5, s17, s5 +; CHECKNOFP16-NEXT: fadd s1, s1, s3 +; CHECKNOFP16-NEXT: fcvt h4, s5 +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s4, h4 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s1, s1, s4 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fadd s0, s1, s0 +; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: ret %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half 0.0, <16 x half> %bin.rdx) ret half %r } define float @add_2S(<8 x float> %bin.rdx) { ; CHECK-LABEL: add_2S: -; CHECK: fadd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: faddp s0, v0.2s -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: ret +; +; CHECKNOFP16-LABEL: add_2S: +; CHECKNOFP16: // %bb.0: +; CHECKNOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECKNOFP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECKNOFP16-NEXT: fadd v0.2s, v0.2s, v1.2s +; CHECKNOFP16-NEXT: faddp s0, v0.2s +; CHECKNOFP16-NEXT: ret %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %bin.rdx) ret float %r } define double @add_2D(<4 x double> %bin.rdx) { ; CHECK-LABEL: add_2D: -; CHECK: fadd v0.2d, v0.2d, v1.2d -; CHECK-NEXT: faddp d0, v0.2d -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d +; CHECK-NEXT: faddp d0, v0.2d +; CHECK-NEXT: ret +; +; CHECKNOFP16-LABEL: add_2D: +; CHECKNOFP16: // %bb.0: +; CHECKNOFP16-NEXT: fadd v0.2d, v0.2d, v1.2d +; CHECKNOFP16-NEXT: faddp d0, v0.2d +; CHECKNOFP16-NEXT: ret %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %bin.rdx) ret double %r } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll index 4d888317b343e..514a43a5e171f 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll @@ -54,19 +54,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind { define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 // =48 -; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill -; CHECK-NEXT: bl __gttf2 -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: cmp w0, #0 // =0 -; CHECK-NEXT: b.le .LBB4_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #48 // =48 -; CHECK-NEXT: ret +; CHECK-NEXT: b fmaxl %b = call fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a) ret fp128 %b } @@ -77,11 +65,7 @@ define float @test_v16f32(<16 x float> %a) nounwind { ; CHECK-NEXT: fmaxnm v1.4s, v1.4s, v3.4s ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v2.4s ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v1.2d, v0.d[1] -; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v1.4s, v0.s[1] -; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: fmaxnmv s0, v0.4s ; CHECK-NEXT: ret %b = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a) ret float %b diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll index 975ba2687792f..5fd7116e9068b 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll @@ -47,7 +47,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind { define float @test_v3f32(<3 x float> %a) nounwind { ; CHECK-LABEL: test_v3f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-8388608 +; CHECK-NEXT: mov w8, #2143289344 ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: fmaxnmv s0, v0.4s @@ -56,6 +56,18 @@ define float @test_v3f32(<3 x float> %a) nounwind { ret float %b } +define float @test_v3f32_ninf(<3 x float> %a) nounwind { +; CHECK-LABEL: test_v3f32_ninf: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2143289344 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.s[3], v1.s[0] +; CHECK-NEXT: fmaxnmv s0, v0.4s +; CHECK-NEXT: ret + %b = call nnan ninf float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a) + ret float %b +} + define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll new file mode 100644 index 0000000000000..7a37c0d047a13 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK + +declare half @llvm.experimental.vector.reduce.fmin.v1f16(<1 x half> %a) +declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a) +declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %a) +declare fp128 @llvm.experimental.vector.reduce.fmin.v1f128(<1 x fp128> %a) + +declare float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a) +declare fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a) +declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a) + +define half @test_v1f16(<1 x half> %a) nounwind { +; CHECK-LABEL: test_v1f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call nnan half @llvm.experimental.vector.reduce.fmin.v1f16(<1 x half> %a) + ret half %b +} + +define float @test_v1f32(<1 x float> %a) nounwind { +; CHECK-LABEL: test_v1f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: ret + %b = call nnan float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a) + ret float %b +} + +define double @test_v1f64(<1 x double> %a) nounwind { +; CHECK-LABEL: test_v1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call nnan double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %a) + ret double %b +} + +define fp128 @test_v1f128(<1 x fp128> %a) nounwind { +; CHECK-LABEL: test_v1f128: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call nnan fp128 @llvm.experimental.vector.reduce.fmin.v1f128(<1 x fp128> %a) + ret fp128 %b +} + +define float @test_v3f32(<3 x float> %a) nounwind { +; CHECK-LABEL: test_v3f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2143289344 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.s[3], v1.s[0] +; CHECK-NEXT: fminnmv s0, v0.4s +; CHECK-NEXT: ret + %b = call nnan float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a) + ret float %b +} + +define float @test_v3f32_ninf(<3 x float> %a) nounwind { +; CHECK-LABEL: test_v3f32_ninf: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2143289344 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.s[3], v1.s[0] +; CHECK-NEXT: fminnmv s0, v0.4s +; CHECK-NEXT: ret + %b = call nnan ninf float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a) + ret float %b +} + +define fp128 @test_v2f128(<2 x fp128> %a) nounwind { +; CHECK-LABEL: test_v2f128: +; CHECK: // %bb.0: +; CHECK-NEXT: b fminl + %b = call nnan fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a) + ret fp128 %b +} + +define float @test_v16f32(<16 x float> %a) nounwind { +; CHECK-LABEL: test_v16f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fminnm v1.4s, v1.4s, v3.4s +; CHECK-NEXT: fminnm v0.4s, v0.4s, v2.4s +; CHECK-NEXT: fminnm v0.4s, v0.4s, v1.4s +; CHECK-NEXT: fminnmv s0, v0.4s +; CHECK-NEXT: ret + %b = call nnan float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a) + ret float %b +} diff --git a/llvm/test/CodeGen/AArch64/win64-jumptable.ll b/llvm/test/CodeGen/AArch64/win64-jumptable.ll index 0c61bcd52366a..1983b2568cdee 100644 --- a/llvm/test/CodeGen/AArch64/win64-jumptable.ll +++ b/llvm/test/CodeGen/AArch64/win64-jumptable.ll @@ -44,7 +44,6 @@ declare void @g(i32, i32) ; CHECK: .word .LBB0_3-.LJTI0_0 ; CHECK: .word .LBB0_4-.LJTI0_0 ; CHECK: .word .LBB0_5-.LJTI0_0 -; CHECK: .section .xdata,"dr" ; CHECK: .seh_handlerdata ; CHECK: .text ; CHECK: .seh_endproc diff --git a/llvm/test/CodeGen/AArch64/wineh-mingw.ll b/llvm/test/CodeGen/AArch64/wineh-mingw.ll index ff1a55711b9ea..d22c61fca7575 100644 --- a/llvm/test/CodeGen/AArch64/wineh-mingw.ll +++ b/llvm/test/CodeGen/AArch64/wineh-mingw.ll @@ -36,8 +36,7 @@ endtryfinally: ; WINEH: .seh_proc foo4 ; WINEH: .seh_handler _d_eh_personality, @unwind, @except ; WINEH: ret -; WINEH: .section .xdata,"dr" -; WINEH-NEXT: .seh_handlerdata +; WINEH: .seh_handlerdata ; WINEH-NEXT: .text ; WINEH-NEXT: .seh_endproc ; WINEH: .section .xdata,"dr" diff --git a/llvm/test/CodeGen/AArch64/wineh1.mir b/llvm/test/CodeGen/AArch64/wineh1.mir index aed1550c54f73..2f73a5291ddd0 100644 --- a/llvm/test/CodeGen/AArch64/wineh1.mir +++ b/llvm/test/CodeGen/AArch64/wineh1.mir @@ -73,7 +73,6 @@ # ASM: .seh_endepilogue # ASM: .seh_endfunclet -# ASM: .section .xdata,"dr" # ASM: .seh_handlerdata # ASM: .text # ASM: .seh_endproc diff --git a/llvm/test/CodeGen/AArch64/wineh3.mir b/llvm/test/CodeGen/AArch64/wineh3.mir index 6cbe7f42dc5ec..d1ffa4aedc085 100644 --- a/llvm/test/CodeGen/AArch64/wineh3.mir +++ b/llvm/test/CodeGen/AArch64/wineh3.mir @@ -8,9 +8,9 @@ # CHECK-NEXT: FunctionLength: 124 # CHECK-NEXT: Version: 0 # CHECK-NEXT: ExceptionData: No -# CHECK-NEXT: EpiloguePacked: No -# CHECK-NEXT: EpilogueScopes: 1 -# CHECK-NEXT: ByteCodeLength: 32 +# CHECK-NEXT: EpiloguePacked: Yes +# CHECK-NEXT: EpilogueOffset: 0 +# CHECK-NEXT: ByteCodeLength: 16 # CHECK-NEXT: Prologue [ # CHECK-NEXT: 0xc80c ; stp x19, x20, [sp, #96] # CHECK-NEXT: 0xc88a ; stp x21, x22, [sp, #80] @@ -21,22 +21,6 @@ # CHECK-NEXT: 0xda8d ; stp d10, d11, [sp, #-112]! # CHECK-NEXT: 0xe4 ; end # CHECK-NEXT: ] -# CHECK-NEXT: EpilogueScopes [ -# CHECK-NEXT: EpilogueScope { -# CHECK-NEXT: StartOffset: 23 -# CHECK-NEXT: EpilogueStartIndex: 15 -# CHECK-NEXT: Opcodes [ -# CHECK-NEXT: 0xc80c ; ldp x19, x20, [sp, #96] -# CHECK-NEXT: 0xc88a ; ldp x21, x22, [sp, #80] -# CHECK-NEXT: 0xc908 ; ldp x23, x24, [sp, #64] -# CHECK-NEXT: 0xc986 ; ldp x25, x26, [sp, #48] -# CHECK-NEXT: 0xca04 ; ldp x27, x28, [sp, #32] -# CHECK-NEXT: 0xd802 ; ldp d8, d9, [sp, #16] -# CHECK-NEXT: 0xda8d ; ldp d10, d11, [sp], #112 -# CHECK-NEXT: 0xe4 ; end -# CHECK-NEXT: ] -# CHECK-NEXT: } -# CHECK-NEXT: ] # CHECK-NEXT: } ... --- diff --git a/llvm/test/CodeGen/AArch64/wineh6.mir b/llvm/test/CodeGen/AArch64/wineh6.mir index 3ea7c0f20d45c..e7592bd711460 100644 --- a/llvm/test/CodeGen/AArch64/wineh6.mir +++ b/llvm/test/CodeGen/AArch64/wineh6.mir @@ -6,25 +6,19 @@ # CHECK-NEXT: FunctionLength: 92 # CHECK-NEXT: Version: 0 # CHECK-NEXT: ExceptionData: No -# CHECK-NEXT: EpiloguePacked: No -# CHECK-NEXT: EpilogueScopes: 1 -# CHECK-NEXT: ByteCodeLength: 8 +# CHECK-NEXT: EpiloguePacked: Yes +# CHECK-NEXT: EpilogueOffset: 1 +# CHECK-NEXT: ByteCodeLength: 4 # CHECK-NEXT: Prologue [ # CHECK-NEXT: 0x02 ; sub sp, #32 # CHECK-NEXT: 0xe1 ; mov fp, sp # CHECK-NEXT: 0x81 ; stp x29, x30, [sp, #-16]! # CHECK-NEXT: 0xe4 ; end # CHECK-NEXT: ] -# CHECK-NEXT: EpilogueScopes [ -# CHECK-NEXT: EpilogueScope { -# CHECK-NEXT: StartOffset: 20 -# CHECK-NEXT: EpilogueStartIndex: 4 -# CHECK-NEXT: Opcodes [ -# CHECK-NEXT: 0xe1 ; mov fp, sp -# CHECK-NEXT: 0x81 ; ldp x29, x30, [sp], #16 -# CHECK-NEXT: 0xe4 ; end -# CHECK-NEXT: ] -# CHECK-NEXT: } +# CHECK-NEXT: Epilogue [ +# CHECK-NEXT: 0xe1 ; mov sp, fp +# CHECK-NEXT: 0x81 ; ldp x29, x30, [sp], #16 +# CHECK-NEXT: 0xe4 ; end # CHECK-NEXT: ] # CHECK-NEXT: } ... diff --git a/llvm/test/CodeGen/AArch64/wineh7.mir b/llvm/test/CodeGen/AArch64/wineh7.mir index c445cbfd6b005..6bf06d80861a4 100644 --- a/llvm/test/CodeGen/AArch64/wineh7.mir +++ b/llvm/test/CodeGen/AArch64/wineh7.mir @@ -6,9 +6,9 @@ # CHECK-NEXT: FunctionLength: 72 # CHECK-NEXT: Version: 0 # CHECK-NEXT: ExceptionData: No -# CHECK-NEXT: EpiloguePacked: No -# CHECK-NEXT: EpilogueScopes: 1 -# CHECK-NEXT: ByteCodeLength: 16 +# CHECK-NEXT: EpiloguePacked: Yes +# CHECK-NEXT: EpilogueOffset: 0 +# CHECK-NEXT: ByteCodeLength: 8 # CHECK-NEXT: Prologue [ # CHECK-NEXT: 0xe204 ; add fp, sp, #32 # CHECK-NEXT: 0x44 ; stp x29, x30, [sp, #32] @@ -16,19 +16,6 @@ # CHECK-NEXT: 0xcc85 ; stp x21, x22, [sp, #-48]! # CHECK-NEXT: 0xe4 ; end # CHECK-NEXT: ] -# CHECK-NEXT: EpilogueScopes [ -# CHECK-NEXT: EpilogueScope { -# CHECK-NEXT: StartOffset: 13 -# CHECK-NEXT: EpilogueStartIndex: 8 -# CHECK-NEXT: Opcodes [ -# CHECK-NEXT: 0xe204 ; add fp, sp, #32 -# CHECK-NEXT: 0x44 ; ldp x29, x30, [sp, #32] -# CHECK-NEXT: 0xc802 ; ldp x19, x20, [sp, #16] -# CHECK-NEXT: 0xcc85 ; ldp x21, x22, [sp], #48 -# CHECK-NEXT: 0xe4 ; end -# CHECK-NEXT: ] -# CHECK-NEXT: } -# CHECK-NEXT: ] # CHECK-NEXT: } # CHECK-NEXT: } diff --git a/llvm/test/CodeGen/AArch64/zext-reg-coalesce.mir b/llvm/test/CodeGen/AArch64/zext-reg-coalesce.mir new file mode 100644 index 0000000000000..b31144b409fca --- /dev/null +++ b/llvm/test/CodeGen/AArch64/zext-reg-coalesce.mir @@ -0,0 +1,33 @@ +# RUN: llc -mtriple=aarch64-arm-none-eabi -o - %s \ +# RUN: -run-pass simple-register-coalescing | FileCheck %s + +# In this test case, the 32-bit copy implements a 32 to 64 bit zero extension +# and relies on the upper 32 bits being zeroed. +# Coalescing to the result of the 64-bit load meant overwriting +# the upper 32 bits incorrectly when the loaded byte was negative. + +--- | + @c = local_unnamed_addr global i8 -1, align 4 + + define i64 @bug_e(i32 %i32) local_unnamed_addr { + ret i64 0 + } +... +--- +name: bug_e +tracksRegLiveness: true +body: | + bb.0: + liveins: $w0 + + %1:gpr32 = COPY $w0 + %2:gpr64common = ADRP target-flags(aarch64-page) @c + %3:gpr64 = LDRSBXui %2, target-flags(aarch64-pageoff, aarch64-nc) @c :: (dereferenceable load 1 from @c, align 4) + %0:gpr32 = COPY %3.sub_32 + ; CHECK: {{.*}}.sub_32:gpr64 = COPY {{.*}}.sub_32 + STRBBui %1, %2, target-flags(aarch64-pageoff, aarch64-nc) @c :: (store 1 into @c, align 4) + %8:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32 + $x0 = COPY %8 + ; CHECK: $x0 = COPY + RET_ReallyLR implicit $x0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll new file mode 100644 index 0000000000000..0e232bf5945d8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll @@ -0,0 +1,149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +; =================================================================================== +; V_ADD_LSHL_U32 +; =================================================================================== + +define amdgpu_ps float @add_shl(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: add_shl: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add_shl: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_lshl_u32 v0, v0, v1, v2 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: add_shl: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_lshl_u32 v0, v0, v1, v2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %x = add i32 %a, %b + %result = shl i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @add_shl_vgpr_c(i32 inreg %a, i32 inreg %b, i32 %c) { +; VI-LABEL: add_shl_vgpr_c: +; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s2, s2, s3 +; VI-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add_shl_vgpr_c: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: add_shl_vgpr_c: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: ; return to shader part epilog + %x = add i32 %a, %b + %result = shl i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @add_shl_vgpr_ac(i32 %a, i32 inreg %b, i32 %c) { +; VI-LABEL: add_shl_vgpr_ac: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add_shl_vgpr_ac: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_lshl_u32 v0, v0, s2, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: add_shl_vgpr_ac: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_lshl_u32 v0, v0, s2, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %x = add i32 %a, %b + %result = shl i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @add_shl_vgpr_const(i32 %a, i32 %b) { +; VI-LABEL: add_shl_vgpr_const: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add_shl_vgpr_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_lshl_u32 v0, v0, v1, 9 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: add_shl_vgpr_const: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_lshl_u32 v0, v0, v1, 9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %x = add i32 %a, %b + %result = shl i32 %x, 9 + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) { +; VI-LABEL: add_shl_vgpr_const_inline_const: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x3f4, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add_shl_vgpr_const_inline_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f4 +; GFX9-NEXT: v_add_lshl_u32 v0, v0, v1, 9 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: add_shl_vgpr_const_inline_const: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_lshl_u32 v0, v0, 0x3f4, 9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %x = add i32 %a, 1012 + %result = shl i32 %x, 9 + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @add_shl_vgpr_inline_const_x2(i32 %a) { +; VI-LABEL: add_shl_vgpr_inline_const_x2: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add_shl_vgpr_inline_const_x2: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_lshl_u32 v0, v0, 3, 9 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: add_shl_vgpr_inline_const_x2: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_lshl_u32 v0, v0, 3, 9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %x = add i32 %a, 3 + %result = shl i32 %x, 9 + %bc = bitcast i32 %result to float + ret float %bc +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll index eebfbee8a12e8..cb6822bcf1ba5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll @@ -52,9 +52,10 @@ define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_xor_b32 s0, s0, -1 ; GCN-NEXT: s_and_b32 s0, s0, 1 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cbranch_scc0 BB3_2 +; GCN-NEXT: s_cbranch_scc1 BB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: flat_store_dword v[0:1], v0 @@ -80,9 +81,10 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) { ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s0, s0, s1 +; GCN-NEXT: s_xor_b32 s0, s0, -1 ; GCN-NEXT: s_and_b32 s0, s0, 1 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cbranch_scc0 BB4_2 +; GCN-NEXT: s_cbranch_scc1 BB4_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: flat_store_dword v[0:1], v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir index 41d0260c81f20..1cc5c9ce659d8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir @@ -12,9 +12,9 @@ body: | ; CHECK-LABEL: name: narrow_shl_s64_32_s64amt ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[TRUNC]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_CONSTANT i64 32 @@ -32,9 +32,9 @@ body: | ; CHECK-LABEL: name: narrow_shl_s64_32 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[TRUNC]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 32 @@ -52,9 +52,9 @@ body: | ; CHECK-LABEL: name: narrow_shl_s64_33 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C]](s32) + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32) ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C1]](s32), [[SHL]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) @@ -93,9 +93,9 @@ body: | ; CHECK-LABEL: name: narrow_shl_s64_63 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 - ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C]](s32) + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32) ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C1]](s32), [[SHL]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 4b8554b781fd9..bf1f0ccbc2e24 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -205,24 +205,26 @@ define amdgpu_kernel void @break_loop(i32 %arg) { ; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_subrev_u32_e32 v0, s2, v0 -; CHECK-NEXT: BB5_1: ; %bb1 +; CHECK-NEXT: s_branch BB5_2 +; CHECK-NEXT: BB5_1: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1 +; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] +; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_cbranch_execz BB5_4 +; CHECK-NEXT: BB5_2: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_u32_e32 v1, 1, v1 ; CHECK-NEXT: v_cmp_le_i32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1 -; CHECK-NEXT: s_cbranch_vccnz BB5_3 -; CHECK-NEXT: ; %bb.2: ; %bb4 -; CHECK-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; CHECK-NEXT: s_cbranch_vccnz BB5_1 +; CHECK-NEXT: ; %bb.3: ; %bb4 +; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; CHECK-NEXT: global_load_dword v2, v[0:1], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ge_i32_e64 s[2:3], v0, v2 -; CHECK-NEXT: BB5_3: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] -; CHECK-NEXT: s_cbranch_execnz BB5_1 -; CHECK-NEXT: ; %bb.4: ; %bb9 +; CHECK-NEXT: s_branch BB5_1 +; CHECK-NEXT: BB5_4: ; %bb9 ; CHECK-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll index 909c05925e7fe..4f9668f8d3697 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -40,7 +40,6 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off ; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 ; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc @@ -56,214 +55,212 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 ; GCN-NEXT: v_add_u32_e32 v1, 16, v0 -; GCN-NEXT: v_add_u32_e32 v2, 20, v0 ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 20, v0 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 24, v0 -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 28, v0 ; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 28, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 32, v0 -; GCN-NEXT: v_add_u32_e32 v2, 36, v0 ; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 36, v0 +; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 40, v0 -; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 44, v0 ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 44, v0 +; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 48, v0 -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 52, v0 ; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 52, v0 +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 56, v0 -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 60, v0 ; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 60, v0 +; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 64, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x44, v0 ; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 +; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 -; GCN-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x4c, v0 ; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v0 +; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 -; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x54, v0 ; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x54, v0 +; GCN-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 -; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x5c, v0 ; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v0 +; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x64, v0 ; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x64, v0 +; GCN-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 -; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x6c, v0 ; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v0 +; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 -; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x74, v0 ; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x74, v0 +; GCN-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 -; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x7c, v0 ; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x84, v0 ; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 +; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 -; GCN-NEXT: buffer_store_dword v36, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x8c, v0 ; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v0 +; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 -; GCN-NEXT: buffer_store_dword v38, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x94, v0 ; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x94, v0 +; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 -; GCN-NEXT: buffer_store_dword v40, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x9c, v0 ; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v0 +; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0xa4, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v8, v15 -; GCN-NEXT: v_mov_b32_e32 v9, v16 ; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v9, v16 +; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v0 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v10, v17 ; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 ; GCN-NEXT: v_mov_b32_e32 v11, v18 -; GCN-NEXT: v_add_u32_e32 v2, 0xac, v0 -; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xb4, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xac, v0 +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 ; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v0 +; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 -; GCN-NEXT: buffer_store_dword v48, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xbc, v0 ; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v2, 0xc4, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v0 +; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xcc, v0 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v0 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 4, v0 -; GCN-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 -; GCN-NEXT: v_add_u32_e32 v7, 8, v0 -; GCN-NEXT: v_add_u32_e32 v2, 12, v0 -; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 4, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 8, v0 +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 12, v0 +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: v_add_u32_e32 v2, 0xd4, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xd8, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xdc, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 ; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v4, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xd4, v0 +; GCN-NEXT: buffer_store_dword v52, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xd8, v0 +; GCN-NEXT: buffer_store_dword v53, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xdc, v0 +; GCN-NEXT: buffer_store_dword v54, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0xe4, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xe8, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xec, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xf0, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xf4, v0 -; GCN-NEXT: v_add_u32_e32 v7, 0xf8, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xfc, v0 ; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 63, v1 +; GCN-NEXT: v_add_u32_e32 v1, 0xe4, v0 +; GCN-NEXT: buffer_store_dword v56, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xe8, v0 +; GCN-NEXT: buffer_store_dword v57, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xec, v0 +; GCN-NEXT: buffer_store_dword v58, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf0, v0 +; GCN-NEXT: buffer_store_dword v59, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf4, v0 +; GCN-NEXT: buffer_store_dword v60, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf8, v0 +; GCN-NEXT: buffer_store_dword v61, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xfc, v0 +; GCN-NEXT: buffer_store_dword v62, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 63, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GCN-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen @@ -326,7 +323,6 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off ; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 ; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc @@ -342,217 +338,215 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 ; GCN-NEXT: v_add_u32_e32 v1, 16, v0 -; GCN-NEXT: v_add_u32_e32 v2, 20, v0 ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 20, v0 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 24, v0 -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 28, v0 ; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 28, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 32, v0 -; GCN-NEXT: v_add_u32_e32 v2, 36, v0 ; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 36, v0 +; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 40, v0 -; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 44, v0 ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 44, v0 +; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 48, v0 -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 52, v0 ; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 52, v0 +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 56, v0 -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 60, v0 ; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 60, v0 +; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 64, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x44, v0 ; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 +; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 -; GCN-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x4c, v0 ; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v0 +; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 -; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x54, v0 ; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x54, v0 +; GCN-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 -; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x5c, v0 ; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v0 +; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x64, v0 ; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x64, v0 +; GCN-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 -; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x6c, v0 ; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v0 +; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 -; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x74, v0 ; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x74, v0 +; GCN-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 -; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x7c, v0 ; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x84, v0 ; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 +; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 -; GCN-NEXT: buffer_store_dword v36, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x8c, v0 ; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v0 +; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 -; GCN-NEXT: buffer_store_dword v38, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x94, v0 ; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x94, v0 +; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 -; GCN-NEXT: buffer_store_dword v40, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x9c, v0 ; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v0 +; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0xa4, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v8, v15 -; GCN-NEXT: v_mov_b32_e32 v9, v16 ; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v9, v16 +; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v0 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v10, v17 ; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 ; GCN-NEXT: v_mov_b32_e32 v11, v18 -; GCN-NEXT: v_add_u32_e32 v2, 0xac, v0 -; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xb4, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xac, v0 +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 ; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v0 +; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 -; GCN-NEXT: buffer_store_dword v48, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xbc, v0 ; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v2, 0xc4, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v0 +; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xcc, v0 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v0 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 4, v0 -; GCN-NEXT: v_add_u32_e32 v7, 8, v0 ; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 12, v0 -; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 4, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 8, v0 +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 12, v0 +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 +; GCN-NEXT: v_add_u32_e32 v3, 0xd0, v0 +; GCN-NEXT: buffer_store_dword v51, v3, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v3, 0xd4, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xd8, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xdc, v0 -; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v52, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v5, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v0 +; GCN-NEXT: v_add_u32_e32 v3, 0xd8, v0 +; GCN-NEXT: buffer_store_dword v53, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xdc, v0 +; GCN-NEXT: buffer_store_dword v54, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xe0, v0 +; GCN-NEXT: buffer_store_dword v55, v3, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v3, 0xe4, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xe8, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xec, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xf0, v0 -; GCN-NEXT: v_add_u32_e32 v7, 0xf4, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xf8, v0 -; GCN-NEXT: v_add_u32_e32 v9, 0xfc, v0 -; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v56, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v9, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 1, v10 -; GCN-NEXT: v_and_b32_e32 v1, 63, v2 +; GCN-NEXT: v_add_u32_e32 v3, 0xe8, v0 +; GCN-NEXT: buffer_store_dword v57, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xec, v0 +; GCN-NEXT: buffer_store_dword v58, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xf0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v2 +; GCN-NEXT: buffer_store_dword v59, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xf4, v0 +; GCN-NEXT: v_and_b32_e32 v1, 63, v1 +; GCN-NEXT: buffer_store_dword v60, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xf8, v0 +; GCN-NEXT: buffer_store_dword v61, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xfc, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GCN-NEXT: v_add_u32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v62, v3, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -569,7 +563,7 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: v_and_b32_e32 v1, 1, v10 +; GCN-NEXT: v_and_b32_e32 v1, 1, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(15) @@ -585,9 +579,22 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-LABEL: v_extract_v32i64_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v15, v0 ; GCN-NEXT: s_add_u32 s4, s32, 0x3fc0 +; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_and_b32 s33, s4, 0xffffc000 +; GCN-NEXT: s_movk_i32 s4, 0x80 +; GCN-NEXT: v_mov_b32_e32 v12, s5 +; GCN-NEXT: v_mov_b32_e32 v16, v1 +; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 +; GCN-NEXT: v_mov_b32_e32 v11, s4 +; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc +; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 +; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v16, v12, vcc +; GCN-NEXT: s_movk_i32 s4, 0xc0 +; GCN-NEXT: v_mov_b32_e32 v12, s5 +; GCN-NEXT: v_mov_b32_e32 v11, s4 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill @@ -603,41 +610,8 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v15, v0 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[0:3], v[15:16], off -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc -; GCN-NEXT: s_add_u32 s32, s32, 0x10000 -; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 -; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v16, v12, vcc -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v11, s4 ; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 +; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off ; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 ; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc ; GCN-NEXT: global_load_dwordx4 v[11:14], v[15:16], off offset:32 @@ -649,198 +623,215 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: global_load_dwordx4 v[35:38], v[48:49], off ; GCN-NEXT: global_load_dwordx4 v[39:42], v[48:49], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[48:49], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[3:6], v[59:60], off ; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 ; GCN-NEXT: v_add_u32_e32 v1, 16, v0 -; GCN-NEXT: v_add_u32_e32 v2, 24, v0 +; GCN-NEXT: s_add_u32 s32, s32, 0x10000 +; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 20, v0 -; GCN-NEXT: v_add_u32_e32 v1, 44, v0 -; GCN-NEXT: v_add_u32_e32 v7, 28, v0 -; GCN-NEXT: v_add_u32_e32 v9, 36, v0 -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 20, v0 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 24, v0 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 28, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 32, v0 +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 36, v0 +; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 40, v0 -; GCN-NEXT: v_add_u32_e32 v3, 32, v0 ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 48, v0 +; GCN-NEXT: v_add_u32_e32 v1, 44, v0 +; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 48, v0 +; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 52, v0 +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 56, v0 -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v4, 52, v0 -; GCN-NEXT: v_add_u32_e32 v5, 60, v0 -; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x4c, v0 -; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 60, v0 +; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 64, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x48, v0 ; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 +; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 +; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v0 +; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 -; GCN-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x58, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x54, v0 +; GCN-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 +; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v0 +; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0x54, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0x5c, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0x64, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0x6c, v0 -; GCN-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v6, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x68, v0 ; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x64, v0 +; GCN-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 +; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v0 +; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 -; GCN-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v7, 0x74, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0x7c, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x78, v0 ; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v8, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x74, v0 +; GCN-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 +; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x88, v0 ; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 +; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 +; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v0 +; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 -; GCN-NEXT: buffer_store_dword v37, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x98, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x94, v0 +; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 +; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v0 +; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0x94, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0x9c, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xa4, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xac, v0 -; GCN-NEXT: buffer_store_dword v41, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v6, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xa8, v0 -; GCN-NEXT: buffer_store_dword v43, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, v15 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v9, v16 +; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v0 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v10, v17 +; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v11, v18 +; GCN-NEXT: v_add_u32_e32 v1, 0xac, v0 +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 -; GCN-NEXT: buffer_store_dword v45, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v7, 0xb4, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xbc, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0xb8, v0 ; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v2, 0xc8, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v0 +; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 +; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v0 +; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v7, 0xec, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xf4, v0 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v10, v4 -; GCN-NEXT: v_add_u32_e32 v2, 0xc4, v0 -; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NEXT: v_mov_b32_e32 v11, v5 -; GCN-NEXT: v_add_u32_e32 v3, 0xcc, v0 -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v2, 8, v0 +; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v0 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 4, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 8, v0 +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 12, v0 +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 ; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 12, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xd4, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xdc, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xe4, v0 -; GCN-NEXT: v_add_u32_e32 v9, 0xfc, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 4, v0 -; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:256 -; GCN-NEXT: v_add_u32_e32 v2, 0xd8, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xe0, v0 ; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xd4, v0 +; GCN-NEXT: buffer_store_dword v52, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xd8, v0 +; GCN-NEXT: buffer_store_dword v53, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xdc, v0 +; GCN-NEXT: buffer_store_dword v54, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v0 +; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xe4, v0 +; GCN-NEXT: buffer_store_dword v56, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xe8, v0 -; GCN-NEXT: buffer_store_dword v53, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xf0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xf8, v0 ; GCN-NEXT: buffer_store_dword v57, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 31, v1 +; GCN-NEXT: v_add_u32_e32 v1, 0xec, v0 +; GCN-NEXT: buffer_store_dword v58, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf0, v0 +; GCN-NEXT: buffer_store_dword v59, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf4, v0 +; GCN-NEXT: buffer_store_dword v60, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf8, v0 +; GCN-NEXT: buffer_store_dword v61, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xfc, v0 +; GCN-NEXT: buffer_store_dword v62, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 31, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN-NEXT: v_add_u32_e32 v1, 4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll new file mode 100644 index 0000000000000..4e7c2959e6aed --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -0,0 +1,658 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s + +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v8 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v2, v[4:5] +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_med3_f32 v0, v0, v1, v2 +; VI-NEXT: flat_store_dword v[6:7], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %a.fneg = fsub float -0.0, %a + %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { +; SI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v3, v[4:5] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7 +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_max_f32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_max_f32_e32 v2, v4, v4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %a.fneg = fsub float -0.0, %a + %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s2, 0x80000000 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_sub_f32_e32 v2, s2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_f32_e64 v4, s2, |v4| +; SI-NEXT: v_med3_f32 v2, v2, |v3|, v4 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v3, v[4:5] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_sub_f32_e32 v4, s2, v7 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| +; VI-NEXT: v_med3_f32 v2, v4, |v2|, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: s_mov_b32 s2, 0x80000000 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_sub_f32_e32 v1, s2, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3| +; GFX9-NEXT: v_med3_f32 v1, v1, |v2|, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + + %a.fneg = fsub float -0.0, %a + %b.fabs = call float @llvm.fabs.f32(float %b) + %c.fabs = call float @llvm.fabs.f32(float %c) + %c.fabs.fneg = fsub float -0.0, %c.fabs + + %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs) + %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + + store float %med3, float addrspace(1)* %outgep + ret void +} + +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s2, 0x80000000 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_sub_f32_e64 v2, s2, |v2| +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_sub_f32_e64 v3, s2, |v3| +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_f32_e64 v4, s2, |v4| +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v3, v[4:5] +; VI-NEXT: s_mov_b32 s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_sub_f32_e64 v4, s2, |v7| +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: v_sub_f32_e64 v2, s2, |v2| +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| +; VI-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: s_mov_b32 s2, 0x80000000 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_sub_f32_e64 v1, s2, |v1| +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_sub_f32_e64 v2, s2, |v2| +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3| +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + + %a.fabs = call float @llvm.fabs.f32(float %a) + %a.fabs.fneg = fsub float -0.0, %a.fabs + %b.fabs = call float @llvm.fabs.f32(float %b) + %b.fabs.fneg = fsub float -0.0, %b.fabs + %c.fabs = call float @llvm.fabs.f32(float %c) + %c.fabs.fneg = fsub float -0.0, %c.fabs + + %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) + %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + + store float %med3, float addrspace(1)* %outgep + ret void +} + +define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { +; SI-LABEL: v_nnan_inputs_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_inputs_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v3, v[4:5] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_max_f32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_max_f32_e32 v2, v4, v4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + + %a.nnan = fadd nnan float %a, 1.0 + %b.nnan = fadd nnan float %b, 2.0 + %c.nnan = fadd nnan float %c, 4.0 + + %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan) + %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + + +; --------------------------------------------------------------------- +; Negative patterns +; --------------------------------------------------------------------- + +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { +; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v3, v[4:5] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_max_f32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_max_f32_e32 v2, v4, v4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + store volatile float %tmp0, float addrspace(1)* undef + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare float @llvm.fabs.f32(float) #0 +declare float @llvm.minnum.f32(float, float) #0 +declare float @llvm.maxnum.f32(float, float) #0 +declare double @llvm.minnum.f64(double, double) #0 +declare double @llvm.maxnum.f64(double, double) #0 +declare half @llvm.fabs.f16(half) #0 +declare half @llvm.minnum.f16(half, half) #0 +declare half @llvm.maxnum.f16(half, half) #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } +attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index 878b93218fd58..71cca1df9157a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -139,29 +139,17 @@ define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspa ; CI-NEXT: s_mov_b64 s[2:3], s[10:11] ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[8:9], s[6:7] -; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 +; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 -; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; CI-NEXT: v_rcp_f32_e32 v4, v2 -; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v3, v4 -; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 -; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_rcp_f32_e32 v2, v1 +; CI-NEXT: v_mul_f32_e32 v2, v0, v2 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm @@ -179,14 +167,9 @@ define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspa ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; VI-NEXT: v_rcp_f32_e32 v3, v3 -; VI-NEXT: v_mul_f32_e32 v1, v1, v3 -; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_div_fixup_f16 v1, v1, v0, v2 +; VI-NEXT: v_rcp_f16_e32 v1, v0 +; VI-NEXT: v_mul_f16_e32 v1, v2, v1 ; VI-NEXT: v_trunc_f16_e32 v1, v1 ; VI-NEXT: v_fma_f16 v2, -v1, v0, v2 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -317,27 +300,16 @@ define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrs ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[6:7], 0x0 -; CI-NEXT: s_load_dword s0, s[8:9], 0x4 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2 -; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 -; CI-NEXT: v_rcp_f32_e32 v3, v1 -; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; CI-NEXT: v_fma_f32 v3, v4, v3, v3 -; CI-NEXT: v_mul_f32_e32 v4, v2, v3 -; CI-NEXT: v_fma_f32 v5, -v1, v4, v2 -; CI-NEXT: v_fma_f32 v4, v5, v3, v4 -; CI-NEXT: v_fma_f32 v1, -v1, v4, v2 -; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s1, s[8:9], 0x4 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s2 -; CI-NEXT: v_trunc_f32_e32 v1, v1 -; CI-NEXT: v_fma_f32 v0, -v1, v0, s2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: v_rcp_f32_e32 v0, s1 +; CI-NEXT: v_mul_f32_e32 v0, s0, v0 +; CI-NEXT: v_trunc_f32_e32 v0, v0 +; CI-NEXT: v_fma_f32 v0, -v0, s1, v1 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm ; @@ -346,25 +318,14 @@ define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrs ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[8:9], 0x10 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s1, s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2 -; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 -; VI-NEXT: v_rcp_f32_e32 v3, v1 -; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; VI-NEXT: v_fma_f32 v3, v4, v3, v3 -; VI-NEXT: v_mul_f32_e32 v4, v2, v3 -; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 -; VI-NEXT: v_fma_f32 v4, v5, v3, v4 -; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 -; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2 -; VI-NEXT: v_trunc_f32_e32 v1, v1 -; VI-NEXT: v_fma_f32 v2, -v1, v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_rcp_f32_e32 v0, s1 +; VI-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-NEXT: v_trunc_f32_e32 v0, v0 +; VI-NEXT: v_fma_f32 v2, -v0, s1, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -512,21 +473,12 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1] -; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] -; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1] -; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] -; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] +; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] +; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3] ; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -540,21 +492,12 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1] -; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] -; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1] -; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] -; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] +; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] +; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3] ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll index acd71947aeeed..fa569b941c935 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll @@ -196,6 +196,89 @@ define half @f16_func_void() #0 { ret half %val } +define i24 @i24_func_void() #0 { + ; CHECK-LABEL: name: i24_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load 3 from `i24 addrspace(1)* undef`, align 4, addrspace 1) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s24) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + %val = load i24, i24 addrspace(1)* undef + ret i24 %val +} + +define zeroext i24 @i24_zeroext_func_void() #0 { + ; CHECK-LABEL: name: i24_zeroext_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load 3 from `i24 addrspace(1)* undef`, align 4, addrspace 1) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s24) + ; CHECK: $vgpr0 = COPY [[ZEXT]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + %val = load i24, i24 addrspace(1)* undef + ret i24 %val +} + +define signext i24 @i24_signext_func_void() #0 { + ; CHECK-LABEL: name: i24_signext_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load 3 from `i24 addrspace(1)* undef`, align 4, addrspace 1) + ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s24) + ; CHECK: $vgpr0 = COPY [[SEXT]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + %val = load i24, i24 addrspace(1)* undef + ret i24 %val +} + +define <2 x i24> @v2i24_func_void() #0 { + ; CHECK-LABEL: name: v2i24_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s24>) = G_LOAD [[DEF]](p1) :: (load 6 from `<2 x i24> addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:_(s24), [[UV1:%[0-9]+]]:_(s24) = G_UNMERGE_VALUES [[LOAD]](<2 x s24>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s24) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s24) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + %val = load <2 x i24>, <2 x i24> addrspace(1)* undef + ret <2 x i24> %val +} + +define <3 x i24> @v3i24_func_void() #0 { + ; CHECK-LABEL: name: v3i24_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(<3 x s24>) = G_LOAD [[DEF]](p1) :: (load 9 from `<3 x i24> addrspace(1)* undef`, align 16, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:_(s24), [[UV1:%[0-9]+]]:_(s24), [[UV2:%[0-9]+]]:_(s24) = G_UNMERGE_VALUES [[LOAD]](<3 x s24>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s24) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s24) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s24) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32) + ; CHECK: $vgpr2 = COPY [[ANYEXT2]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + %val = load <3 x i24>, <3 x i24> addrspace(1)* undef + ret <3 x i24> %val +} + define i32 @i32_func_void() #0 { ; CHECK-LABEL: name: i32_func_void ; CHECK: bb.1 (%ir-block.0): @@ -977,6 +1060,44 @@ define <16 x i8> @v16i8_func_void() #0 { ret <16 x i8> %val } +define <2 x i8> @v2i8_func_void() #0 { + ; CHECK-LABEL: name: v2i8_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[DEF]](p1) :: (load 2 from `<2 x i8> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD]](<2 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + %val = load <2 x i8>, <2 x i8> addrspace(1)* undef + ret <2 x i8> %val +} + +define <3 x i8> @v3i8_func_void() #0 { + ; CHECK-LABEL: name: v3i8_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[DEF]](p1) :: (load 3 from `<3 x i8> addrspace(1)* undef`, align 4, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD]](<3 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s8) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32) + ; CHECK: $vgpr2 = COPY [[ANYEXT2]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + %val = load <3 x i8>, <3 x i8> addrspace(1)* undef + ret <3 x i8> %val +} + define <4 x i8> @v4i8_func_void() #0 { ; CHECK-LABEL: name: v4i8_func_void ; CHECK: bb.1 (%ir-block.0): diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index abb422ae7363f..7901f2286b2a6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -10,362 +10,364 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GCN-NEXT: v_mov_b32_e32 v0, 0x100 +; GCN-NEXT: v_mov_b32_e32 v16, 0x100 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_add_u32_e32 v1, 4, v0 +; GCN-NEXT: v_add_u32_e32 v31, 64, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x0 -; GCN-NEXT: s_load_dwordx16 s[68:83], s[10:11], 0x40 -; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x80 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0xc0 -; GCN-NEXT: s_movk_i32 s4, 0x50 +; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x40 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x80 +; GCN-NEXT: v_add_u32_e32 v32, 0x44, v16 +; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, s13 -; GCN-NEXT: v_mov_b32_e32 v5, s14 -; GCN-NEXT: v_mov_b32_e32 v6, s15 -; GCN-NEXT: v_mov_b32_e32 v8, s16 -; GCN-NEXT: v_mov_b32_e32 v10, s17 -; GCN-NEXT: v_mov_b32_e32 v12, s18 -; GCN-NEXT: v_mov_b32_e32 v14, s19 +; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NEXT: v_mov_b32_e32 v4, s16 +; GCN-NEXT: v_mov_b32_e32 v5, s17 +; GCN-NEXT: v_mov_b32_e32 v6, s18 +; GCN-NEXT: v_mov_b32_e32 v7, s19 +; GCN-NEXT: v_mov_b32_e32 v8, s20 +; GCN-NEXT: v_mov_b32_e32 v9, s21 +; GCN-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NEXT: v_mov_b32_e32 v13, s25 +; GCN-NEXT: v_mov_b32_e32 v14, s26 +; GCN-NEXT: v_mov_b32_e32 v15, s27 +; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0xc0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:256 +; GCN-NEXT: v_add_u32_e32 v0, 4, v16 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s52 +; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s53 +; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s54 +; GCN-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s4, 0x50 +; GCN-NEXT: v_add_u32_e32 v34, 0x4c, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s55 +; GCN-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v35, s4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s56 +; GCN-NEXT: buffer_store_dword v1, v35, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v36, 0x54, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NEXT: buffer_store_dword v1, v36, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v37, 0x58, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s58 +; GCN-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen ; GCN-NEXT: s_movk_i32 s5, 0x60 -; GCN-NEXT: v_add_u32_e32 v2, 8, v0 -; GCN-NEXT: v_add_u32_e32 v3, 12, v0 -; GCN-NEXT: v_add_u32_e32 v7, 16, v0 -; GCN-NEXT: v_add_u32_e32 v9, 20, v0 -; GCN-NEXT: v_add_u32_e32 v11, 24, v0 -; GCN-NEXT: v_add_u32_e32 v13, 28, v0 -; GCN-NEXT: v_add_u32_e32 v15, 32, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s20 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v17, 36, v0 -; GCN-NEXT: v_mov_b32_e32 v18, s21 -; GCN-NEXT: v_mov_b32_e32 v26, s25 -; GCN-NEXT: v_add_u32_e32 v33, 0x44, v0 -; GCN-NEXT: v_mov_b32_e32 v34, s69 -; GCN-NEXT: v_mov_b32_e32 v4, s71 -; GCN-NEXT: v_add_u32_e32 v19, 40, v0 -; GCN-NEXT: v_mov_b32_e32 v20, s22 -; GCN-NEXT: v_add_u32_e32 v21, 44, v0 -; GCN-NEXT: v_mov_b32_e32 v22, s23 -; GCN-NEXT: v_add_u32_e32 v23, 48, v0 -; GCN-NEXT: v_mov_b32_e32 v24, s24 -; GCN-NEXT: v_add_u32_e32 v25, 52, v0 -; GCN-NEXT: v_add_u32_e32 v27, 56, v0 -; GCN-NEXT: v_mov_b32_e32 v28, s26 -; GCN-NEXT: v_add_u32_e32 v29, 60, v0 -; GCN-NEXT: v_mov_b32_e32 v30, s27 -; GCN-NEXT: v_add_u32_e32 v31, 64, v0 -; GCN-NEXT: v_mov_b32_e32 v32, s68 -; GCN-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v31, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s13, 0x70 -; GCN-NEXT: v_add_u32_e32 v35, 0x48, v0 -; GCN-NEXT: v_mov_b32_e32 v36, s70 -; GCN-NEXT: v_add_u32_e32 v37, 0x4c, v0 -; GCN-NEXT: v_add_u32_e32 v38, s4, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s72 -; GCN-NEXT: v_add_u32_e32 v39, 0x54, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s73 -; GCN-NEXT: v_add_u32_e32 v40, 0x58, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s74 -; GCN-NEXT: v_add_u32_e32 v41, 0x5c, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s75 -; GCN-NEXT: v_add_u32_e32 v42, s5, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s76 -; GCN-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v42, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v26, 0x64, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s77 -; GCN-NEXT: v_mov_b32_e32 v4, s81 -; GCN-NEXT: s_movk_i32 s14, 0x90 -; GCN-NEXT: s_movk_i32 s15, 0xa0 -; GCN-NEXT: v_add_u32_e32 v28, 0x68, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s78 -; GCN-NEXT: v_add_u32_e32 v30, 0x6c, v0 -; GCN-NEXT: v_mov_b32_e32 v18, s79 -; GCN-NEXT: v_add_u32_e32 v32, s13, v0 -; GCN-NEXT: v_mov_b32_e32 v20, s80 -; GCN-NEXT: v_add_u32_e32 v34, 0x74, v0 -; GCN-NEXT: v_add_u32_e32 v36, 0x78, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s82 -; GCN-NEXT: v_add_u32_e32 v43, 0x7c, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s83 -; GCN-NEXT: v_add_u32_e32 v44, 0x80, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s52 -; GCN-NEXT: buffer_store_dword v14, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v44, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v45, 0x84, v0 -; GCN-NEXT: v_mov_b32_e32 v4, s53 -; GCN-NEXT: s_movk_i32 s16, 0xb0 -; GCN-NEXT: v_add_u32_e32 v46, 0x88, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s54 -; GCN-NEXT: v_add_u32_e32 v47, 0x8c, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s55 -; GCN-NEXT: v_add_u32_e32 v48, s14, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s56 -; GCN-NEXT: v_add_u32_e32 v49, 0x94, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s57 -; GCN-NEXT: v_add_u32_e32 v50, 0x98, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s58 -; GCN-NEXT: v_add_u32_e32 v51, 0x9c, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s59 -; GCN-NEXT: v_add_u32_e32 v52, s15, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s60 -; GCN-NEXT: buffer_store_dword v4, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v52, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v53, 0xa4, v0 -; GCN-NEXT: v_mov_b32_e32 v4, s61 -; GCN-NEXT: s_movk_i32 s17, 0xd0 -; GCN-NEXT: s_movk_i32 s18, 0xe0 -; GCN-NEXT: v_add_u32_e32 v54, 0xa8, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s62 -; GCN-NEXT: v_add_u32_e32 v55, 0xac, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s63 -; GCN-NEXT: v_add_u32_e32 v56, s16, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s64 -; GCN-NEXT: v_add_u32_e32 v57, 0xb4, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s65 -; GCN-NEXT: v_add_u32_e32 v58, 0xb8, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s66 -; GCN-NEXT: v_add_u32_e32 v59, 0xbc, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s67 -; GCN-NEXT: v_add_u32_e32 v60, 0xc0, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s36 -; GCN-NEXT: buffer_store_dword v4, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v60, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v61, 0xc4, v0 -; GCN-NEXT: v_mov_b32_e32 v4, s37 +; GCN-NEXT: v_add_u32_e32 v38, 0x5c, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s59 +; GCN-NEXT: buffer_store_dword v1, v38, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v39, s5, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s60 +; GCN-NEXT: buffer_store_dword v1, v39, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v40, 0x64, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s61 +; GCN-NEXT: buffer_store_dword v1, v40, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v41, 0x68, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s62 +; GCN-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s10, 0x70 +; GCN-NEXT: v_add_u32_e32 v42, 0x6c, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s63 +; GCN-NEXT: buffer_store_dword v1, v42, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v43, s10, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s64 +; GCN-NEXT: buffer_store_dword v1, v43, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v44, 0x74, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s65 +; GCN-NEXT: buffer_store_dword v1, v44, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v45, 0x78, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s66 +; GCN-NEXT: buffer_store_dword v1, v45, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v46, 0x7c, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s67 +; GCN-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v47, 0x80, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s36 +; GCN-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v48, 0x84, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v49, 0x88, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s38 +; GCN-NEXT: buffer_store_dword v1, v49, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s11, 0x90 +; GCN-NEXT: v_add_u32_e32 v50, 0x8c, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NEXT: buffer_store_dword v1, v50, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v51, s11, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v52, 0x94, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NEXT: buffer_store_dword v1, v52, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v53, 0x98, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s28, 0xa0 +; GCN-NEXT: v_add_u32_e32 v54, 0x9c, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NEXT: buffer_store_dword v1, v54, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v55, s28, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v56, 0xa4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NEXT: buffer_store_dword v1, v56, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v57, 0xa8, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s29, 0xb0 +; GCN-NEXT: v_add_u32_e32 v58, 0xac, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NEXT: buffer_store_dword v1, v58, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v59, s29, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s48 +; GCN-NEXT: buffer_store_dword v1, v59, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v60, 0xb4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NEXT: buffer_store_dword v1, v60, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v61, 0xb8, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s50 +; GCN-NEXT: buffer_store_dword v1, v61, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v62, 0xbc, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NEXT: buffer_store_dword v1, v62, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s12 +; GCN-NEXT: v_add_u32_e32 v63, 0xc0, v16 +; GCN-NEXT: buffer_store_dword v1, v63, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_add_u32_e32 v64, 0xc4, v16 +; GCN-NEXT: buffer_store_dword v1, v64, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s14 +; GCN-NEXT: v_add_u32_e32 v65, 0xc8, v16 +; GCN-NEXT: buffer_store_dword v1, v65, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s12, 0xd0 +; GCN-NEXT: v_add_u32_e32 v66, 0xcc, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NEXT: buffer_store_dword v1, v66, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v67, s12, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: buffer_store_dword v1, v67, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v68, 0xd4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: buffer_store_dword v1, v68, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v69, 0xd8, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NEXT: buffer_store_dword v1, v69, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s13, 0xe0 +; GCN-NEXT: v_add_u32_e32 v70, 0xdc, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NEXT: buffer_store_dword v1, v70, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v71, s13, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NEXT: buffer_store_dword v1, v71, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v72, 0xe4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NEXT: buffer_store_dword v1, v72, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v73, 0xe8, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s22 +; GCN-NEXT: buffer_store_dword v1, v73, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s14, 0xf0 +; GCN-NEXT: v_add_u32_e32 v74, 0xec, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NEXT: buffer_store_dword v1, v74, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v75, s14, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NEXT: buffer_store_dword v1, v75, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v76, 0xf4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NEXT: s_and_b32 s7, s7, 63 -; GCN-NEXT: s_movk_i32 s19, 0xf0 -; GCN-NEXT: v_add_u32_e32 v62, 0xc8, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s38 -; GCN-NEXT: v_add_u32_e32 v63, 0xcc, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s39 -; GCN-NEXT: v_add_u32_e32 v64, s17, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s40 -; GCN-NEXT: v_add_u32_e32 v65, 0xd4, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s41 -; GCN-NEXT: v_add_u32_e32 v66, 0xd8, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s42 -; GCN-NEXT: v_add_u32_e32 v67, 0xdc, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s43 -; GCN-NEXT: v_add_u32_e32 v68, s18, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s44 -; GCN-NEXT: buffer_store_dword v4, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v63, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v64, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v65, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v66, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v67, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v68, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v69, 0xe4, v0 -; GCN-NEXT: v_mov_b32_e32 v4, s45 -; GCN-NEXT: v_add_u32_e32 v70, 0xe8, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s46 -; GCN-NEXT: v_add_u32_e32 v71, 0xec, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s47 -; GCN-NEXT: v_add_u32_e32 v72, s19, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s48 -; GCN-NEXT: v_add_u32_e32 v73, 0xf4, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s49 -; GCN-NEXT: v_add_u32_e32 v74, 0xf8, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s50 -; GCN-NEXT: buffer_store_dword v4, v69, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v70, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v71, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v72, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v73, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v74, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: buffer_store_dword v1, v76, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v77, 0xf8, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NEXT: v_add_u32_e32 v17, 8, v16 +; GCN-NEXT: buffer_store_dword v1, v77, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v78, 0xfc, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NEXT: s_lshl_b32 s7, s7, 2 -; GCN-NEXT: v_add_u32_e32 v75, 0xfc, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s51 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:256 -; GCN-NEXT: buffer_store_dword v14, v75, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_add_u32_e32 v0, s7, v0 -; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v5, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v6, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v9, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v10, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v11, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v12, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v13, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v14, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v15, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v18, 12, v16 +; GCN-NEXT: v_add_u32_e32 v19, 16, v16 +; GCN-NEXT: v_add_u32_e32 v20, 20, v16 +; GCN-NEXT: v_add_u32_e32 v21, 24, v16 +; GCN-NEXT: v_add_u32_e32 v22, 28, v16 +; GCN-NEXT: v_add_u32_e32 v23, 32, v16 +; GCN-NEXT: v_add_u32_e32 v24, 36, v16 +; GCN-NEXT: v_add_u32_e32 v25, 40, v16 +; GCN-NEXT: v_add_u32_e32 v26, 44, v16 +; GCN-NEXT: v_add_u32_e32 v27, 48, v16 +; GCN-NEXT: v_add_u32_e32 v28, 52, v16 +; GCN-NEXT: v_add_u32_e32 v29, 56, v16 +; GCN-NEXT: v_add_u32_e32 v30, 60, v16 +; GCN-NEXT: buffer_store_dword v1, v78, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_add_u32_e32 v1, s7, v16 +; GCN-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v2, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v3, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v4, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v5, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v6, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v8, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v9, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v10, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v11, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v12, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v13, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v14, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v15, v30, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v16, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v17, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v18, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v19, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v20, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v21, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v22, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v24, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v25, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v27, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v28, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v29, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v30, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v31, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v32, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v33, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v34, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v35, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v36, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v37, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v38, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v39, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v40, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v41, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v42, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v43, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v44, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v45, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v46, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v47, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v48, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v49, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v50, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v51, v63, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v52, v64, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v53, v65, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v54, v66, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v55, v67, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v56, v68, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v57, v69, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v58, v70, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, v71, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, v72, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v61, v73, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, v74, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, v75, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v17, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v18, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v19, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v20, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v21, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v22, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v23, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v24, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v25, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v26, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v27, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v28, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v29, v44, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v30, v45, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v31, v46, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v32, v47, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v33, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v34, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v35, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v36, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v37, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v38, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v39, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v40, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v41, v56, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v42, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v43, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v44, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v45, v60, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v46, v61, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v47, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v48, v63, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v49, v64, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v50, v65, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v51, v66, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v52, v67, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v53, v68, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v54, v69, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v55, v70, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v56, v71, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v57, v72, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v58, v73, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v59, v74, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v60, v75, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v61, v76, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v62, v77, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, v78, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256 -; GCN-NEXT: s_add_u32 s6, s8, 16 -; GCN-NEXT: s_addc_u32 s7, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v67, s7 -; GCN-NEXT: v_mov_b32_e32 v66, s6 -; GCN-NEXT: s_add_u32 s6, s8, 32 -; GCN-NEXT: s_addc_u32 s7, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v65, s9 -; GCN-NEXT: s_add_u32 s10, s8, 48 +; GCN-NEXT: s_add_u32 s6, s8, 16 ; GCN-NEXT: v_mov_b32_e32 v64, s8 -; GCN-NEXT: s_addc_u32 s11, s9, 0 +; GCN-NEXT: s_addc_u32 s7, s9, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v[64:65], v[0:3], off -; GCN-NEXT: global_store_dwordx4 v[66:67], v[4:7], off +; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_add_u32 s6, s8, 64 -; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: s_add_u32 s6, s8, 32 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GCN-NEXT: s_addc_u32 s7, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: s_add_u32 s10, s8, s4 -; GCN-NEXT: s_addc_u32 s11, s9, 0 -; GCN-NEXT: s_add_u32 s4, s8, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_add_u32 s6, s8, 48 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[12:15], off +; GCN-NEXT: s_addc_u32 s7, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_add_u32 s6, s8, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[20:23], off +; GCN-NEXT: s_add_u32 s6, s8, 64 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GCN-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_add_u32 s6, s8, s4 +; GCN-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_add_u32 s4, s8, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[20:23], off +; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, 0x80 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: s_add_u32 s6, s8, s14 +; GCN-NEXT: s_add_u32 s4, s8, s10 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[28:31], off -; GCN-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s15 -; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: s_add_u32 s4, s8, 0x80 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[28:31], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: s_add_u32 s6, s8, s16 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_add_u32 s4, s8, s11 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[32:35], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[36:39], off -; GCN-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: s_add_u32 s4, s8, s28 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[36:39], off +; GCN-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_add_u32 s4, s8, s29 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[44:47], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s17 +; GCN-NEXT: s_add_u32 s4, s8, 0xc0 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[44:47], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: s_add_u32 s4, s8, s18 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_add_u32 s4, s8, s12 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[52:55], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s19 +; GCN-NEXT: s_add_u32 s4, s8, s13 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[52:55], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_add_u32 s4, s8, s14 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[56:59], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[60:63], off +; GCN-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[60:63], off ; GCN-NEXT: s_endpgm %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index 008b09d968870..ffdb1155a9343 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -1954,7 +1954,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: s_lshr_b32 s7, s5, 1 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s0, s9, s8 ; GFX9-NEXT: s_cmp_eq_u32 s7, 2 @@ -1997,16 +1997,16 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_add_u32 s0, 0, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_addc_u32 s1, 0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v11, s1 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_mov_b32_e32 v10, s0 -; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_s_s: @@ -2015,7 +2015,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: s_lshr_b32 s7, s5, 1 ; GFX8-NEXT: s_cmp_eq_u32 s7, 1 ; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s0, s9, s8 ; GFX8-NEXT: s_cmp_eq_u32 s7, 2 @@ -2058,16 +2058,16 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s0, 0, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_addc_u32 s1, 0, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v7, s7 -; GFX8-NEXT: v_mov_b32_e32 v10, s0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_s_s: @@ -2108,24 +2108,25 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX7-NEXT: s_cmp_eq_u32 s7, 4 ; GFX7-NEXT: s_cselect_b32 s4, s16, s12 ; GFX7-NEXT: s_cmp_eq_u32 s7, 5 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_cselect_b32 s5, s16, s13 ; GFX7-NEXT: s_cmp_eq_u32 s7, 6 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_cselect_b32 s6, s16, s14 ; GFX7-NEXT: s_cmp_eq_u32 s7, 7 -; GFX7-NEXT: s_cselect_b32 s7, s16, s15 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v7, s7 +; GFX7-NEXT: s_cselect_b32 s7, s16, s15 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 -; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GFX7-NEXT: s_nop 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GFX7-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -2329,23 +2330,23 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_mov_b32_e32 v5, s13 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX9-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] -; GFX9-NEXT: s_add_u32 s0, 0, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX9-NEXT: v_mov_b32_e32 v7, s15 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 -; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: s_add_u32 s0, 0, 16 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, s0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_v_s: @@ -2390,23 +2391,23 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX8-NEXT: v_mov_b32_e32 v6, s14 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] -; GFX8-NEXT: s_add_u32 s0, 0, 16 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX8-NEXT: v_mov_b32_e32 v7, s15 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 -; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: s_add_u32 s0, 0, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_v_s: @@ -2509,8 +2510,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -2518,8 +2519,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX9-NEXT: s_add_u32 s0, 0, 16 -; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] @@ -2528,11 +2527,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: s_add_u32 s0, 0, 16 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, s0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_s_v: @@ -2572,8 +2573,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v9, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 ; GFX8-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NEXT: v_mov_b32_e32 v5, s21 @@ -2581,8 +2582,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: v_mov_b32_e32 v7, s23 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX8-NEXT: s_add_u32 s0, 0, 16 -; GFX8-NEXT: s_addc_u32 s1, 0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] @@ -2591,11 +2590,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: s_add_u32 s0, 0, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_s_v: @@ -2699,8 +2700,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 ; GFX9-NEXT: v_mov_b32_e32 v5, s17 @@ -2708,8 +2709,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_mov_b32_e32 v7, s19 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX9-NEXT: s_add_u32 s0, 0, 16 -; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] @@ -2718,11 +2717,13 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: s_add_u32 s0, 0, 16 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, s0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_v_v: @@ -2761,8 +2762,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v5, s17 @@ -2770,8 +2771,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: v_mov_b32_e32 v7, s19 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX8-NEXT: s_add_u32 s0, 0, 16 -; GFX8-NEXT: s_addc_u32 s1, 0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] @@ -2780,11 +2779,13 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: s_add_u32 s0, 0, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_v_v: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll index 43692dc81535e..7cad269df704b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -8,39 +8,39 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, ; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_co_u32_e32 v6, vcc, v0, v64 -; GCN-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v6 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NEXT: v_add_co_u32_e32 v16, vcc, v6, v4 -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, v7, v5, vcc -; GCN-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:32 +; GCN-NEXT: v_add_co_u32_e32 v4, vcc, v0, v64 +; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v4 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc ; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:48 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GCN-NEXT: s_movk_i32 s4, 0xc0 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc ; GCN-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] ; GCN-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:16 ; GCN-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:32 ; GCN-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:48 ; GCN-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[16:17], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[16:17], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[16:17], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 ; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 -; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 @@ -55,8 +55,8 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, ; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:16 ; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:32 ; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 ; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 ; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] offset:80 ; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:96 ; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:112 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir new file mode 100644 index 0000000000000..b450aa8b81962 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir @@ -0,0 +1,103 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: fract_f64_neg +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: fract_f64_neg + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0, 0 :: (dereferenceable invariant load 16, align 4, addrspace 4) + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; CHECK: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0, 0 :: (load 8, addrspace 1) + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; CHECK: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK: %12:vreg_64 = nofpexcept V_ADD_F64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec + ; CHECK: %15:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %12, 0, 0, implicit $mode, implicit $exec + ; CHECK: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[COPY1]] + ; CHECK: GLOBAL_STORE_DWORDX2 [[COPY5]], %15, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; CHECK: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2, %7(s64) + %9:sgpr(<2 x s64>) = G_LOAD %8(p4) :: (dereferenceable invariant load 16, align 4, addrspace 4) + %10:sgpr(s64) = G_EXTRACT %9(<2 x s64>), 0 + %13:sgpr(s64) = G_EXTRACT %9(<2 x s64>), 64 + %15:sgpr(p1) = G_INTTOPTR %13(s64) + %18:sgpr(s64) = G_LOAD %15(p1) :: (load 8, addrspace 1) + %19:sgpr(s64) = G_FCONSTANT double -0.000000e+00 + %24:sgpr(s64) = G_FNEG %18 + %25:vgpr(s64) = COPY %19(s64) + %26:vgpr(s64) = COPY %24(s64) + %20:vgpr(s64) = G_FADD %25, %26 + %21:vgpr(s64) = G_FFLOOR %20 + %23:vgpr(s64) = G_FNEG %21 + %22:vgpr(s64) = G_FADD %20, %23 + %12:sgpr(p1) = G_INTTOPTR %10(s64) + %27:vgpr(p1) = COPY %12(p1) + G_STORE %22(s64), %27(p1) :: (store 8, addrspace 1) + S_ENDPGM 0 +... + +--- +name: fract_f64_neg_abs +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: fract_f64_neg_abs + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0, 0 :: (dereferenceable invariant load 16, align 4, addrspace 4) + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; CHECK: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0, 0 :: (load 8, addrspace 1) + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; CHECK: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK: %13:vreg_64 = nofpexcept V_ADD_F64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec + ; CHECK: %16:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %13, 0, 0, implicit $mode, implicit $exec + ; CHECK: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[COPY1]] + ; CHECK: GLOBAL_STORE_DWORDX2 [[COPY5]], %16, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; CHECK: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2, %7(s64) + %9:sgpr(<2 x s64>) = G_LOAD %8(p4) :: (dereferenceable invariant load 16, align 4, addrspace 4) + %10:sgpr(s64) = G_EXTRACT %9(<2 x s64>), 0 + %13:sgpr(s64) = G_EXTRACT %9(<2 x s64>), 64 + %15:sgpr(p1) = G_INTTOPTR %13(s64) + %18:sgpr(s64) = G_LOAD %15(p1) :: (load 8, addrspace 1) + %19:sgpr(s64) = G_FABS %18 + %20:sgpr(s64) = G_FCONSTANT double -0.000000e+00 + %25:sgpr(s64) = G_FNEG %19 + %26:vgpr(s64) = COPY %20(s64) + %27:vgpr(s64) = COPY %25(s64) + %21:vgpr(s64) = G_FADD %26, %27 + %22:vgpr(s64) = G_FFLOOR %21 + %24:vgpr(s64) = G_FNEG %22 + %23:vgpr(s64) = G_FADD %21, %24 + %12:sgpr(p1) = G_INTTOPTR %10(s64) + %28:vgpr(p1) = COPY %12(p1) + G_STORE %23(s64), %28(p1) :: (store 8, addrspace 1) + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll index 28f60ca7528db..96d0c9d1d4a80 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -553,6 +553,104 @@ define void @void_func_v2i32(<2 x i32> %arg0) #0 { ret void } +define void @void_func_v2i24(<2 x i24> %arg0) #0 { + ; CHECK-LABEL: name: void_func_v2i24 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[TRUNC:%[0-9]+]]:_(<2 x s24>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[TRUNC]](<2 x s24>), [[DEF]](p1) :: (store 6 into `<2 x i24> addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]] + store <2 x i24> %arg0, <2 x i24> addrspace(1)* undef + ret void +} + +define void @void_func_v3i24(<3 x i24> %arg0) #0 { + ; CHECK-LABEL: name: void_func_v3i24 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[TRUNC:%[0-9]+]]:_(<3 x s24>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[TRUNC]](<3 x s24>), [[DEF]](p1) :: (store 9 into `<3 x i24> addrspace(1)* undef`, align 16, addrspace 1) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + store <3 x i24> %arg0, <3 x i24> addrspace(1)* undef + ret void +} + +define void @void_func_v2i8(<2 x i8> %arg0) #0 { + ; CHECK-LABEL: name: void_func_v2i8 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[BUILD_VECTOR]](<2 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[TRUNC2]](<2 x s8>), [[DEF]](p1) :: (store 2 into `<2 x i8> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]] + store <2 x i8> %arg0, <2 x i8> addrspace(1)* undef + ret void +} + +define void @void_func_v3i8(<3 x i8> %arg0) #0 { + ; CHECK-LABEL: name: void_func_v3i8 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16) + ; CHECK: [[TRUNC3:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[BUILD_VECTOR]](<3 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[TRUNC3]](<3 x s8>), [[DEF]](p1) :: (store 3 into `<3 x i8> addrspace(1)* undef`, align 4, addrspace 1) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + store <3 x i8> %arg0, <3 x i8> addrspace(1)* undef + ret void +} + +define void @void_func_v4i8(<4 x i8> %arg0) #0 { + ; CHECK-LABEL: name: void_func_v4i8 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CHECK: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[TRUNC4]](<4 x s8>), [[DEF]](p1) :: (store 4 into `<4 x i8> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] + ; CHECK: S_SETPC_B64_return [[COPY5]] + store <4 x i8> %arg0, <4 x i8> addrspace(1)* undef + ret void +} + define void @void_func_v2p3i8(<2 x i8 addrspace(3)*> %arg0) #0 { ; CHECK-LABEL: name: void_func_v2p3i8 ; CHECK: bb.1 (%ir-block.0): diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll new file mode 100644 index 0000000000000..7d5a49cfd38dd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll @@ -0,0 +1,128 @@ +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s + +; GCN-LABEL: test_local_misaligned_v2: +; GCN-DAG: ds_read2_b32 +; GCN-DAG: ds_write2_b32 +define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid + %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)* + %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4 + %v1 = extractelement <2 x i32> %load, i32 0 + %v2 = extractelement <2 x i32> %load, i32 1 + %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0 + %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1 + store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4 + ret void +} + +; GCN-LABEL: test_local_misaligned_v4: +; VECT-DAG: ds_read_b128 +; VECT-DAG: ds_write_b128 +; SPLIT-DAG: ds_read2_b32 +; SPLIT-DAG: ds_read2_b32 +; SPLIT-DAG: ds_write2_b32 +; SPLIT-DAG: ds_write2_b32 +define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid + %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)* + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 + %v1 = extractelement <4 x i32> %load, i32 0 + %v2 = extractelement <4 x i32> %load, i32 1 + %v3 = extractelement <4 x i32> %load, i32 2 + %v4 = extractelement <4 x i32> %load, i32 3 + %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 + %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 + %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 + %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 + store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4 + ret void +} + +; GCN-LABEL: test_local_misaligned_v3: +; VECT-DAG: ds_read_b96 +; VECT-DAG: ds_write_b96 +; SPLIT-DAG: ds_read2_b32 +; SPLIT-DAG: ds_read_b32 +; SPLIT-DAG: ds_write2_b32 +; SPLIT-DAG: ds_write_b32 +define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid + %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)* + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 + %v1 = extractelement <3 x i32> %load, i32 0 + %v2 = extractelement <3 x i32> %load, i32 1 + %v3 = extractelement <3 x i32> %load, i32 2 + %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0 + %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1 + %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2 + store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4 + ret void +} + +; GCN-LABEL: test_local_aligned_v2: +; GCN-DAG: ds_read_b64 +; GCN-DAG: ds_write_b64 +define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid + %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)* + %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8 + %v1 = extractelement <2 x i32> %load, i32 0 + %v2 = extractelement <2 x i32> %load, i32 1 + %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0 + %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1 + store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8 + ret void +} + +; GCN-LABEL: test_local_aligned_v3: +; GCN-DAG: ds_read_b96 +; GCN-DAG: ds_write_b96 +define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid + %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)* + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 + %v1 = extractelement <3 x i32> %load, i32 0 + %v2 = extractelement <3 x i32> %load, i32 1 + %v3 = extractelement <3 x i32> %load, i32 2 + %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0 + %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1 + %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2 + store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16 + ret void +} + +; GCN-LABEL: test_local_v4_aligned8: +; GCN-DAG: ds_read_b128 +; GCN-DAG: ds_write_b128 +define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid + %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)* + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 + %v1 = extractelement <4 x i32> %load, i32 0 + %v2 = extractelement <4 x i32> %load, i32 1 + %v3 = extractelement <4 x i32> %load, i32 2 + %v4 = extractelement <4 x i32> %load, i32 3 + %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0 + %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1 + %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2 + %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3 + store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll index 387630adabcee..390b91ea80c11 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll @@ -110,15 +110,16 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; UNPACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) + ; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) + ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>) - ; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) ; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) ; UNPACKED: S_ENDPGM 0 @@ -140,9 +141,29 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) - ; PACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; PACKED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) + ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96) + ; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] + ; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; PACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]] + ; PACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; PACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; PACKED: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[DEF]](<2 x s16>) + ; PACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) + ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[EXTRACT]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir index 8b607244eb8e7..80bd3e1f6ec8a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir @@ -44,6 +44,38 @@ body: | G_STORE %2, %0 :: (store 1, align 1, addrspace 1) ... +--- +name: test_store_global_s7_align1 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; SI-LABEL: name: test_store_global_s7_align1 + ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 1, addrspace 1) + ; CI-LABEL: name: test_store_global_s7_align1 + ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 1, addrspace 1) + ; VI-LABEL: name: test_store_global_s7_align1 + ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; VI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 1, addrspace 1) + ; GFX9-LABEL: name: test_store_global_s7_align1 + ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX9: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 1, addrspace 1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s7) = G_TRUNC %1 + G_STORE %2, %0 :: (store 1, align 1, addrspace 1) +... + --- name: test_store_global_s8_align1 body: | @@ -192,6 +224,262 @@ body: | G_STORE %2, %0 :: (store 2, align 4, addrspace 1) ... +--- +name: test_store_global_s24_align4 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; SI-LABEL: name: test_store_global_s24_align4 + ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) + ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; SI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; SI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, align 4, addrspace 1) + ; SI: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store 1 + 2, align 2, addrspace 1) + ; CI-LABEL: name: test_store_global_s24_align4 + ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; CI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, align 4, addrspace 1) + ; CI: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store 1 + 2, align 2, addrspace 1) + ; VI-LABEL: name: test_store_global_s24_align4 + ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) + ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; VI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, align 4, addrspace 1) + ; VI: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store 1 + 2, align 2, addrspace 1) + ; GFX9-LABEL: name: test_store_global_s24_align4 + ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX9: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, align 4, addrspace 1) + ; GFX9: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store 1 + 2, align 2, addrspace 1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s24) = G_TRUNC %1 + G_STORE %2, %0 :: (store 3, align 4, addrspace 1) +... + +--- +name: test_store_global_s24_align2 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; SI-LABEL: name: test_store_global_s24_align2 + ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; SI: G_STORE [[COPY5]](s32), [[COPY]](p1) :: (store 2, addrspace 1) + ; SI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; SI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: G_STORE [[COPY6]](s32), [[PTR_ADD]](p1) :: (store 1 + 2, align 2, addrspace 1) + ; CI-LABEL: name: test_store_global_s24_align2 + ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; CI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, addrspace 1) + ; CI: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store 1 + 2, align 2, addrspace 1) + ; VI-LABEL: name: test_store_global_s24_align2 + ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; VI: G_STORE [[COPY5]](s32), [[COPY]](p1) :: (store 2, addrspace 1) + ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; VI: G_STORE [[COPY6]](s32), [[PTR_ADD]](p1) :: (store 1 + 2, align 2, addrspace 1) + ; GFX9-LABEL: name: test_store_global_s24_align2 + ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX9: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, addrspace 1) + ; GFX9: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store 1 + 2, align 2, addrspace 1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s24) = G_TRUNC %1 + G_STORE %2, %0 :: (store 3, align 2, addrspace 1) +... + +--- +name: test_store_global_s24_align1 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; SI-LABEL: name: test_store_global_s24_align1 + ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C2]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; SI: G_STORE [[COPY6]](s32), [[COPY]](p1) :: (store 1, addrspace 1) + ; SI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; SI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; SI: G_STORE [[COPY7]](s32), [[PTR_ADD]](p1) :: (store 1 + 1, addrspace 1) + ; SI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; SI: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: G_STORE [[COPY8]](s32), [[PTR_ADD1]](p1) :: (store 1 + 2, addrspace 1) + ; CI-LABEL: name: test_store_global_s24_align1 + ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; CI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, align 1, addrspace 1) + ; CI: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store 1 + 2, addrspace 1) + ; VI-LABEL: name: test_store_global_s24_align1 + ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32) + ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; VI: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C2]](s16) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; VI: G_STORE [[COPY5]](s32), [[COPY]](p1) :: (store 1, addrspace 1) + ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; VI: G_STORE [[ANYEXT]](s32), [[PTR_ADD]](p1) :: (store 1 + 1, addrspace 1) + ; VI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; VI: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; VI: G_STORE [[COPY6]](s32), [[PTR_ADD1]](p1) :: (store 1 + 2, addrspace 1) + ; GFX9-LABEL: name: test_store_global_s24_align1 + ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX9: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, align 1, addrspace 1) + ; GFX9: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store 1 + 2, addrspace 1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s24) = G_TRUNC %1 + G_STORE %2, %0 :: (store 3, align 1, addrspace 1) +... + +--- +name: test_store_global_s25_align4 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; SI-LABEL: name: test_store_global_s25_align4 + ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 4, addrspace 1) + ; CI-LABEL: name: test_store_global_s25_align4 + ; CI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 4, addrspace 1) + ; VI-LABEL: name: test_store_global_s25_align4 + ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; VI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 4, addrspace 1) + ; GFX9-LABEL: name: test_store_global_s25_align4 + ; GFX9: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX9: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 4, addrspace 1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s25) = G_TRUNC %1 + G_STORE %2, %0 :: (store 4, align 4, addrspace 1) +... + +# --- +# name: test_store_global_s25_align2 +# body: | +# bb.0: +# liveins: $vgpr0_vgpr1, $vgpr2 + +# %0:_(p1) = COPY $vgpr0_vgpr1 +# %1:_(s32) = COPY $vgpr2 +# %2:_(s25) = G_TRUNC %1 +# G_STORE %2, %0 :: (store 4, align 2, addrspace 1) +# ... + +# --- +# name: test_store_global_s25_align1 +# body: | +# bb.0: +# liveins: $vgpr0_vgpr1, $vgpr2 + +# %0:_(p1) = COPY $vgpr0_vgpr1 +# %1:_(s32) = COPY $vgpr2 +# %2:_(s25) = G_TRUNC %1 +# G_STORE %2, %0 :: (store 4, align 1, addrspace 1) +# ... + --- name: test_store_global_s32_align1 body: | diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir index 758d5b01c9786..bba490ee57dad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir @@ -929,15 +929,59 @@ body: | ; SI-LABEL: name: test_truncstore_global_v3s8_to_1_align1 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4 - ; SI: [[TRUNC:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[COPY1]](<3 x s32>) - ; SI: [[BITCAST:%[0-9]+]]:_(s24) = G_BITCAST [[TRUNC]](<3 x s8>) - ; SI: G_STORE [[BITCAST]](s24), [[COPY]](p1) :: (store 1, addrspace 1) + ; SI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) + ; SI: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; SI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]] + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY2]](s32) + ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] + ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32) + ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; SI: G_STORE [[COPY5]](s32), [[COPY]](p1) :: (store 1, addrspace 1) ; VI-LABEL: name: test_truncstore_global_v3s8_to_1_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4 - ; VI: [[TRUNC:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[COPY1]](<3 x s32>) - ; VI: [[BITCAST:%[0-9]+]]:_(s24) = G_BITCAST [[TRUNC]](<3 x s8>) - ; VI: G_STORE [[BITCAST]](s24), [[COPY]](p1) :: (store 1, addrspace 1) + ; VI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) + ; VI: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; VI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]] + ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]] + ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C1]](s16) + ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]] + ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C]] + ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C1]](s16) + ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C2]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; VI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 1, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4 %2:_(<3 x s8>) = G_TRUNC %1 @@ -954,15 +998,59 @@ body: | ; SI-LABEL: name: test_truncstore_global_v3s8_to_2_align2 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4 - ; SI: [[TRUNC:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[COPY1]](<3 x s32>) - ; SI: [[BITCAST:%[0-9]+]]:_(s24) = G_BITCAST [[TRUNC]](<3 x s8>) - ; SI: G_STORE [[BITCAST]](s24), [[COPY]](p1) :: (store 2, addrspace 1) + ; SI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) + ; SI: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; SI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]] + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY2]](s32) + ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] + ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32) + ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; SI: G_STORE [[COPY5]](s32), [[COPY]](p1) :: (store 2, addrspace 1) ; VI-LABEL: name: test_truncstore_global_v3s8_to_2_align2 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4 - ; VI: [[TRUNC:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[COPY1]](<3 x s32>) - ; VI: [[BITCAST:%[0-9]+]]:_(s24) = G_BITCAST [[TRUNC]](<3 x s8>) - ; VI: G_STORE [[BITCAST]](s24), [[COPY]](p1) :: (store 2, addrspace 1) + ; VI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) + ; VI: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; VI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]] + ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]] + ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C1]](s16) + ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]] + ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C]] + ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C1]](s16) + ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C2]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; VI: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store 2, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4 %2:_(<3 x s8>) = G_TRUNC %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll new file mode 100644 index 0000000000000..22e944fc3a116 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll @@ -0,0 +1,10 @@ +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 + +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #0 + +; GFX908: error: {{.*}} return versions of fp atomics not supported + +define float @global_atomic_fadd_f32_rtn(float addrspace(1)* %ptr, float %data) { + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) + ret float %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll index 60ba088404a2d..70651280003e5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -8,7 +8,7 @@ define void @global_atomic_fadd_f32(float addrspace(1)* %ptr, float %data) { ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret void } @@ -26,7 +26,7 @@ define void @global_atomic_fadd_f32_off_2048(float addrspace(1)* %ptr, float %da ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data) ret void } @@ -44,7 +44,7 @@ define void @global_atomic_fadd_f32_off_neg2047(float addrspace(1)* %ptr, float ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(1)* %ptr, i64 -511 - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data) ret void } @@ -62,7 +62,7 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %pt ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_endpgm %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data) ret void } @@ -73,7 +73,7 @@ define void @global_atomic_fadd_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> ; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] - call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret void } @@ -91,11 +91,11 @@ define void @global_atomic_fadd_v2f16_off_neg2047(<2 x half> addrspace(1)* %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -511 - call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data) ret void } -declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #0 -declare void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0 +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #0 +declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0 attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll index 88c82b1c3f7cf..e25fd7fc43fc5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -51,11 +51,11 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) { ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s0, s[4:5], 0x11 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_cmp_eq_u32 s1, s0 +; CI-NEXT: s_cmp_lg_u32 s1, s0 ; CI-NEXT: s_cselect_b32 s0, 1, 0 ; CI-NEXT: s_and_b32 s0, s0, 1 ; CI-NEXT: s_cmp_lg_u32 s0, 0 -; CI-NEXT: s_cbranch_scc0 BB1_2 +; CI-NEXT: s_cbranch_scc1 BB1_2 ; CI-NEXT: ; %bb.1: ; %bb0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: flat_store_dword v[0:1], v0 @@ -68,11 +68,11 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9-NEXT: s_cmp_lg_u32 s1, s0 ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_cbranch_scc0 BB1_2 +; GFX9-NEXT: s_cbranch_scc1 BB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll index ec477c9925c9a..356f219ba0c28 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -51,11 +51,11 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) { ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s0, s[4:5], 0x10 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_cmp_eq_u32 s1, s0 +; CI-NEXT: s_cmp_lg_u32 s1, s0 ; CI-NEXT: s_cselect_b32 s0, 1, 0 ; CI-NEXT: s_and_b32 s0, s0, 1 ; CI-NEXT: s_cmp_lg_u32 s0, 0 -; CI-NEXT: s_cbranch_scc0 BB1_2 +; CI-NEXT: s_cbranch_scc1 BB1_2 ; CI-NEXT: ; %bb.1: ; %bb0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: flat_store_dword v[0:1], v0 @@ -68,11 +68,11 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9-NEXT: s_cmp_lg_u32 s1, s0 ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_cbranch_scc0 BB1_2 +; GFX9-NEXT: s_cbranch_scc1 BB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll index e9cd9f6ff797c..1cb79ff7fcacf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -16,7 +16,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -35,7 +35,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } @@ -52,7 +52,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void } @@ -70,7 +70,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_v ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } @@ -117,7 +117,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] ; CHECK: bb.4: ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -162,7 +162,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] ; CHECK: bb.4: ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } @@ -181,7 +181,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset = add i32 %voffset.base, 4095 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -200,7 +200,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void } @@ -218,7 +218,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__v ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -235,11 +235,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 -declare void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll index 7ff60e57d9646..43d7968832335 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -174,22 +174,20 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4) ; GFX6: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11 - ; GFX6: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 - ; GFX6: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5 - ; GFX6: [[COPY7:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8 - ; GFX6: [[COPY8:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11 - ; GFX6: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0 - ; GFX6: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1 - ; GFX6: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2 - ; GFX6: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11 + ; GFX6: [[COPY5:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = COPY [[REG_SEQUENCE1]] + ; GFX6: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[COPY5]].sub0_sub1_sub2 + ; GFX6: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0 + ; GFX6: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1 + ; GFX6: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub2 + ; GFX6: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX6: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; GFX6: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX6: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX6: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX6: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX6: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; GFX6: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; GFX6: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX6: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX6: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 ; GFX7-LABEL: name: s_buffer_load_v3i32 @@ -203,22 +201,20 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4) ; GFX7: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11 - ; GFX7: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 - ; GFX7: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5 - ; GFX7: [[COPY7:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8 - ; GFX7: [[COPY8:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11 - ; GFX7: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0 - ; GFX7: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1 - ; GFX7: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2 - ; GFX7: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11 + ; GFX7: [[COPY5:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = COPY [[REG_SEQUENCE1]] + ; GFX7: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[COPY5]].sub0_sub1_sub2 + ; GFX7: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0 + ; GFX7: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1 + ; GFX7: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub2 + ; GFX7: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX7: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; GFX7: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX7: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX7: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX7: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX7: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; GFX7: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; GFX7: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX7: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX7: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 ; GFX8-LABEL: name: s_buffer_load_v3i32 @@ -232,22 +228,20 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4) ; GFX8: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11 - ; GFX8: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 - ; GFX8: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5 - ; GFX8: [[COPY7:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8 - ; GFX8: [[COPY8:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11 - ; GFX8: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0 - ; GFX8: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1 - ; GFX8: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2 - ; GFX8: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11 + ; GFX8: [[COPY5:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = COPY [[REG_SEQUENCE1]] + ; GFX8: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[COPY5]].sub0_sub1_sub2 + ; GFX8: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0 + ; GFX8: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1 + ; GFX8: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub2 + ; GFX8: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX8: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; GFX8: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX8: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX8: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX8: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX8: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; GFX8: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; GFX8: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX8: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX8: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -1600,15 +1594,12 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r ; GFX6: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]] ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[COPY5]], %subreg.sub4_sub5_sub6_sub7, [[COPY6]], %subreg.sub8_sub9_sub10_sub11 ; GFX6: [[COPY7:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 - ; GFX6: [[COPY8:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5 - ; GFX6: [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8 - ; GFX6: [[COPY10:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11 - ; GFX6: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0 - ; GFX6: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1 - ; GFX6: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2 - ; GFX6: $vgpr0 = COPY [[COPY11]] - ; GFX6: $vgpr1 = COPY [[COPY12]] - ; GFX6: $vgpr2 = COPY [[COPY13]] + ; GFX6: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0 + ; GFX6: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1 + ; GFX6: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2 + ; GFX6: $vgpr0 = COPY [[COPY8]] + ; GFX6: $vgpr1 = COPY [[COPY9]] + ; GFX6: $vgpr2 = COPY [[COPY10]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; GFX7-LABEL: name: s_buffer_load_v3f32_vgpr_offset ; GFX7: bb.1 (%ir-block.0): @@ -1626,15 +1617,12 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r ; GFX7: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]] ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[COPY5]], %subreg.sub4_sub5_sub6_sub7, [[COPY6]], %subreg.sub8_sub9_sub10_sub11 ; GFX7: [[COPY7:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 - ; GFX7: [[COPY8:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5 - ; GFX7: [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8 - ; GFX7: [[COPY10:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11 - ; GFX7: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0 - ; GFX7: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1 - ; GFX7: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2 - ; GFX7: $vgpr0 = COPY [[COPY11]] - ; GFX7: $vgpr1 = COPY [[COPY12]] - ; GFX7: $vgpr2 = COPY [[COPY13]] + ; GFX7: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0 + ; GFX7: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1 + ; GFX7: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2 + ; GFX7: $vgpr0 = COPY [[COPY8]] + ; GFX7: $vgpr1 = COPY [[COPY9]] + ; GFX7: $vgpr2 = COPY [[COPY10]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; GFX8-LABEL: name: s_buffer_load_v3f32_vgpr_offset ; GFX8: bb.1 (%ir-block.0): @@ -1652,15 +1640,12 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r ; GFX8: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]] ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[COPY5]], %subreg.sub4_sub5_sub6_sub7, [[COPY6]], %subreg.sub8_sub9_sub10_sub11 ; GFX8: [[COPY7:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 - ; GFX8: [[COPY8:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5 - ; GFX8: [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8 - ; GFX8: [[COPY10:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11 - ; GFX8: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0 - ; GFX8: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1 - ; GFX8: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2 - ; GFX8: $vgpr0 = COPY [[COPY11]] - ; GFX8: $vgpr1 = COPY [[COPY12]] - ; GFX8: $vgpr2 = COPY [[COPY13]] + ; GFX8: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0 + ; GFX8: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1 + ; GFX8: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2 + ; GFX8: $vgpr0 = COPY [[COPY8]] + ; GFX8: $vgpr1 = COPY [[COPY9]] + ; GFX8: $vgpr2 = COPY [[COPY10]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <3 x float> %val diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll index da0455f3ed8f2..d84282eb3ede3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX6 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s ; FIXME: This test has a DAG duplicate @@ -13,20 +13,27 @@ ; Set FP32 fp_round to round to zero define amdgpu_kernel void @test_setreg_f32_round_mode_rtz() { -; GFX6789-LABEL: test_setreg_f32_round_mode_rtz: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_f32_round_mode_rtz: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 ; encoding: [0x01,0x08,0x80,0xba,0x03,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_f32_round_mode_rtz: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 ; encoding: [0x01,0x08,0x00,0xba,0x03,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_f32_round_mode_rtz: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 ; encoding: [0x01,0x08,0x80,0xba,0x03,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 2049, i32 3) call void asm sideeffect "", ""() ret void @@ -34,20 +41,27 @@ define amdgpu_kernel void @test_setreg_f32_round_mode_rtz() { ; Set FP64/FP16 fp_round to round to zero define amdgpu_kernel void @test_setreg_f64_round_mode_rtz() { -; GFX6789-LABEL: test_setreg_f64_round_mode_rtz: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_f64_round_mode_rtz: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 ; encoding: [0x81,0x08,0x80,0xba,0x03,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_f64_round_mode_rtz: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 ; encoding: [0x81,0x08,0x00,0xba,0x03,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_f64_round_mode_rtz: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 ; encoding: [0x81,0x08,0x80,0xba,0x03,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 2177, i32 3) call void asm sideeffect "", ""() ret void @@ -55,20 +69,27 @@ define amdgpu_kernel void @test_setreg_f64_round_mode_rtz() { ; Set all fp_round to round to zero define amdgpu_kernel void @test_setreg_all_round_mode_rtz() { -; GFX6789-LABEL: test_setreg_all_round_mode_rtz: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_all_round_mode_rtz: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7 ; encoding: [0x81,0x18,0x80,0xba,0x07,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_all_round_mode_rtz: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7 ; encoding: [0x81,0x18,0x00,0xba,0x07,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_all_round_mode_rtz: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7 ; encoding: [0x81,0x18,0x80,0xba,0x07,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6273, i32 7) call void asm sideeffect "", ""() ret void @@ -76,100 +97,135 @@ define amdgpu_kernel void @test_setreg_all_round_mode_rtz() { ; Set FP32 fp_round to dynamic mode define amdgpu_cs void @test_setreg_roundingmode_var(i32 inreg %var.mode) { -; GFX6789-LABEL: test_setreg_roundingmode_var: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_roundingmode_var: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0 ; encoding: [0x01,0x08,0x80,0xb9] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_roundingmode_var: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0 ; encoding: [0x01,0x08,0x00,0xb9] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_roundingmode_var: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0 ; encoding: [0x01,0x08,0x80,0xb9] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 2049, i32 %var.mode) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_ieee_mode_off() { -; GFX6789-LABEL: test_setreg_ieee_mode_off: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_ieee_mode_off: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0 ; encoding: [0x41,0x02,0x80,0xba,0x00,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_ieee_mode_off: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0 ; encoding: [0x41,0x02,0x00,0xba,0x00,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_ieee_mode_off: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0 ; encoding: [0x41,0x02,0x80,0xba,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 577, i32 0) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_ieee_mode_on() { -; GFX6789-LABEL: test_setreg_ieee_mode_on: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_ieee_mode_on: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1 ; encoding: [0x41,0x02,0x80,0xba,0x01,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_ieee_mode_on: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1 ; encoding: [0x41,0x02,0x00,0xba,0x01,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_ieee_mode_on: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1 ; encoding: [0x41,0x02,0x80,0xba,0x01,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 577, i32 1) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_dx10_clamp_off() { -; GFX6789-LABEL: test_setreg_dx10_clamp_off: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_dx10_clamp_off: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0 ; encoding: [0x01,0x02,0x80,0xba,0x00,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_dx10_clamp_off: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0 ; encoding: [0x01,0x02,0x00,0xba,0x00,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_dx10_clamp_off: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0 ; encoding: [0x01,0x02,0x80,0xba,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 513, i32 0) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_dx10_clamp_on() { -; GFX6789-LABEL: test_setreg_dx10_clamp_on: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_dx10_clamp_on: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1 ; encoding: [0x01,0x02,0x80,0xba,0x01,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_dx10_clamp_on: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1 ; encoding: [0x01,0x02,0x00,0xba,0x01,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_dx10_clamp_on: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1 ; encoding: [0x01,0x02,0x80,0xba,0x01,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 513, i32 1) call void asm sideeffect "", ""() ret void @@ -177,20 +233,27 @@ define amdgpu_kernel void @test_setreg_dx10_clamp_on() { ; Sets full width of fp round and fp denorm fields, to a variable define amdgpu_cs void @test_setreg_full_both_round_mode_and_denorm_mode(i32 inreg %mode) { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0 ; encoding: [0x01,0x38,0x80,0xb9] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0 ; encoding: [0x01,0x38,0x00,0xb9] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0 ; encoding: [0x01,0x38,0x80,0xb9] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 inreg %mode) call void asm sideeffect "", ""() ret void @@ -198,20 +261,27 @@ define amdgpu_cs void @test_setreg_full_both_round_mode_and_denorm_mode(i32 inre ; Does not cover last bit of denorm field define amdgpu_cs void @test_setreg_most_both_round_mode_and_denorm_mode() { -; GFX6789-LABEL: test_setreg_most_both_round_mode_and_denorm_mode: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_most_both_round_mode_and_denorm_mode: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6 ; encoding: [0x01,0x30,0x80,0xba,0x06,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_most_both_round_mode_and_denorm_mode: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6 ; encoding: [0x01,0x30,0x00,0xba,0x06,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_most_both_round_mode_and_denorm_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6 ; encoding: [0x01,0x30,0x80,0xba,0x06,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 12289, i32 6) call void asm sideeffect "", ""() ret void @@ -219,200 +289,270 @@ define amdgpu_cs void @test_setreg_most_both_round_mode_and_denorm_mode() { ; Does not cover first bit of denorm field define amdgpu_cs void @test_setreg_most_both_round_mode_and_denorm_mode_6() { -; GFX6789-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6 ; encoding: [0x41,0x10,0x80,0xba,0x06,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6 ; encoding: [0x41,0x10,0x00,0xba,0x06,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6 ; encoding: [0x41,0x10,0x80,0xba,0x06,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 4161, i32 6) call void asm sideeffect "", ""() ret void } define amdgpu_cs void @test_setreg_f32_denorm_mode(i32 inreg %val) { -; GFX6789-LABEL: test_setreg_f32_denorm_mode: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_f32_denorm_mode: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; encoding: [0x01,0x09,0x80,0xb9] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_f32_denorm_mode: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; encoding: [0x01,0x09,0x00,0xb9] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_f32_denorm_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; encoding: [0x01,0x09,0x80,0xb9] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 2305, i32 %val) call void asm sideeffect "", ""() ret void } define amdgpu_cs void @test_setreg_f64_denorm_mode(i32 inreg %val) { -; GFX6789-LABEL: test_setreg_f64_denorm_mode: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_f64_denorm_mode: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0 ; encoding: [0x81,0x09,0x80,0xb9] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_f64_denorm_mode: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0 ; encoding: [0x81,0x09,0x00,0xb9] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_f64_denorm_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0 ; encoding: [0x81,0x09,0x80,0xb9] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 2433, i32 %val) call void asm sideeffect "", ""() ret void } define amdgpu_cs void @test_setreg_full_denorm_mode(i32 inreg %val) { -; GFX6789-LABEL: test_setreg_full_denorm_mode: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; encoding: [0x01,0x18,0x80,0xb9] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; encoding: [0x01,0x18,0x00,0xb9] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; encoding: [0x01,0x18,0x80,0xb9] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 %val) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_round_mode_0() { -; GFX6789-LABEL: test_setreg_full_round_mode_0: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 ; encoding: [0x01,0x18,0x80,0xba,0x00,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_0: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 ; encoding: [0x01,0x18,0x00,0xba,0x00,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 0) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_round_mode_1() { -; GFX6789-LABEL: test_setreg_full_round_mode_1: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1 ; encoding: [0x01,0x18,0x80,0xba,0x01,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_1: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1 ; encoding: [0x01,0x18,0x00,0xba,0x01,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0x1 +; GFX10-NEXT: s_round_mode 0x1 ; encoding: [0x01,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 1) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_round_mode_2() { -; GFX6789-LABEL: test_setreg_full_round_mode_2: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2 ; encoding: [0x01,0x18,0x80,0xba,0x02,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_2: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2 ; encoding: [0x01,0x18,0x00,0xba,0x02,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0x2 +; GFX10-NEXT: s_round_mode 0x2 ; encoding: [0x02,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 2) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_round_mode_4() { -; GFX6789-LABEL: test_setreg_full_round_mode_4: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4 ; encoding: [0x01,0x18,0x80,0xba,0x04,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_4: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4 ; encoding: [0x01,0x18,0x00,0xba,0x04,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_4: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0x4 +; GFX10-NEXT: s_round_mode 0x4 ; encoding: [0x04,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 4) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_round_mode_8() { -; GFX6789-LABEL: test_setreg_full_round_mode_8: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8 ; encoding: [0x01,0x18,0x80,0xba,0x08,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_8: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8 ; encoding: [0x01,0x18,0x00,0xba,0x08,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_8: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0x8 +; GFX10-NEXT: s_round_mode 0x8 ; encoding: [0x08,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 8) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_round_mode_15() { -; GFX6789-LABEL: test_setreg_full_round_mode_15: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15 ; encoding: [0x01,0x18,0x80,0xba,0x0f,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_15: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15 ; encoding: [0x01,0x18,0x00,0xba,0x0f,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_15: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0xf +; GFX10-NEXT: s_round_mode 0xf ; encoding: [0x0f,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 15) call void asm sideeffect "", ""() ret void @@ -420,60 +560,81 @@ define amdgpu_kernel void @test_setreg_full_round_mode_15() { ; Should truncate set immediate value define amdgpu_kernel void @test_setreg_full_round_mode_42() { -; GFX6789-LABEL: test_setreg_full_round_mode_42: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 42 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_42: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 42 ; encoding: [0x01,0x18,0x80,0xba,0x2a,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_42: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 42 ; encoding: [0x01,0x18,0x00,0xba,0x2a,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_42: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0xa +; GFX10-NEXT: s_round_mode 0xa ; encoding: [0x0a,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 42) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_denorm_mode_0() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_0: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 0 ; encoding: [0x01,0x19,0x80,0xba,0x00,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_0: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 0 ; encoding: [0x01,0x19,0x00,0xba,0x00,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 0) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_denorm_mode_1() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_1: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 1 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 1 ; encoding: [0x01,0x19,0x80,0xba,0x01,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_1: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 1 ; encoding: [0x01,0x19,0x00,0xba,0x01,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 1 +; GFX10-NEXT: s_denorm_mode 1 ; encoding: [0x01,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 1) call void asm sideeffect "", ""() ret void @@ -481,100 +642,135 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_1() { define amdgpu_kernel void @test_setreg_full_denorm_mode_2() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_2: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 2 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 2 ; encoding: [0x01,0x19,0x80,0xba,0x02,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_2: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 2 ; encoding: [0x01,0x19,0x00,0xba,0x02,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 2 +; GFX10-NEXT: s_denorm_mode 2 ; encoding: [0x02,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 2) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_denorm_mode_4() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_4: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 4 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 4 ; encoding: [0x01,0x19,0x80,0xba,0x04,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_4: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 4 ; encoding: [0x01,0x19,0x00,0xba,0x04,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_4: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 4 +; GFX10-NEXT: s_denorm_mode 4 ; encoding: [0x04,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 4) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_denorm_mode_8() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_8: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 8 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 8 ; encoding: [0x01,0x19,0x80,0xba,0x08,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_8: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 8 ; encoding: [0x01,0x19,0x00,0xba,0x08,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_8: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 8 +; GFX10-NEXT: s_denorm_mode 8 ; encoding: [0x08,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 8) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_denorm_mode_15() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_15: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 15 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 15 ; encoding: [0x01,0x19,0x80,0xba,0x0f,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_15: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 15 ; encoding: [0x01,0x19,0x00,0xba,0x0f,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_15: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: s_denorm_mode 15 ; encoding: [0x0f,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 15) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_denorm_mode_42() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_42: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 42 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_42: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 42 ; encoding: [0x01,0x19,0x80,0xba,0x2a,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_42: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 42 ; encoding: [0x01,0x19,0x00,0xba,0x2a,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_42: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 10 +; GFX10-NEXT: s_denorm_mode 10 ; encoding: [0x0a,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 42) call void asm sideeffect "", ""() ret void @@ -582,231 +778,308 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_42() { ; Sets all fp round and fp denorm bits. define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_0() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0 ; encoding: [0x01,0x38,0x80,0xba,0x00,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0 ; encoding: [0x01,0x38,0x00,0xba,0x00,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 0) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_1() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 1 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 1 ; encoding: [0x01,0x38,0x80,0xba,0x01,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 1 ; encoding: [0x01,0x38,0x00,0xba,0x01,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x1 +; GFX10-NEXT: s_round_mode 0x1 ; encoding: [0x01,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 1) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_2() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 2 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 2 ; encoding: [0x01,0x38,0x80,0xba,0x02,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 2 ; encoding: [0x01,0x38,0x00,0xba,0x02,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x2 +; GFX10-NEXT: s_round_mode 0x2 ; encoding: [0x02,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 2) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_4() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 4 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 4 ; encoding: [0x01,0x38,0x80,0xba,0x04,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 4 ; encoding: [0x01,0x38,0x00,0xba,0x04,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x4 +; GFX10-NEXT: s_round_mode 0x4 ; encoding: [0x04,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 4) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_8() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 8 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 8 ; encoding: [0x01,0x38,0x80,0xba,0x08,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 8 ; encoding: [0x01,0x38,0x00,0xba,0x08,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x8 +; GFX10-NEXT: s_round_mode 0x8 ; encoding: [0x08,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 8) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_16() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 16 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 16 ; encoding: [0x01,0x38,0x80,0xba,0x10,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 16 ; encoding: [0x01,0x38,0x00,0xba,0x10,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 1 +; GFX10-NEXT: s_denorm_mode 1 ; encoding: [0x01,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 16) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_32() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 32 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 32 ; encoding: [0x01,0x38,0x80,0xba,0x20,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 32 ; encoding: [0x01,0x38,0x00,0xba,0x20,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 2 +; GFX10-NEXT: s_denorm_mode 2 ; encoding: [0x02,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 32) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_64() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 64 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 64 ; encoding: [0x01,0x38,0x80,0xba,0x40,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 64 ; encoding: [0x01,0x38,0x00,0xba,0x40,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 4 +; GFX10-NEXT: s_denorm_mode 4 ; encoding: [0x04,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 64) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_128() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x80 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x80 ; encoding: [0x01,0x38,0x80,0xba,0x80,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x80 ; encoding: [0x01,0x38,0x00,0xba,0x80,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 8 +; GFX10-NEXT: s_denorm_mode 8 ; encoding: [0x08,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 128) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_15() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 15 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 15 ; encoding: [0x01,0x38,0x80,0xba,0x0f,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 15 ; encoding: [0x01,0x38,0x00,0xba,0x0f,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0xf +; GFX10-NEXT: s_round_mode 0xf ; encoding: [0x0f,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 15) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_255() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xff -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xff ; encoding: [0x01,0x38,0x80,0xba,0xff,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xff ; encoding: [0x01,0x38,0x00,0xba,0xff,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0xf +; GFX10-NEXT: s_round_mode 0xf ; encoding: [0x0f,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: s_denorm_mode 15 ; encoding: [0x0f,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 255) call void asm sideeffect "", ""() ret void @@ -814,61 +1087,82 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_255( ; Truncate extra high bit define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_597() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x255 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x255 ; encoding: [0x01,0x38,0x80,0xba,0x55,0x02,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x255 ; encoding: [0x01,0x38,0x00,0xba,0x55,0x02,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x5 +; GFX10-NEXT: s_round_mode 0x5 ; encoding: [0x05,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 5 +; GFX10-NEXT: s_denorm_mode 5 ; encoding: [0x05,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 597) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_set_8_bits_straddles_round_and_denorm() { -; GFX6789-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff ; encoding: [0x81,0x38,0x80,0xba,0xff,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff ; encoding: [0x81,0x38,0x00,0xba,0xff,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff ; encoding: [0x81,0x38,0x80,0xba,0xff,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14465, i32 255) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_set_4_bits_straddles_round_and_denorm() { -; GFX6789-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15 ; encoding: [0x81,0x18,0x80,0xba,0x0f,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15 ; encoding: [0x81,0x18,0x00,0xba,0x0f,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15 ; encoding: [0x81,0x18,0x80,0xba,0x0f,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6273, i32 15) call void asm sideeffect "", ""() ret void @@ -876,25 +1170,34 @@ define amdgpu_kernel void @test_setreg_set_4_bits_straddles_round_and_denorm() { ; FIXME: Broken for DAG define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) { -; GFX6789-LABEL: test_setreg_roundingmode_var_vgpr: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6789-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: test_setreg_roundingmode_var_vgpr: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; encoding: [0x00,0x05,0x08,0x7e] +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4 ; encoding: [0x01,0x10,0x84,0xb9] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] +; +; GFX789-LABEL: test_setreg_roundingmode_var_vgpr: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +; GFX789-NEXT: v_readfirstlane_b32 s4, v0 ; encoding: [0x00,0x05,0x08,0x7e] +; GFX789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4 ; encoding: [0x01,0x10,0x04,0xb9] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; GFX10-LABEL: test_setreg_roundingmode_var_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 ; encoding: [0x00,0x05,0x08,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4 ; encoding: [0x01,0x10,0x84,0xb9] +; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode) call void asm sideeffect "", ""() ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll new file mode 100644 index 0000000000000..99dde6c4d5833 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll @@ -0,0 +1,11 @@ +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 + +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 + +; GFX908: error: {{.*}} return versions of fp atomics not supported + +define amdgpu_ps float @buffer_atomic_add_f32_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +main_body: + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret float %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll index 4a5e4be7cb819..be0c233577d0b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -18,7 +18,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -39,7 +39,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__ ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) ret void } @@ -57,7 +57,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) ret void } @@ -76,7 +76,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } @@ -126,7 +126,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] ; CHECK: bb.4: ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -173,7 +173,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] ; CHECK: bb.4: ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } @@ -194,7 +194,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } @@ -212,7 +212,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) ret void } @@ -232,7 +232,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -250,11 +250,11 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 -declare void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll index ef28a300590a0..50de683890186 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -177,35 +177,35 @@ define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) ; GFX7-LABEL: store_lds_v4i32_align1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX7-NEXT: ds_write_b8 v0, v1 ; GFX7-NEXT: ds_write_b8 v0, v5 offset:1 ; GFX7-NEXT: ds_write_b8 v0, v6 offset:2 ; GFX7-NEXT: ds_write_b8 v0, v7 offset:3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 -; GFX7-NEXT: ds_write_b8 v0, v8 offset:5 -; GFX7-NEXT: ds_write_b8 v0, v9 offset:6 -; GFX7-NEXT: ds_write_b8 v0, v10 offset:7 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: ds_write_b8 v0, v5 offset:6 +; GFX7-NEXT: ds_write_b8 v0, v6 offset:7 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4 ; GFX7-NEXT: ds_write_b8 v0, v3 offset:8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:10 ; GFX7-NEXT: ds_write_b8 v0, v5 offset:11 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v4 ; GFX7-NEXT: ds_write_b8 v0, v4 offset:12 -; GFX7-NEXT: ds_write_b8 v0, v6 offset:13 -; GFX7-NEXT: ds_write_b8 v0, v7 offset:14 -; GFX7-NEXT: ds_write_b8 v0, v8 offset:15 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:14 +; GFX7-NEXT: ds_write_b8 v0, v3 offset:15 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 @@ -227,17 +227,17 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX7-NEXT: ds_write_b8 v0, v1 ; GFX7-NEXT: ds_write_b8 v0, v4 offset:1 ; GFX7-NEXT: ds_write_b8 v0, v5 offset:2 ; GFX7-NEXT: ds_write_b8 v0, v6 offset:3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 -; GFX7-NEXT: ds_write_b8 v0, v7 offset:5 -; GFX7-NEXT: ds_write_b8 v0, v8 offset:6 -; GFX7-NEXT: ds_write_b8 v0, v9 offset:7 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: ds_write_b8 v0, v4 offset:6 +; GFX7-NEXT: ds_write_b8 v0, v5 offset:7 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 3c550a1a08e1f..5f4d4097b23a2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -29,9 +29,10 @@ define amdgpu_kernel void @localize_constants(i1 %cond) { ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: BB0_2: ; %Flow +; GFX9-NEXT: s_xor_b32 s0, s0, -1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_cbranch_scc0 BB0_4 +; GFX9-NEXT: s_cbranch_scc1 BB0_4 ; GFX9-NEXT: ; %bb.3: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: global_store_dword v[0:1], v0, off @@ -109,9 +110,10 @@ define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: BB1_2: ; %Flow +; GFX9-NEXT: s_xor_b32 s0, s0, -1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_cbranch_scc0 BB1_4 +; GFX9-NEXT: s_cbranch_scc1 BB1_4 ; GFX9-NEXT: ; %bb.3: ; %bb0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, gv0@gotpcrel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir index b8109fe6c87cf..1941ad593f96d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir @@ -37,8 +37,9 @@ body: | ; GCN-LABEL: name: select_from_same_results_of_unmerge_values ; GCN: liveins: $vgpr0 ; GCN: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF - ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>) - ; GCN: $vgpr0 = COPY [[UV]](s32) + ; GCN: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[DEF]](<2 x s32>) + ; GCN: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64) + ; GCN: $vgpr0 = COPY [[TRUNC]](s32) ; GCN: SI_RETURN_TO_EPILOG $vgpr0 %0:_(<2 x s32>) = G_IMPLICIT_DEF %1:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir index f0e2698e52f20..7257357eab8ec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir @@ -58,14 +58,12 @@ body: | ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -105,14 +103,12 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %8, %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll index 670c9898c2798..9e051458ccd19 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s -; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s --check-prefix=GREEDY ; Natural mapping define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { @@ -18,6 +18,20 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; GREEDY-LABEL: name: s_buffer_load_i32 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 4) + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32) + ; GREEDY: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; GREEDY: $sgpr0 = COPY [[INT]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret i32 %val } @@ -41,6 +55,24 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) ; CHECK: $sgpr1 = COPY [[INT1]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GREEDY-LABEL: name: s_buffer_load_v2i32 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 8, align 4) + ; GREEDY: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>) + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; GREEDY: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; GREEDY: $sgpr0 = COPY [[INT]](s32) + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; GREEDY: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; GREEDY: $sgpr1 = COPY [[INT1]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <2 x i32> %val } @@ -58,18 +90,46 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 12, align 4) ; CHECK: [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) - ; CHECK: [[UV:%[0-9]+]]:sgpr(<3 x s32>), [[UV1:%[0-9]+]]:sgpr(<3 x s32>), [[UV2:%[0-9]+]]:sgpr(<3 x s32>), [[UV3:%[0-9]+]]:sgpr(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) - ; CHECK: [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[UV]](<3 x s32>) - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(s384) = G_BITCAST [[CONCAT_VECTORS]](<12 x s32>) + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s96) = G_TRUNC [[BITCAST]](s384) + ; CHECK: [[BITCAST1:%[0-9]+]]:sgpr(<3 x s32>) = G_BITCAST [[TRUNC]](s96) + ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[BITCAST1]](<3 x s32>) + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) ; CHECK: $sgpr1 = COPY [[INT1]](s32) - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) ; CHECK: $sgpr2 = COPY [[INT2]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 + ; GREEDY-LABEL: name: s_buffer_load_v3i32 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 12, align 4) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) + ; GREEDY: [[BITCAST:%[0-9]+]]:sgpr(s384) = G_BITCAST [[CONCAT_VECTORS]](<12 x s32>) + ; GREEDY: [[TRUNC:%[0-9]+]]:sgpr(s96) = G_TRUNC [[BITCAST]](s384) + ; GREEDY: [[BITCAST1:%[0-9]+]]:sgpr(<3 x s32>) = G_BITCAST [[TRUNC]](s96) + ; GREEDY: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[BITCAST1]](<3 x s32>) + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; GREEDY: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; GREEDY: $sgpr0 = COPY [[INT]](s32) + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; GREEDY: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; GREEDY: $sgpr1 = COPY [[INT1]](s32) + ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) + ; GREEDY: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) + ; GREEDY: $sgpr2 = COPY [[INT2]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <3 x i32> %val } @@ -111,6 +171,42 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg ; CHECK: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) ; CHECK: $sgpr7 = COPY [[INT7]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 + ; GREEDY-LABEL: name: s_buffer_load_v8i32 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 32, align 4) + ; GREEDY: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; GREEDY: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; GREEDY: $sgpr0 = COPY [[INT]](s32) + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; GREEDY: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; GREEDY: $sgpr1 = COPY [[INT1]](s32) + ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) + ; GREEDY: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) + ; GREEDY: $sgpr2 = COPY [[INT2]](s32) + ; GREEDY: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32) + ; GREEDY: [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32) + ; GREEDY: $sgpr3 = COPY [[INT3]](s32) + ; GREEDY: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) + ; GREEDY: [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32) + ; GREEDY: $sgpr4 = COPY [[INT4]](s32) + ; GREEDY: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) + ; GREEDY: [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32) + ; GREEDY: $sgpr5 = COPY [[INT5]](s32) + ; GREEDY: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) + ; GREEDY: [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32) + ; GREEDY: $sgpr6 = COPY [[INT6]](s32) + ; GREEDY: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32) + ; GREEDY: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) + ; GREEDY: $sgpr7 = COPY [[INT7]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x i32> %val } @@ -176,6 +272,66 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr ; CHECK: [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32) ; CHECK: $sgpr15 = COPY [[INT15]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 + ; GREEDY-LABEL: name: s_buffer_load_v16i32 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 64, align 4) + ; GREEDY: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>) + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; GREEDY: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; GREEDY: $sgpr0 = COPY [[INT]](s32) + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; GREEDY: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; GREEDY: $sgpr1 = COPY [[INT1]](s32) + ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) + ; GREEDY: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) + ; GREEDY: $sgpr2 = COPY [[INT2]](s32) + ; GREEDY: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32) + ; GREEDY: [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32) + ; GREEDY: $sgpr3 = COPY [[INT3]](s32) + ; GREEDY: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) + ; GREEDY: [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32) + ; GREEDY: $sgpr4 = COPY [[INT4]](s32) + ; GREEDY: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) + ; GREEDY: [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32) + ; GREEDY: $sgpr5 = COPY [[INT5]](s32) + ; GREEDY: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) + ; GREEDY: [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32) + ; GREEDY: $sgpr6 = COPY [[INT6]](s32) + ; GREEDY: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32) + ; GREEDY: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) + ; GREEDY: $sgpr7 = COPY [[INT7]](s32) + ; GREEDY: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32) + ; GREEDY: [[INT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32) + ; GREEDY: $sgpr8 = COPY [[INT8]](s32) + ; GREEDY: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32) + ; GREEDY: [[INT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32) + ; GREEDY: $sgpr9 = COPY [[INT9]](s32) + ; GREEDY: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32) + ; GREEDY: [[INT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32) + ; GREEDY: $sgpr10 = COPY [[INT10]](s32) + ; GREEDY: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32) + ; GREEDY: [[INT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32) + ; GREEDY: $sgpr11 = COPY [[INT11]](s32) + ; GREEDY: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32) + ; GREEDY: [[INT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32) + ; GREEDY: $sgpr12 = COPY [[INT12]](s32) + ; GREEDY: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32) + ; GREEDY: [[INT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32) + ; GREEDY: $sgpr13 = COPY [[INT13]](s32) + ; GREEDY: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32) + ; GREEDY: [[INT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32) + ; GREEDY: $sgpr14 = COPY [[INT14]](s32) + ; GREEDY: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32) + ; GREEDY: [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32) + ; GREEDY: $sgpr15 = COPY [[INT15]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x i32> %val } @@ -196,6 +352,20 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val } @@ -217,6 +387,22 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; GREEDY-LABEL: name: s_buffer_load_v2f32_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 8, align 4) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<2 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV]](s32) + ; GREEDY: $vgpr1 = COPY [[UV1]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <2 x float> %val } @@ -238,12 +424,38 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r ; CHECK: [[COPY5:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[DEF]](<4 x s32>) ; CHECK: [[COPY6:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[DEF]](<4 x s32>) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[COPY5]](<4 x s32>), [[COPY6]](<4 x s32>) - ; CHECK: [[UV:%[0-9]+]]:vgpr(<3 x s32>), [[UV1:%[0-9]+]]:vgpr(<3 x s32>), [[UV2:%[0-9]+]]:vgpr(<3 x s32>), [[UV3:%[0-9]+]]:vgpr(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) - ; CHECK: [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV]](<3 x s32>) - ; CHECK: $vgpr0 = COPY [[UV4]](s32) - ; CHECK: $vgpr1 = COPY [[UV5]](s32) - ; CHECK: $vgpr2 = COPY [[UV6]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(s384) = G_BITCAST [[CONCAT_VECTORS]](<12 x s32>) + ; CHECK: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[BITCAST]](s384) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<3 x s32>) = G_BITCAST [[TRUNC]](s96) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BITCAST1]](<3 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: $vgpr2 = COPY [[UV2]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + ; GREEDY-LABEL: name: s_buffer_load_v3f32_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[DEF]](<4 x s32>) + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[DEF]](<4 x s32>) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[COPY5]](<4 x s32>), [[COPY6]](<4 x s32>) + ; GREEDY: [[BITCAST:%[0-9]+]]:vgpr(s384) = G_BITCAST [[CONCAT_VECTORS]](<12 x s32>) + ; GREEDY: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[BITCAST]](s384) + ; GREEDY: [[BITCAST1:%[0-9]+]]:vgpr(<3 x s32>) = G_BITCAST [[TRUNC]](s96) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BITCAST1]](<3 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV]](s32) + ; GREEDY: $vgpr1 = COPY [[UV1]](s32) + ; GREEDY: $vgpr2 = COPY [[UV2]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <3 x float> %val } @@ -267,6 +479,24 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r ; CHECK: $vgpr2 = COPY [[UV2]](s32) ; CHECK: $vgpr3 = COPY [[UV3]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GREEDY-LABEL: name: s_buffer_load_v4f32_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV]](s32) + ; GREEDY: $vgpr1 = COPY [[UV1]](s32) + ; GREEDY: $vgpr2 = COPY [[UV2]](s32) + ; GREEDY: $vgpr3 = COPY [[UV3]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <4 x float> %val } @@ -296,6 +526,30 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r ; CHECK: $vgpr6 = COPY [[UV6]](s32) ; CHECK: $vgpr7 = COPY [[UV7]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; GREEDY-LABEL: name: s_buffer_load_v8f32_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV]](s32) + ; GREEDY: $vgpr1 = COPY [[UV1]](s32) + ; GREEDY: $vgpr2 = COPY [[UV2]](s32) + ; GREEDY: $vgpr3 = COPY [[UV3]](s32) + ; GREEDY: $vgpr4 = COPY [[UV4]](s32) + ; GREEDY: $vgpr5 = COPY [[UV5]](s32) + ; GREEDY: $vgpr6 = COPY [[UV6]](s32) + ; GREEDY: $vgpr7 = COPY [[UV7]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } @@ -335,6 +589,40 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg ; CHECK: $vgpr14 = COPY [[UV14]](s32) ; CHECK: $vgpr15 = COPY [[UV15]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 + ; GREEDY-LABEL: name: s_buffer_load_v16f32_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV]](s32) + ; GREEDY: $vgpr1 = COPY [[UV1]](s32) + ; GREEDY: $vgpr2 = COPY [[UV2]](s32) + ; GREEDY: $vgpr3 = COPY [[UV3]](s32) + ; GREEDY: $vgpr4 = COPY [[UV4]](s32) + ; GREEDY: $vgpr5 = COPY [[UV5]](s32) + ; GREEDY: $vgpr6 = COPY [[UV6]](s32) + ; GREEDY: $vgpr7 = COPY [[UV7]](s32) + ; GREEDY: $vgpr8 = COPY [[UV8]](s32) + ; GREEDY: $vgpr9 = COPY [[UV9]](s32) + ; GREEDY: $vgpr10 = COPY [[UV10]](s32) + ; GREEDY: $vgpr11 = COPY [[UV11]](s32) + ; GREEDY: $vgpr12 = COPY [[UV12]](s32) + ; GREEDY: $vgpr13 = COPY [[UV13]](s32) + ; GREEDY: $vgpr14 = COPY [[UV14]](s32) + ; GREEDY: $vgpr15 = COPY [[UV15]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x float> %val } @@ -356,6 +644,22 @@ define amdgpu_ps void @s_buffer_load_i96_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; CHECK: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[AMDGPU_BUFFER_LOAD]](s128) ; CHECK: G_STORE [[TRUNC]](s96), [[DEF]](p1) :: (store 12 into `i96 addrspace(1)* undef`, align 8, addrspace 1) ; CHECK: S_ENDPGM 0 + ; GREEDY-LABEL: name: s_buffer_load_i96_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[AMDGPU_BUFFER_LOAD]](s128) + ; GREEDY: G_STORE [[TRUNC]](s96), [[DEF]](p1) :: (store 12 into `i96 addrspace(1)* undef`, align 8, addrspace 1) + ; GREEDY: S_ENDPGM 0 %val = call i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32> %rsrc, i32 %soffset, i32 0) store i96 %val, i96 addrspace(1)* undef ret void @@ -384,6 +688,27 @@ define amdgpu_ps void @s_buffer_load_i256_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) ; CHECK: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store 16 into `i256 addrspace(1)* undef` + 16, align 8, addrspace 1) ; CHECK: S_ENDPGM 0 + ; GREEDY-LABEL: name: s_buffer_load_i256_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s256) + ; GREEDY: G_STORE [[UV]](s128), [[DEF]](p1) :: (store 16 into `i256 addrspace(1)* undef`, align 8, addrspace 1) + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GREEDY: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store 16 into `i256 addrspace(1)* undef` + 16, align 8, addrspace 1) + ; GREEDY: S_ENDPGM 0 %val = call i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32> %rsrc, i32 %soffset, i32 0) store i256 %val, i256 addrspace(1)* undef ret void @@ -420,6 +745,35 @@ define amdgpu_ps void @s_buffer_load_i512_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) ; CHECK: G_STORE [[UV3]](s128), [[PTR_ADD2]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 48, align 8, addrspace 1) ; CHECK: S_ENDPGM 0 + ; GREEDY-LABEL: name: s_buffer_load_i512_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) + ; GREEDY: [[MV:%[0-9]+]]:vgpr(s512) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128), [[AMDGPU_BUFFER_LOAD2]](s128), [[AMDGPU_BUFFER_LOAD3]](s128) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128), [[UV2:%[0-9]+]]:vgpr(s128), [[UV3:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s512) + ; GREEDY: G_STORE [[UV]](s128), [[DEF]](p1) :: (store 16 into `i512 addrspace(1)* undef`, align 8, addrspace 1) + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GREEDY: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 16, align 8, addrspace 1) + ; GREEDY: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GREEDY: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) + ; GREEDY: G_STORE [[UV2]](s128), [[PTR_ADD1]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 32, align 8, addrspace 1) + ; GREEDY: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GREEDY: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; GREEDY: G_STORE [[UV3]](s128), [[PTR_ADD2]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 48, align 8, addrspace 1) + ; GREEDY: S_ENDPGM 0 %val = call i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32> %rsrc, i32 %soffset, i32 0) store i512 %val, i512 addrspace(1)* undef ret void @@ -448,6 +802,27 @@ define amdgpu_ps void @s_buffer_load_v16i16_vgpr_offset(<4 x i32> inreg %rsrc, i ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) ; CHECK: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store 16 into `<16 x i16> addrspace(1)* undef` + 16, align 32, addrspace 1) ; CHECK: S_ENDPGM 0 + ; GREEDY-LABEL: name: s_buffer_load_v16i16_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) + ; GREEDY: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store 16 into `<16 x i16> addrspace(1)* undef`, align 32, addrspace 1) + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GREEDY: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store 16 into `<16 x i16> addrspace(1)* undef` + 16, align 32, addrspace 1) + ; GREEDY: S_ENDPGM 0 %val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32> %rsrc, i32 %soffset, i32 0) store <16 x i16> %val, <16 x i16> addrspace(1)* undef ret void @@ -484,6 +859,35 @@ define amdgpu_ps void @s_buffer_load_v32i16_vgpr_offset(<4 x i32> inreg %rsrc, i ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) ; CHECK: G_STORE [[UV3]](<8 x s16>), [[PTR_ADD2]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 48, align 64, addrspace 1) ; CHECK: S_ENDPGM 0 + ; GREEDY-LABEL: name: s_buffer_load_v32i16_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>) + ; GREEDY: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef`, align 64, addrspace 1) + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GREEDY: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 16, align 64, addrspace 1) + ; GREEDY: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GREEDY: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) + ; GREEDY: G_STORE [[UV2]](<8 x s16>), [[PTR_ADD1]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 32, align 64, addrspace 1) + ; GREEDY: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GREEDY: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; GREEDY: G_STORE [[UV3]](<8 x s16>), [[PTR_ADD2]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 48, align 64, addrspace 1) + ; GREEDY: S_ENDPGM 0 %val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32> %rsrc, i32 %soffset, i32 0) store <32 x i16> %val, <32 x i16> addrspace(1)* undef ret void @@ -512,6 +916,27 @@ define amdgpu_ps void @s_buffer_load_v4i64_vgpr_offset(<4 x i32> inreg %rsrc, i3 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) ; CHECK: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store 16 into `<4 x i64> addrspace(1)* undef` + 16, align 32, addrspace 1) ; CHECK: S_ENDPGM 0 + ; GREEDY-LABEL: name: s_buffer_load_v4i64_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) + ; GREEDY: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store 16 into `<4 x i64> addrspace(1)* undef`, align 32, addrspace 1) + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GREEDY: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store 16 into `<4 x i64> addrspace(1)* undef` + 16, align 32, addrspace 1) + ; GREEDY: S_ENDPGM 0 %val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32> %rsrc, i32 %soffset, i32 0) store <4 x i64> %val, <4 x i64> addrspace(1)* undef ret void @@ -548,6 +973,35 @@ define amdgpu_ps void @s_buffer_load_v8i64_vgpr_offset(<4 x i32> inreg %rsrc, i3 ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) ; CHECK: G_STORE [[UV3]](<2 x s64>), [[PTR_ADD2]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 48, align 64, addrspace 1) ; CHECK: S_ENDPGM 0 + ; GREEDY-LABEL: name: s_buffer_load_v8i64_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) + ; GREEDY: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef`, align 64, addrspace 1) + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GREEDY: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 16, align 64, addrspace 1) + ; GREEDY: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GREEDY: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) + ; GREEDY: G_STORE [[UV2]](<2 x s64>), [[PTR_ADD1]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 32, align 64, addrspace 1) + ; GREEDY: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GREEDY: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; GREEDY: G_STORE [[UV3]](<2 x s64>), [[PTR_ADD2]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 48, align 64, addrspace 1) + ; GREEDY: S_ENDPGM 0 %val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32> %rsrc, i32 %soffset, i32 0) store <8 x i64> %val, <8 x i64> addrspace(1)* undef ret void @@ -576,6 +1030,27 @@ define amdgpu_ps void @s_buffer_load_v4p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) ; CHECK: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store 16 into `<4 x i8 addrspace(1)*> addrspace(1)* undef` + 16, align 32, addrspace 1) ; CHECK: S_ENDPGM 0 + ; GREEDY-LABEL: name: s_buffer_load_v4p1_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>) + ; GREEDY: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store 16 into `<4 x i8 addrspace(1)*> addrspace(1)* undef`, align 32, addrspace 1) + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GREEDY: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store 16 into `<4 x i8 addrspace(1)*> addrspace(1)* undef` + 16, align 32, addrspace 1) + ; GREEDY: S_ENDPGM 0 %val = call <4 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v4p1i8(<4 x i32> %rsrc, i32 %soffset, i32 0) store <4 x i8 addrspace(1)*> %val, <4 x i8 addrspace(1)*> addrspace(1)* undef ret void @@ -612,6 +1087,35 @@ define amdgpu_ps void @s_buffer_load_v8p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) ; CHECK: G_STORE [[UV3]](<2 x p1>), [[PTR_ADD2]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 48, align 64, addrspace 1) ; CHECK: S_ENDPGM 0 + ; GREEDY-LABEL: name: s_buffer_load_v8p1_vgpr_offset + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>) + ; GREEDY: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef`, align 64, addrspace 1) + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GREEDY: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 16, align 64, addrspace 1) + ; GREEDY: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GREEDY: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) + ; GREEDY: G_STORE [[UV2]](<2 x p1>), [[PTR_ADD1]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 32, align 64, addrspace 1) + ; GREEDY: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GREEDY: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; GREEDY: G_STORE [[UV3]](<2 x p1>), [[PTR_ADD2]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 48, align 64, addrspace 1) + ; GREEDY: S_ENDPGM 0 %val = call <8 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v8p1i8(<4 x i32> %rsrc, i32 %soffset, i32 0) store <8 x i8 addrspace(1)*> %val, <8 x i8 addrspace(1)*> addrspace(1)* undef ret void @@ -635,6 +1139,23 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg % ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -658,6 +1179,23 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg % ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -680,6 +1218,22 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg % ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -714,6 +1268,33 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32> ; CHECK: $vgpr6 = COPY [[UV6]](s32) ; CHECK: $vgpr7 = COPY [[UV7]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; GREEDY-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV]](s32) + ; GREEDY: $vgpr1 = COPY [[UV1]](s32) + ; GREEDY: $vgpr2 = COPY [[UV2]](s32) + ; GREEDY: $vgpr3 = COPY [[UV3]](s32) + ; GREEDY: $vgpr4 = COPY [[UV4]](s32) + ; GREEDY: $vgpr5 = COPY [[UV5]](s32) + ; GREEDY: $vgpr6 = COPY [[UV6]](s32) + ; GREEDY: $vgpr7 = COPY [[UV7]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4064 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -747,6 +1328,32 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32> ; CHECK: $vgpr6 = COPY [[UV6]](s32) ; CHECK: $vgpr7 = COPY [[UV7]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; GREEDY-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV]](s32) + ; GREEDY: $vgpr1 = COPY [[UV1]](s32) + ; GREEDY: $vgpr2 = COPY [[UV2]](s32) + ; GREEDY: $vgpr3 = COPY [[UV3]](s32) + ; GREEDY: $vgpr4 = COPY [[UV4]](s32) + ; GREEDY: $vgpr5 = COPY [[UV5]](s32) + ; GREEDY: $vgpr6 = COPY [[UV6]](s32) + ; GREEDY: $vgpr7 = COPY [[UV7]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4068 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -790,6 +1397,43 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3 ; CHECK: $vgpr14 = COPY [[UV14]](s32) ; CHECK: $vgpr15 = COPY [[UV15]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 + ; GREEDY-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV]](s32) + ; GREEDY: $vgpr1 = COPY [[UV1]](s32) + ; GREEDY: $vgpr2 = COPY [[UV2]](s32) + ; GREEDY: $vgpr3 = COPY [[UV3]](s32) + ; GREEDY: $vgpr4 = COPY [[UV4]](s32) + ; GREEDY: $vgpr5 = COPY [[UV5]](s32) + ; GREEDY: $vgpr6 = COPY [[UV6]](s32) + ; GREEDY: $vgpr7 = COPY [[UV7]](s32) + ; GREEDY: $vgpr8 = COPY [[UV8]](s32) + ; GREEDY: $vgpr9 = COPY [[UV9]](s32) + ; GREEDY: $vgpr10 = COPY [[UV10]](s32) + ; GREEDY: $vgpr11 = COPY [[UV11]](s32) + ; GREEDY: $vgpr12 = COPY [[UV12]](s32) + ; GREEDY: $vgpr13 = COPY [[UV13]](s32) + ; GREEDY: $vgpr14 = COPY [[UV14]](s32) + ; GREEDY: $vgpr15 = COPY [[UV15]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %soffset = add i32 %soffset.base, 4032 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x float> %val @@ -832,6 +1476,42 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3 ; CHECK: $vgpr14 = COPY [[UV14]](s32) ; CHECK: $vgpr15 = COPY [[UV15]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 + ; GREEDY-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV]](s32) + ; GREEDY: $vgpr1 = COPY [[UV1]](s32) + ; GREEDY: $vgpr2 = COPY [[UV2]](s32) + ; GREEDY: $vgpr3 = COPY [[UV3]](s32) + ; GREEDY: $vgpr4 = COPY [[UV4]](s32) + ; GREEDY: $vgpr5 = COPY [[UV5]](s32) + ; GREEDY: $vgpr6 = COPY [[UV6]](s32) + ; GREEDY: $vgpr7 = COPY [[UV7]](s32) + ; GREEDY: $vgpr8 = COPY [[UV8]](s32) + ; GREEDY: $vgpr9 = COPY [[UV9]](s32) + ; GREEDY: $vgpr10 = COPY [[UV10]](s32) + ; GREEDY: $vgpr11 = COPY [[UV11]](s32) + ; GREEDY: $vgpr12 = COPY [[UV12]](s32) + ; GREEDY: $vgpr13 = COPY [[UV13]](s32) + ; GREEDY: $vgpr14 = COPY [[UV14]](s32) + ; GREEDY: $vgpr15 = COPY [[UV15]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %soffset = add i32 %soffset.base, 4036 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x float> %val @@ -878,6 +1558,45 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; CHECK: bb.4: ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_rsrc + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; GREEDY: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2 + ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %8(s32), %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val } @@ -924,6 +1643,46 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; CHECK: bb.4: ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 + ; GREEDY: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; GREEDY: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %18, %bb.2 + ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -972,6 +1731,47 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; CHECK: bb.4: ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; GREEDY: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; GREEDY: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %19, %bb.2 + ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -1018,6 +1818,45 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; CHECK: bb.4: ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; GREEDY: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2 + ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %7(s32), %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load 4 + 4095, align 1) + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0) ret float %val } @@ -1063,6 +1902,45 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; CHECK: bb.4: ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; GREEDY: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2 + ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %7(s32), %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0) ret float %val } @@ -1083,16 +1961,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1122,6 +1996,54 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; CHECK: $vgpr6 = COPY [[UV8]](s32) ; CHECK: $vgpr7 = COPY [[UV9]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; GREEDY-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 + ; GREEDY: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GREEDY: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV2]](s32) + ; GREEDY: $vgpr1 = COPY [[UV3]](s32) + ; GREEDY: $vgpr2 = COPY [[UV4]](s32) + ; GREEDY: $vgpr3 = COPY [[UV5]](s32) + ; GREEDY: $vgpr4 = COPY [[UV6]](s32) + ; GREEDY: $vgpr5 = COPY [[UV7]](s32) + ; GREEDY: $vgpr6 = COPY [[UV8]](s32) + ; GREEDY: $vgpr7 = COPY [[UV9]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4064 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -1144,16 +2066,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1183,6 +2101,55 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; CHECK: $vgpr6 = COPY [[UV8]](s32) ; CHECK: $vgpr7 = COPY [[UV9]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; GREEDY-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 + ; GREEDY: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GREEDY: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV2]](s32) + ; GREEDY: $vgpr1 = COPY [[UV3]](s32) + ; GREEDY: $vgpr2 = COPY [[UV4]](s32) + ; GREEDY: $vgpr3 = COPY [[UV5]](s32) + ; GREEDY: $vgpr4 = COPY [[UV6]](s32) + ; GREEDY: $vgpr5 = COPY [[UV7]](s32) + ; GREEDY: $vgpr6 = COPY [[UV8]](s32) + ; GREEDY: $vgpr7 = COPY [[UV9]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4068 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -1203,16 +2170,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1242,6 +2205,55 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; CHECK: $vgpr6 = COPY [[UV8]](s32) ; CHECK: $vgpr7 = COPY [[UV9]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; GREEDY-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; GREEDY: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GREEDY: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV2]](s32) + ; GREEDY: $vgpr1 = COPY [[UV3]](s32) + ; GREEDY: $vgpr2 = COPY [[UV4]](s32) + ; GREEDY: $vgpr3 = COPY [[UV5]](s32) + ; GREEDY: $vgpr4 = COPY [[UV6]](s32) + ; GREEDY: $vgpr5 = COPY [[UV7]](s32) + ; GREEDY: $vgpr6 = COPY [[UV8]](s32) + ; GREEDY: $vgpr7 = COPY [[UV9]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4096 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -1261,16 +2273,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1300,6 +2308,54 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; CHECK: $vgpr6 = COPY [[UV8]](s32) ; CHECK: $vgpr7 = COPY [[UV9]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; GREEDY-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GREEDY: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV2]](s32) + ; GREEDY: $vgpr1 = COPY [[UV3]](s32) + ; GREEDY: $vgpr2 = COPY [[UV4]](s32) + ; GREEDY: $vgpr3 = COPY [[UV5]](s32) + ; GREEDY: $vgpr4 = COPY [[UV6]](s32) + ; GREEDY: $vgpr5 = COPY [[UV7]](s32) + ; GREEDY: $vgpr6 = COPY [[UV8]](s32) + ; GREEDY: $vgpr7 = COPY [[UV9]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 5000 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -1319,16 +2375,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1358,6 +2410,54 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; CHECK: $vgpr6 = COPY [[UV8]](s32) ; CHECK: $vgpr7 = COPY [[UV9]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; GREEDY-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GREEDY: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV2]](s32) + ; GREEDY: $vgpr1 = COPY [[UV3]](s32) + ; GREEDY: $vgpr2 = COPY [[UV4]](s32) + ; GREEDY: $vgpr3 = COPY [[UV5]](s32) + ; GREEDY: $vgpr4 = COPY [[UV6]](s32) + ; GREEDY: $vgpr5 = COPY [[UV7]](s32) + ; GREEDY: $vgpr6 = COPY [[UV8]](s32) + ; GREEDY: $vgpr7 = COPY [[UV9]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 4076 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -1377,16 +2477,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1416,6 +2512,54 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; CHECK: $vgpr6 = COPY [[UV8]](s32) ; CHECK: $vgpr7 = COPY [[UV9]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; GREEDY-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GREEDY: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV2]](s32) + ; GREEDY: $vgpr1 = COPY [[UV3]](s32) + ; GREEDY: $vgpr2 = COPY [[UV4]](s32) + ; GREEDY: $vgpr3 = COPY [[UV5]](s32) + ; GREEDY: $vgpr4 = COPY [[UV6]](s32) + ; GREEDY: $vgpr5 = COPY [[UV7]](s32) + ; GREEDY: $vgpr6 = COPY [[UV8]](s32) + ; GREEDY: $vgpr7 = COPY [[UV9]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 4080 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -1434,16 +2578,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1473,6 +2613,53 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; CHECK: $vgpr6 = COPY [[UV8]](s32) ; CHECK: $vgpr7 = COPY [[UV9]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; GREEDY-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064 + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load 16 + 4064, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load 16 + 4064, align 4) + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GREEDY: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GREEDY: $vgpr0 = COPY [[UV2]](s32) + ; GREEDY: $vgpr1 = COPY [[UV3]](s32) + ; GREEDY: $vgpr2 = COPY [[UV4]](s32) + ; GREEDY: $vgpr3 = COPY [[UV5]](s32) + ; GREEDY: $vgpr4 = COPY [[UV6]](s32) + ; GREEDY: $vgpr5 = COPY [[UV7]](s32) + ; GREEDY: $vgpr6 = COPY [[UV8]](s32) + ; GREEDY: $vgpr7 = COPY [[UV9]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 4064, i32 0) ret <8 x float> %val } @@ -1494,6 +2681,22 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg % ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset = add i32 %offset.v, %offset.s %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) ret float %val @@ -1516,6 +2719,22 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg % ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset = add i32 %offset.s, %offset.v %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) ret float %val @@ -1542,6 +2761,26 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, %offset.s %offset = add i32 %offset.base, 1024 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) @@ -1569,6 +2808,26 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, %offset.v %offset = add i32 %offset.base, 1024 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) @@ -1595,6 +2854,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; GREEDY: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY5]], [[C]] + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, 1024 %offset = add i32 %offset.base, %offset.v %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) @@ -1621,6 +2898,25 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] + ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, 1024 %offset = add i32 %offset.base, %offset.s %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index dad8a5ac58e8d..26a8d81120548 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4999,24 +4999,22 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX6-NEXT: s_brev_b32 s8, 1 -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: v_add_i32_e64 v4, s[6:7], 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v10, 1 +; GFX6-NEXT: v_add_i32_e64 v1, s[6:7], 0, v0 ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GFX6-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v2, v6 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 -; GFX6-NEXT: v_add_i32_e64 v6, s[6:7], 0, v2 -; GFX6-NEXT: v_addc_u32_e64 v3, s[6:7], v2, v3, s[6:7] +; GFX6-NEXT: v_add_i32_e64 v3, s[6:7], 0, v2 +; GFX6-NEXT: v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7] ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i64: @@ -5027,24 +5025,22 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX8-NEXT: s_brev_b32 s8, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], 0, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v10, 1 +; GFX8-NEXT: v_add_u32_e64 v1, s[6:7], 0, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GFX8-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v2, v6 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX8-NEXT: v_mov_b32_e32 v3, s8 -; GFX8-NEXT: v_add_u32_e64 v6, s[6:7], 0, v2 -; GFX8-NEXT: v_addc_u32_e64 v3, s[6:7], v2, v3, s[6:7] +; GFX8-NEXT: v_add_u32_e64 v3, s[6:7], 0, v2 +; GFX8-NEXT: v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7] ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v2i64: @@ -5055,56 +5051,53 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX9-NEXT: s_brev_b32 s8, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], 0, v0 +; GFX9-NEXT: v_bfrev_b32_e32 v10, 1 +; GFX9-NEXT: v_add_co_u32_e64 v1, s[6:7], 0, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[6:7], v0, v1, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v7, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-NEXT: v_add_co_u32_e64 v6, s[6:7], 0, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[6:7], v2, v3, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[6:7], 0, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[6:7], v2, v10, s[6:7] ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v13, v2 -; GFX10-NEXT: v_mov_b32_e32 v14, v3 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_mov_b32_e32 v15, v1 +; GFX10-NEXT: v_mov_b32_e32 v17, v2 +; GFX10-NEXT: v_mov_b32_e32 v18, v3 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_add_co_u32_e64 v19, vcc_lo, v9, v4 -; GFX10-NEXT: s_brev_b32 s8, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, v10, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v23, vcc_lo, v13, v6 +; GFX10-NEXT: v_add_co_u32_e64 v8, vcc_lo, v14, v4 ; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, v14, v7, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v15, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v19, vcc_lo, v17, v6 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, v18, v7, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[14:15] ; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v20 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10] -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v24 -; GFX10-NEXT: v_add_co_u32_e64 v4, s5, v0, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s5, s8, v0, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[23:24], v[13:14] -; GFX10-NEXT: v_add_co_u32_e64 v2, s7, v1, 0 +; GFX10-NEXT: v_add_co_u32_e64 v1, s5, v12, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[19:20], v[17:18] +; GFX10-NEXT: v_add_co_u32_e64 v2, s7, v0, 0 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, s8, v1, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v20, v5, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v23, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v24, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -6225,15 +6218,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v19 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0, v0 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: s_brev_b32 s4, 1 -; GFX6-NEXT: v_mov_b32_e32 v8, s4 +; GFX6-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v20, vcc ; GFX6-NEXT: v_and_b32_e32 v8, 1, v10 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc @@ -6248,43 +6240,42 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-NEXT: s_cmp_lt_u32 s6, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] ; GFX6-NEXT: s_cmp_eq_u32 s6, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, 0, v[12:13] -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] ; GFX6-NEXT: v_ashr_i64 v[12:13], v[10:11], s6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; GFX6-NEXT: s_and_b32 s5, 1, s5 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX6-NEXT: v_xor_b32_e32 v14, v5, v4 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], s6 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[10:11], s8 -; GFX6-NEXT: s_and_b32 s6, 1, s5 +; GFX6-NEXT: s_and_b32 s6, 1, s4 ; GFX6-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v7, v5, v7 ; GFX6-NEXT: v_ashr_i64 v[4:5], v[10:11], s7 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX6-NEXT: s_and_b32 s6, 1, s9 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX6-NEXT: s_and_b32 s5, 1, s5 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: s_and_b32 s4, 1, s4 +; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v11 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v15, v12, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GFX6-NEXT: v_mov_b32_e32 v12, s4 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v20, vcc ; GFX6-NEXT: v_and_b32_e32 v12, 1, v14 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc @@ -6334,15 +6325,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v19 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: s_brev_b32 s4, 1 -; GFX8-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v20, vcc ; GFX8-NEXT: v_and_b32_e32 v8, 1, v10 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc @@ -6357,43 +6347,42 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-NEXT: s_cmp_lt_u32 s6, 64 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] ; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, 0, v[12:13] -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] ; GFX8-NEXT: v_ashrrev_i64 v[12:13], s6, v[10:11] ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; GFX8-NEXT: s_and_b32 s5, 1, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX8-NEXT: v_xor_b32_e32 v14, v5, v4 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], s6, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], s8, v[10:11] -; GFX8-NEXT: s_and_b32 s6, 1, s5 +; GFX8-NEXT: s_and_b32 s6, 1, s4 ; GFX8-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v7, v5, v7 ; GFX8-NEXT: v_ashrrev_i64 v[4:5], s7, v[10:11] ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX8-NEXT: s_and_b32 s6, 1, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX8-NEXT: s_and_b32 s5, 1, s5 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: s_and_b32 s4, 1, s4 +; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0, v4 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GFX8-NEXT: v_mov_b32_e32 v12, s4 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v20, vcc ; GFX8-NEXT: v_and_b32_e32 v12, 1, v14 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc @@ -6443,15 +6432,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v19 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: s_brev_b32 s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v20, vcc ; GFX9-NEXT: v_and_b32_e32 v8, 1, v10 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc @@ -6466,43 +6454,42 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: s_cmp_lt_u32 s6, 64 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] ; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, 0, v[12:13] -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: v_ashrrev_i64 v[12:13], s6, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; GFX9-NEXT: s_and_b32 s5, 1, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_xor_b32_e32 v14, v5, v4 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], s6, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], s8, v[10:11] -; GFX9-NEXT: s_and_b32 s6, 1, s5 +; GFX9-NEXT: s_and_b32 s6, 1, s4 ; GFX9-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v7, v5, v7 ; GFX9-NEXT: v_ashrrev_i64 v[4:5], s7, v[10:11] ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX9-NEXT: s_and_b32 s6, 1, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX9-NEXT: s_and_b32 s5, 1, s5 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: s_and_b32 s4, 1, s4 +; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v12, s4 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v12, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v20, vcc ; GFX9-NEXT: v_and_b32_e32 v12, 1, v14 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc @@ -6561,7 +6548,6 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8 ; GFX10-NEXT: v_xor_b32_e32 v9, v10, v20 -; GFX10-NEXT: s_brev_b32 s8, 1 ; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc_lo @@ -6571,7 +6557,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, s8, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 ; GFX10-NEXT: v_add_co_u32_e64 v8, s4, v26, v12 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v27, v13, s4 @@ -6619,7 +6605,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v7 ; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, 0, v3, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v21, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s4, s8, v4, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s4, 0x80000000, v4, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v5, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v7, s5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index 57737aeb886fa..3aee949b5bde6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -50,20 +50,16 @@ define i32 @v_sdiv_i32(i32 %num, i32 %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v1, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; CGP-NEXT: v_mul_lo_u32 v5, v0, 0 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v6, v2, 0 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v3 +; CGP-NEXT: v_mul_lo_u32 v5, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 ; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 @@ -127,34 +123,29 @@ define amdgpu_ps i32 @s_sdiv_i32(i32 inreg %num, i32 inreg %den) { ; CGP-NEXT: s_add_i32 s0, s0, s2 ; CGP-NEXT: s_add_i32 s1, s1, s3 ; CGP-NEXT: s_xor_b32 s0, s0, s2 -; CGP-NEXT: s_xor_b32 s5, s1, s3 -; CGP-NEXT: v_cvt_f32_u32_e32 v0, s5 -; CGP-NEXT: s_sub_i32 s1, 0, s5 -; CGP-NEXT: s_bfe_u64 s[2:3], s[0:1], 0x200000 +; CGP-NEXT: s_xor_b32 s2, s1, s3 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, s2 +; CGP-NEXT: s_sub_i32 s1, 0, s2 ; CGP-NEXT: v_rcp_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, s2, 0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v2, s1, v0 -; CGP-NEXT: v_mul_lo_u32 v3, v0, 0 -; CGP-NEXT: v_mul_lo_u32 v4, 0, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_lo_u32 v2, s3, v0 -; CGP-NEXT: v_mul_hi_u32 v0, s2, v0 +; CGP-NEXT: v_mul_lo_u32 v1, s1, v0 +; CGP-NEXT: v_mul_lo_u32 v2, 0, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v1, 0, v0 +; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, s5 +; CGP-NEXT: v_mul_lo_u32 v1, v0, s2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_subrev_i32_e64 v2, s[0:1], s5, v1 +; CGP-NEXT: v_subrev_i32_e64 v2, s[0:1], s2, v1 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, s4, v0 ; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 @@ -246,36 +237,28 @@ define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; CGP-NEXT: v_mul_lo_u32 v6, v0, 0 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v11, v1, 0 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; CGP-NEXT: v_rcp_f32_e32 v4, v4 -; CGP-NEXT: v_rcp_f32_e32 v7, v7 +; CGP-NEXT: v_rcp_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v4, 0 -; CGP-NEXT: v_mul_lo_u32 v10, v10, v7 -; CGP-NEXT: v_mul_lo_u32 v13, v7, 0 -; CGP-NEXT: v_mul_lo_u32 v14, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 +; CGP-NEXT: v_mul_lo_u32 v10, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v15, 0, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; CGP-NEXT: v_mul_lo_u32 v11, 0, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v10 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v10, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 @@ -715,42 +698,34 @@ define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 ; CGP-NEXT: v_xor_b32_e32 v4, v4, v6 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_mul_lo_u32 v8, v0, 0 ; CGP-NEXT: v_xor_b32_e32 v5, v5, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CGP-NEXT: v_mul_lo_u32 v9, v1, 0 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v10, v3 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 ; CGP-NEXT: v_rcp_f32_e32 v6, v6 -; CGP-NEXT: v_rcp_f32_e32 v10, v10 +; CGP-NEXT: v_rcp_f32_e32 v8, v8 ; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; CGP-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; CGP-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v12, v6, 0 -; CGP-NEXT: v_mul_lo_u32 v11, v11, v10 -; CGP-NEXT: v_mul_lo_u32 v13, v10, 0 -; CGP-NEXT: v_mul_lo_u32 v14, 0, v7 +; CGP-NEXT: v_mul_lo_u32 v9, v9, v8 +; CGP-NEXT: v_mul_lo_u32 v10, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v15, 0, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v10, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v10, 0, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v6 ; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v11, 0, v7 +; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_mul_lo_u32 v8, v6, v2 @@ -828,20 +803,16 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) { ; CGP-NEXT: v_and_b32_e32 v1, s4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; CGP-NEXT: v_mul_lo_u32 v4, v0, 0 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v2, 0 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v3 +; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 @@ -937,36 +908,28 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) { ; CGP-NEXT: v_and_b32_e32 v3, s4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; CGP-NEXT: v_mul_lo_u32 v6, v0, 0 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v1, 0 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; CGP-NEXT: v_rcp_f32_e32 v4, v4 -; CGP-NEXT: v_rcp_f32_e32 v7, v7 +; CGP-NEXT: v_rcp_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v4, 0 -; CGP-NEXT: v_mul_lo_u32 v8, v8, v7 -; CGP-NEXT: v_mul_lo_u32 v11, v7, 0 -; CGP-NEXT: v_mul_lo_u32 v12, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v8 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index d2e7328a384fe..f188fc05f3637 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -357,9 +357,10 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: BB1_2: ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: BB1_3: ; %Flow -; CHECK-NEXT: s_and_b32 s0, s1, 1 +; CHECK-NEXT: s_xor_b32 s0, s1, -1 +; CHECK-NEXT: s_and_b32 s0, s0, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 BB1_5 +; CHECK-NEXT: s_cbranch_scc1 BB1_5 ; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s4 ; CHECK-NEXT: s_sub_i32 s0, 0, s4 @@ -1056,10 +1057,9 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) { ; CHECK-LABEL: v_sdiv_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s6 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_mov_b32 s7, 0xfffff000 +; CHECK-NEXT: s_movk_i32 s6, 0xf000 ; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -1074,9 +1074,9 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 ; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s7, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 @@ -1103,9 +1103,9 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s7, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 +; CHECK-NEXT: v_mul_hi_u32 v10, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, s6, v2 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 @@ -1113,6 +1113,7 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_mul_lo_u32 v10, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v5, v2, v9 ; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 +; CHECK-NEXT: s_movk_i32 s6, 0x1000 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 @@ -1501,10 +1502,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s6, 0x1000 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, s6 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; CGP-NEXT: s_mov_b32 s7, 0xfffff000 +; CGP-NEXT: s_movk_i32 s6, 0xf000 ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v7, v4 ; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 @@ -1519,19 +1519,19 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -1552,9 +1552,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s7, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s7, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 +; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 @@ -1562,7 +1562,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 @@ -1587,6 +1587,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -1605,9 +1606,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s7, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 @@ -1616,8 +1617,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v10 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 -; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 +; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s7, v0 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 @@ -1626,7 +1627,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc @@ -1645,9 +1646,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s7, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1676,9 +1677,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc ; CGP-NEXT: v_mul_lo_u32 v10, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s7, v4 -; CGP-NEXT: v_mul_lo_u32 v12, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v4 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 @@ -1733,9 +1734,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v8, s6, v5 -; CGP-NEXT: v_mul_hi_u32 v10, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v8, s7, v5 +; CGP-NEXT: v_mul_hi_u32 v10, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s7, v4 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 @@ -1744,8 +1745,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v2 -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s6, v2 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 +; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s7, v2 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 @@ -1754,7 +1755,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc @@ -1779,10 +1780,9 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-LABEL: v_sdiv_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s6 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_mov_b32 s7, 0xffed2705 +; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 ; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -1797,9 +1797,9 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 ; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s7, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 @@ -1826,9 +1826,9 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s7, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 +; CHECK-NEXT: v_mul_hi_u32 v10, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, s6, v2 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 @@ -1836,6 +1836,7 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_mul_lo_u32 v10, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v5, v2, v9 ; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 +; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 @@ -2224,10 +2225,9 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s6, 0x12d8fb -; CGP-NEXT: v_cvt_f32_u32_e32 v4, s6 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; CGP-NEXT: s_mov_b32 s7, 0xffed2705 +; CGP-NEXT: s_mov_b32 s6, 0xffed2705 ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v7, v4 ; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 @@ -2242,19 +2242,19 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -2275,9 +2275,9 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s7, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s7, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 +; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 @@ -2285,7 +2285,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 @@ -2310,6 +2310,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -2328,9 +2329,9 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s7, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 @@ -2339,8 +2340,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v10 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 -; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 +; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s7, v0 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 @@ -2349,7 +2350,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc @@ -2368,9 +2369,9 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s7, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -2399,9 +2400,9 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc ; CGP-NEXT: v_mul_lo_u32 v10, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s7, v4 -; CGP-NEXT: v_mul_lo_u32 v12, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v4 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 @@ -2456,9 +2457,9 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v8, s6, v5 -; CGP-NEXT: v_mul_hi_u32 v10, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v8, s7, v5 +; CGP-NEXT: v_mul_hi_u32 v10, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s7, v4 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 @@ -2467,8 +2468,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v2 -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s6, v2 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 +; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s7, v2 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 @@ -2477,7 +2478,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index b2f3dd8b2bf41..74832a1cfb257 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -10,7 +10,7 @@ define amdgpu_ps i64 @s_shl_i64_zext_i32(i32 inreg %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_andn2_b32 s0, s0, -2.0 ; GCN-NEXT: s_lshl_b32 s0, s0, 2 -; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog %and = and i32 %x, 1073741823 %ext = zext i32 %and to i64 @@ -37,7 +37,7 @@ define amdgpu_ps i64 @s_shl_i64_sext_i32(i32 inreg %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_and_b32 s0, s0, 0x1fffffff ; GCN-NEXT: s_lshl_b32 s0, s0, 2 -; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog %and = and i32 %x, 536870911 %ext = sext i32 %and to i64 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index f58e26604529e..ff16d8a6fffaa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -82,14 +82,14 @@ define amdgpu_ps i8 @s_shl_i8_7(i8 inreg %value) { ; ; GFX8-LABEL: s_shl_i8_7: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, 7 +; GFX8-NEXT: s_bfe_u32 s1, 7, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_i8_7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, 7 +; GFX9-NEXT: s_bfe_u32 s1, 7, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog %result = shl i8 %value, 7 ret i8 %result @@ -426,14 +426,14 @@ define amdgpu_ps i16 @s_shl_i16_15(i16 inreg %value) { ; ; GFX8-LABEL: s_shl_i16_15: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s0, s0, 15 +; GFX8-NEXT: s_bfe_u32 s1, 15, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_i16_15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_lshl_b32 s0, s0, 15 +; GFX9-NEXT: s_bfe_u32 s1, 15, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog %result = shl i16 %value, 15 ret i16 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll index 320d814be8a94..ec1b610fdd819 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -46,20 +46,16 @@ define i32 @v_srem_i32(i32 %num, i32 %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v1, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 ; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 -; CGP-NEXT: v_mul_lo_u32 v5, v0, 0 ; CGP-NEXT: v_rcp_f32_e32 v3, v3 ; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v3, 0 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 +; CGP-NEXT: v_mul_lo_u32 v5, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -112,29 +108,24 @@ define amdgpu_ps i32 @s_srem_i32(i32 inreg %num, i32 inreg %den) { ; ; CGP-LABEL: s_srem_i32: ; CGP: ; %bb.0: -; CGP-NEXT: s_ashr_i32 s4, s0, 31 -; CGP-NEXT: s_ashr_i32 s2, s1, 31 -; CGP-NEXT: s_add_i32 s0, s0, s4 -; CGP-NEXT: s_add_i32 s1, s1, s2 -; CGP-NEXT: s_xor_b32 s0, s0, s4 -; CGP-NEXT: s_xor_b32 s1, s1, s2 +; CGP-NEXT: s_ashr_i32 s2, s0, 31 +; CGP-NEXT: s_ashr_i32 s3, s1, 31 +; CGP-NEXT: s_add_i32 s0, s0, s2 +; CGP-NEXT: s_add_i32 s1, s1, s3 +; CGP-NEXT: s_xor_b32 s0, s0, s2 +; CGP-NEXT: s_xor_b32 s1, s1, s3 ; CGP-NEXT: v_cvt_f32_u32_e32 v0, s1 -; CGP-NEXT: s_sub_i32 s5, 0, s1 -; CGP-NEXT: s_bfe_u64 s[2:3], s[0:1], 0x200000 +; CGP-NEXT: s_sub_i32 s3, 0, s1 ; CGP-NEXT: v_rcp_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, s2, 0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v2, s5, v0 -; CGP-NEXT: v_mul_lo_u32 v3, v0, 0 -; CGP-NEXT: v_mul_lo_u32 v4, 0, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_lo_u32 v2, s3, v0 -; CGP-NEXT: v_mul_hi_u32 v0, s2, v0 +; CGP-NEXT: v_mul_lo_u32 v1, s3, v0 +; CGP-NEXT: v_mul_lo_u32 v2, 0, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v1, 0, v0 +; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, s1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 @@ -144,8 +135,8 @@ define amdgpu_ps i32 @s_srem_i32(i32 inreg %num, i32 inreg %den) { ; CGP-NEXT: v_subrev_i32_e32 v1, vcc, s1, v0 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 +; CGP-NEXT: v_xor_b32_e32 v0, s2, v0 +; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 ; CGP-NEXT: v_readfirstlane_b32 s0, v0 ; CGP-NEXT: ; return to shader part epilog %result = srem i32 %num, %den @@ -226,36 +217,28 @@ define <2 x i32> @v_srem_v2i32(<2 x i32> %num, <2 x i32> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 -; CGP-NEXT: v_mul_lo_u32 v8, v0, 0 -; CGP-NEXT: v_cvt_f32_u32_e32 v9, v3 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v11, v1, 0 +; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 ; CGP-NEXT: v_rcp_f32_e32 v5, v5 -; CGP-NEXT: v_rcp_f32_e32 v9, v9 +; CGP-NEXT: v_rcp_f32_e32 v8, v8 ; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 -; CGP-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; CGP-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v12, v5, 0 -; CGP-NEXT: v_mul_lo_u32 v10, v10, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v9, 0 -; CGP-NEXT: v_mul_lo_u32 v14, 0, v7 +; CGP-NEXT: v_mul_lo_u32 v9, v9, v8 +; CGP-NEXT: v_mul_lo_u32 v10, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v5, v7 -; CGP-NEXT: v_mul_lo_u32 v15, 0, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v10 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v0, v5 -; CGP-NEXT: v_mul_lo_u32 v10, 0, v7 +; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v2 @@ -661,41 +644,33 @@ define <2 x i32> @v_srem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) { ; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_mul_lo_u32 v8, v0, 0 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CGP-NEXT: v_mul_lo_u32 v9, v1, 0 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v10, v3 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 ; CGP-NEXT: v_rcp_f32_e32 v6, v6 -; CGP-NEXT: v_rcp_f32_e32 v10, v10 +; CGP-NEXT: v_rcp_f32_e32 v8, v8 ; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; CGP-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; CGP-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v12, v6, 0 -; CGP-NEXT: v_mul_lo_u32 v11, v11, v10 -; CGP-NEXT: v_mul_lo_u32 v13, v10, 0 -; CGP-NEXT: v_mul_lo_u32 v14, 0, v7 +; CGP-NEXT: v_mul_lo_u32 v9, v9, v8 +; CGP-NEXT: v_mul_lo_u32 v10, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v15, 0, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v10, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v10, 0, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v6 ; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v11, 0, v7 +; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_mul_lo_u32 v6, v6, v2 @@ -766,20 +741,16 @@ define i32 @v_srem_i32_24bit(i32 %num, i32 %den) { ; CGP-NEXT: v_and_b32_e32 v1, s4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; CGP-NEXT: v_mul_lo_u32 v4, v0, 0 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v2, 0 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v3 +; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 @@ -867,36 +838,28 @@ define <2 x i32> @v_srem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) { ; CGP-NEXT: v_and_b32_e32 v3, s4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; CGP-NEXT: v_mul_lo_u32 v6, v0, 0 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v1, 0 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; CGP-NEXT: v_rcp_f32_e32 v4, v4 -; CGP-NEXT: v_rcp_f32_e32 v7, v7 +; CGP-NEXT: v_rcp_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v4, 0 -; CGP-NEXT: v_mul_lo_u32 v8, v8, v7 -; CGP-NEXT: v_mul_lo_u32 v11, v7, 0 -; CGP-NEXT: v_mul_lo_u32 v12, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v8 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index cbb77b54aba55..f769b826b1ea8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -351,9 +351,10 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: BB1_2: ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: BB1_3: ; %Flow -; CHECK-NEXT: s_and_b32 s0, s1, 1 +; CHECK-NEXT: s_xor_b32 s0, s1, -1 +; CHECK-NEXT: s_and_b32 s0, s0, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 BB1_5 +; CHECK-NEXT: s_cbranch_scc1 BB1_5 ; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s4 ; CHECK-NEXT: s_sub_i32 s0, 0, s4 @@ -1036,10 +1037,9 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-LABEL: v_srem_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s6 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_mov_b32 s7, 0xfffff000 +; CHECK-NEXT: s_movk_i32 s6, 0xf000 ; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -1054,9 +1054,9 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 ; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s7, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 @@ -1083,9 +1083,9 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s7, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 +; CHECK-NEXT: v_mul_hi_u32 v10, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, s6, v2 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 @@ -1093,6 +1093,7 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_mul_lo_u32 v10, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v5, v2, v9 ; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 +; CHECK-NEXT: s_movk_i32 s6, 0x1000 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 @@ -1477,10 +1478,9 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-LABEL: v_srem_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s6, 0x1000 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, s6 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; CGP-NEXT: s_mov_b32 s7, 0xfffff000 +; CGP-NEXT: s_movk_i32 s6, 0xf000 ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v7, v4 ; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 @@ -1495,19 +1495,19 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -1528,9 +1528,9 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s7, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s7, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 +; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 @@ -1538,7 +1538,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 @@ -1563,6 +1563,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -1581,9 +1582,9 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v7, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v8, s7, v8 +; CGP-NEXT: v_mul_lo_u32 v10, s7, v7 +; CGP-NEXT: v_mul_hi_u32 v7, s7, v7 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 @@ -1591,20 +1592,20 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s6, v0 +; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s7, v0 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s6, v9 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CGP-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; CGP-NEXT: v_subrev_i32_e32 v11, vcc, s6, v9 +; CGP-NEXT: v_subrev_i32_e32 v11, vcc, s7, v9 ; CGP-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc @@ -1618,9 +1619,9 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s7, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 @@ -1650,9 +1651,9 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc ; CGP-NEXT: v_mul_lo_u32 v10, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s7, v4 -; CGP-NEXT: v_mul_lo_u32 v12, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v4 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 @@ -1707,9 +1708,9 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v5, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s6, v4 -; CGP-NEXT: v_mul_hi_u32 v4, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v5, s7, v5 +; CGP-NEXT: v_mul_lo_u32 v8, s7, v4 +; CGP-NEXT: v_mul_hi_u32 v4, s7, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 @@ -1717,20 +1718,20 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v2 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s6, v2 +; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s7, v2 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s6, v7 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v7 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s6, v7 +; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s7, v7 ; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc @@ -1751,10 +1752,9 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-LABEL: v_srem_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s6 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_mov_b32 s7, 0xffed2705 +; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 ; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -1769,9 +1769,9 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 ; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s7, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 @@ -1798,9 +1798,9 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s7, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 +; CHECK-NEXT: v_mul_hi_u32 v10, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, s6, v2 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 @@ -1808,6 +1808,7 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_mul_lo_u32 v10, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v5, v2, v9 ; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 +; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 @@ -2192,10 +2193,9 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-LABEL: v_srem_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s6, 0x12d8fb -; CGP-NEXT: v_cvt_f32_u32_e32 v4, s6 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; CGP-NEXT: s_mov_b32 s7, 0xffed2705 +; CGP-NEXT: s_mov_b32 s6, 0xffed2705 ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v7, v4 ; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 @@ -2210,19 +2210,19 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -2243,9 +2243,9 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s7, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s7, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 +; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 @@ -2253,7 +2253,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 @@ -2278,6 +2278,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -2296,9 +2297,9 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v7, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v8, s7, v8 +; CGP-NEXT: v_mul_lo_u32 v10, s7, v7 +; CGP-NEXT: v_mul_hi_u32 v7, s7, v7 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 @@ -2306,20 +2307,20 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s6, v0 +; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s7, v0 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s6, v9 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CGP-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; CGP-NEXT: v_subrev_i32_e32 v11, vcc, s6, v9 +; CGP-NEXT: v_subrev_i32_e32 v11, vcc, s7, v9 ; CGP-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc @@ -2333,9 +2334,9 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s7, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 @@ -2365,9 +2366,9 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc ; CGP-NEXT: v_mul_lo_u32 v10, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s7, v4 -; CGP-NEXT: v_mul_lo_u32 v12, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v4 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 @@ -2422,9 +2423,9 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v5, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s6, v4 -; CGP-NEXT: v_mul_hi_u32 v4, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v5, s7, v5 +; CGP-NEXT: v_mul_lo_u32 v8, s7, v4 +; CGP-NEXT: v_mul_hi_u32 v4, s7, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 @@ -2432,20 +2433,20 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v2 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s6, v2 +; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s7, v2 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s6, v7 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v7 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s6, v7 +; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s7, v7 ; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index d2c65aa5a1784..76aa2f511b141 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4984,24 +4984,22 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX6-NEXT: s_brev_b32 s8, 1 -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: v_add_i32_e64 v4, s[6:7], 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v10, 1 +; GFX6-NEXT: v_add_i32_e64 v1, s[6:7], 0, v0 ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GFX6-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 -; GFX6-NEXT: v_add_i32_e64 v6, s[6:7], 0, v2 -; GFX6-NEXT: v_addc_u32_e64 v3, s[6:7], v2, v3, s[6:7] +; GFX6-NEXT: v_add_i32_e64 v3, s[6:7], 0, v2 +; GFX6-NEXT: v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7] ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i64: @@ -5012,24 +5010,22 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX8-NEXT: s_brev_b32 s8, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], 0, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v10, 1 +; GFX8-NEXT: v_add_u32_e64 v1, s[6:7], 0, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GFX8-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v2, v6 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX8-NEXT: v_mov_b32_e32 v3, s8 -; GFX8-NEXT: v_add_u32_e64 v6, s[6:7], 0, v2 -; GFX8-NEXT: v_addc_u32_e64 v3, s[6:7], v2, v3, s[6:7] +; GFX8-NEXT: v_add_u32_e64 v3, s[6:7], 0, v2 +; GFX8-NEXT: v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7] ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v2i64: @@ -5040,56 +5036,53 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX9-NEXT: s_brev_b32 s8, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], 0, v0 +; GFX9-NEXT: v_bfrev_b32_e32 v10, 1 +; GFX9-NEXT: v_add_co_u32_e64 v1, s[6:7], 0, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[6:7], v0, v1, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-NEXT: v_add_co_u32_e64 v6, s[6:7], 0, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[6:7], v2, v3, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[6:7], 0, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[6:7], v2, v10, s[6:7] ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v13, v2 -; GFX10-NEXT: v_mov_b32_e32 v14, v3 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_mov_b32_e32 v15, v1 +; GFX10-NEXT: v_mov_b32_e32 v17, v2 +; GFX10-NEXT: v_mov_b32_e32 v18, v3 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_sub_co_u32_e64 v19, vcc_lo, v9, v4 -; GFX10-NEXT: s_brev_b32 s8, 1 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v10, v5, vcc_lo -; GFX10-NEXT: v_sub_co_u32_e64 v23, vcc_lo, v13, v6 +; GFX10-NEXT: v_sub_co_u32_e64 v8, vcc_lo, v14, v4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_sub_co_ci_u32_e32 v24, vcc_lo, v14, v7, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v15, v5, vcc_lo +; GFX10-NEXT: v_sub_co_u32_e64 v19, vcc_lo, v17, v6 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v18, v7, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[14:15] ; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v20 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10] -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v24 -; GFX10-NEXT: v_add_co_u32_e64 v4, s5, v0, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s5, s8, v0, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[23:24], v[13:14] -; GFX10-NEXT: v_add_co_u32_e64 v2, s7, v1, 0 +; GFX10-NEXT: v_add_co_u32_e64 v1, s5, v12, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[19:20], v[17:18] +; GFX10-NEXT: v_add_co_u32_e64 v2, s7, v0, 0 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, s8, v1, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v20, v5, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v23, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v24, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -6210,15 +6203,14 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v19 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0, v0 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: s_brev_b32 s4, 1 -; GFX6-NEXT: v_mov_b32_e32 v8, s4 +; GFX6-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v20, vcc ; GFX6-NEXT: v_and_b32_e32 v8, 1, v10 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc @@ -6233,43 +6225,42 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-NEXT: s_cmp_lt_u32 s6, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] ; GFX6-NEXT: s_cmp_eq_u32 s6, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] ; GFX6-NEXT: v_ashr_i64 v[12:13], v[10:11], s6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; GFX6-NEXT: s_and_b32 s5, 1, s5 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX6-NEXT: v_xor_b32_e32 v14, v5, v4 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], s6 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[10:11], s8 -; GFX6-NEXT: s_and_b32 s6, 1, s5 +; GFX6-NEXT: s_and_b32 s6, 1, s4 ; GFX6-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v7, v5, v7 ; GFX6-NEXT: v_ashr_i64 v[4:5], v[10:11], s7 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX6-NEXT: s_and_b32 s6, 1, s9 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX6-NEXT: s_and_b32 s5, 1, s5 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: s_and_b32 s4, 1, s4 +; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v11 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v15, v12, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GFX6-NEXT: v_mov_b32_e32 v12, s4 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v20, vcc ; GFX6-NEXT: v_and_b32_e32 v12, 1, v14 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc @@ -6319,15 +6310,14 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v19 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: s_brev_b32 s4, 1 -; GFX8-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v20, vcc ; GFX8-NEXT: v_and_b32_e32 v8, 1, v10 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc @@ -6342,43 +6332,42 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-NEXT: s_cmp_lt_u32 s6, 64 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] ; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] ; GFX8-NEXT: v_ashrrev_i64 v[12:13], s6, v[10:11] ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; GFX8-NEXT: s_and_b32 s5, 1, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX8-NEXT: v_xor_b32_e32 v14, v5, v4 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], s6, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], s8, v[10:11] -; GFX8-NEXT: s_and_b32 s6, 1, s5 +; GFX8-NEXT: s_and_b32 s6, 1, s4 ; GFX8-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v7, v5, v7 ; GFX8-NEXT: v_ashrrev_i64 v[4:5], s7, v[10:11] ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX8-NEXT: s_and_b32 s6, 1, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX8-NEXT: s_and_b32 s5, 1, s5 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: s_and_b32 s4, 1, s4 +; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0, v4 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GFX8-NEXT: v_mov_b32_e32 v12, s4 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v20, vcc ; GFX8-NEXT: v_and_b32_e32 v12, 1, v14 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc @@ -6428,15 +6417,14 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v19 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: s_brev_b32 s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v20, vcc ; GFX9-NEXT: v_and_b32_e32 v8, 1, v10 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc @@ -6451,43 +6439,42 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: s_cmp_lt_u32 s6, 64 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] ; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: v_ashrrev_i64 v[12:13], s6, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; GFX9-NEXT: s_and_b32 s5, 1, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_xor_b32_e32 v14, v5, v4 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], s6, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], s8, v[10:11] -; GFX9-NEXT: s_and_b32 s6, 1, s5 +; GFX9-NEXT: s_and_b32 s6, 1, s4 ; GFX9-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v7, v5, v7 ; GFX9-NEXT: v_ashrrev_i64 v[4:5], s7, v[10:11] ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX9-NEXT: s_and_b32 s6, 1, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 -; GFX9-NEXT: s_and_b32 s5, 1, s5 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: s_and_b32 s4, 1, s4 +; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v12, s4 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v12, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v20, vcc ; GFX9-NEXT: v_and_b32_e32 v12, 1, v14 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc @@ -6546,7 +6533,6 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8 ; GFX10-NEXT: v_xor_b32_e32 v9, v10, v20 -; GFX10-NEXT: s_brev_b32 s8, 1 ; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc_lo @@ -6556,7 +6542,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, s8, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 ; GFX10-NEXT: v_sub_co_u32_e64 v8, s4, v26, v12 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s4, v27, v13, s4 @@ -6604,7 +6590,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v7 ; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, 0, v3, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v21, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s4, s8, v4, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s4, 0x80000000, v4, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v5, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v7, s5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 5f71277bb50e7..5b078d41e8d89 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -43,50 +43,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s5, s0, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s0, 24 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_lshr_b32 s0, s2, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NEXT: v_mov_b32_e32 v7, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: ds_write_b8 v1, v2 offset:1 -; GFX9-NEXT: ds_write_b8 v1, v3 offset:2 -; GFX9-NEXT: ds_write_b8 v1, v4 offset:3 -; GFX9-NEXT: ds_write_b8 v1, v5 offset:4 -; GFX9-NEXT: ds_write_b8 v1, v6 offset:5 -; GFX9-NEXT: ds_write_b8 v1, v7 offset:6 -; GFX9-NEXT: ds_write_b8 v1, v8 offset:7 ; GFX9-NEXT: s_lshr_b32 s4, s2, 24 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: s_lshr_b32 s0, s3, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s1, s3, 16 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s2, s3, 24 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: v_mov_b32_e32 v7, s1 -; GFX9-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX9-NEXT: ds_write_b8 v1, v2 offset:9 -; GFX9-NEXT: ds_write_b8 v1, v3 offset:10 -; GFX9-NEXT: ds_write_b8 v1, v4 offset:11 -; GFX9-NEXT: ds_write_b8 v1, v5 offset:12 -; GFX9-NEXT: ds_write_b8 v1, v6 offset:13 -; GFX9-NEXT: ds_write_b8 v1, v7 offset:14 -; GFX9-NEXT: ds_write_b8 v1, v8 offset:15 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:15 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align1: @@ -96,50 +96,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s5, s0, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: s_lshr_b32 s6, s0, 16 ; GFX7-NEXT: s_lshr_b32 s7, s0, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s0, s1, 8 -; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s4, s1, 16 ; GFX7-NEXT: s_lshr_b32 s5, s1, 24 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v6, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_lshr_b32 s0, s2, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s1, s2, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, s4 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v8, s5 -; GFX7-NEXT: ds_write_b8 v1, v0 -; GFX7-NEXT: ds_write_b8 v1, v2 offset:1 -; GFX7-NEXT: ds_write_b8 v1, v3 offset:2 -; GFX7-NEXT: ds_write_b8 v1, v4 offset:3 -; GFX7-NEXT: ds_write_b8 v1, v5 offset:4 -; GFX7-NEXT: ds_write_b8 v1, v6 offset:5 -; GFX7-NEXT: ds_write_b8 v1, v7 offset:6 -; GFX7-NEXT: ds_write_b8 v1, v8 offset:7 ; GFX7-NEXT: s_lshr_b32 s4, s2, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: s_lshr_b32 s0, s3, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:12 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s1, s3, 16 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:13 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s2, s3, 24 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-NEXT: v_mov_b32_e32 v6, s0 -; GFX7-NEXT: v_mov_b32_e32 v7, s1 -; GFX7-NEXT: v_mov_b32_e32 v8, s2 -; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX7-NEXT: ds_write_b8 v1, v2 offset:9 -; GFX7-NEXT: ds_write_b8 v1, v3 offset:10 -; GFX7-NEXT: ds_write_b8 v1, v4 offset:11 -; GFX7-NEXT: ds_write_b8 v1, v5 offset:12 -; GFX7-NEXT: ds_write_b8 v1, v6 offset:13 -; GFX7-NEXT: ds_write_b8 v1, v7 offset:14 -; GFX7-NEXT: ds_write_b8 v1, v8 offset:15 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:14 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:15 ; GFX7-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void @@ -152,26 +152,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: s_lshr_b32 s0, s3, 16 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NEXT: v_mov_b32_e32 v8, s0 -; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: ds_write_b16 v1, v2 offset:2 -; GFX9-NEXT: ds_write_b16 v1, v3 offset:4 -; GFX9-NEXT: ds_write_b16 v1, v4 offset:6 -; GFX9-NEXT: ds_write_b16 v1, v5 offset:8 -; GFX9-NEXT: ds_write_b16 v1, v6 offset:10 -; GFX9-NEXT: ds_write_b16 v1, v7 offset:12 -; GFX9-NEXT: ds_write_b16 v1, v8 offset:14 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:14 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align2: @@ -181,26 +181,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s5, s0, 16 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s5, s0, 16 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_lshr_b32 s0, s2, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: s_lshr_b32 s0, s3, 16 -; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v5, s2 -; GFX7-NEXT: v_mov_b32_e32 v7, s3 -; GFX7-NEXT: v_mov_b32_e32 v8, s0 -; GFX7-NEXT: ds_write_b16 v1, v0 -; GFX7-NEXT: ds_write_b16 v1, v2 offset:2 -; GFX7-NEXT: ds_write_b16 v1, v3 offset:4 -; GFX7-NEXT: ds_write_b16 v1, v4 offset:6 -; GFX7-NEXT: ds_write_b16 v1, v5 offset:8 -; GFX7-NEXT: ds_write_b16 v1, v6 offset:10 -; GFX7-NEXT: ds_write_b16 v1, v7 offset:12 -; GFX7-NEXT: ds_write_b16 v1, v8 offset:14 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:12 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:14 ; GFX7-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index e96a5163e92f3..538c146601bda 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -41,39 +41,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: s_lshr_b32 s5, s0, 16 ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_lshr_b32 s4, s1, 24 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v8, s4 -; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: ds_write_b8 v1, v2 offset:1 -; GFX9-NEXT: ds_write_b8 v1, v3 offset:2 -; GFX9-NEXT: ds_write_b8 v1, v4 offset:3 -; GFX9-NEXT: ds_write_b8 v1, v5 offset:4 -; GFX9-NEXT: ds_write_b8 v1, v6 offset:5 -; GFX9-NEXT: ds_write_b8 v1, v7 offset:6 -; GFX9-NEXT: ds_write_b8 v1, v8 offset:7 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_lshr_b32 s0, s2, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s3, s2, 24 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX9-NEXT: ds_write_b8 v1, v2 offset:9 -; GFX9-NEXT: ds_write_b8 v1, v3 offset:10 -; GFX9-NEXT: ds_write_b8 v1, v4 offset:11 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align1: @@ -83,39 +83,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s3, s0, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: s_lshr_b32 s5, s0, 16 ; GFX7-NEXT: s_lshr_b32 s6, s0, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s0, s1, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s3, s1, 16 ; GFX7-NEXT: s_lshr_b32 s4, s1, 24 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v6, s0 -; GFX7-NEXT: v_mov_b32_e32 v7, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: v_mov_b32_e32 v8, s4 -; GFX7-NEXT: ds_write_b8 v1, v0 -; GFX7-NEXT: ds_write_b8 v1, v2 offset:1 -; GFX7-NEXT: ds_write_b8 v1, v3 offset:2 -; GFX7-NEXT: ds_write_b8 v1, v4 offset:3 -; GFX7-NEXT: ds_write_b8 v1, v5 offset:4 -; GFX7-NEXT: ds_write_b8 v1, v6 offset:5 -; GFX7-NEXT: ds_write_b8 v1, v7 offset:6 -; GFX7-NEXT: ds_write_b8 v1, v8 offset:7 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_lshr_b32 s0, s2, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s1, s2, 16 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s3, s2, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s3 -; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX7-NEXT: ds_write_b8 v1, v2 offset:9 -; GFX7-NEXT: ds_write_b8 v1, v3 offset:10 -; GFX7-NEXT: ds_write_b8 v1, v4 offset:11 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX7-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void @@ -128,21 +128,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: ds_write_b16 v1, v2 offset:2 -; GFX9-NEXT: ds_write_b16 v1, v3 offset:4 -; GFX9-NEXT: ds_write_b16 v1, v4 offset:6 -; GFX9-NEXT: ds_write_b16 v1, v5 offset:8 -; GFX9-NEXT: ds_write_b16 v1, v6 offset:10 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align2: @@ -152,21 +152,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s3, s0, 16 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s3, s0, 16 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_lshr_b32 s0, s2, 16 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v5, s2 -; GFX7-NEXT: v_mov_b32_e32 v6, s0 -; GFX7-NEXT: ds_write_b16 v1, v0 -; GFX7-NEXT: ds_write_b16 v1, v2 offset:2 -; GFX7-NEXT: ds_write_b16 v1, v3 offset:4 -; GFX7-NEXT: ds_write_b16 v1, v4 offset:6 -; GFX7-NEXT: ds_write_b16 v1, v5 offset:8 -; GFX7-NEXT: ds_write_b16 v1, v6 offset:10 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:10 ; GFX7-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll index 54eebc9205796..6e0ffe656dfa2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -34,20 +34,16 @@ define i32 @v_udiv_i32(i32 %num, i32 %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; CGP-NEXT: v_mul_lo_u32 v4, v0, 0 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v2, 0 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v3 +; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 @@ -95,22 +91,17 @@ define amdgpu_ps i32 @s_udiv_i32(i32 inreg %num, i32 inreg %den) { ; CGP-LABEL: s_udiv_i32: ; CGP: ; %bb.0: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, s1 -; CGP-NEXT: s_sub_i32 s4, 0, s1 -; CGP-NEXT: s_bfe_u64 s[2:3], s[0:1], 0x200000 +; CGP-NEXT: s_sub_i32 s2, 0, s1 ; CGP-NEXT: v_rcp_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, s2, 0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v2, s4, v0 -; CGP-NEXT: v_mul_lo_u32 v3, v0, 0 -; CGP-NEXT: v_mul_lo_u32 v4, 0, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_lo_u32 v2, s3, v0 -; CGP-NEXT: v_mul_hi_u32 v0, s2, v0 +; CGP-NEXT: v_mul_lo_u32 v1, s2, v0 +; CGP-NEXT: v_mul_lo_u32 v2, 0, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v1, 0, v0 +; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v0, s1 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 @@ -178,36 +169,28 @@ define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; CGP-NEXT: v_mul_lo_u32 v6, v0, 0 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v1, 0 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; CGP-NEXT: v_rcp_f32_e32 v4, v4 -; CGP-NEXT: v_rcp_f32_e32 v7, v7 +; CGP-NEXT: v_rcp_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v4, 0 -; CGP-NEXT: v_mul_lo_u32 v8, v8, v7 -; CGP-NEXT: v_mul_lo_u32 v11, v7, 0 -; CGP-NEXT: v_mul_lo_u32 v12, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v8 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 @@ -553,42 +536,34 @@ define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) { ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_mul_lo_u32 v4, v0, 0 -; CGP-NEXT: v_mul_lo_u32 v5, v1, 0 ; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 ; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 -; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 +; CGP-NEXT: v_rcp_f32_e32 v4, v4 ; CGP-NEXT: v_rcp_f32_e32 v6, v6 -; CGP-NEXT: v_rcp_f32_e32 v8, v8 +; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; CGP-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v10, v6, 0 -; CGP-NEXT: v_mul_lo_u32 v9, v9, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v8, 0 -; CGP-NEXT: v_mul_lo_u32 v12, 0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 @@ -651,20 +626,16 @@ define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) { ; CGP-NEXT: v_and_b32_e32 v1, s4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; CGP-NEXT: v_mul_lo_u32 v4, v0, 0 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v2, 0 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v3 +; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 @@ -742,36 +713,28 @@ define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) { ; CGP-NEXT: v_and_b32_e32 v3, s4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; CGP-NEXT: v_mul_lo_u32 v6, v0, 0 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v1, 0 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; CGP-NEXT: v_rcp_f32_e32 v4, v4 -; CGP-NEXT: v_rcp_f32_e32 v7, v7 +; CGP-NEXT: v_rcp_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v4, 0 -; CGP-NEXT: v_mul_lo_u32 v8, v8, v7 -; CGP-NEXT: v_mul_lo_u32 v11, v7, 0 -; CGP-NEXT: v_mul_lo_u32 v12, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v8 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 559d116602e50..f0984a2397368 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -323,9 +323,10 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: BB1_2: ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: BB1_3: ; %Flow -; CHECK-NEXT: s_and_b32 s1, s5, 1 +; CHECK-NEXT: s_xor_b32 s1, s5, -1 +; CHECK-NEXT: s_and_b32 s1, s1, 1 ; CHECK-NEXT: s_cmp_lg_u32 s1, 0 -; CHECK-NEXT: s_cbranch_scc0 BB1_5 +; CHECK-NEXT: s_cbranch_scc1 BB1_5 ; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 ; CHECK-NEXT: s_sub_i32 s1, 0, s2 @@ -962,22 +963,22 @@ define i64 @v_udiv_i64_pow2k_denom(i64 %num) { ; CHECK-LABEL: v_udiv_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v2, 0 -; CHECK-NEXT: s_mov_b32 s7, 0xfffff000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s6 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v2 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 +; CHECK-NEXT: s_movk_i32 s6, 0xf000 +; CHECK-NEXT: s_movk_i32 s7, 0x1000 +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v3, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v4, s7, v3 -; CHECK-NEXT: v_mul_lo_u32 v5, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v4, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s7, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CHECK-NEXT: v_mul_lo_u32 v6, v3, v5 ; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 @@ -1004,10 +1005,10 @@ define i64 @v_udiv_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_addc_u32_e64 v5, s[4:5], v3, v4, vcc ; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v4 -; CHECK-NEXT: v_mul_lo_u32 v4, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v4, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s7, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v5 ; CHECK-NEXT: v_mul_lo_u32 v9, v5, v4 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4 @@ -1054,11 +1055,11 @@ define i64 @v_udiv_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, s7, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, 0, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, s7, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_mul_lo_u32 v4, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, s7, v3 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v2 ; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 @@ -1068,16 +1069,16 @@ define i64 @v_udiv_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 +; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s7, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc @@ -1363,14 +1364,14 @@ define <2 x i64> @v_udiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-LABEL: v_udiv_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; CGP-NEXT: s_movk_i32 s8, 0xf000 ; CGP-NEXT: s_movk_i32 s10, 0x1000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, s10 -; CGP-NEXT: s_mov_b32 s8, 0xfffff000 -; CGP-NEXT: v_mov_b32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; CGP-NEXT: v_mov_b32_e32 v6, v4 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v6 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 @@ -1623,22 +1624,22 @@ define i64 @v_udiv_i64_oddk_denom(i64 %num) { ; CHECK-LABEL: v_udiv_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v2, 0 -; CHECK-NEXT: s_mov_b32 s7, 0xffed2705 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s6 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v2 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 +; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 +; CHECK-NEXT: s_mov_b32 s7, 0x12d8fb +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v3, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v4, s7, v3 -; CHECK-NEXT: v_mul_lo_u32 v5, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v4, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s7, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CHECK-NEXT: v_mul_lo_u32 v6, v3, v5 ; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 @@ -1665,10 +1666,10 @@ define i64 @v_udiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_addc_u32_e64 v5, s[4:5], v3, v4, vcc ; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v4 -; CHECK-NEXT: v_mul_lo_u32 v4, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v4, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s7, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v5 ; CHECK-NEXT: v_mul_lo_u32 v9, v5, v4 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4 @@ -1715,11 +1716,11 @@ define i64 @v_udiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, s7, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, 0, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, s7, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_mul_lo_u32 v4, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v4, s7, v3 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v2 ; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 @@ -1729,16 +1730,16 @@ define i64 @v_udiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 +; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s7, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc @@ -2024,14 +2025,14 @@ define <2 x i64> @v_udiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-LABEL: v_udiv_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s10, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, s10 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 ; CGP-NEXT: s_mov_b32 s8, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; CGP-NEXT: s_mov_b32 s10, 0x12d8fb +; CGP-NEXT: v_mov_b32_e32 v6, v4 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v6 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll index f331deea89e54..500e967c86d64 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -32,20 +32,16 @@ define i32 @v_urem_i32(i32 %num, i32 %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; CGP-NEXT: v_mul_lo_u32 v4, v0, 0 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v2, 0 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v3 +; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 @@ -89,22 +85,17 @@ define amdgpu_ps i32 @s_urem_i32(i32 inreg %num, i32 inreg %den) { ; CGP-LABEL: s_urem_i32: ; CGP: ; %bb.0: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, s1 -; CGP-NEXT: s_sub_i32 s4, 0, s1 -; CGP-NEXT: s_bfe_u64 s[2:3], s[0:1], 0x200000 +; CGP-NEXT: s_sub_i32 s2, 0, s1 ; CGP-NEXT: v_rcp_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, s2, 0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v2, s4, v0 -; CGP-NEXT: v_mul_lo_u32 v3, v0, 0 -; CGP-NEXT: v_mul_lo_u32 v4, 0, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_lo_u32 v2, s3, v0 -; CGP-NEXT: v_mul_hi_u32 v0, s2, v0 +; CGP-NEXT: v_mul_lo_u32 v1, s2, v0 +; CGP-NEXT: v_mul_lo_u32 v2, 0, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v1, 0, v0 +; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, s1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 @@ -167,36 +158,28 @@ define <2 x i32> @v_urem_v2i32(<2 x i32> %num, <2 x i32> %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; CGP-NEXT: v_mul_lo_u32 v6, v0, 0 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v1, 0 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; CGP-NEXT: v_rcp_f32_e32 v4, v4 -; CGP-NEXT: v_rcp_f32_e32 v7, v7 +; CGP-NEXT: v_rcp_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v4, 0 -; CGP-NEXT: v_mul_lo_u32 v8, v8, v7 -; CGP-NEXT: v_mul_lo_u32 v11, v7, 0 -; CGP-NEXT: v_mul_lo_u32 v12, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v8 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 @@ -496,42 +479,34 @@ define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) { ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_mul_lo_u32 v4, v0, 0 -; CGP-NEXT: v_mul_lo_u32 v5, v1, 0 ; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 ; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 -; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 +; CGP-NEXT: v_rcp_f32_e32 v4, v4 ; CGP-NEXT: v_rcp_f32_e32 v6, v6 -; CGP-NEXT: v_rcp_f32_e32 v8, v8 +; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; CGP-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v10, v6, 0 -; CGP-NEXT: v_mul_lo_u32 v9, v9, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v8, 0 -; CGP-NEXT: v_mul_lo_u32 v12, 0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 @@ -588,20 +563,16 @@ define i32 @v_urem_i32_24bit(i32 %num, i32 %den) { ; CGP-NEXT: v_and_b32_e32 v1, s4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; CGP-NEXT: v_mul_lo_u32 v4, v0, 0 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v2, 0 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v3 +; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 @@ -674,36 +645,28 @@ define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) { ; CGP-NEXT: v_and_b32_e32 v3, s4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; CGP-NEXT: v_mul_lo_u32 v6, v0, 0 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v1, 0 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; CGP-NEXT: v_rcp_f32_e32 v4, v4 -; CGP-NEXT: v_rcp_f32_e32 v7, v7 +; CGP-NEXT: v_rcp_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v4, 0 -; CGP-NEXT: v_mul_lo_u32 v8, v8, v7 -; CGP-NEXT: v_mul_lo_u32 v11, v7, 0 -; CGP-NEXT: v_mul_lo_u32 v12, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v8 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 92f93185530f2..e79c300a56b84 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -319,9 +319,10 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: BB1_2: ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: BB1_3: ; %Flow -; CHECK-NEXT: s_and_b32 s1, s5, 1 +; CHECK-NEXT: s_xor_b32 s1, s5, -1 +; CHECK-NEXT: s_and_b32 s1, s1, 1 ; CHECK-NEXT: s_cmp_lg_u32 s1, 0 -; CHECK-NEXT: s_cbranch_scc0 BB1_5 +; CHECK-NEXT: s_cbranch_scc1 BB1_5 ; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 ; CHECK-NEXT: s_sub_i32 s1, 0, s2 @@ -948,22 +949,22 @@ define i64 @v_urem_i64_pow2k_denom(i64 %num) { ; CHECK-LABEL: v_urem_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v2, 0 -; CHECK-NEXT: s_mov_b32 s7, 0xfffff000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s6 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v2 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 +; CHECK-NEXT: s_movk_i32 s6, 0xf000 +; CHECK-NEXT: s_movk_i32 s7, 0x1000 +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v3, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v4, s7, v3 -; CHECK-NEXT: v_mul_lo_u32 v5, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v4, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s7, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CHECK-NEXT: v_mul_lo_u32 v6, v3, v5 ; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 @@ -990,10 +991,10 @@ define i64 @v_urem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_addc_u32_e64 v5, s[4:5], v3, v4, vcc ; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v4 -; CHECK-NEXT: v_mul_lo_u32 v4, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v4, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s7, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v5 ; CHECK-NEXT: v_mul_lo_u32 v9, v5, v4 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4 @@ -1040,30 +1041,30 @@ define i64 @v_urem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, s7, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, 0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, s7, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_mul_lo_u32 v3, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v3, s7, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v1, v2, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, s6, v0 +; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, s7, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v4 +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s7, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, s6, v4 +; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, s7, v4 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc @@ -1343,14 +1344,14 @@ define <2 x i64> @v_urem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-LABEL: v_urem_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; CGP-NEXT: s_movk_i32 s8, 0xf000 ; CGP-NEXT: s_movk_i32 s10, 0x1000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, s10 -; CGP-NEXT: s_mov_b32 s8, 0xfffff000 -; CGP-NEXT: v_mov_b32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; CGP-NEXT: v_mov_b32_e32 v6, v4 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v6 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 @@ -1599,22 +1600,22 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) { ; CHECK-LABEL: v_urem_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v2, 0 -; CHECK-NEXT: s_mov_b32 s7, 0xffed2705 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s6 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v2 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 +; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 +; CHECK-NEXT: s_mov_b32 s7, 0x12d8fb +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v3, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v4, s7, v3 -; CHECK-NEXT: v_mul_lo_u32 v5, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v4, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s7, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CHECK-NEXT: v_mul_lo_u32 v6, v3, v5 ; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 @@ -1641,10 +1642,10 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_addc_u32_e64 v5, s[4:5], v3, v4, vcc ; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v4 -; CHECK-NEXT: v_mul_lo_u32 v4, s7, v2 +; CHECK-NEXT: v_mul_lo_u32 v4, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s7, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v5 ; CHECK-NEXT: v_mul_lo_u32 v9, v5, v4 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4 @@ -1691,30 +1692,30 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, s7, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, 0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, s7, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_mul_lo_u32 v3, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v3, s7, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v1, v2, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, s6, v0 +; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, s7, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v4 +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s7, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, s6, v4 +; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, s7, v4 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc @@ -1994,14 +1995,14 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-LABEL: v_urem_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s10, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, s10 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 ; CGP-NEXT: s_mov_b32 s8, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; CGP-NEXT: s_mov_b32 s10, 0x12d8fb +; CGP-NEXT: v_mov_b32_e32 v6, v4 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v6 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index 4edc231fc1410..9139cd029adda 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -37,7 +37,6 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, s2 ; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s2 @@ -121,10 +120,8 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; GFX8-NEXT: s_mov_b32 s5, s4 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_and_b32 s6, s1, s4 -; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_and_b64 s[2:3], s[6:7], s[4:5] -; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] +; GFX8-NEXT: s_xor_b64 s[2:3], s[6:7], s[4:5] ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s4 ; GFX8-NEXT: s_or_b32 s0, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index c44f5dd6bd594..7eec033fa2717 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -3316,13 +3316,14 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GCN-NEXT: v_and_b32_e32 v2, s3, v3 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc ; GCN-NEXT: v_and_b32_e32 v3, s3, v4 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_endpgm %r = udiv <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out @@ -3460,9 +3461,10 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_endpgm %r = urem <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out @@ -3612,9 +3614,10 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_endpgm %r = sdiv <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out @@ -3780,13 +3783,14 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 ; GCN-NEXT: v_and_b32_e32 v3, s3, v3 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_endpgm %r = srem <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll index e4f0083a4685c..2c5a3f3d9ba96 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -15,27 +15,27 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 32, align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 48, align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 64, align 1, addrspace 4) ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 80, align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 96, align 1, addrspace 4) @@ -49,13 +49,13 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0 - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec - ; GCN: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7" + 112, addrspace 4) - ; GCN: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4) - ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4) - ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 112, align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4) ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64 ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4) @@ -64,7 +64,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4) ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72 ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4) @@ -73,7 +73,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4) ; GCN: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80 ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4) @@ -82,7 +82,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4) ; GCN: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0 @@ -101,7 +101,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0 - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4) ; GCN: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96 ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4) @@ -110,7 +110,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4) ; GCN: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104 ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4) @@ -119,7 +119,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4) ; GCN: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112 @@ -135,7 +135,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4) ; GCN: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120 @@ -150,7 +150,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4) ; GCN: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] @@ -164,7 +164,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4) @@ -193,7 +193,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0 - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4) ; GCN: [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] @@ -207,7 +207,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4) ; GCN: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152 @@ -268,10 +268,10 @@ bb.0: call void asm sideeffect "", "" () - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2 - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2 - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2 + %fadd1 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2 + %fadd2 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 + %fadd3 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2 + %fadd4 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2 call void asm sideeffect "", "" () @@ -392,7 +392,7 @@ declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #2 declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #2 -declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2 +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0 declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #2 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 52ac3705a490e..fb1cd3bbbaf10 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -744,13 +744,13 @@ entry: ; GCN-LABEL: {{^}}tail_call_byval_align16: ; GCN-NOT: s32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GCN: s_getpc_b64 -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4 -; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}} ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { @@ -777,12 +777,12 @@ entry: ; GCN-LABEL: {{^}}stack_12xv3i32: ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN: buffer_store_dword [[REG12]], {{.*$}} +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12 ; GCN: v_mov_b32_e32 v31, 11 ; GCN: s_getpc @@ -806,12 +806,12 @@ entry: ; GCN-LABEL: {{^}}stack_12xv3f32: ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 ; GCN: buffer_store_dword [[REG12]], {{.*$}} +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12 ; GCN: v_mov_b32_e32 v31, 0x41300000 ; GCN: s_getpc @@ -836,20 +836,20 @@ entry: ; GCN-LABEL: {{^}}stack_8xv5i32: ; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN: buffer_store_dword [[REG8]], {{.*$}} +; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 ; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 ; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 ; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 ; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 ; GCN: v_mov_b32_e32 v31, 7 @@ -870,20 +870,20 @@ entry: ; GCN-LABEL: {{^}}stack_8xv5f32: ; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 ; GCN: buffer_store_dword [[REG8]], {{.*$}} +; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 ; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 ; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 ; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 ; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 ; GCN: v_mov_b32_e32 v31, 0x40e00000 diff --git a/llvm/test/CodeGen/AMDGPU/call-return-types.ll b/llvm/test/CodeGen/AMDGPU/call-return-types.ll index 8751c61dcd400..33b201bbe6d8e 100644 --- a/llvm/test/CodeGen/AMDGPU/call-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-return-types.ll @@ -30,6 +30,8 @@ declare <3 x float> @external_v3f32_func_void() #0 declare <5 x float> @external_v5f32_func_void() #0 declare <2 x double> @external_v2f64_func_void() #0 +declare <2 x i24> @external_v2i24_func_void() #0 + declare <2 x i32> @external_v2i32_func_void() #0 declare <3 x i32> @external_v3i32_func_void() #0 declare <4 x i32> @external_v4i32_func_void() #0 @@ -250,6 +252,18 @@ define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_v2i24_func_void: +; GCN: s_swappc_b64 +; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1 +define amdgpu_kernel void @test_call_external_v2i24_func_void() #0 { + %val = call <2 x i24> @external_v2i24_func_void() + %elt0 = extractelement <2 x i24> %val, i32 0 + %elt1 = extractelement <2 x i24> %val, i32 1 + %add = add i24 %elt0, %elt1 + store volatile i24 %add, i24 addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}test_call_external_v3f32_func_void: ; GCN: s_swappc ; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2] diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll index 0f655dadfa11d..7d3839d213b89 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -68,7 +68,6 @@ done: declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #0 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 -declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #2 attributes #0 = { argmemonly nounwind } attributes #1 = { nounwind readnone willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll index 840a4ec3dac8f..e14a35e150824 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -codegenprepare -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s @@ -9,14 +8,14 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float a ; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32( ; OPT-NEXT: entry: ; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999 -; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3 +; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) [[ATTR3:#.*]] ; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT: if: ; OPT-NEXT: [[TMP0:%.*]] = bitcast float addrspace(1)* [[IN:%.*]] to i8 addrspace(1)* ; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to float addrspace(1)* -; OPT-NEXT: call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00) +; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00) ; OPT-NEXT: [[VAL:%.*]] = load volatile float, float addrspace(1)* undef, align 4 ; OPT-NEXT: br label [[ENDIF]] ; OPT: endif: @@ -57,7 +56,7 @@ entry: br i1 %cmp, label %endif, label %if if: - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %in.gep, float 2.0) + %fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %in.gep, float 2.0) %val = load volatile float, float addrspace(1)* undef br label %endif @@ -71,7 +70,7 @@ done: } declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 -declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #2 +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #2 attributes #0 = { argmemonly nounwind } attributes #1 = { nounwind readnone willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index bc3bcfe6089af..566899486d954 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -31,9 +31,7 @@ bb: %la3 = getelementptr inbounds i32, i32* %lb, i32 6 %ld3 = load i32, i32* %la3 -; DBG: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]]) -; DBG: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]]) -; DBG: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]]) +; DBG-NOT: Cluster ld/st ; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]] ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD2]] offset:8 ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16 @@ -78,13 +76,11 @@ bb: %la3 = getelementptr inbounds i32, i32* %lb, i32 6 %ld3 = load i32, i32* %la3 -; DBG: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]]) -; DBG: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]]) -; DBG: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]]) -; GCN: v_add_u32_e32 [[ST2:v[0-9]+]], 1, [[LD2]] +; DBG-NOT: Cluster ld/st ; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]] -; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[ST2]] offset:8 +; GCN: v_add_u32_e32 [[ST2:v[0-9]+]], 1, [[LD2]] ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16 +; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[ST2]] offset:8 ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD4]] offset:24 %sa0 = getelementptr inbounds i32, i32* %sb, i32 0 store i32 %ld0, i32* %sa0 @@ -125,7 +121,6 @@ entry: ; CHECK-LABEL: {{^}}no_cluster_image_load: ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 -; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 ; DBG-NOT: {{^}}Cluster ld/st define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> inreg %src2, <8 x i32> inreg %dst, i32 %x, i32 %y) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll index 3a73b91249d51..0c76f00590264 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll @@ -31,8 +31,8 @@ ; OSABI-HSA: .amd_amdgpu_hsa_metadata ; OSABI-HSA-NOT: .amd_amdgpu_pal_metadata -; OSABI-HSA-ELF: Unknown note type (0x00000001) -; OSABI-HSA-ELF: Unknown note type (0x00000003) +; OSABI-HSA-ELF: Unknown note type: (0x00000001) +; OSABI-HSA-ELF: Unknown note type: (0x00000003) ; OSABI-HSA-ELF: NT_AMD_AMDGPU_ISA (ISA Version) ; OSABI-HSA-ELF: ISA Version: ; OSABI-HSA-ELF: amdgcn-amd-amdhsa--gfx802 @@ -59,7 +59,7 @@ ; OSABI-PAL-NOT: .amd_amdgpu_hsa_metadata ; OSABI-PAL: .amd_amdgpu_pal_metadata -; OSABI-PAL-ELF: Unknown note type (0x00000003) +; OSABI-PAL-ELF: Unknown note type: (0x00000003) ; OSABI-PAL-ELF: NT_AMD_AMDGPU_ISA (ISA Version) ; OSABI-PAL-ELF: ISA Version: ; OSABI-PAL-ELF: amdgcn-amd-amdpal--gfx802 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index badaa16bbfcc5..05f0bafb47c74 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -11,7 +11,7 @@ ; R600-NOT: AND ; R600: |PV.{{[XYZW]}}| -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; SI: s_bitset0_b32 s{{[0-9]+}}, 31 ; VI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float @@ -24,7 +24,7 @@ define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) { ; R600-NOT: AND ; R600: |PV.{{[XYZW]}}| -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; SI: s_bitset0_b32 s{{[0-9]+}}, 31 ; VI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float @@ -36,7 +36,7 @@ define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) { ; FUNC-LABEL: {{^}}s_fabs_f32: ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; SI: s_bitset0_b32 s{{[0-9]+}}, 31 ; VI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) { %fabs = call float @llvm.fabs.f32(float %in) diff --git a/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll b/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll index e52fcc747a710..710bfa9744ad9 100644 --- a/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll @@ -8,12 +8,12 @@ ; have the instruction available. ; FIXME: Should also really make sure the v2f16 version fails. -; FAIL: LLVM ERROR: Cannot select: {{.+}}: ch = BUFFER_ATOMIC_FADD +; FAIL: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD define amdgpu_cs void @atomic_fadd(<4 x i32> inreg %arg0) { - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %arg0, i32 0, i32 112, i1 false) + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %arg0, i32 0, i32 112, i1 false) ret void } -declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1 immarg) #0 +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 76490407c7447..3b6396f8b63fc 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -156,28 +156,28 @@ define amdgpu_kernel void @global_store_2xi16_align1(i16 addrspace(1)* %p, i16 a ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 ; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v4 -; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v5 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 ; GFX7-ALIGNED-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v5 -; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v4 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_endpgm ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1: diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll index 85f9ea173eb5e..3a4778333001d 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -73,9 +73,9 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)* ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, 2 ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen -; GFX9-NEXT: buffer_store_short v2, v1, s[0:3], 0 offen offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 @@ -140,14 +140,14 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)* ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1 -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0 ; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], 0 offen -; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v4, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2 -; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v1, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v3, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v1, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir b/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir new file mode 100644 index 0000000000000..32de262837816 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir @@ -0,0 +1,185 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: self_loop_single_def_use +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + ; GCN-LABEL: name: self_loop_single_def_use + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x80000000) + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; GCN: bb.1: + ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; GCN: renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec + ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; GCN: bb.2: + ; GCN: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1 + %0:vreg_64 = COPY $vgpr0_vgpr1 + + bb.1: + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec + S_CBRANCH_EXECZ %bb.1, implicit $exec + + bb.2: + S_ENDPGM 0 + +... + +--- +name: self_loop_multi_def +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + ; GCN-LABEL: name: self_loop_multi_def + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x80000000) + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; GCN: bb.1: + ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; GCN: renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec + ; GCN: renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec + ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; GCN: bb.2: + ; GCN: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1 + %0:vreg_64 = COPY $vgpr0_vgpr1 + + bb.1: + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec + S_CBRANCH_EXECZ %bb.1, implicit $exec + + bb.2: + S_ENDPGM 0 + +... + +# There's a single def inside the self loop, but it's also a use. + +--- +name: self_loop_def_use_same_inst +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + ; GCN-LABEL: name: self_loop_def_use_same_inst + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x80000000) + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; GCN: bb.1: + ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN: renamable $vgpr0 = V_ADD_U32_e32 1, undef $vgpr0, implicit $exec + ; GCN: $vgpr1_vgpr2 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; GCN: GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec + ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; GCN: bb.2: + ; GCN: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1 + %0:vreg_64 = COPY $vgpr0_vgpr1 + + bb.1: + %1:vgpr_32 = V_ADD_U32_e32 1, undef %1, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec + S_CBRANCH_EXECZ %bb.1, implicit $exec + + bb.2: + S_ENDPGM 0 + +... + +--- +name: self_loop_def_after_use +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + ; GCN-LABEL: name: self_loop_def_after_use + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x80000000) + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; GCN: bb.1: + ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; GCN: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, undef renamable $vgpr0, 0, 0, 0, 0, implicit $exec + ; GCN: renamable $vgpr2 = V_ADD_U32_e64 1, 1, 0, implicit $exec + ; GCN: SI_SPILL_V32_SAVE killed $vgpr2, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) + ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; GCN: bb.2: + ; GCN: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1 + %0:vreg_64 = COPY $vgpr0_vgpr1 + + bb.1: + GLOBAL_STORE_DWORD %0, undef %1, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = V_ADD_U32_e64 1, 1, 0, implicit $exec + S_CBRANCH_EXECZ %bb.1, implicit $exec + + bb.2: + S_ENDPGM 0 + +... + +--- +name: self_loop_single_subreg_def_use +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + ; GCN-LABEL: name: self_loop_single_subreg_def_use + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x80000000) + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; GCN: bb.1: + ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; GCN: undef renamable $vgpr3 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr2_vgpr3 + ; GCN: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, undef renamable $vgpr3, 0, 0, 0, 0, implicit $exec + ; GCN: SI_SPILL_V64_SAVE killed $vgpr2_vgpr3, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.1, align 4, addrspace 5) + ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; GCN: bb.2: + ; GCN: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1 + %0:vreg_64 = COPY $vgpr0_vgpr1 + + bb.1: + undef %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, undef %1.sub1, 0, 0, 0, 0, implicit $exec + S_CBRANCH_EXECZ %bb.1, implicit $exec + + bb.2: + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll index 9286e91e09b2c..216ab53cb24e1 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll @@ -17,14 +17,14 @@ define float @fdiv_f32(float %a, float %b) #0 { ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN: S_SETREG_B32 killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode + ; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode ; GCN: %14:vgpr_32 = nofpexcept V_FMA_F32 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec ; GCN: %15:vgpr_32 = nofpexcept V_FMA_F32 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec ; GCN: %16:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec ; GCN: %17:vgpr_32 = nofpexcept V_FMA_F32 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec ; GCN: %18:vgpr_32 = nofpexcept V_FMA_F32 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec ; GCN: %19:vgpr_32 = nofpexcept V_FMA_F32 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec - ; GCN: S_SETREG_B32 killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode + ; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode ; GCN: $vcc = COPY %7 ; GCN: %20:vgpr_32 = nofpexcept V_DIV_FMAS_F32 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec ; GCN: %21:vgpr_32 = nofpexcept V_DIV_FIXUP_F32 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec @@ -50,14 +50,14 @@ define float @fdiv_nnan_f32(float %a, float %b) #0 { ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN: S_SETREG_B32 killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode + ; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode ; GCN: %14:vgpr_32 = nnan nofpexcept V_FMA_F32 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec ; GCN: %15:vgpr_32 = nnan nofpexcept V_FMA_F32 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec ; GCN: %16:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec ; GCN: %17:vgpr_32 = nnan nofpexcept V_FMA_F32 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec ; GCN: %18:vgpr_32 = nnan nofpexcept V_FMA_F32 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec ; GCN: %19:vgpr_32 = nnan nofpexcept V_FMA_F32 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec - ; GCN: S_SETREG_B32 killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode + ; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode ; GCN: $vcc = COPY %7 ; GCN: %20:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec ; GCN: %21:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index a621b04a346c0..afae6b43ee587 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -34,7 +34,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x ; R600: |PV.{{[XYZW]}}| ; R600: -PV -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 ; VI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float @@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) ; R600: |PV.{{[XYZW]}}| ; R600: -PV -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fabs = call float @fabs(float %bc) @@ -59,7 +59,7 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 % } ; FUNC-LABEL: {{^}}fneg_fabs_f32: -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) { %fabs = call float @llvm.fabs.f32(float %in) %fsub = fsub float -0.000000e+00, %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fold_16bit_imm.mir b/llvm/test/CodeGen/AMDGPU/fold_16bit_imm.mir index 458bdcef1a584..eae7e4807f765 100644 --- a/llvm/test/CodeGen/AMDGPU/fold_16bit_imm.mir +++ b/llvm/test/CodeGen/AMDGPU/fold_16bit_imm.mir @@ -16,21 +16,6 @@ body: | ... ---- -name: fold_simm_16_sub_to_sub -body: | - bb.0: - - ; GCN-LABEL: name: fold_simm_16_sub_to_sub - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 - ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 - ; GCN: SI_RETURN_TO_EPILOG [[S_MOV_B32_1]] - %0:sreg_32 = S_MOV_B32 2048 - %1.lo16:sreg_32 = COPY killed %0.lo16 - SI_RETURN_TO_EPILOG %1 - -... - --- name: fold_simm_16_sub_to_phys body: | @@ -46,36 +31,6 @@ body: | ... ---- -name: fold_aimm_16_sub_to_sub_2048 -body: | - bb.0: - - ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_2048 - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 - ; GCN: %1.lo16:agpr_32 = COPY killed [[S_MOV_B32_]].lo16 - ; GCN: SI_RETURN_TO_EPILOG %1 - %0:sreg_32 = S_MOV_B32 2048 - %1.lo16:agpr_32 = COPY killed %0.lo16 - SI_RETURN_TO_EPILOG %1 - -... - ---- -name: fold_aimm_16_sub_to_sub_0 -body: | - bb.0: - - ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_0 - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec - ; GCN: SI_RETURN_TO_EPILOG [[V_ACCVGPR_WRITE_B32_]] - %0:sreg_32 = S_MOV_B32 0 - %1.lo16:agpr_32 = COPY killed %0.lo16 - SI_RETURN_TO_EPILOG %1 - -... - --- name: fold_aimm_16_sub_to_phys body: | @@ -106,21 +61,6 @@ body: | ... ---- -name: fold_vimm_16_sub_to_sub -body: | - bb.0: - - ; GCN-LABEL: name: fold_vimm_16_sub_to_sub - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 - ; GCN: %1.lo16:vgpr_32 = COPY killed [[S_MOV_B32_]].lo16 - ; GCN: SI_RETURN_TO_EPILOG %1 - %0:sreg_32 = S_MOV_B32 2048 - %1.lo16:vgpr_32 = COPY killed %0.lo16 - SI_RETURN_TO_EPILOG %1 - -... - --- name: fold_vimm_16_sub_to_phys body: | @@ -135,123 +75,3 @@ body: | SI_RETURN_TO_EPILOG $vgpr0_lo16 ... - ---- -name: fold_vimm_16_lo_to_hi -body: | - bb.0: - - ; GCN-LABEL: name: fold_vimm_16_lo_to_hi - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 - ; GCN: %1.hi16:vgpr_32 = COPY killed [[S_MOV_B32_]].lo16 - ; GCN: SI_RETURN_TO_EPILOG %1 - %0:sreg_32 = S_MOV_B32 2048 - %1.hi16:vgpr_32 = COPY killed %0.lo16 - SI_RETURN_TO_EPILOG %1 - -... - ---- -name: fold_vimm_16_hi_to_lo -body: | - bb.0: - - ; GCN-LABEL: name: fold_vimm_16_hi_to_lo - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 - ; GCN: %1.lo16:vgpr_32 = COPY killed [[S_MOV_B32_]].hi16 - ; GCN: SI_RETURN_TO_EPILOG %1 - %0:sreg_32 = S_MOV_B32 2048 - %1.lo16:vgpr_32 = COPY killed %0.hi16 - SI_RETURN_TO_EPILOG %1 - -... - ---- -name: fold_simm_16_sub_to_sub_lo_to_hi -body: | - bb.0: - - ; GCN-LABEL: name: fold_simm_16_sub_to_sub_lo_to_hi - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 - ; GCN: %1.hi16:sreg_32 = COPY killed [[S_MOV_B32_]].lo16 - ; GCN: SI_RETURN_TO_EPILOG %1 - %0:sreg_32 = S_MOV_B32 2048 - %1.hi16:sreg_32 = COPY killed %0.lo16 - SI_RETURN_TO_EPILOG %1 - -... - ---- -name: fold_simm_16_sub_to_sub_hi_to_lo_2048 -body: | - bb.0: - - ; GCN-LABEL: name: fold_simm_16_sub_to_sub_hi_to_lo_2048 - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 - ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN: SI_RETURN_TO_EPILOG [[S_MOV_B32_1]] - %0:sreg_32 = S_MOV_B32 2048 - %1.lo16:sreg_32 = COPY killed %0.hi16 - SI_RETURN_TO_EPILOG %1 - -... - ---- -name: fold_simm_16_sub_to_sub_hi_to_lo_shifted_2048 -body: | - bb.0: - - ; GCN-LABEL: name: fold_simm_16_sub_to_sub_hi_to_lo_shifted_2048 - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 134217728 - ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 - ; GCN: SI_RETURN_TO_EPILOG [[S_MOV_B32_1]] - %0:sreg_32 = S_MOV_B32 134217728 - %1.lo16:sreg_32 = COPY killed %0.hi16 - SI_RETURN_TO_EPILOG %1 - -... - ---- -name: fold_aimm_16_sub_to_sub_hi_to_lo_2048 -body: | - bb.0: - - ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_hi_to_lo_2048 - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 - ; GCN: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec - ; GCN: SI_RETURN_TO_EPILOG [[V_ACCVGPR_WRITE_B32_]] - %0:sreg_32 = S_MOV_B32 2048 - %1.lo16:agpr_32 = COPY killed %0.hi16 - SI_RETURN_TO_EPILOG %1 - -... - ---- -name: fold_aimm_16_sub_to_sub_hi_to_lo_shifted_1 -body: | - bb.0: - - ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_hi_to_lo_shifted_1 - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65536 - ; GCN: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 1, implicit $exec - ; GCN: SI_RETURN_TO_EPILOG [[V_ACCVGPR_WRITE_B32_]] - %0:sreg_32 = S_MOV_B32 65536 - %1.lo16:agpr_32 = COPY killed %0.hi16 - SI_RETURN_TO_EPILOG %1 - -... - ---- -name: fold_aimm_16_sub_to_sub_hi_to_lo_shifted_2048 -body: | - bb.0: - - ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_hi_to_lo_shifted_2048 - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 134217728 - ; GCN: %1.lo16:agpr_32 = COPY killed [[S_MOV_B32_]].hi16 - ; GCN: SI_RETURN_TO_EPILOG %1 - %0:sreg_32 = S_MOV_B32 134217728 - %1.lo16:agpr_32 = COPY killed %0.hi16 - SI_RETURN_TO_EPILOG %1 - -... diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 720e45b3c30f5..d5ee24a8bd1a7 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -1040,9 +1040,9 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> ; CI-NEXT: v_trunc_f32_e32 v4, v4 ; CI-NEXT: v_fma_f32 v0, -v4, v2, v0 ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 @@ -1265,9 +1265,9 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; CI-NEXT: v_trunc_f32_e32 v8, v8 ; CI-NEXT: v_fma_f32 v1, -v8, v1, v5 ; CI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 +; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_rcp_f32_e32 v9, v8 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 @@ -1300,8 +1300,8 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; CI-NEXT: v_trunc_f32_e32 v4, v4 ; CI-NEXT: v_fma_f32 v0, -v4, v0, v3 ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 157330b8bd47d..96b609436da78 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -981,127 +981,61 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; SI-LABEL: v_fshr_v2i24: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 5, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 2, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_hi_u32 v11, v2, s4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_hi_u32 v12, v3, s4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 4, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 -; SI-NEXT: v_mul_lo_u32 v11, v11, 24 -; SI-NEXT: v_mul_lo_u32 v12, v12, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 -; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v12 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2 +; SI-NEXT: v_mul_hi_u32 v6, v4, s4 +; SI-NEXT: v_mul_hi_u32 v7, v5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 +; SI-NEXT: v_mul_lo_u32 v6, v6, 24 +; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v7 +; SI-NEXT: v_mul_lo_u32 v6, v6, 24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v6 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3 -; SI-NEXT: v_alignbit_b32 v1, v1, v6, v2 -; SI-NEXT: v_alignbit_b32 v2, v5, v4, v3 -; SI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen -; SI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen -; SI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v2i24: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 5, v0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 2, v0 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_mul_hi_u32 v11, v2, s4 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_mul_hi_u32 v12, v3, s4 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v11, 4, v11 -; VI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 -; VI-NEXT: v_mul_lo_u32 v11, v11, 24 -; VI-NEXT: v_mul_lo_u32 v12, v12, 24 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v11 -; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v12 -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2 +; VI-NEXT: v_mul_hi_u32 v6, v4, s4 +; VI-NEXT: v_mul_hi_u32 v7, v5, s4 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 +; VI-NEXT: v_mul_lo_u32 v6, v6, 24 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 +; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v7 +; VI-NEXT: v_mul_lo_u32 v6, v6, 24 +; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v6 ; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3 -; VI-NEXT: v_alignbit_b32 v1, v1, v6, v2 -; VI-NEXT: v_alignbit_b32 v2, v5, v4, v3 -; VI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen -; VI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen -; VI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen -; VI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i24: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_mul_hi_u32 v6, v1, s4 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_mul_hi_u32 v7, v2, s4 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_mul_hi_u32 v6, v4, s4 +; GFX9-NEXT: v_mul_hi_u32 v7, v5, s4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6 -; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 8, v1 -; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_alignbit_b32 v1, v8, v5, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 -; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2 -; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 -; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v7 +; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX9-NEXT: v_add_u32_e32 v4, 8, v4 +; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, v5, v6 +; GFX9-NEXT: v_add_u32_e32 v3, 8, v3 +; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fshr_v2i24: diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index ded8d7ad55113..1f2657fe94d29 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -344,6 +344,16 @@ define void @void_func_v16i16(<16 x i16> %arg0) #0 { ret void } +; GCN-LABEL: {{^}}void_func_v2i24: +; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1 +define void @void_func_v2i24(<2 x i24> %arg0) #0 { + %elt0 = extractelement <2 x i24> %arg0, i32 0 + %elt1 = extractelement <2 x i24> %arg0, i32 1 + %add = add i24 %elt0, %elt1 + store i24 %add, i24 addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}void_func_v2f32: ; GCN-NOT: v[0:1] ; GCN-NOT: v0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll index 315180dff5fac..af54135d1ceba 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900,CAS %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,CAS %s ; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32: -; GCN: [[LOOP:BB[0-9]+_[0-9]+]] -; GCN: v_add_f32_e32 -; GCN: global_atomic_cmpswap -; GCN: s_andn2_b64 exec, exec, -; GCN-NEXT: s_cbranch_execnz [[LOOP]] +; CAS: [[LOOP:BB[0-9]+_[0-9]+]] +; CAS: v_add_f32_e32 +; CAS: global_atomic_cmpswap +; CAS: s_andn2_b64 exec, exec, +; CAS-NEXT: s_cbranch_execnz [[LOOP]] define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) { %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll index fb5a454421550..e8f4504bbccaa 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll @@ -15,7 +15,7 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn(i8 addrspace(1)* inreg %sbase %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)* - call void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep0, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep0, float %data) ret void } @@ -28,7 +28,7 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn_neg128(i8 addrspace(1)* inreg %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)* - call void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep1, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep1, float %data) ret void } @@ -40,7 +40,7 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn(i8 addrspace(1)* inreg %sba %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)* - call void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep0, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep0, <2 x half> %data) ret void } @@ -53,11 +53,11 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn_neg128(i8 addrspace(1)* inr %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)* - call void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep1, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep1, <2 x half> %data) ret void } -declare void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* nocapture, float) #0 -declare void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0 +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* nocapture, float) #0 +declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0 attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 1908015f47707..d54058eec30c9 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -312,7 +312,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 ; GCN: flat_store_dwordx4 @@ -326,6 +325,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 ; VI: v_cvt_f32_f16_e32 ; VI: v_cvt_f32_f16_sdwa diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-meta-insts.mir b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-meta-insts.mir new file mode 100644 index 0000000000000..e59db4fead3d7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-meta-insts.mir @@ -0,0 +1,41 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx906 -run-pass=post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GFX9 %s + +# Make sure the kill is skipped for hazard purposes, so the nop is +# correctly inserted. + +--- + +name: global_store_dwordx4_data_hazard_kill + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX9-LABEL: name: global_store_dwordx4_data_hazard_kill + ; GFX9: GLOBAL_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr2 = KILL + ; GFX9: S_NOP 0 + ; GFX9: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + GLOBAL_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec + $vgpr2 = KILL + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + +... + +--- + +name: global_store_dwordx3_data_hazard_kill + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 + ; GFX9-LABEL: name: global_store_dwordx3_data_hazard_kill + ; GFX9: GLOBAL_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr2 = KILL + ; GFX9: S_NOP 0 + ; GFX9: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + GLOBAL_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr2 = KILL + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + +... diff --git a/llvm/test/CodeGen/AMDGPU/hsa-globals.ll b/llvm/test/CodeGen/AMDGPU/hsa-globals.ll index 09c4b5f68a0b5..bbb96072dfaf5 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-globals.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-globals.ll @@ -13,6 +13,8 @@ define amdgpu_kernel void @test() { ret void } +@weak_global = extern_weak addrspace(1) global i32 + ; ASM: .type linkonce_odr_global_program,@object ; ASM: .section .bss,#alloc,#write ; ASM: .weak linkonce_odr_global_program @@ -48,3 +50,5 @@ define amdgpu_kernel void @test() { ; ASM: external_readonly: ; ASM: .long 0 ; ASM: .size external_readonly, 4 + +; ASM: .weak weak_global diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll index 9e7cca3ded721..f52aa1e4dee1e 100644 --- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll @@ -321,14 +321,77 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) { ret void } -; define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) { -; %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) -; %v.data = extractvalue { <3 x half>, i32 } %v, 0 -; %v.err = extractvalue { <3 x half>, i32 } %v, 1 -; store volatile <3 x half> %v.data, <3 x half> addrspace(1)* undef -; store volatile i32 %v.err, i32 addrspace(1)* undef -; ret void -; } +define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) { +; GFX9-LABEL: load_1d_v3f16_tfe_dmask7: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s7, s5 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x7 unorm tfe d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: load_1d_v3f16_tfe_dmask7: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s11, s9 +; GFX10-NEXT: s_mov_b32 s10, s8 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s7, s5 +; GFX10-NEXT: s_mov_b32 s6, s4 +; GFX10-NEXT: s_mov_b32 s5, s3 +; GFX10-NEXT: s_mov_b32 s4, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: global_store_dword v[0:1], v3, off +; GFX10-NEXT: s_endpgm +; +; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask7: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9 +; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-UNPACKED-NEXT: image_load v[1:4], v0, s[4:11] dmask:0x7 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v3 +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0 +; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v4 +; GFX8-UNPACKED-NEXT: s_endpgm + %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.data = extractvalue { <3 x half>, i32 } %v, 0 + %v.err = extractvalue { <3 x half>, i32 } %v, 1 + store volatile <3 x half> %v.data, <3 x half> addrspace(1)* undef + store volatile i32 %v.err, i32 addrspace(1)* undef + ret void +} define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s) { ; GFX9-LABEL: load_1d_v4f16_tfe_dmask15: diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll index 3d3b511ab34b7..8999cd91169ac 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -69,15 +69,15 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { ; GCN: renamable $vgpr30 = COPY killed renamable $vgpr14 ; GCN: renamable $vgpr31 = COPY killed renamable $vgpr15 ; GCN: renamable $vgpr32 = COPY killed renamable $vgpr16 - ; GCN: renamable $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GCN: renamable $sgpr20_sgpr21 = S_MOV_B64 $exec ; GCN: renamable $vgpr1 = IMPLICIT_DEF - ; GCN: renamable $sgpr2_sgpr3 = IMPLICIT_DEF + ; GCN: renamable $sgpr22_sgpr23 = IMPLICIT_DEF ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) ; GCN: SI_SPILL_S128_SAVE killed $sgpr4_sgpr5_sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5) ; GCN: SI_SPILL_V512_SAVE killed $vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32, %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 64 into %stack.2, align 4, addrspace 5) - ; GCN: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.3, align 4, addrspace 5) + ; GCN: SI_SPILL_S64_SAVE killed $sgpr20_sgpr21, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.3, align 4, addrspace 5) ; GCN: SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) - ; GCN: SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5) + ; GCN: SI_SPILL_S64_SAVE killed $sgpr22_sgpr23, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5) ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 8 from %stack.5, align 4, addrspace 5) @@ -91,8 +91,8 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { ; GCN: renamable $vgpr18 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, implicit $m0 ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GCN: renamable $vgpr19 = COPY renamable $vgpr18 - ; GCN: renamable $sgpr2_sgpr3 = COPY renamable $sgpr4_sgpr5 - ; GCN: SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5) + ; GCN: renamable $sgpr6_sgpr7 = COPY renamable $sgpr4_sgpr5 + ; GCN: SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5) ; GCN: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.6, align 4, addrspace 5) ; GCN: SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 9b525585d876d..5d8ed0f540427 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -773,12 +773,13 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* % ; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <8 x i32> %a, i32 5, i32 %b store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 @@ -910,9 +911,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* % ; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v3i16: diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll index 975e2306cc325..1e5dcffdedd77 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s -; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VECT %s +; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s @@ -21,8 +21,12 @@ bb: } ; GCN-LABEL: test_local_misaligned_v4: -; GCN-DAG: ds_read_b128 -; GCN-DAG: ds_write_b128 +; VECT-DAG: ds_read_b128 +; VECT-DAG: ds_write_b128 +; SPLIT-DAG: ds_read2_b32 +; SPLIT-DAG: ds_read2_b32 +; SPLIT-DAG: ds_write2_b32 +; SPLIT-DAG: ds_write2_b32 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -42,8 +46,12 @@ bb: } ; GCN-LABEL: test_local_misaligned_v3: -; GCN-DAG: ds_read_b96 -; GCN-DAG: ds_write_b96 +; VECT-DAG: ds_read_b96 +; VECT-DAG: ds_write_b96 +; SPLIT-DAG: ds_read2_b32 +; SPLIT-DAG: ds_read_b32 +; SPLIT-DAG: ds_write2_b32 +; SPLIT-DAG: ds_write_b32 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll index b46e01373aad0..aee44794ac89b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll @@ -1,15 +1,15 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs | FileCheck %s -check-prefix=GCN -declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) -declare void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) -declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)*, float) -declare void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>) +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) +declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float) +declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>) ; GCN-LABEL: {{^}}buffer_atomic_add_f32: ; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { main_body: - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) ret void } @@ -17,7 +17,7 @@ main_body: ; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 slc define amdgpu_ps void @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { main_body: - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) ret void } @@ -25,7 +25,7 @@ main_body: ; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { main_body: - call void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) ret void } @@ -33,7 +33,7 @@ main_body: ; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 slc define amdgpu_ps void @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { main_body: - call void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) ret void } @@ -41,7 +41,7 @@ main_body: ; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off define amdgpu_kernel void @global_atomic_add_f32(float addrspace(1)* %ptr, float %data) { main_body: - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret void } @@ -50,7 +50,7 @@ main_body: define amdgpu_kernel void @global_atomic_add_f32_off4(float addrspace(1)* %ptr, float %data) { main_body: %p = getelementptr float, float addrspace(1)* %ptr, i64 1 - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %p, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data) ret void } @@ -59,7 +59,7 @@ main_body: define amdgpu_kernel void @global_atomic_add_f32_offneg4(float addrspace(1)* %ptr, float %data) { main_body: %p = getelementptr float, float addrspace(1)* %ptr, i64 -1 - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %p, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data) ret void } @@ -67,7 +67,7 @@ main_body: ; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off define amdgpu_kernel void @global_atomic_pk_add_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { main_body: - call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret void } @@ -76,7 +76,7 @@ main_body: define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { main_body: %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 1 - call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data) ret void } @@ -85,7 +85,7 @@ main_body: define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { main_body: %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -1 - call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data) ret void } @@ -94,7 +94,7 @@ main_body: ; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget: ; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(float addrspace(1)* %ptr, float %data) #0 { - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll index 274a5b2f0a78b..b1c2a030ea9f5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s @@ -23,6 +23,19 @@ main_body: ret half %elt } +; GCN-LABEL: {{^}}buffer_load_format_d16_xyz: +; UNPACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] +define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) { +main_body: + %data = call <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %elt = extractelement <3 x half> %data, i32 2 + ret half %elt +} + ; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: ; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] @@ -38,4 +51,5 @@ main_body: declare half @llvm.amdgcn.buffer.load.format.f16(<4 x i32>, i32, i32, i1, i1) declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32>, i32, i32, i1, i1) +declare <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32>, i32, i32, i1, i1) declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll index 5ece33f0195cd..aadd9a448a1b3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -28,6 +28,12 @@ main_body: ret void } +define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 @@ -54,4 +60,5 @@ main_body: declare void @llvm.amdgcn.buffer.store.format.f16(half, <4 x i32>, i32, i32, i1, i1) declare void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i1, i1) +declare void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i1, i1) declare void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll index 9e6be563c383e..da1174d7eb860 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll @@ -23,6 +23,18 @@ main_body: ret float %r } +; GCN-LABEL: {{^}}image_load_v3f16: +; UNPACKED: image_load v[0:2], v[0:1], s[0:7] dmask:0x7 unorm d16{{$}} +; PACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0x7 unorm d16{{$}} +; GFX10: image_load v[0:1], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm d16{{$}} +define amdgpu_ps <2 x float> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { +main_body: + %tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + %ext = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> + %r = bitcast <4 x half> %ext to <2 x float> + ret <2 x float> %r +} + ; GCN-LABEL: {{^}}image_load_v4f16: ; UNPACKED: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} ; PACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} @@ -56,6 +68,14 @@ main_body: ret float %x } +define amdgpu_ps <2 x float> @image_load_3d_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { +main_body: + %tex = call <3 x half> @llvm.amdgcn.image.load.3d.v3f16.i32(i32 7, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) + %ext = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> + %res = bitcast <4 x half> %ext to <2 x float> + ret <2 x float> %res +} + ; GCN-LABEL: {{^}}image_store_f16 ; GFX89: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}} ; GFX10: image_store v2, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm d16{{$}} @@ -78,6 +98,14 @@ main_body: ret void } +define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) { +main_body: + %r = bitcast <2 x float> %in to <4 x half> + %data = shufflevector <4 x half> %r, <4 x half> undef, <3 x i32> + call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %data, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + ; GCN-LABEL: {{^}}image_store_v4f16 ; UNPACKED: v_lshrrev_b32_e32 ; UNPACKED: v_and_b32_e32 @@ -110,15 +138,19 @@ main_body: declare half @llvm.amdgcn.image.load.2d.f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 declare <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x half> @llvm.amdgcn.image.load.mip.2d.v4f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <2 x half> @llvm.amdgcn.image.load.3d.v2f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.image.load.3d.v3f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare void @llvm.amdgcn.image.store.2d.f16.i32(half, i32, i32, i32, <8 x i32>, i32, i32) #0 declare void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half>, i32, i32, i32, <8 x i32>, i32, i32) #0 declare void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half>, i32, i32, i32, <8 x i32>, i32, i32) #0 declare void @llvm.amdgcn.image.store.mip.1d.v4f16.i32(<4 x half>, i32, i32, i32, <8 x i32>, i32, i32) #0 declare void @llvm.amdgcn.image.store.3d.v2f16.i32(<2 x half>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.3d.v3f16.i32(<3 x half>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll index 8a358ee59c963..6843134f83932 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -206,6 +206,131 @@ main_body: ret <2 x float> %r } +define amdgpu_ps <2 x float> @image_sample_b_2d_v3f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { +; TONGA-LABEL: image_sample_b_2d_v3f16: +; TONGA: ; %bb.0: ; %main_body +; TONGA-NEXT: s_mov_b64 s[12:13], exec +; TONGA-NEXT: s_wqm_b64 exec, exec +; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] +; TONGA-NEXT: image_sample_b v[0:2], v[0:2], s[0:7], s[8:11] dmask:0x7 d16 +; TONGA-NEXT: s_waitcnt vmcnt(0) +; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_mov_b32_e32 v1, v2 +; TONGA-NEXT: ; return to shader part epilog +; +; GFX81-LABEL: image_sample_b_2d_v3f16: +; GFX81: ; %bb.0: ; %main_body +; GFX81-NEXT: s_mov_b64 s[12:13], exec +; GFX81-NEXT: s_wqm_b64 exec, exec +; GFX81-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX81-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 d16 +; GFX81-NEXT: s_waitcnt vmcnt(0) +; GFX81-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: image_sample_b_2d_v3f16: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: image_sample_b_2d_v3f16: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %tex = call <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %tex_wide = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> + %r = bitcast <4 x half> %tex_wide to <2 x float> + ret <2 x float> %r +} + +define amdgpu_ps <4 x float> @image_sample_b_2d_v3f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { +; TONGA-LABEL: image_sample_b_2d_v3f16_tfe: +; TONGA: ; %bb.0: ; %main_body +; TONGA-NEXT: s_mov_b64 s[12:13], exec +; TONGA-NEXT: s_wqm_b64 exec, exec +; TONGA-NEXT: v_mov_b32_e32 v3, 0 +; TONGA-NEXT: v_mov_b32_e32 v4, v3 +; TONGA-NEXT: v_mov_b32_e32 v5, v3 +; TONGA-NEXT: v_mov_b32_e32 v6, v3 +; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] +; TONGA-NEXT: image_sample_b v[3:6], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16 +; TONGA-NEXT: s_waitcnt vmcnt(0) +; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; TONGA-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_mov_b32_e32 v1, v5 +; TONGA-NEXT: v_mov_b32_e32 v2, v6 +; TONGA-NEXT: ; return to shader part epilog +; +; GFX81-LABEL: image_sample_b_2d_v3f16_tfe: +; GFX81: ; %bb.0: ; %main_body +; GFX81-NEXT: s_mov_b64 s[12:13], exec +; GFX81-NEXT: s_wqm_b64 exec, exec +; GFX81-NEXT: v_mov_b32_e32 v3, 0 +; GFX81-NEXT: v_mov_b32_e32 v4, v3 +; GFX81-NEXT: v_mov_b32_e32 v5, v3 +; GFX81-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX81-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16 +; GFX81-NEXT: s_waitcnt vmcnt(0) +; GFX81-NEXT: v_mov_b32_e32 v0, v3 +; GFX81-NEXT: v_mov_b32_e32 v1, v4 +; GFX81-NEXT: v_mov_b32_e32 v2, v5 +; GFX81-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: image_sample_b_2d_v3f16_tfe: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: image_sample_b_2d_v3f16_tfe: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) + %tex.vec = extractvalue {<3 x half>, i32} %tex, 0 + %tex.vec_wide = shufflevector <3 x half> %tex.vec, <3 x half> undef, <4 x i32> + %tex.err = extractvalue {<3 x half>, i32} %tex, 1 + %tex.vecf = bitcast <4 x half> %tex.vec_wide to <2 x float> + %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0 + %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1 + %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0 + %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1 + %tex.errf = bitcast i32 %tex.err to float + %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2 + ret <4 x float> %r +} + define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { ; TONGA-LABEL: image_sample_b_2d_v4f16: ; TONGA: ; %bb.0: ; %main_body @@ -334,10 +459,13 @@ main_body: declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.image.sample.2d.v3f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare {<2 x half>,i32} @llvm.amdgcn.image.sample.2d.v2f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll new file mode 100644 index 0000000000000..d726b9c306be2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -0,0 +1,162 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) + +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) + +; GCN-LABEL: {{^}}image_bvh_intersect_ray: +; GCN: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]{{$}} +; Arguments are flattened to represent the actual VGPR_A layout, so we have no +; extra moves in the generated kernel. +define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { +main_body: + %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 + %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 + %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 + %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %r = bitcast <4 x i32> %v to <4 x float> + ret <4 x float> %r +} + +; GCN-LABEL: {{^}}image_bvh_intersect_ray_a16: +; GCN: image_bvh_intersect_ray v[0:3], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}} +define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { +main_body: + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %r = bitcast <4 x i32> %v to <4 x float> + ret <4 x float> %r +} + +; GCN-LABEL: {{^}}image_bvh64_intersect_ray: +; GCN: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]{{$}} +; Arguments are flattened to represent the actual VGPR_A layout, so we have no +; extra moves in the generated kernel. +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { +main_body: + %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64 + %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 + %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 + %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 + %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %r = bitcast <4 x i32> %v to <4 x float> + ret <4 x float> %r +} + +; GCN-LABEL: {{^}}image_bvh64_intersect_ray_a16: +; GCN: image_bvh64_intersect_ray v[0:3], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}} +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { +main_body: + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %r = bitcast <4 x i32> %v to <4 x float> + ret <4 x float> %r +} + +; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs. + +; GCN-LABEL: {{^}}image_bvh_intersect_ray_nsa_reassign: +; GCN: image_bvh_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) { +main_body: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid + %node_ptr = load i32, i32* %gep_node_ptr, align 4 + %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid + %ray_extent = load float, float* %gep_ray, align 4 + %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 + %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 + %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 + %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 + %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 + %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + store <4 x i32> %v, <4 x i32>* undef + ret void +} + +; GCN-LABEL: {{^}}image_bvh_intersect_ray_a16_nsa_reassign: +; GCN: image_bvh_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}} +define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) { +main_body: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid + %node_ptr = load i32, i32* %gep_node_ptr, align 4 + %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid + %ray_extent = load float, float* %gep_ray, align 4 + %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 + %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 + %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 + %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 + %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 + %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + store <4 x i32> %v, <4 x i32>* undef + ret void +} + +; GCN-LABEL: {{^}}image_bvh64_intersect_ray_nsa_reassign: +; GCN: image_bvh64_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) { +main_body: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid + %ray_extent = load float, float* %gep_ray, align 4 + %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 + %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 + %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 + %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 + %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 + %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + store <4 x i32> %v, <4 x i32>* undef + ret void +} + +; GCN-LABEL: {{^}}image_bvh64_intersect_ray_a16_nsa_reassign: +; GCN: image_bvh64_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}} +define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) { +main_body: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid + %ray_extent = load float, float* %gep_ray, align 4 + %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 + %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 + %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 + %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 + %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 + %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + store <4 x i32> %v, <4 x i32>* undef + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll index a48528caba1ba..90f805f2fc85f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -10,7 +10,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 24) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 24) ret void } @@ -23,7 +23,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_v ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s6 ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } @@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__v ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s6 offen ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -49,7 +49,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0 ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s6 offset:92 ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) ret void } @@ -62,11 +62,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen slc ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void } -declare void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 -declare void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll index fb28bc0748b08..2ebf3f6633a97 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll @@ -23,6 +23,18 @@ main_body: ret half %elt } +; GCN-LABEL: {{^}}buffer_load_format_d16_xyz: +; UNPACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) { +main_body: + %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %elt = extractelement <3 x half> %data, i32 2 + ret half %elt +} + ; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: ; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] @@ -38,4 +50,5 @@ main_body: declare half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32>, i32, i32, i32) declare <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32) +declare <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32) declare <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll index 139496282addf..68e77aff667c9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll @@ -28,6 +28,31 @@ main_body: ret void } +; GCN-LABEL: {{^}}buffer_store_format_d16_xyz: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] + +; UNPACKED: buffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen + +; PACKED: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]] + +; PACKED: buffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %voffset) { +main_body: + %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> + call void @llvm.amdgcn.raw.buffer.store.format.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + ret void +} + ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 @@ -54,4 +79,5 @@ main_body: declare void @llvm.amdgcn.raw.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32) declare void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i32) declare void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll index db7949f540964..0ebc4e67b4fbe 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll @@ -26,6 +26,21 @@ main_body: ret half %elt } +; GCN-LABEL: {{^}}tbuffer_load_d16_xyz: +; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] +; GFX10-UNPACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PREGFX10-PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] +; GFX10-PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] +; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] +define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) { +main_body: + %data = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) + %elt = extractelement <3 x half> %data, i32 2 + ret half %elt +} + ; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: ; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; GFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] @@ -43,5 +58,5 @@ main_body: declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32) declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) +declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) - diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index 5041cf3197342..281c48513b6ae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -32,6 +32,31 @@ main_body: ret void } +; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] +; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] + + +; PACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} +; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]] +; PREGFX10-PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] +; GFX10-PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] +define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) { +main_body: + %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> + call void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + ret void +} + ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, @@ -58,4 +83,5 @@ main_body: declare void @llvm.amdgcn.raw.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32) declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32) declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll index 88bfa8a0b687d..758069023579a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX6 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s ; FIXME: This copy of the test is a subset of the -global-isel version, since the VGPR case doesn't work. @@ -13,20 +13,27 @@ ; Set FP32 fp_round to round to zero define amdgpu_kernel void @test_setreg_f32_round_mode_rtz() { -; GFX6789-LABEL: test_setreg_f32_round_mode_rtz: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_f32_round_mode_rtz: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 ; encoding: [0x01,0x08,0x80,0xba,0x03,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_f32_round_mode_rtz: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 ; encoding: [0x01,0x08,0x00,0xba,0x03,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_f32_round_mode_rtz: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 ; encoding: [0x01,0x08,0x80,0xba,0x03,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 2049, i32 3) call void asm sideeffect "", ""() ret void @@ -34,20 +41,27 @@ define amdgpu_kernel void @test_setreg_f32_round_mode_rtz() { ; Set FP64/FP16 fp_round to round to zero define amdgpu_kernel void @test_setreg_f64_round_mode_rtz() { -; GFX6789-LABEL: test_setreg_f64_round_mode_rtz: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_f64_round_mode_rtz: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 ; encoding: [0x81,0x08,0x80,0xba,0x03,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_f64_round_mode_rtz: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 ; encoding: [0x81,0x08,0x00,0xba,0x03,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_f64_round_mode_rtz: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 ; encoding: [0x81,0x08,0x80,0xba,0x03,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 2177, i32 3) call void asm sideeffect "", ""() ret void @@ -55,20 +69,27 @@ define amdgpu_kernel void @test_setreg_f64_round_mode_rtz() { ; Set all fp_round to round to zero define amdgpu_kernel void @test_setreg_all_round_mode_rtz() { -; GFX6789-LABEL: test_setreg_all_round_mode_rtz: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_all_round_mode_rtz: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7 ; encoding: [0x81,0x18,0x80,0xba,0x07,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_all_round_mode_rtz: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7 ; encoding: [0x81,0x18,0x00,0xba,0x07,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_all_round_mode_rtz: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7 ; encoding: [0x81,0x18,0x80,0xba,0x07,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6273, i32 7) call void asm sideeffect "", ""() ret void @@ -76,100 +97,135 @@ define amdgpu_kernel void @test_setreg_all_round_mode_rtz() { ; Set FP32 fp_round to dynamic mode define amdgpu_cs void @test_setreg_roundingmode_var(i32 inreg %var.mode) { -; GFX6789-LABEL: test_setreg_roundingmode_var: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_roundingmode_var: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0 ; encoding: [0x01,0x08,0x80,0xb9] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_roundingmode_var: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0 ; encoding: [0x01,0x08,0x00,0xb9] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_roundingmode_var: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0 ; encoding: [0x01,0x08,0x80,0xb9] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 2049, i32 %var.mode) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_ieee_mode_off() { -; GFX6789-LABEL: test_setreg_ieee_mode_off: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_ieee_mode_off: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0 ; encoding: [0x41,0x02,0x80,0xba,0x00,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_ieee_mode_off: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0 ; encoding: [0x41,0x02,0x00,0xba,0x00,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_ieee_mode_off: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0 ; encoding: [0x41,0x02,0x80,0xba,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 577, i32 0) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_ieee_mode_on() { -; GFX6789-LABEL: test_setreg_ieee_mode_on: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_ieee_mode_on: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1 ; encoding: [0x41,0x02,0x80,0xba,0x01,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_ieee_mode_on: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1 ; encoding: [0x41,0x02,0x00,0xba,0x01,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_ieee_mode_on: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1 ; encoding: [0x41,0x02,0x80,0xba,0x01,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 577, i32 1) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_dx10_clamp_off() { -; GFX6789-LABEL: test_setreg_dx10_clamp_off: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_dx10_clamp_off: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0 ; encoding: [0x01,0x02,0x80,0xba,0x00,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_dx10_clamp_off: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0 ; encoding: [0x01,0x02,0x00,0xba,0x00,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_dx10_clamp_off: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0 ; encoding: [0x01,0x02,0x80,0xba,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 513, i32 0) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_dx10_clamp_on() { -; GFX6789-LABEL: test_setreg_dx10_clamp_on: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_dx10_clamp_on: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1 ; encoding: [0x01,0x02,0x80,0xba,0x01,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_dx10_clamp_on: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1 ; encoding: [0x01,0x02,0x00,0xba,0x01,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_dx10_clamp_on: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1 ; encoding: [0x01,0x02,0x80,0xba,0x01,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 513, i32 1) call void asm sideeffect "", ""() ret void @@ -177,20 +233,27 @@ define amdgpu_kernel void @test_setreg_dx10_clamp_on() { ; Sets full width of fp round and fp denorm fields, to a variable define amdgpu_cs void @test_setreg_full_both_round_mode_and_denorm_mode(i32 inreg %mode) { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0 ; encoding: [0x01,0x38,0x80,0xb9] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0 ; encoding: [0x01,0x38,0x00,0xb9] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0 ; encoding: [0x01,0x38,0x80,0xb9] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 inreg %mode) call void asm sideeffect "", ""() ret void @@ -198,20 +261,27 @@ define amdgpu_cs void @test_setreg_full_both_round_mode_and_denorm_mode(i32 inre ; Does not cover last bit of denorm field define amdgpu_cs void @test_setreg_most_both_round_mode_and_denorm_mode() { -; GFX6789-LABEL: test_setreg_most_both_round_mode_and_denorm_mode: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_most_both_round_mode_and_denorm_mode: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6 ; encoding: [0x01,0x30,0x80,0xba,0x06,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_most_both_round_mode_and_denorm_mode: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6 ; encoding: [0x01,0x30,0x00,0xba,0x06,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_most_both_round_mode_and_denorm_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6 ; encoding: [0x01,0x30,0x80,0xba,0x06,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 12289, i32 6) call void asm sideeffect "", ""() ret void @@ -219,200 +289,270 @@ define amdgpu_cs void @test_setreg_most_both_round_mode_and_denorm_mode() { ; Does not cover first bit of denorm field define amdgpu_cs void @test_setreg_most_both_round_mode_and_denorm_mode_6() { -; GFX6789-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6 ; encoding: [0x41,0x10,0x80,0xba,0x06,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6 ; encoding: [0x41,0x10,0x00,0xba,0x06,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6 ; encoding: [0x41,0x10,0x80,0xba,0x06,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 4161, i32 6) call void asm sideeffect "", ""() ret void } define amdgpu_cs void @test_setreg_f32_denorm_mode(i32 inreg %val) { -; GFX6789-LABEL: test_setreg_f32_denorm_mode: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_f32_denorm_mode: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; encoding: [0x01,0x09,0x80,0xb9] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_f32_denorm_mode: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; encoding: [0x01,0x09,0x00,0xb9] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_f32_denorm_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; encoding: [0x01,0x09,0x80,0xb9] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 2305, i32 %val) call void asm sideeffect "", ""() ret void } define amdgpu_cs void @test_setreg_f64_denorm_mode(i32 inreg %val) { -; GFX6789-LABEL: test_setreg_f64_denorm_mode: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_f64_denorm_mode: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0 ; encoding: [0x81,0x09,0x80,0xb9] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_f64_denorm_mode: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0 ; encoding: [0x81,0x09,0x00,0xb9] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_f64_denorm_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0 ; encoding: [0x81,0x09,0x80,0xb9] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 2433, i32 %val) call void asm sideeffect "", ""() ret void } define amdgpu_cs void @test_setreg_full_denorm_mode(i32 inreg %val) { -; GFX6789-LABEL: test_setreg_full_denorm_mode: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; encoding: [0x01,0x18,0x80,0xb9] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; encoding: [0x01,0x18,0x00,0xb9] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; encoding: [0x01,0x18,0x80,0xb9] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 %val) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_round_mode_0() { -; GFX6789-LABEL: test_setreg_full_round_mode_0: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 ; encoding: [0x01,0x18,0x80,0xba,0x00,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_0: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 ; encoding: [0x01,0x18,0x00,0xba,0x00,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 0) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_round_mode_1() { -; GFX6789-LABEL: test_setreg_full_round_mode_1: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1 ; encoding: [0x01,0x18,0x80,0xba,0x01,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_1: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1 ; encoding: [0x01,0x18,0x00,0xba,0x01,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0x1 +; GFX10-NEXT: s_round_mode 0x1 ; encoding: [0x01,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 1) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_round_mode_2() { -; GFX6789-LABEL: test_setreg_full_round_mode_2: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2 ; encoding: [0x01,0x18,0x80,0xba,0x02,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_2: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2 ; encoding: [0x01,0x18,0x00,0xba,0x02,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0x2 +; GFX10-NEXT: s_round_mode 0x2 ; encoding: [0x02,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 2) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_round_mode_4() { -; GFX6789-LABEL: test_setreg_full_round_mode_4: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4 ; encoding: [0x01,0x18,0x80,0xba,0x04,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_4: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4 ; encoding: [0x01,0x18,0x00,0xba,0x04,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_4: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0x4 +; GFX10-NEXT: s_round_mode 0x4 ; encoding: [0x04,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 4) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_round_mode_8() { -; GFX6789-LABEL: test_setreg_full_round_mode_8: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8 ; encoding: [0x01,0x18,0x80,0xba,0x08,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_8: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8 ; encoding: [0x01,0x18,0x00,0xba,0x08,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_8: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0x8 +; GFX10-NEXT: s_round_mode 0x8 ; encoding: [0x08,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 8) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_round_mode_15() { -; GFX6789-LABEL: test_setreg_full_round_mode_15: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15 ; encoding: [0x01,0x18,0x80,0xba,0x0f,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_15: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15 ; encoding: [0x01,0x18,0x00,0xba,0x0f,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_15: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0xf +; GFX10-NEXT: s_round_mode 0xf ; encoding: [0x0f,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 15) call void asm sideeffect "", ""() ret void @@ -420,60 +560,81 @@ define amdgpu_kernel void @test_setreg_full_round_mode_15() { ; Should truncate set immediate value define amdgpu_kernel void @test_setreg_full_round_mode_42() { -; GFX6789-LABEL: test_setreg_full_round_mode_42: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 42 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_round_mode_42: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 42 ; encoding: [0x01,0x18,0x80,0xba,0x2a,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_round_mode_42: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 42 ; encoding: [0x01,0x18,0x00,0xba,0x2a,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_round_mode_42: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_round_mode 0xa +; GFX10-NEXT: s_round_mode 0xa ; encoding: [0x0a,0x00,0xa4,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 42) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_denorm_mode_0() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_0: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 0 ; encoding: [0x01,0x19,0x80,0xba,0x00,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_0: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 0 ; encoding: [0x01,0x19,0x00,0xba,0x00,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 0) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_denorm_mode_1() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_1: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 1 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 1 ; encoding: [0x01,0x19,0x80,0xba,0x01,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_1: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 1 ; encoding: [0x01,0x19,0x00,0xba,0x01,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 1 +; GFX10-NEXT: s_denorm_mode 1 ; encoding: [0x01,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 1) call void asm sideeffect "", ""() ret void @@ -481,100 +642,135 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_1() { define amdgpu_kernel void @test_setreg_full_denorm_mode_2() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_2: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 2 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 2 ; encoding: [0x01,0x19,0x80,0xba,0x02,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_2: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 2 ; encoding: [0x01,0x19,0x00,0xba,0x02,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 2 +; GFX10-NEXT: s_denorm_mode 2 ; encoding: [0x02,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 2) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_denorm_mode_4() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_4: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 4 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 4 ; encoding: [0x01,0x19,0x80,0xba,0x04,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_4: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 4 ; encoding: [0x01,0x19,0x00,0xba,0x04,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_4: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 4 +; GFX10-NEXT: s_denorm_mode 4 ; encoding: [0x04,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 4) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_denorm_mode_8() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_8: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 8 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 8 ; encoding: [0x01,0x19,0x80,0xba,0x08,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_8: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 8 ; encoding: [0x01,0x19,0x00,0xba,0x08,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_8: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 8 +; GFX10-NEXT: s_denorm_mode 8 ; encoding: [0x08,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 8) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_denorm_mode_15() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_15: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 15 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 15 ; encoding: [0x01,0x19,0x80,0xba,0x0f,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_15: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 15 ; encoding: [0x01,0x19,0x00,0xba,0x0f,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_15: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: s_denorm_mode 15 ; encoding: [0x0f,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 15) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_denorm_mode_42() { -; GFX6789-LABEL: test_setreg_full_denorm_mode_42: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 42 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_denorm_mode_42: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 42 ; encoding: [0x01,0x19,0x80,0xba,0x2a,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_denorm_mode_42: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 42 ; encoding: [0x01,0x19,0x00,0xba,0x2a,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_denorm_mode_42: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 10 +; GFX10-NEXT: s_denorm_mode 10 ; encoding: [0x0a,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 42) call void asm sideeffect "", ""() ret void @@ -582,231 +778,308 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_42() { ; Sets all fp round and fp denorm bits. define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_0() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0 ; encoding: [0x01,0x38,0x80,0xba,0x00,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0 ; encoding: [0x01,0x38,0x00,0xba,0x00,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 0) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_1() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 1 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 1 ; encoding: [0x01,0x38,0x80,0xba,0x01,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 1 ; encoding: [0x01,0x38,0x00,0xba,0x01,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x1 +; GFX10-NEXT: s_round_mode 0x1 ; encoding: [0x01,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 1) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_2() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 2 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 2 ; encoding: [0x01,0x38,0x80,0xba,0x02,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 2 ; encoding: [0x01,0x38,0x00,0xba,0x02,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x2 +; GFX10-NEXT: s_round_mode 0x2 ; encoding: [0x02,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 2) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_4() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 4 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 4 ; encoding: [0x01,0x38,0x80,0xba,0x04,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 4 ; encoding: [0x01,0x38,0x00,0xba,0x04,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x4 +; GFX10-NEXT: s_round_mode 0x4 ; encoding: [0x04,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 4) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_8() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 8 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 8 ; encoding: [0x01,0x38,0x80,0xba,0x08,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 8 ; encoding: [0x01,0x38,0x00,0xba,0x08,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x8 +; GFX10-NEXT: s_round_mode 0x8 ; encoding: [0x08,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 8) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_16() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 16 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 16 ; encoding: [0x01,0x38,0x80,0xba,0x10,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 16 ; encoding: [0x01,0x38,0x00,0xba,0x10,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 1 +; GFX10-NEXT: s_denorm_mode 1 ; encoding: [0x01,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 16) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_32() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 32 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 32 ; encoding: [0x01,0x38,0x80,0xba,0x20,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 32 ; encoding: [0x01,0x38,0x00,0xba,0x20,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 2 +; GFX10-NEXT: s_denorm_mode 2 ; encoding: [0x02,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 32) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_64() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 64 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 64 ; encoding: [0x01,0x38,0x80,0xba,0x40,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 64 ; encoding: [0x01,0x38,0x00,0xba,0x40,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 4 +; GFX10-NEXT: s_denorm_mode 4 ; encoding: [0x04,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 64) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_128() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x80 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x80 ; encoding: [0x01,0x38,0x80,0xba,0x80,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x80 ; encoding: [0x01,0x38,0x00,0xba,0x80,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x0 +; GFX10-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 8 +; GFX10-NEXT: s_denorm_mode 8 ; encoding: [0x08,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 128) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_15() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 15 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 15 ; encoding: [0x01,0x38,0x80,0xba,0x0f,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 15 ; encoding: [0x01,0x38,0x00,0xba,0x0f,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0xf +; GFX10-NEXT: s_round_mode 0xf ; encoding: [0x0f,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 0 +; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 15) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_255() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xff -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xff ; encoding: [0x01,0x38,0x80,0xba,0xff,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xff ; encoding: [0x01,0x38,0x00,0xba,0xff,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0xf +; GFX10-NEXT: s_round_mode 0xf ; encoding: [0x0f,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: s_denorm_mode 15 ; encoding: [0x0f,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 255) call void asm sideeffect "", ""() ret void @@ -814,61 +1087,82 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_255( ; Truncate extra high bit define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_597() { -; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x255 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x255 ; encoding: [0x01,0x38,0x80,0xba,0x55,0x02,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x255 ; encoding: [0x01,0x38,0x00,0xba,0x55,0x02,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_round_mode 0x5 +; GFX10-NEXT: s_round_mode 0x5 ; encoding: [0x05,0x00,0xa4,0xbf] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_denorm_mode 5 +; GFX10-NEXT: s_denorm_mode 5 ; encoding: [0x05,0x00,0xa5,0xbf] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 597) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_set_8_bits_straddles_round_and_denorm() { -; GFX6789-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff ; encoding: [0x81,0x38,0x80,0xba,0xff,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff ; encoding: [0x81,0x38,0x00,0xba,0xff,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff ; encoding: [0x81,0x38,0x80,0xba,0xff,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 14465, i32 255) call void asm sideeffect "", ""() ret void } define amdgpu_kernel void @test_setreg_set_4_bits_straddles_round_and_denorm() { -; GFX6789-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm: -; GFX6789: ; %bb.0: -; GFX6789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15 -; GFX6789-NEXT: ;;#ASMSTART -; GFX6789-NEXT: ;;#ASMEND -; GFX6789-NEXT: s_endpgm +; GFX6-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15 ; encoding: [0x81,0x18,0x80,0xba,0x0f,0x00,0x00,0x00] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX789-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15 ; encoding: [0x81,0x18,0x00,0xba,0x0f,0x00,0x00,0x00] +; GFX789-NEXT: ;;#ASMSTART +; GFX789-NEXT: ;;#ASMEND +; GFX789-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX10-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm: ; GFX10: ; %bb.0: ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15 +; GFX10-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15 ; encoding: [0x81,0x18,0x80,0xba,0x0f,0x00,0x00,0x00] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] call void @llvm.amdgcn.s.setreg(i32 6273, i32 15) call void asm sideeffect "", ""() ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll index ccd6dc912b66c..3df101ea6fdda 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -11,7 +11,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -25,7 +25,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 idxen ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } @@ -38,7 +38,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen slc ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } @@ -51,11 +51,11 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s6 idxen offen ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 -declare void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll index 3e0d87bb6ef93..e6c90336724b5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll @@ -23,6 +23,19 @@ main_body: ret half %elt } +; GCN-LABEL: {{^}}buffer_load_format_d16_xyz: +; UNPACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] +define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) { +main_body: + %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %elt = extractelement <3 x half> %data, i32 2 + ret half %elt +} + ; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: ; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] @@ -47,5 +60,6 @@ main_body: declare half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32>, i32, i32, i32, i32) declare <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32, i32) +declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32, i32) declare <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32, i32) declare i16 @llvm.amdgcn.struct.buffer.load.format.i16(<4 x i32>, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll index 8ae753b59ab54..69c9a633db864 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll @@ -28,6 +28,31 @@ main_body: ret void } +; GCN-LABEL: {{^}}buffer_store_format_d16_xyz: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] + +; UNPACKED: buffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen + +; PACKED: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]] + +; PACKED: buffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { +main_body: + %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> + call void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 @@ -64,5 +89,6 @@ main_body: declare void @llvm.amdgcn.struct.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32, i32) declare void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32) declare void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) declare void @llvm.amdgcn.struct.buffer.store.format.i16(i16, <4 x i32>, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll index 2fd21a10564d4..ebf8940e034a4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll @@ -28,6 +28,21 @@ main_body: ret half %elt } +; GCN-LABEL: {{^}}tbuffer_load_d16_xyz: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen +; PREGFX10-UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PREGFX10-PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen +; GFX10-PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] idxen +; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] +define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) { +main_body: + %data = call <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) + %elt = extractelement <3 x half> %data, i32 2 + ret half %elt +} + ; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 ; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen @@ -45,5 +60,5 @@ main_body: declare half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32) declare <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32) +declare <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32, i32) declare <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32) - diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index ca78b29cc8f53..93634fbffb935 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -32,6 +32,30 @@ main_body: ret void } +; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] +; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen + +; PACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} +; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]] +; PREGFX10-PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen +; GFX10-PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen +define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { +main_body: + %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + ret void +} + ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 @@ -57,4 +81,5 @@ main_body: declare void @llvm.amdgcn.struct.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32) declare void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32) declare void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll index 205cc5f78d335..2839f92d2aae1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll @@ -23,6 +23,19 @@ main_body: ret half %elt } +; GCN-LABEL: {{^}}tbuffer_load_d16_xyz: +; UNPACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] +; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] +define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) { +main_body: + %data = call <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + %elt = extractelement <3 x half> %data, i32 2 + ret half %elt +} + ; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: ; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] @@ -38,4 +51,5 @@ main_body: declare half @llvm.amdgcn.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll index 4dd76a3a632dc..a940df3540cfe 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -28,6 +28,28 @@ main_body: ret void } +; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] +; UNPACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen + +; PACKED-DAG: s_and_b32 [[SHR0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} +; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR0]] +; PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen +define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + ret void +} + ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 @@ -52,4 +74,5 @@ main_body: declare void @llvm.amdgcn.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) declare void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) declare void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll index ef646d6be267f..d8a82859629c7 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -45,7 +45,7 @@ entry: ; GCN: s_barrier -; SI: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] +; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll new file mode 100644 index 0000000000000..f390fadba1503 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +; Make sure we use the correct frame offset is used with the local +; frame area. +; +; %pin.low is allocated to offset 0. +; +; %local.area is assigned to the local frame offset by the +; LocalStackSlotAllocation pass at offset 4096. +; +; The %load1 access to %gep.large.offset initially used the stack +; pointer register and directly referenced the frame index. After +; LocalStackSlotAllocation, it would no longer refer to a frame index +; so eliminateFrameIndex would not adjust the access to use the +; correct FP offset. + +define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspace(1)* %in) { +; GCN-LABEL: local_stack_offset_uses_sp: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: v_mov_b32_e32 v1, 0x3000 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_add_u32_e32 v0, 64, v1 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0x2000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN-NEXT: BB0_1: ; %loadstoreloop +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_add_u32_e32 v3, s6, v1 +; GCN-NEXT: s_add_i32 s6, s6, 1 +; GCN-NEXT: s_cmpk_lt_u32 s6, 0x2120 +; GCN-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen +; GCN-NEXT: s_cbranch_scc1 BB0_1 +; GCN-NEXT: ; %bb.2: ; %split +; GCN-NEXT: v_mov_b32_e32 v1, 0x3000 +; GCN-NEXT: v_add_u32_e32 v1, 0x20d0, v1 +; GCN-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GCN-NEXT: s_endpgm +entry: + %pin.low = alloca i32, align 8192, addrspace(5) + %local.area = alloca [1060 x i64], align 4096, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %pin.low + %local.area.cast = bitcast [1060 x i64] addrspace(5)* %local.area to i8 addrspace(5)* + call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %local.area.cast, i8 0, i32 8480, i1 true) + %gep.large.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 1050 + %gep.small.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 8 + %load0 = load volatile i64, i64 addrspace(5)* %gep.large.offset + %load1 = load volatile i64, i64 addrspace(5)* %gep.small.offset + %add0 = add i64 %load0, %load1 + store volatile i64 %add0, i64 addrspace(1)* %out + ret void +} + +define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspace(1)* %in) { +; GCN-LABEL: func_local_stack_offset_uses_sp: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_add_u32 s4, s32, 0x7ffc0 +; GCN-NEXT: s_mov_b32 s5, s33 +; GCN-NEXT: s_and_b32 s33, s4, 0xfff80000 +; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 +; GCN-NEXT: v_add_u32_e32 v3, 0x1000, v3 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_add_u32_e32 v2, 64, v3 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: s_add_u32 s32, s32, 0x180000 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 +; GCN-NEXT: BB1_1: ; %loadstoreloop +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_add_u32_e32 v5, s4, v3 +; GCN-NEXT: s_add_i32 s4, s4, 1 +; GCN-NEXT: s_cmpk_lt_u32 s4, 0x2120 +; GCN-NEXT: buffer_store_byte v4, v5, s[0:3], 0 offen +; GCN-NEXT: s_cbranch_scc1 BB1_1 +; GCN-NEXT: ; %bb.2: ; %split +; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 +; GCN-NEXT: v_add_u32_e32 v3, 0x1000, v3 +; GCN-NEXT: v_add_u32_e32 v3, 0x20d0, v3 +; GCN-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 +; GCN-NEXT: s_sub_u32 s32, s32, 0x180000 +; GCN-NEXT: s_mov_b32 s33, s5 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v4, v5 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +entry: + %pin.low = alloca i32, align 8192, addrspace(5) + %local.area = alloca [1060 x i64], align 4096, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %pin.low + %local.area.cast = bitcast [1060 x i64] addrspace(5)* %local.area to i8 addrspace(5)* + call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %local.area.cast, i8 0, i32 8480, i1 true) + %gep.large.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 1050 + %gep.small.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 8 + %load0 = load volatile i64, i64 addrspace(5)* %gep.large.offset + %load1 = load volatile i64, i64 addrspace(5)* %gep.small.offset + %add0 = add i64 %load0, %load1 + store volatile i64 %add0, i64 addrspace(1)* %out + ret void +} + +declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture writeonly, i8, i32, i1 immarg) #0 + +attributes #0 = { argmemonly nounwind willreturn writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index e1386d3e07d7f..e17c322a37728 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -70,16 +70,16 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocaptu ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: v_mov_b32_e32 v6, s6 ; GCN-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off -; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16 -; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v9, s9 ; GCN-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off +; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32 +; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32 ; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48 ; GCN-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll index 925a2daa93da7..8d3b401c57884 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll @@ -529,8 +529,8 @@ define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* % ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}} -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}} +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} ; GCN: buffer_store_dword v[[HI]] define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { store i32 9, i32 addrspace(1)* %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/movreld-bug.ll b/llvm/test/CodeGen/AMDGPU/movreld-bug.ll index 3071f18c449fc..4bf15054aee00 100644 --- a/llvm/test/CodeGen/AMDGPU/movreld-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/movreld-bug.ll @@ -8,14 +8,14 @@ ; MOVREL-NEXT: v_movreld_b32_e32 v0, ; GPRIDX: s_set_gpr_idx_on s0, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, 0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, 1.0 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return define amdgpu_ps float @main(i32 inreg %arg) #0 { main_body: - %tmp24 = insertelement <16 x float> undef, float 0.000000e+00, i32 %arg + %tmp24 = insertelement <16 x float> zeroinitializer, float 1.000000e+00, i32 %arg %tmp25 = extractelement <16 x float> %tmp24, i32 1 ret float %tmp25 } diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index 58085f89e04a8..ebd7ca184bd35 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -28,14 +28,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 ; GCN-NEXT: s_add_i32 s6, s32, 0x1000 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_lshl_b32 s7, s10, 2 ; GCN-NEXT: s_mov_b32 s32, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_add_i32 s6, s6, s7 -; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: s_add_i32 s6, s6, s7 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 @@ -98,14 +98,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_add_i32 s6, s32, 0x1000 ; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_lshl_b32 s7, s7, 2 ; GCN-NEXT: s_mov_b32 s32, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_add_i32 s6, s6, s7 -; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: s_add_i32 s6, s6, s7 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 @@ -166,9 +166,9 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i ; GCN-NEXT: s_add_i32 s6, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_mov_b32_e32 v6, 1 ; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen offset:4 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_lshl_add_u32 v2, v4, 2, s6 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5 @@ -228,9 +228,9 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, ; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, 1 ; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen offset:4 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_lshl_add_u32 v2, v3, 2, s6 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4 diff --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll index 31531a43fc3f2..50bc175bc24f2 100644 --- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -139,6 +139,8 @@ ; GCN-O1-NEXT: Loop Pass Manager ; GCN-O1-NEXT: Rotate Loops ; GCN-O1-NEXT: Memory SSA +; GCN-O1-NEXT: Lazy Branch Probability Analysis +; GCN-O1-NEXT: Lazy Block Frequency Analysis ; GCN-O1-NEXT: Loop Pass Manager ; GCN-O1-NEXT: Loop Invariant Code Motion ; GCN-O1-NEXT: Post-Dominator Tree Construction @@ -270,10 +272,10 @@ ; GCN-O1-NEXT: LCSSA Verifier ; GCN-O1-NEXT: Loop-Closed SSA Form Pass ; GCN-O1-NEXT: Scalar Evolution Analysis -; GCN-O1-NEXT: Loop Pass Manager -; GCN-O1-NEXT: Loop Invariant Code Motion ; GCN-O1-NEXT: Lazy Branch Probability Analysis ; GCN-O1-NEXT: Lazy Block Frequency Analysis +; GCN-O1-NEXT: Loop Pass Manager +; GCN-O1-NEXT: Loop Invariant Code Motion ; GCN-O1-NEXT: Optimization Remark Emitter ; GCN-O1-NEXT: Warn about non-applied transformations ; GCN-O1-NEXT: Alignment from assumptions @@ -459,6 +461,8 @@ ; GCN-O2-NEXT: Loop Pass Manager ; GCN-O2-NEXT: Rotate Loops ; GCN-O2-NEXT: Memory SSA +; GCN-O2-NEXT: Lazy Branch Probability Analysis +; GCN-O2-NEXT: Lazy Block Frequency Analysis ; GCN-O2-NEXT: Loop Pass Manager ; GCN-O2-NEXT: Loop Invariant Code Motion ; GCN-O2-NEXT: Post-Dominator Tree Construction @@ -521,6 +525,8 @@ ; GCN-O2-NEXT: LCSSA Verifier ; GCN-O2-NEXT: Loop-Closed SSA Form Pass ; GCN-O2-NEXT: Scalar Evolution Analysis +; GCN-O2-NEXT: Lazy Branch Probability Analysis +; GCN-O2-NEXT: Lazy Block Frequency Analysis ; GCN-O2-NEXT: Loop Pass Manager ; GCN-O2-NEXT: Loop Invariant Code Motion ; GCN-O2-NEXT: Post-Dominator Tree Construction @@ -623,10 +629,10 @@ ; GCN-O2-NEXT: LCSSA Verifier ; GCN-O2-NEXT: Loop-Closed SSA Form Pass ; GCN-O2-NEXT: Scalar Evolution Analysis -; GCN-O2-NEXT: Loop Pass Manager -; GCN-O2-NEXT: Loop Invariant Code Motion ; GCN-O2-NEXT: Lazy Branch Probability Analysis ; GCN-O2-NEXT: Lazy Block Frequency Analysis +; GCN-O2-NEXT: Loop Pass Manager +; GCN-O2-NEXT: Loop Invariant Code Motion ; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Warn about non-applied transformations ; GCN-O2-NEXT: Alignment from assumptions @@ -819,6 +825,8 @@ ; GCN-O3-NEXT: Loop Pass Manager ; GCN-O3-NEXT: Rotate Loops ; GCN-O3-NEXT: Memory SSA +; GCN-O3-NEXT: Lazy Branch Probability Analysis +; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: Loop Pass Manager ; GCN-O3-NEXT: Loop Invariant Code Motion ; GCN-O3-NEXT: Post-Dominator Tree Construction @@ -881,6 +889,8 @@ ; GCN-O3-NEXT: LCSSA Verifier ; GCN-O3-NEXT: Loop-Closed SSA Form Pass ; GCN-O3-NEXT: Scalar Evolution Analysis +; GCN-O3-NEXT: Lazy Branch Probability Analysis +; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: Loop Pass Manager ; GCN-O3-NEXT: Loop Invariant Code Motion ; GCN-O3-NEXT: Post-Dominator Tree Construction @@ -983,10 +993,10 @@ ; GCN-O3-NEXT: LCSSA Verifier ; GCN-O3-NEXT: Loop-Closed SSA Form Pass ; GCN-O3-NEXT: Scalar Evolution Analysis -; GCN-O3-NEXT: Loop Pass Manager -; GCN-O3-NEXT: Loop Invariant Code Motion ; GCN-O3-NEXT: Lazy Branch Probability Analysis ; GCN-O3-NEXT: Lazy Block Frequency Analysis +; GCN-O3-NEXT: Loop Pass Manager +; GCN-O3-NEXT: Loop Invariant Code Motion ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Warn about non-applied transformations ; GCN-O3-NEXT: Alignment from assumptions diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll index b119ffd303e08..e991c550c6be0 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out, i32 %in) #0 { ; GCN-LABEL: spill_sgprs_to_multiple_vgprs: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND @@ -42,354 +42,352 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out, ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[84:91] ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:11] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v0, s4, 8 +; GCN-NEXT: v_writelane_b32 v0, s5, 9 +; GCN-NEXT: v_writelane_b32 v0, s6, 10 +; GCN-NEXT: v_writelane_b32 v0, s7, 11 +; GCN-NEXT: v_writelane_b32 v0, s8, 12 +; GCN-NEXT: v_writelane_b32 v0, s9, 13 +; GCN-NEXT: v_writelane_b32 v0, s10, 14 +; GCN-NEXT: v_writelane_b32 v0, s11, 15 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:11] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:11] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v0, s4, 24 +; GCN-NEXT: v_writelane_b32 v0, s5, 25 +; GCN-NEXT: v_writelane_b32 v0, s6, 26 +; GCN-NEXT: v_writelane_b32 v0, s7, 27 +; GCN-NEXT: v_writelane_b32 v0, s8, 28 +; GCN-NEXT: v_writelane_b32 v0, s9, 29 +; GCN-NEXT: v_writelane_b32 v0, s10, 30 +; GCN-NEXT: v_writelane_b32 v0, s11, 31 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:11] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:11] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v0, s4, 40 +; GCN-NEXT: v_writelane_b32 v0, s5, 41 +; GCN-NEXT: v_writelane_b32 v0, s6, 42 +; GCN-NEXT: v_writelane_b32 v0, s7, 43 +; GCN-NEXT: v_writelane_b32 v0, s8, 44 +; GCN-NEXT: v_writelane_b32 v0, s9, 45 +; GCN-NEXT: v_writelane_b32 v0, s10, 46 +; GCN-NEXT: v_writelane_b32 v0, s11, 47 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:11] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:11] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_writelane_b32 v0, s0, 0 -; GCN-NEXT: v_writelane_b32 v0, s4, 1 -; GCN-NEXT: v_writelane_b32 v0, s5, 2 -; GCN-NEXT: v_writelane_b32 v0, s6, 3 -; GCN-NEXT: v_writelane_b32 v0, s7, 4 -; GCN-NEXT: v_writelane_b32 v0, s8, 5 -; GCN-NEXT: v_writelane_b32 v0, s9, 6 -; GCN-NEXT: v_writelane_b32 v0, s10, 7 -; GCN-NEXT: v_writelane_b32 v0, s11, 8 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[0:7] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s0, 9 -; GCN-NEXT: v_writelane_b32 v0, s1, 10 -; GCN-NEXT: v_writelane_b32 v0, s2, 11 -; GCN-NEXT: v_writelane_b32 v0, s3, 12 -; GCN-NEXT: v_writelane_b32 v0, s4, 13 -; GCN-NEXT: v_writelane_b32 v0, s5, 14 -; GCN-NEXT: v_writelane_b32 v0, s6, 15 -; GCN-NEXT: v_writelane_b32 v0, s7, 16 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[0:7] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s0, 17 -; GCN-NEXT: v_writelane_b32 v0, s1, 18 -; GCN-NEXT: v_writelane_b32 v0, s2, 19 -; GCN-NEXT: v_writelane_b32 v0, s3, 20 -; GCN-NEXT: v_writelane_b32 v0, s4, 21 -; GCN-NEXT: v_writelane_b32 v0, s5, 22 -; GCN-NEXT: v_writelane_b32 v0, s6, 23 -; GCN-NEXT: v_writelane_b32 v0, s7, 24 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[0:7] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s0, 25 -; GCN-NEXT: v_writelane_b32 v0, s1, 26 -; GCN-NEXT: v_writelane_b32 v0, s2, 27 -; GCN-NEXT: v_writelane_b32 v0, s3, 28 -; GCN-NEXT: v_writelane_b32 v0, s4, 29 -; GCN-NEXT: v_writelane_b32 v0, s5, 30 -; GCN-NEXT: v_writelane_b32 v0, s6, 31 -; GCN-NEXT: v_writelane_b32 v0, s7, 32 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[0:7] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s0, 33 -; GCN-NEXT: v_writelane_b32 v0, s1, 34 -; GCN-NEXT: v_writelane_b32 v0, s2, 35 -; GCN-NEXT: v_writelane_b32 v0, s3, 36 -; GCN-NEXT: v_writelane_b32 v0, s4, 37 -; GCN-NEXT: v_writelane_b32 v0, s5, 38 -; GCN-NEXT: v_writelane_b32 v0, s6, 39 -; GCN-NEXT: v_writelane_b32 v0, s7, 40 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[0:7] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s0, 41 -; GCN-NEXT: v_writelane_b32 v0, s1, 42 -; GCN-NEXT: v_writelane_b32 v0, s2, 43 -; GCN-NEXT: v_writelane_b32 v0, s3, 44 -; GCN-NEXT: v_writelane_b32 v0, s4, 45 -; GCN-NEXT: v_writelane_b32 v0, s5, 46 -; GCN-NEXT: v_writelane_b32 v0, s6, 47 -; GCN-NEXT: v_writelane_b32 v0, s7, 48 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[0:7] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s0, 49 -; GCN-NEXT: v_writelane_b32 v0, s1, 50 -; GCN-NEXT: v_writelane_b32 v0, s2, 51 -; GCN-NEXT: v_writelane_b32 v0, s3, 52 -; GCN-NEXT: v_writelane_b32 v0, s4, 53 -; GCN-NEXT: v_writelane_b32 v0, s5, 54 -; GCN-NEXT: v_writelane_b32 v0, s6, 55 -; GCN-NEXT: v_writelane_b32 v0, s7, 56 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[0:7] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b32 s8, 0 -; GCN-NEXT: v_readlane_b32 s9, v0, 0 -; GCN-NEXT: s_cmp_lg_u32 s9, s8 -; GCN-NEXT: v_writelane_b32 v0, s12, 57 -; GCN-NEXT: v_writelane_b32 v0, s13, 58 -; GCN-NEXT: v_writelane_b32 v0, s14, 59 -; GCN-NEXT: v_writelane_b32 v0, s15, 60 -; GCN-NEXT: v_writelane_b32 v0, s16, 61 -; GCN-NEXT: v_writelane_b32 v0, s17, 62 -; GCN-NEXT: v_writelane_b32 v0, s18, 63 -; GCN-NEXT: v_writelane_b32 v1, s19, 0 -; GCN-NEXT: v_writelane_b32 v1, s20, 1 -; GCN-NEXT: v_writelane_b32 v1, s21, 2 -; GCN-NEXT: v_writelane_b32 v1, s22, 3 -; GCN-NEXT: v_writelane_b32 v1, s23, 4 -; GCN-NEXT: v_writelane_b32 v1, s24, 5 -; GCN-NEXT: v_writelane_b32 v1, s25, 6 -; GCN-NEXT: v_writelane_b32 v1, s26, 7 -; GCN-NEXT: v_writelane_b32 v1, s27, 8 -; GCN-NEXT: v_writelane_b32 v1, s36, 9 -; GCN-NEXT: v_writelane_b32 v1, s37, 10 -; GCN-NEXT: v_writelane_b32 v1, s38, 11 -; GCN-NEXT: v_writelane_b32 v1, s39, 12 -; GCN-NEXT: v_writelane_b32 v1, s40, 13 -; GCN-NEXT: v_writelane_b32 v1, s41, 14 -; GCN-NEXT: v_writelane_b32 v1, s42, 15 -; GCN-NEXT: v_writelane_b32 v1, s43, 16 -; GCN-NEXT: v_writelane_b32 v1, s44, 17 -; GCN-NEXT: v_writelane_b32 v1, s45, 18 -; GCN-NEXT: v_writelane_b32 v1, s46, 19 -; GCN-NEXT: v_writelane_b32 v1, s47, 20 -; GCN-NEXT: v_writelane_b32 v1, s48, 21 -; GCN-NEXT: v_writelane_b32 v1, s49, 22 -; GCN-NEXT: v_writelane_b32 v1, s50, 23 -; GCN-NEXT: v_writelane_b32 v1, s51, 24 -; GCN-NEXT: v_writelane_b32 v1, s52, 25 -; GCN-NEXT: v_writelane_b32 v1, s53, 26 -; GCN-NEXT: v_writelane_b32 v1, s54, 27 -; GCN-NEXT: v_writelane_b32 v1, s55, 28 -; GCN-NEXT: v_writelane_b32 v1, s56, 29 -; GCN-NEXT: v_writelane_b32 v1, s57, 30 -; GCN-NEXT: v_writelane_b32 v1, s58, 31 -; GCN-NEXT: v_writelane_b32 v1, s59, 32 -; GCN-NEXT: v_writelane_b32 v1, s60, 33 -; GCN-NEXT: v_writelane_b32 v1, s61, 34 -; GCN-NEXT: v_writelane_b32 v1, s62, 35 -; GCN-NEXT: v_writelane_b32 v1, s63, 36 -; GCN-NEXT: v_writelane_b32 v1, s64, 37 -; GCN-NEXT: v_writelane_b32 v1, s65, 38 -; GCN-NEXT: v_writelane_b32 v1, s66, 39 -; GCN-NEXT: v_writelane_b32 v1, s67, 40 -; GCN-NEXT: v_writelane_b32 v1, s68, 41 -; GCN-NEXT: v_writelane_b32 v1, s69, 42 -; GCN-NEXT: v_writelane_b32 v1, s70, 43 -; GCN-NEXT: v_writelane_b32 v1, s71, 44 -; GCN-NEXT: v_writelane_b32 v1, s72, 45 -; GCN-NEXT: v_writelane_b32 v1, s73, 46 -; GCN-NEXT: v_writelane_b32 v1, s74, 47 -; GCN-NEXT: v_writelane_b32 v1, s75, 48 -; GCN-NEXT: v_writelane_b32 v1, s76, 49 -; GCN-NEXT: v_writelane_b32 v1, s77, 50 -; GCN-NEXT: v_writelane_b32 v1, s78, 51 -; GCN-NEXT: v_writelane_b32 v1, s79, 52 -; GCN-NEXT: v_writelane_b32 v1, s80, 53 -; GCN-NEXT: v_writelane_b32 v1, s81, 54 -; GCN-NEXT: v_writelane_b32 v1, s82, 55 -; GCN-NEXT: v_writelane_b32 v1, s83, 56 -; GCN-NEXT: v_writelane_b32 v1, s84, 57 -; GCN-NEXT: v_writelane_b32 v1, s85, 58 -; GCN-NEXT: v_writelane_b32 v1, s86, 59 -; GCN-NEXT: v_writelane_b32 v1, s87, 60 -; GCN-NEXT: v_writelane_b32 v1, s88, 61 -; GCN-NEXT: v_writelane_b32 v1, s89, 62 -; GCN-NEXT: v_writelane_b32 v1, s90, 63 -; GCN-NEXT: v_writelane_b32 v2, s91, 0 -; GCN-NEXT: v_writelane_b32 v2, s0, 1 -; GCN-NEXT: v_writelane_b32 v2, s1, 2 -; GCN-NEXT: v_writelane_b32 v2, s2, 3 -; GCN-NEXT: v_writelane_b32 v2, s3, 4 -; GCN-NEXT: v_writelane_b32 v2, s4, 5 -; GCN-NEXT: v_writelane_b32 v2, s5, 6 -; GCN-NEXT: v_writelane_b32 v2, s6, 7 -; GCN-NEXT: v_writelane_b32 v2, s7, 8 +; GCN-NEXT: s_cmp_lg_u32 s2, s3 +; GCN-NEXT: v_writelane_b32 v0, s12, 56 +; GCN-NEXT: v_writelane_b32 v0, s13, 57 +; GCN-NEXT: v_writelane_b32 v0, s14, 58 +; GCN-NEXT: v_writelane_b32 v0, s15, 59 +; GCN-NEXT: v_writelane_b32 v0, s16, 60 +; GCN-NEXT: v_writelane_b32 v0, s17, 61 +; GCN-NEXT: v_writelane_b32 v0, s18, 62 +; GCN-NEXT: v_writelane_b32 v0, s19, 63 +; GCN-NEXT: v_writelane_b32 v1, s20, 0 +; GCN-NEXT: v_writelane_b32 v1, s21, 1 +; GCN-NEXT: v_writelane_b32 v1, s22, 2 +; GCN-NEXT: v_writelane_b32 v1, s23, 3 +; GCN-NEXT: v_writelane_b32 v1, s24, 4 +; GCN-NEXT: v_writelane_b32 v1, s25, 5 +; GCN-NEXT: v_writelane_b32 v1, s26, 6 +; GCN-NEXT: v_writelane_b32 v1, s27, 7 +; GCN-NEXT: v_writelane_b32 v1, s36, 8 +; GCN-NEXT: v_writelane_b32 v1, s37, 9 +; GCN-NEXT: v_writelane_b32 v1, s38, 10 +; GCN-NEXT: v_writelane_b32 v1, s39, 11 +; GCN-NEXT: v_writelane_b32 v1, s40, 12 +; GCN-NEXT: v_writelane_b32 v1, s41, 13 +; GCN-NEXT: v_writelane_b32 v1, s42, 14 +; GCN-NEXT: v_writelane_b32 v1, s43, 15 +; GCN-NEXT: v_writelane_b32 v1, s44, 16 +; GCN-NEXT: v_writelane_b32 v1, s45, 17 +; GCN-NEXT: v_writelane_b32 v1, s46, 18 +; GCN-NEXT: v_writelane_b32 v1, s47, 19 +; GCN-NEXT: v_writelane_b32 v1, s48, 20 +; GCN-NEXT: v_writelane_b32 v1, s49, 21 +; GCN-NEXT: v_writelane_b32 v1, s50, 22 +; GCN-NEXT: v_writelane_b32 v1, s51, 23 +; GCN-NEXT: v_writelane_b32 v1, s52, 24 +; GCN-NEXT: v_writelane_b32 v1, s53, 25 +; GCN-NEXT: v_writelane_b32 v1, s54, 26 +; GCN-NEXT: v_writelane_b32 v1, s55, 27 +; GCN-NEXT: v_writelane_b32 v1, s56, 28 +; GCN-NEXT: v_writelane_b32 v1, s57, 29 +; GCN-NEXT: v_writelane_b32 v1, s58, 30 +; GCN-NEXT: v_writelane_b32 v1, s59, 31 +; GCN-NEXT: v_writelane_b32 v1, s60, 32 +; GCN-NEXT: v_writelane_b32 v1, s61, 33 +; GCN-NEXT: v_writelane_b32 v1, s62, 34 +; GCN-NEXT: v_writelane_b32 v1, s63, 35 +; GCN-NEXT: v_writelane_b32 v1, s64, 36 +; GCN-NEXT: v_writelane_b32 v1, s65, 37 +; GCN-NEXT: v_writelane_b32 v1, s66, 38 +; GCN-NEXT: v_writelane_b32 v1, s67, 39 +; GCN-NEXT: v_writelane_b32 v1, s68, 40 +; GCN-NEXT: v_writelane_b32 v1, s69, 41 +; GCN-NEXT: v_writelane_b32 v1, s70, 42 +; GCN-NEXT: v_writelane_b32 v1, s71, 43 +; GCN-NEXT: v_writelane_b32 v1, s72, 44 +; GCN-NEXT: v_writelane_b32 v1, s73, 45 +; GCN-NEXT: v_writelane_b32 v1, s74, 46 +; GCN-NEXT: v_writelane_b32 v1, s75, 47 +; GCN-NEXT: v_writelane_b32 v1, s76, 48 +; GCN-NEXT: v_writelane_b32 v1, s77, 49 +; GCN-NEXT: v_writelane_b32 v1, s78, 50 +; GCN-NEXT: v_writelane_b32 v1, s79, 51 +; GCN-NEXT: v_writelane_b32 v1, s80, 52 +; GCN-NEXT: v_writelane_b32 v1, s81, 53 +; GCN-NEXT: v_writelane_b32 v1, s82, 54 +; GCN-NEXT: v_writelane_b32 v1, s83, 55 +; GCN-NEXT: v_writelane_b32 v1, s84, 56 +; GCN-NEXT: v_writelane_b32 v1, s85, 57 +; GCN-NEXT: v_writelane_b32 v1, s86, 58 +; GCN-NEXT: v_writelane_b32 v1, s87, 59 +; GCN-NEXT: v_writelane_b32 v1, s88, 60 +; GCN-NEXT: v_writelane_b32 v1, s89, 61 +; GCN-NEXT: v_writelane_b32 v1, s90, 62 +; GCN-NEXT: v_writelane_b32 v1, s91, 63 +; GCN-NEXT: v_writelane_b32 v2, s4, 0 +; GCN-NEXT: v_writelane_b32 v2, s5, 1 +; GCN-NEXT: v_writelane_b32 v2, s6, 2 +; GCN-NEXT: v_writelane_b32 v2, s7, 3 +; GCN-NEXT: v_writelane_b32 v2, s8, 4 +; GCN-NEXT: v_writelane_b32 v2, s9, 5 +; GCN-NEXT: v_writelane_b32 v2, s10, 6 +; GCN-NEXT: v_writelane_b32 v2, s11, 7 ; GCN-NEXT: s_cbranch_scc1 BB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s0, v0, 1 -; GCN-NEXT: v_readlane_b32 s1, v0, 2 -; GCN-NEXT: v_readlane_b32 s2, v0, 3 -; GCN-NEXT: v_readlane_b32 s3, v0, 4 -; GCN-NEXT: v_readlane_b32 s4, v0, 5 -; GCN-NEXT: v_readlane_b32 s5, v0, 6 -; GCN-NEXT: v_readlane_b32 s6, v0, 7 -; GCN-NEXT: v_readlane_b32 s7, v0, 8 +; GCN-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: v_readlane_b32 s2, v0, 2 +; GCN-NEXT: v_readlane_b32 s3, v0, 3 +; GCN-NEXT: v_readlane_b32 s4, v0, 4 +; GCN-NEXT: v_readlane_b32 s5, v0, 5 +; GCN-NEXT: v_readlane_b32 s6, v0, 6 +; GCN-NEXT: v_readlane_b32 s7, v0, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 57 -; GCN-NEXT: v_readlane_b32 s1, v0, 58 -; GCN-NEXT: v_readlane_b32 s2, v0, 59 -; GCN-NEXT: v_readlane_b32 s3, v0, 60 -; GCN-NEXT: v_readlane_b32 s4, v0, 61 -; GCN-NEXT: v_readlane_b32 s5, v0, 62 -; GCN-NEXT: v_readlane_b32 s6, v0, 63 -; GCN-NEXT: v_readlane_b32 s7, v1, 0 +; GCN-NEXT: v_readlane_b32 s0, v0, 56 +; GCN-NEXT: v_readlane_b32 s1, v0, 57 +; GCN-NEXT: v_readlane_b32 s2, v0, 58 +; GCN-NEXT: v_readlane_b32 s3, v0, 59 +; GCN-NEXT: v_readlane_b32 s4, v0, 60 +; GCN-NEXT: v_readlane_b32 s5, v0, 61 +; GCN-NEXT: v_readlane_b32 s6, v0, 62 +; GCN-NEXT: v_readlane_b32 s7, v0, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 1 -; GCN-NEXT: v_readlane_b32 s1, v1, 2 -; GCN-NEXT: v_readlane_b32 s2, v1, 3 -; GCN-NEXT: v_readlane_b32 s3, v1, 4 -; GCN-NEXT: v_readlane_b32 s4, v1, 5 -; GCN-NEXT: v_readlane_b32 s5, v1, 6 -; GCN-NEXT: v_readlane_b32 s6, v1, 7 -; GCN-NEXT: v_readlane_b32 s7, v1, 8 +; GCN-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-NEXT: v_readlane_b32 s2, v1, 2 +; GCN-NEXT: v_readlane_b32 s3, v1, 3 +; GCN-NEXT: v_readlane_b32 s4, v1, 4 +; GCN-NEXT: v_readlane_b32 s5, v1, 5 +; GCN-NEXT: v_readlane_b32 s6, v1, 6 +; GCN-NEXT: v_readlane_b32 s7, v1, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 9 -; GCN-NEXT: v_readlane_b32 s1, v1, 10 -; GCN-NEXT: v_readlane_b32 s2, v1, 11 -; GCN-NEXT: v_readlane_b32 s3, v1, 12 -; GCN-NEXT: v_readlane_b32 s4, v1, 13 -; GCN-NEXT: v_readlane_b32 s5, v1, 14 -; GCN-NEXT: v_readlane_b32 s6, v1, 15 -; GCN-NEXT: v_readlane_b32 s7, v1, 16 +; GCN-NEXT: v_readlane_b32 s0, v1, 8 +; GCN-NEXT: v_readlane_b32 s1, v1, 9 +; GCN-NEXT: v_readlane_b32 s2, v1, 10 +; GCN-NEXT: v_readlane_b32 s3, v1, 11 +; GCN-NEXT: v_readlane_b32 s4, v1, 12 +; GCN-NEXT: v_readlane_b32 s5, v1, 13 +; GCN-NEXT: v_readlane_b32 s6, v1, 14 +; GCN-NEXT: v_readlane_b32 s7, v1, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 17 -; GCN-NEXT: v_readlane_b32 s1, v1, 18 -; GCN-NEXT: v_readlane_b32 s2, v1, 19 -; GCN-NEXT: v_readlane_b32 s3, v1, 20 -; GCN-NEXT: v_readlane_b32 s4, v1, 21 -; GCN-NEXT: v_readlane_b32 s5, v1, 22 -; GCN-NEXT: v_readlane_b32 s6, v1, 23 -; GCN-NEXT: v_readlane_b32 s7, v1, 24 +; GCN-NEXT: v_readlane_b32 s0, v1, 16 +; GCN-NEXT: v_readlane_b32 s1, v1, 17 +; GCN-NEXT: v_readlane_b32 s2, v1, 18 +; GCN-NEXT: v_readlane_b32 s3, v1, 19 +; GCN-NEXT: v_readlane_b32 s4, v1, 20 +; GCN-NEXT: v_readlane_b32 s5, v1, 21 +; GCN-NEXT: v_readlane_b32 s6, v1, 22 +; GCN-NEXT: v_readlane_b32 s7, v1, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 25 -; GCN-NEXT: v_readlane_b32 s1, v1, 26 -; GCN-NEXT: v_readlane_b32 s2, v1, 27 -; GCN-NEXT: v_readlane_b32 s3, v1, 28 -; GCN-NEXT: v_readlane_b32 s4, v1, 29 -; GCN-NEXT: v_readlane_b32 s5, v1, 30 -; GCN-NEXT: v_readlane_b32 s6, v1, 31 -; GCN-NEXT: v_readlane_b32 s7, v1, 32 +; GCN-NEXT: v_readlane_b32 s0, v1, 24 +; GCN-NEXT: v_readlane_b32 s1, v1, 25 +; GCN-NEXT: v_readlane_b32 s2, v1, 26 +; GCN-NEXT: v_readlane_b32 s3, v1, 27 +; GCN-NEXT: v_readlane_b32 s4, v1, 28 +; GCN-NEXT: v_readlane_b32 s5, v1, 29 +; GCN-NEXT: v_readlane_b32 s6, v1, 30 +; GCN-NEXT: v_readlane_b32 s7, v1, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 33 -; GCN-NEXT: v_readlane_b32 s1, v1, 34 -; GCN-NEXT: v_readlane_b32 s2, v1, 35 -; GCN-NEXT: v_readlane_b32 s3, v1, 36 -; GCN-NEXT: v_readlane_b32 s4, v1, 37 -; GCN-NEXT: v_readlane_b32 s5, v1, 38 -; GCN-NEXT: v_readlane_b32 s6, v1, 39 -; GCN-NEXT: v_readlane_b32 s7, v1, 40 +; GCN-NEXT: v_readlane_b32 s0, v1, 32 +; GCN-NEXT: v_readlane_b32 s1, v1, 33 +; GCN-NEXT: v_readlane_b32 s2, v1, 34 +; GCN-NEXT: v_readlane_b32 s3, v1, 35 +; GCN-NEXT: v_readlane_b32 s4, v1, 36 +; GCN-NEXT: v_readlane_b32 s5, v1, 37 +; GCN-NEXT: v_readlane_b32 s6, v1, 38 +; GCN-NEXT: v_readlane_b32 s7, v1, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 41 -; GCN-NEXT: v_readlane_b32 s1, v1, 42 -; GCN-NEXT: v_readlane_b32 s2, v1, 43 -; GCN-NEXT: v_readlane_b32 s3, v1, 44 -; GCN-NEXT: v_readlane_b32 s4, v1, 45 -; GCN-NEXT: v_readlane_b32 s5, v1, 46 -; GCN-NEXT: v_readlane_b32 s6, v1, 47 -; GCN-NEXT: v_readlane_b32 s7, v1, 48 +; GCN-NEXT: v_readlane_b32 s0, v1, 40 +; GCN-NEXT: v_readlane_b32 s1, v1, 41 +; GCN-NEXT: v_readlane_b32 s2, v1, 42 +; GCN-NEXT: v_readlane_b32 s3, v1, 43 +; GCN-NEXT: v_readlane_b32 s4, v1, 44 +; GCN-NEXT: v_readlane_b32 s5, v1, 45 +; GCN-NEXT: v_readlane_b32 s6, v1, 46 +; GCN-NEXT: v_readlane_b32 s7, v1, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 49 -; GCN-NEXT: v_readlane_b32 s1, v1, 50 -; GCN-NEXT: v_readlane_b32 s2, v1, 51 -; GCN-NEXT: v_readlane_b32 s3, v1, 52 -; GCN-NEXT: v_readlane_b32 s4, v1, 53 -; GCN-NEXT: v_readlane_b32 s5, v1, 54 -; GCN-NEXT: v_readlane_b32 s6, v1, 55 -; GCN-NEXT: v_readlane_b32 s7, v1, 56 +; GCN-NEXT: v_readlane_b32 s0, v1, 48 +; GCN-NEXT: v_readlane_b32 s1, v1, 49 +; GCN-NEXT: v_readlane_b32 s2, v1, 50 +; GCN-NEXT: v_readlane_b32 s3, v1, 51 +; GCN-NEXT: v_readlane_b32 s4, v1, 52 +; GCN-NEXT: v_readlane_b32 s5, v1, 53 +; GCN-NEXT: v_readlane_b32 s6, v1, 54 +; GCN-NEXT: v_readlane_b32 s7, v1, 55 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 57 -; GCN-NEXT: v_readlane_b32 s1, v1, 58 -; GCN-NEXT: v_readlane_b32 s2, v1, 59 -; GCN-NEXT: v_readlane_b32 s3, v1, 60 -; GCN-NEXT: v_readlane_b32 s4, v1, 61 -; GCN-NEXT: v_readlane_b32 s5, v1, 62 -; GCN-NEXT: v_readlane_b32 s6, v1, 63 -; GCN-NEXT: v_readlane_b32 s7, v2, 0 +; GCN-NEXT: v_readlane_b32 s0, v1, 56 +; GCN-NEXT: v_readlane_b32 s1, v1, 57 +; GCN-NEXT: v_readlane_b32 s2, v1, 58 +; GCN-NEXT: v_readlane_b32 s3, v1, 59 +; GCN-NEXT: v_readlane_b32 s4, v1, 60 +; GCN-NEXT: v_readlane_b32 s5, v1, 61 +; GCN-NEXT: v_readlane_b32 s6, v1, 62 +; GCN-NEXT: v_readlane_b32 s7, v1, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 9 -; GCN-NEXT: v_readlane_b32 s1, v0, 10 -; GCN-NEXT: v_readlane_b32 s2, v0, 11 -; GCN-NEXT: v_readlane_b32 s3, v0, 12 -; GCN-NEXT: v_readlane_b32 s4, v0, 13 -; GCN-NEXT: v_readlane_b32 s5, v0, 14 -; GCN-NEXT: v_readlane_b32 s6, v0, 15 -; GCN-NEXT: v_readlane_b32 s7, v0, 16 +; GCN-NEXT: v_readlane_b32 s0, v0, 8 +; GCN-NEXT: v_readlane_b32 s1, v0, 9 +; GCN-NEXT: v_readlane_b32 s2, v0, 10 +; GCN-NEXT: v_readlane_b32 s3, v0, 11 +; GCN-NEXT: v_readlane_b32 s4, v0, 12 +; GCN-NEXT: v_readlane_b32 s5, v0, 13 +; GCN-NEXT: v_readlane_b32 s6, v0, 14 +; GCN-NEXT: v_readlane_b32 s7, v0, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 17 -; GCN-NEXT: v_readlane_b32 s1, v0, 18 -; GCN-NEXT: v_readlane_b32 s2, v0, 19 -; GCN-NEXT: v_readlane_b32 s3, v0, 20 -; GCN-NEXT: v_readlane_b32 s4, v0, 21 -; GCN-NEXT: v_readlane_b32 s5, v0, 22 -; GCN-NEXT: v_readlane_b32 s6, v0, 23 -; GCN-NEXT: v_readlane_b32 s7, v0, 24 +; GCN-NEXT: v_readlane_b32 s0, v0, 16 +; GCN-NEXT: v_readlane_b32 s1, v0, 17 +; GCN-NEXT: v_readlane_b32 s2, v0, 18 +; GCN-NEXT: v_readlane_b32 s3, v0, 19 +; GCN-NEXT: v_readlane_b32 s4, v0, 20 +; GCN-NEXT: v_readlane_b32 s5, v0, 21 +; GCN-NEXT: v_readlane_b32 s6, v0, 22 +; GCN-NEXT: v_readlane_b32 s7, v0, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 25 -; GCN-NEXT: v_readlane_b32 s1, v0, 26 -; GCN-NEXT: v_readlane_b32 s2, v0, 27 -; GCN-NEXT: v_readlane_b32 s3, v0, 28 -; GCN-NEXT: v_readlane_b32 s4, v0, 29 -; GCN-NEXT: v_readlane_b32 s5, v0, 30 -; GCN-NEXT: v_readlane_b32 s6, v0, 31 -; GCN-NEXT: v_readlane_b32 s7, v0, 32 +; GCN-NEXT: v_readlane_b32 s0, v0, 24 +; GCN-NEXT: v_readlane_b32 s1, v0, 25 +; GCN-NEXT: v_readlane_b32 s2, v0, 26 +; GCN-NEXT: v_readlane_b32 s3, v0, 27 +; GCN-NEXT: v_readlane_b32 s4, v0, 28 +; GCN-NEXT: v_readlane_b32 s5, v0, 29 +; GCN-NEXT: v_readlane_b32 s6, v0, 30 +; GCN-NEXT: v_readlane_b32 s7, v0, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 33 -; GCN-NEXT: v_readlane_b32 s1, v0, 34 -; GCN-NEXT: v_readlane_b32 s2, v0, 35 -; GCN-NEXT: v_readlane_b32 s3, v0, 36 -; GCN-NEXT: v_readlane_b32 s4, v0, 37 -; GCN-NEXT: v_readlane_b32 s5, v0, 38 -; GCN-NEXT: v_readlane_b32 s6, v0, 39 -; GCN-NEXT: v_readlane_b32 s7, v0, 40 +; GCN-NEXT: v_readlane_b32 s0, v0, 32 +; GCN-NEXT: v_readlane_b32 s1, v0, 33 +; GCN-NEXT: v_readlane_b32 s2, v0, 34 +; GCN-NEXT: v_readlane_b32 s3, v0, 35 +; GCN-NEXT: v_readlane_b32 s4, v0, 36 +; GCN-NEXT: v_readlane_b32 s5, v0, 37 +; GCN-NEXT: v_readlane_b32 s6, v0, 38 +; GCN-NEXT: v_readlane_b32 s7, v0, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 41 -; GCN-NEXT: v_readlane_b32 s1, v0, 42 -; GCN-NEXT: v_readlane_b32 s2, v0, 43 -; GCN-NEXT: v_readlane_b32 s3, v0, 44 -; GCN-NEXT: v_readlane_b32 s4, v0, 45 -; GCN-NEXT: v_readlane_b32 s5, v0, 46 -; GCN-NEXT: v_readlane_b32 s6, v0, 47 -; GCN-NEXT: v_readlane_b32 s7, v0, 48 +; GCN-NEXT: v_readlane_b32 s0, v0, 40 +; GCN-NEXT: v_readlane_b32 s1, v0, 41 +; GCN-NEXT: v_readlane_b32 s2, v0, 42 +; GCN-NEXT: v_readlane_b32 s3, v0, 43 +; GCN-NEXT: v_readlane_b32 s4, v0, 44 +; GCN-NEXT: v_readlane_b32 s5, v0, 45 +; GCN-NEXT: v_readlane_b32 s6, v0, 46 +; GCN-NEXT: v_readlane_b32 s7, v0, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 49 -; GCN-NEXT: v_readlane_b32 s1, v0, 50 -; GCN-NEXT: v_readlane_b32 s2, v0, 51 -; GCN-NEXT: v_readlane_b32 s3, v0, 52 -; GCN-NEXT: v_readlane_b32 s4, v0, 53 -; GCN-NEXT: v_readlane_b32 s5, v0, 54 -; GCN-NEXT: v_readlane_b32 s6, v0, 55 -; GCN-NEXT: v_readlane_b32 s7, v0, 56 +; GCN-NEXT: v_readlane_b32 s0, v0, 48 +; GCN-NEXT: v_readlane_b32 s1, v0, 49 +; GCN-NEXT: v_readlane_b32 s2, v0, 50 +; GCN-NEXT: v_readlane_b32 s3, v0, 51 +; GCN-NEXT: v_readlane_b32 s4, v0, 52 +; GCN-NEXT: v_readlane_b32 s5, v0, 53 +; GCN-NEXT: v_readlane_b32 s6, v0, 54 +; GCN-NEXT: v_readlane_b32 s7, v0, 55 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v2, 1 -; GCN-NEXT: v_readlane_b32 s1, v2, 2 -; GCN-NEXT: v_readlane_b32 s2, v2, 3 -; GCN-NEXT: v_readlane_b32 s3, v2, 4 -; GCN-NEXT: v_readlane_b32 s4, v2, 5 -; GCN-NEXT: v_readlane_b32 s5, v2, 6 -; GCN-NEXT: v_readlane_b32 s6, v2, 7 -; GCN-NEXT: v_readlane_b32 s7, v2, 8 +; GCN-NEXT: v_readlane_b32 s0, v2, 0 +; GCN-NEXT: v_readlane_b32 s1, v2, 1 +; GCN-NEXT: v_readlane_b32 s2, v2, 2 +; GCN-NEXT: v_readlane_b32 s3, v2, 3 +; GCN-NEXT: v_readlane_b32 s4, v2, 4 +; GCN-NEXT: v_readlane_b32 s5, v2, 5 +; GCN-NEXT: v_readlane_b32 s6, v2, 6 +; GCN-NEXT: v_readlane_b32 s7, v2, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND @@ -444,195 +442,193 @@ ret: define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 { ; GCN-LABEL: split_sgpr_spill_2_vgprs: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[36:51] ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v0, s12, 8 +; GCN-NEXT: v_writelane_b32 v0, s13, 9 +; GCN-NEXT: v_writelane_b32 v0, s14, 10 +; GCN-NEXT: v_writelane_b32 v0, s15, 11 +; GCN-NEXT: v_writelane_b32 v0, s16, 12 +; GCN-NEXT: v_writelane_b32 v0, s17, 13 +; GCN-NEXT: v_writelane_b32 v0, s18, 14 +; GCN-NEXT: v_writelane_b32 v0, s19, 15 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s12, 24 +; GCN-NEXT: v_writelane_b32 v0, s13, 25 +; GCN-NEXT: v_writelane_b32 v0, s14, 26 +; GCN-NEXT: v_writelane_b32 v0, s15, 27 +; GCN-NEXT: v_writelane_b32 v0, s16, 28 +; GCN-NEXT: v_writelane_b32 v0, s17, 29 +; GCN-NEXT: v_writelane_b32 v0, s18, 30 +; GCN-NEXT: v_writelane_b32 v0, s19, 31 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[20:27] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[0:1] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_writelane_b32 v0, s0, 0 -; GCN-NEXT: v_writelane_b32 v0, s4, 1 -; GCN-NEXT: v_writelane_b32 v0, s5, 2 -; GCN-NEXT: v_writelane_b32 v0, s6, 3 -; GCN-NEXT: v_writelane_b32 v0, s7, 4 -; GCN-NEXT: v_writelane_b32 v0, s8, 5 -; GCN-NEXT: v_writelane_b32 v0, s9, 6 -; GCN-NEXT: v_writelane_b32 v0, s10, 7 -; GCN-NEXT: v_writelane_b32 v0, s11, 8 -; GCN-NEXT: v_writelane_b32 v0, s12, 9 -; GCN-NEXT: v_writelane_b32 v0, s13, 10 -; GCN-NEXT: v_writelane_b32 v0, s14, 11 -; GCN-NEXT: v_writelane_b32 v0, s15, 12 -; GCN-NEXT: v_writelane_b32 v0, s16, 13 -; GCN-NEXT: v_writelane_b32 v0, s17, 14 -; GCN-NEXT: v_writelane_b32 v0, s18, 15 -; GCN-NEXT: v_writelane_b32 v0, s19, 16 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[0:15] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[16:31] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s0, 17 -; GCN-NEXT: v_writelane_b32 v0, s1, 18 -; GCN-NEXT: v_writelane_b32 v0, s2, 19 -; GCN-NEXT: v_writelane_b32 v0, s3, 20 -; GCN-NEXT: v_writelane_b32 v0, s4, 21 -; GCN-NEXT: v_writelane_b32 v0, s5, 22 -; GCN-NEXT: v_writelane_b32 v0, s6, 23 -; GCN-NEXT: v_writelane_b32 v0, s7, 24 -; GCN-NEXT: v_writelane_b32 v0, s8, 25 -; GCN-NEXT: v_writelane_b32 v0, s9, 26 -; GCN-NEXT: v_writelane_b32 v0, s10, 27 -; GCN-NEXT: v_writelane_b32 v0, s11, 28 -; GCN-NEXT: v_writelane_b32 v0, s12, 29 -; GCN-NEXT: v_writelane_b32 v0, s13, 30 -; GCN-NEXT: v_writelane_b32 v0, s14, 31 -; GCN-NEXT: v_writelane_b32 v0, s15, 32 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[0:7] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[8:9] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b32 s10, 0 -; GCN-NEXT: v_readlane_b32 s11, v0, 0 -; GCN-NEXT: s_cmp_lg_u32 s11, s10 -; GCN-NEXT: v_writelane_b32 v0, s36, 33 -; GCN-NEXT: v_writelane_b32 v0, s37, 34 -; GCN-NEXT: v_writelane_b32 v0, s38, 35 -; GCN-NEXT: v_writelane_b32 v0, s39, 36 -; GCN-NEXT: v_writelane_b32 v0, s40, 37 -; GCN-NEXT: v_writelane_b32 v0, s41, 38 -; GCN-NEXT: v_writelane_b32 v0, s42, 39 -; GCN-NEXT: v_writelane_b32 v0, s43, 40 -; GCN-NEXT: v_writelane_b32 v0, s44, 41 -; GCN-NEXT: v_writelane_b32 v0, s45, 42 -; GCN-NEXT: v_writelane_b32 v0, s46, 43 -; GCN-NEXT: v_writelane_b32 v0, s47, 44 -; GCN-NEXT: v_writelane_b32 v0, s48, 45 -; GCN-NEXT: v_writelane_b32 v0, s49, 46 -; GCN-NEXT: v_writelane_b32 v0, s50, 47 -; GCN-NEXT: v_writelane_b32 v0, s51, 48 -; GCN-NEXT: v_writelane_b32 v0, s16, 49 -; GCN-NEXT: v_writelane_b32 v0, s17, 50 -; GCN-NEXT: v_writelane_b32 v0, s18, 51 -; GCN-NEXT: v_writelane_b32 v0, s19, 52 -; GCN-NEXT: v_writelane_b32 v0, s20, 53 -; GCN-NEXT: v_writelane_b32 v0, s21, 54 -; GCN-NEXT: v_writelane_b32 v0, s22, 55 -; GCN-NEXT: v_writelane_b32 v0, s23, 56 -; GCN-NEXT: v_writelane_b32 v0, s24, 57 -; GCN-NEXT: v_writelane_b32 v0, s25, 58 -; GCN-NEXT: v_writelane_b32 v0, s26, 59 -; GCN-NEXT: v_writelane_b32 v0, s27, 60 -; GCN-NEXT: v_writelane_b32 v0, s28, 61 -; GCN-NEXT: v_writelane_b32 v0, s29, 62 -; GCN-NEXT: v_writelane_b32 v0, s30, 63 -; GCN-NEXT: v_writelane_b32 v1, s31, 0 -; GCN-NEXT: v_writelane_b32 v1, s0, 1 -; GCN-NEXT: v_writelane_b32 v1, s1, 2 -; GCN-NEXT: v_writelane_b32 v1, s2, 3 -; GCN-NEXT: v_writelane_b32 v1, s3, 4 -; GCN-NEXT: v_writelane_b32 v1, s4, 5 -; GCN-NEXT: v_writelane_b32 v1, s5, 6 -; GCN-NEXT: v_writelane_b32 v1, s6, 7 -; GCN-NEXT: v_writelane_b32 v1, s7, 8 -; GCN-NEXT: v_writelane_b32 v1, s8, 9 -; GCN-NEXT: v_writelane_b32 v1, s9, 10 +; GCN-NEXT: s_cmp_lg_u32 s2, s3 +; GCN-NEXT: v_writelane_b32 v0, s36, 32 +; GCN-NEXT: v_writelane_b32 v0, s37, 33 +; GCN-NEXT: v_writelane_b32 v0, s38, 34 +; GCN-NEXT: v_writelane_b32 v0, s39, 35 +; GCN-NEXT: v_writelane_b32 v0, s40, 36 +; GCN-NEXT: v_writelane_b32 v0, s41, 37 +; GCN-NEXT: v_writelane_b32 v0, s42, 38 +; GCN-NEXT: v_writelane_b32 v0, s43, 39 +; GCN-NEXT: v_writelane_b32 v0, s44, 40 +; GCN-NEXT: v_writelane_b32 v0, s45, 41 +; GCN-NEXT: v_writelane_b32 v0, s46, 42 +; GCN-NEXT: v_writelane_b32 v0, s47, 43 +; GCN-NEXT: v_writelane_b32 v0, s48, 44 +; GCN-NEXT: v_writelane_b32 v0, s49, 45 +; GCN-NEXT: v_writelane_b32 v0, s50, 46 +; GCN-NEXT: v_writelane_b32 v0, s51, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s12, 56 +; GCN-NEXT: v_writelane_b32 v0, s13, 57 +; GCN-NEXT: v_writelane_b32 v0, s14, 58 +; GCN-NEXT: v_writelane_b32 v0, s15, 59 +; GCN-NEXT: v_writelane_b32 v0, s16, 60 +; GCN-NEXT: v_writelane_b32 v0, s17, 61 +; GCN-NEXT: v_writelane_b32 v0, s18, 62 +; GCN-NEXT: v_writelane_b32 v0, s19, 63 +; GCN-NEXT: v_writelane_b32 v1, s20, 0 +; GCN-NEXT: v_writelane_b32 v1, s21, 1 +; GCN-NEXT: v_writelane_b32 v1, s22, 2 +; GCN-NEXT: v_writelane_b32 v1, s23, 3 +; GCN-NEXT: v_writelane_b32 v1, s24, 4 +; GCN-NEXT: v_writelane_b32 v1, s25, 5 +; GCN-NEXT: v_writelane_b32 v1, s26, 6 +; GCN-NEXT: v_writelane_b32 v1, s27, 7 +; GCN-NEXT: v_writelane_b32 v1, s0, 8 +; GCN-NEXT: v_writelane_b32 v1, s1, 9 ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s0, v0, 1 -; GCN-NEXT: v_readlane_b32 s1, v0, 2 -; GCN-NEXT: v_readlane_b32 s2, v0, 3 -; GCN-NEXT: v_readlane_b32 s3, v0, 4 -; GCN-NEXT: v_readlane_b32 s4, v0, 5 -; GCN-NEXT: v_readlane_b32 s5, v0, 6 -; GCN-NEXT: v_readlane_b32 s6, v0, 7 -; GCN-NEXT: v_readlane_b32 s7, v0, 8 -; GCN-NEXT: v_readlane_b32 s8, v0, 9 -; GCN-NEXT: v_readlane_b32 s9, v0, 10 -; GCN-NEXT: v_readlane_b32 s10, v0, 11 -; GCN-NEXT: v_readlane_b32 s11, v0, 12 -; GCN-NEXT: v_readlane_b32 s12, v0, 13 -; GCN-NEXT: v_readlane_b32 s13, v0, 14 -; GCN-NEXT: v_readlane_b32 s14, v0, 15 -; GCN-NEXT: v_readlane_b32 s15, v0, 16 +; GCN-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: v_readlane_b32 s2, v0, 2 +; GCN-NEXT: v_readlane_b32 s3, v0, 3 +; GCN-NEXT: v_readlane_b32 s4, v0, 4 +; GCN-NEXT: v_readlane_b32 s5, v0, 5 +; GCN-NEXT: v_readlane_b32 s6, v0, 6 +; GCN-NEXT: v_readlane_b32 s7, v0, 7 +; GCN-NEXT: v_readlane_b32 s8, v0, 8 +; GCN-NEXT: v_readlane_b32 s9, v0, 9 +; GCN-NEXT: v_readlane_b32 s10, v0, 10 +; GCN-NEXT: v_readlane_b32 s11, v0, 11 +; GCN-NEXT: v_readlane_b32 s12, v0, 12 +; GCN-NEXT: v_readlane_b32 s13, v0, 13 +; GCN-NEXT: v_readlane_b32 s14, v0, 14 +; GCN-NEXT: v_readlane_b32 s15, v0, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 33 -; GCN-NEXT: v_readlane_b32 s1, v0, 34 -; GCN-NEXT: v_readlane_b32 s2, v0, 35 -; GCN-NEXT: v_readlane_b32 s3, v0, 36 -; GCN-NEXT: v_readlane_b32 s4, v0, 37 -; GCN-NEXT: v_readlane_b32 s5, v0, 38 -; GCN-NEXT: v_readlane_b32 s6, v0, 39 -; GCN-NEXT: v_readlane_b32 s7, v0, 40 -; GCN-NEXT: v_readlane_b32 s8, v0, 41 -; GCN-NEXT: v_readlane_b32 s9, v0, 42 -; GCN-NEXT: v_readlane_b32 s10, v0, 43 -; GCN-NEXT: v_readlane_b32 s11, v0, 44 -; GCN-NEXT: v_readlane_b32 s12, v0, 45 -; GCN-NEXT: v_readlane_b32 s13, v0, 46 -; GCN-NEXT: v_readlane_b32 s14, v0, 47 -; GCN-NEXT: v_readlane_b32 s15, v0, 48 +; GCN-NEXT: v_readlane_b32 s0, v0, 32 +; GCN-NEXT: v_readlane_b32 s1, v0, 33 +; GCN-NEXT: v_readlane_b32 s2, v0, 34 +; GCN-NEXT: v_readlane_b32 s3, v0, 35 +; GCN-NEXT: v_readlane_b32 s4, v0, 36 +; GCN-NEXT: v_readlane_b32 s5, v0, 37 +; GCN-NEXT: v_readlane_b32 s6, v0, 38 +; GCN-NEXT: v_readlane_b32 s7, v0, 39 +; GCN-NEXT: v_readlane_b32 s8, v0, 40 +; GCN-NEXT: v_readlane_b32 s9, v0, 41 +; GCN-NEXT: v_readlane_b32 s10, v0, 42 +; GCN-NEXT: v_readlane_b32 s11, v0, 43 +; GCN-NEXT: v_readlane_b32 s12, v0, 44 +; GCN-NEXT: v_readlane_b32 s13, v0, 45 +; GCN-NEXT: v_readlane_b32 s14, v0, 46 +; GCN-NEXT: v_readlane_b32 s15, v0, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 17 -; GCN-NEXT: v_readlane_b32 s1, v0, 18 -; GCN-NEXT: v_readlane_b32 s2, v0, 19 -; GCN-NEXT: v_readlane_b32 s3, v0, 20 -; GCN-NEXT: v_readlane_b32 s4, v0, 21 -; GCN-NEXT: v_readlane_b32 s5, v0, 22 -; GCN-NEXT: v_readlane_b32 s6, v0, 23 -; GCN-NEXT: v_readlane_b32 s7, v0, 24 -; GCN-NEXT: v_readlane_b32 s8, v0, 25 -; GCN-NEXT: v_readlane_b32 s9, v0, 26 -; GCN-NEXT: v_readlane_b32 s10, v0, 27 -; GCN-NEXT: v_readlane_b32 s11, v0, 28 -; GCN-NEXT: v_readlane_b32 s12, v0, 29 -; GCN-NEXT: v_readlane_b32 s13, v0, 30 -; GCN-NEXT: v_readlane_b32 s14, v0, 31 -; GCN-NEXT: v_readlane_b32 s15, v0, 32 +; GCN-NEXT: v_readlane_b32 s0, v0, 16 +; GCN-NEXT: v_readlane_b32 s1, v0, 17 +; GCN-NEXT: v_readlane_b32 s2, v0, 18 +; GCN-NEXT: v_readlane_b32 s3, v0, 19 +; GCN-NEXT: v_readlane_b32 s4, v0, 20 +; GCN-NEXT: v_readlane_b32 s5, v0, 21 +; GCN-NEXT: v_readlane_b32 s6, v0, 22 +; GCN-NEXT: v_readlane_b32 s7, v0, 23 +; GCN-NEXT: v_readlane_b32 s8, v0, 24 +; GCN-NEXT: v_readlane_b32 s9, v0, 25 +; GCN-NEXT: v_readlane_b32 s10, v0, 26 +; GCN-NEXT: v_readlane_b32 s11, v0, 27 +; GCN-NEXT: v_readlane_b32 s12, v0, 28 +; GCN-NEXT: v_readlane_b32 s13, v0, 29 +; GCN-NEXT: v_readlane_b32 s14, v0, 30 +; GCN-NEXT: v_readlane_b32 s15, v0, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 1 -; GCN-NEXT: v_readlane_b32 s1, v1, 2 -; GCN-NEXT: v_readlane_b32 s2, v1, 3 -; GCN-NEXT: v_readlane_b32 s3, v1, 4 -; GCN-NEXT: v_readlane_b32 s4, v1, 5 -; GCN-NEXT: v_readlane_b32 s5, v1, 6 -; GCN-NEXT: v_readlane_b32 s6, v1, 7 -; GCN-NEXT: v_readlane_b32 s7, v1, 8 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; use s[0:7] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 9 -; GCN-NEXT: v_readlane_b32 s1, v1, 10 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; use s[0:1] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 49 -; GCN-NEXT: v_readlane_b32 s1, v0, 50 -; GCN-NEXT: v_readlane_b32 s2, v0, 51 -; GCN-NEXT: v_readlane_b32 s3, v0, 52 -; GCN-NEXT: v_readlane_b32 s4, v0, 53 -; GCN-NEXT: v_readlane_b32 s5, v0, 54 -; GCN-NEXT: v_readlane_b32 s6, v0, 55 -; GCN-NEXT: v_readlane_b32 s7, v0, 56 -; GCN-NEXT: v_readlane_b32 s8, v0, 57 -; GCN-NEXT: v_readlane_b32 s9, v0, 58 -; GCN-NEXT: v_readlane_b32 s10, v0, 59 -; GCN-NEXT: v_readlane_b32 s11, v0, 60 -; GCN-NEXT: v_readlane_b32 s12, v0, 61 -; GCN-NEXT: v_readlane_b32 s13, v0, 62 -; GCN-NEXT: v_readlane_b32 s14, v0, 63 -; GCN-NEXT: v_readlane_b32 s15, v1, 0 +; GCN-NEXT: v_readlane_b32 s16, v1, 0 +; GCN-NEXT: v_readlane_b32 s17, v1, 1 +; GCN-NEXT: v_readlane_b32 s18, v1, 2 +; GCN-NEXT: v_readlane_b32 s19, v1, 3 +; GCN-NEXT: v_readlane_b32 s20, v1, 4 +; GCN-NEXT: v_readlane_b32 s21, v1, 5 +; GCN-NEXT: v_readlane_b32 s22, v1, 6 +; GCN-NEXT: v_readlane_b32 s23, v1, 7 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[16:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s24, v1, 8 +; GCN-NEXT: v_readlane_b32 s25, v1, 9 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[24:25] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s0, v0, 48 +; GCN-NEXT: v_readlane_b32 s1, v0, 49 +; GCN-NEXT: v_readlane_b32 s2, v0, 50 +; GCN-NEXT: v_readlane_b32 s3, v0, 51 +; GCN-NEXT: v_readlane_b32 s4, v0, 52 +; GCN-NEXT: v_readlane_b32 s5, v0, 53 +; GCN-NEXT: v_readlane_b32 s6, v0, 54 +; GCN-NEXT: v_readlane_b32 s7, v0, 55 +; GCN-NEXT: v_readlane_b32 s8, v0, 56 +; GCN-NEXT: v_readlane_b32 s9, v0, 57 +; GCN-NEXT: v_readlane_b32 s10, v0, 58 +; GCN-NEXT: v_readlane_b32 s11, v0, 59 +; GCN-NEXT: v_readlane_b32 s12, v0, 60 +; GCN-NEXT: v_readlane_b32 s13, v0, 61 +; GCN-NEXT: v_readlane_b32 s14, v0, 62 +; GCN-NEXT: v_readlane_b32 s15, v0, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND @@ -667,13 +663,13 @@ ret: define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 { ; GCN-LABEL: no_vgprs_last_sgpr_spill: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s56, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s57, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s58, -1 -; GCN-NEXT: s_mov_b32 s59, 0xe8f000 -; GCN-NEXT: s_add_u32 s56, s56, s3 -; GCN-NEXT: s_addc_u32 s57, s57, 0 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s22, -1 +; GCN-NEXT: s_mov_b32 s23, 0xe8f000 +; GCN-NEXT: s_add_u32 s20, s20, s3 +; GCN-NEXT: s_addc_u32 s21, s21, 0 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -692,179 +688,177 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[36:51] ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v31, s4, 0 +; GCN-NEXT: v_writelane_b32 v31, s5, 1 +; GCN-NEXT: v_writelane_b32 v31, s6, 2 +; GCN-NEXT: v_writelane_b32 v31, s7, 3 +; GCN-NEXT: v_writelane_b32 v31, s8, 4 +; GCN-NEXT: v_writelane_b32 v31, s9, 5 +; GCN-NEXT: v_writelane_b32 v31, s10, 6 +; GCN-NEXT: v_writelane_b32 v31, s11, 7 +; GCN-NEXT: v_writelane_b32 v31, s12, 8 +; GCN-NEXT: v_writelane_b32 v31, s13, 9 +; GCN-NEXT: v_writelane_b32 v31, s14, 10 +; GCN-NEXT: v_writelane_b32 v31, s15, 11 +; GCN-NEXT: v_writelane_b32 v31, s16, 12 +; GCN-NEXT: v_writelane_b32 v31, s17, 13 +; GCN-NEXT: v_writelane_b32 v31, s18, 14 +; GCN-NEXT: v_writelane_b32 v31, s19, 15 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v31, s4, 16 +; GCN-NEXT: v_writelane_b32 v31, s5, 17 +; GCN-NEXT: v_writelane_b32 v31, s6, 18 +; GCN-NEXT: v_writelane_b32 v31, s7, 19 +; GCN-NEXT: v_writelane_b32 v31, s8, 20 +; GCN-NEXT: v_writelane_b32 v31, s9, 21 +; GCN-NEXT: v_writelane_b32 v31, s10, 22 +; GCN-NEXT: v_writelane_b32 v31, s11, 23 +; GCN-NEXT: v_writelane_b32 v31, s12, 24 +; GCN-NEXT: v_writelane_b32 v31, s13, 25 +; GCN-NEXT: v_writelane_b32 v31, s14, 26 +; GCN-NEXT: v_writelane_b32 v31, s15, 27 +; GCN-NEXT: v_writelane_b32 v31, s16, 28 +; GCN-NEXT: v_writelane_b32 v31, s17, 29 +; GCN-NEXT: v_writelane_b32 v31, s18, 30 +; GCN-NEXT: v_writelane_b32 v31, s19, 31 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[0:1] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_writelane_b32 v31, s0, 0 -; GCN-NEXT: v_writelane_b32 v31, s4, 1 -; GCN-NEXT: v_writelane_b32 v31, s5, 2 -; GCN-NEXT: v_writelane_b32 v31, s6, 3 -; GCN-NEXT: v_writelane_b32 v31, s7, 4 -; GCN-NEXT: v_writelane_b32 v31, s8, 5 -; GCN-NEXT: v_writelane_b32 v31, s9, 6 -; GCN-NEXT: v_writelane_b32 v31, s10, 7 -; GCN-NEXT: v_writelane_b32 v31, s11, 8 -; GCN-NEXT: v_writelane_b32 v31, s12, 9 -; GCN-NEXT: v_writelane_b32 v31, s13, 10 -; GCN-NEXT: v_writelane_b32 v31, s14, 11 -; GCN-NEXT: v_writelane_b32 v31, s15, 12 -; GCN-NEXT: v_writelane_b32 v31, s16, 13 -; GCN-NEXT: v_writelane_b32 v31, s17, 14 -; GCN-NEXT: v_writelane_b32 v31, s18, 15 -; GCN-NEXT: v_writelane_b32 v31, s19, 16 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[0:15] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[16:31] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s[34:35] -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b32 s33, 0 -; GCN-NEXT: v_readlane_b32 s52, v31, 0 -; GCN-NEXT: s_cmp_lg_u32 s52, s33 -; GCN-NEXT: v_writelane_b32 v31, s36, 17 -; GCN-NEXT: v_writelane_b32 v31, s37, 18 -; GCN-NEXT: v_writelane_b32 v31, s38, 19 -; GCN-NEXT: v_writelane_b32 v31, s39, 20 -; GCN-NEXT: v_writelane_b32 v31, s40, 21 -; GCN-NEXT: v_writelane_b32 v31, s41, 22 -; GCN-NEXT: v_writelane_b32 v31, s42, 23 -; GCN-NEXT: v_writelane_b32 v31, s43, 24 -; GCN-NEXT: v_writelane_b32 v31, s44, 25 -; GCN-NEXT: v_writelane_b32 v31, s45, 26 -; GCN-NEXT: v_writelane_b32 v31, s46, 27 -; GCN-NEXT: v_writelane_b32 v31, s47, 28 -; GCN-NEXT: v_writelane_b32 v31, s48, 29 -; GCN-NEXT: v_writelane_b32 v31, s49, 30 -; GCN-NEXT: v_writelane_b32 v31, s50, 31 -; GCN-NEXT: v_writelane_b32 v31, s51, 32 -; GCN-NEXT: v_writelane_b32 v31, s0, 33 -; GCN-NEXT: v_writelane_b32 v31, s1, 34 -; GCN-NEXT: v_writelane_b32 v31, s2, 35 -; GCN-NEXT: v_writelane_b32 v31, s3, 36 -; GCN-NEXT: v_writelane_b32 v31, s4, 37 -; GCN-NEXT: v_writelane_b32 v31, s5, 38 -; GCN-NEXT: v_writelane_b32 v31, s6, 39 -; GCN-NEXT: v_writelane_b32 v31, s7, 40 -; GCN-NEXT: v_writelane_b32 v31, s8, 41 -; GCN-NEXT: v_writelane_b32 v31, s9, 42 -; GCN-NEXT: v_writelane_b32 v31, s10, 43 -; GCN-NEXT: v_writelane_b32 v31, s11, 44 -; GCN-NEXT: v_writelane_b32 v31, s12, 45 -; GCN-NEXT: v_writelane_b32 v31, s13, 46 -; GCN-NEXT: v_writelane_b32 v31, s14, 47 -; GCN-NEXT: v_writelane_b32 v31, s15, 48 -; GCN-NEXT: buffer_store_dword v0, off, s[56:59], 0 -; GCN-NEXT: v_writelane_b32 v0, s16, 0 -; GCN-NEXT: v_writelane_b32 v0, s17, 1 -; GCN-NEXT: v_writelane_b32 v0, s18, 2 -; GCN-NEXT: v_writelane_b32 v0, s19, 3 -; GCN-NEXT: v_writelane_b32 v0, s20, 4 -; GCN-NEXT: v_writelane_b32 v0, s21, 5 -; GCN-NEXT: v_writelane_b32 v0, s22, 6 -; GCN-NEXT: v_writelane_b32 v0, s23, 7 -; GCN-NEXT: v_writelane_b32 v0, s24, 8 -; GCN-NEXT: v_writelane_b32 v0, s25, 9 -; GCN-NEXT: v_writelane_b32 v0, s26, 10 -; GCN-NEXT: v_writelane_b32 v0, s27, 11 -; GCN-NEXT: v_writelane_b32 v0, s28, 12 -; GCN-NEXT: v_writelane_b32 v0, s29, 13 -; GCN-NEXT: v_writelane_b32 v0, s30, 14 -; GCN-NEXT: v_writelane_b32 v0, s31, 15 -; GCN-NEXT: s_mov_b64 s[16:17], exec -; GCN-NEXT: s_mov_b64 exec, 0xffff -; GCN-NEXT: buffer_store_dword v0, off, s[56:59], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v31, s34, 49 -; GCN-NEXT: v_writelane_b32 v31, s35, 50 -; GCN-NEXT: buffer_load_dword v0, off, s[56:59], 0 +; GCN-NEXT: s_cmp_lg_u32 s2, s3 +; GCN-NEXT: v_writelane_b32 v31, s36, 32 +; GCN-NEXT: v_writelane_b32 v31, s37, 33 +; GCN-NEXT: v_writelane_b32 v31, s38, 34 +; GCN-NEXT: v_writelane_b32 v31, s39, 35 +; GCN-NEXT: v_writelane_b32 v31, s40, 36 +; GCN-NEXT: v_writelane_b32 v31, s41, 37 +; GCN-NEXT: v_writelane_b32 v31, s42, 38 +; GCN-NEXT: v_writelane_b32 v31, s43, 39 +; GCN-NEXT: v_writelane_b32 v31, s44, 40 +; GCN-NEXT: v_writelane_b32 v31, s45, 41 +; GCN-NEXT: v_writelane_b32 v31, s46, 42 +; GCN-NEXT: v_writelane_b32 v31, s47, 43 +; GCN-NEXT: v_writelane_b32 v31, s48, 44 +; GCN-NEXT: v_writelane_b32 v31, s49, 45 +; GCN-NEXT: v_writelane_b32 v31, s50, 46 +; GCN-NEXT: v_writelane_b32 v31, s51, 47 +; GCN-NEXT: v_writelane_b32 v31, s4, 48 +; GCN-NEXT: v_writelane_b32 v31, s5, 49 +; GCN-NEXT: v_writelane_b32 v31, s6, 50 +; GCN-NEXT: v_writelane_b32 v31, s7, 51 +; GCN-NEXT: v_writelane_b32 v31, s8, 52 +; GCN-NEXT: v_writelane_b32 v31, s9, 53 +; GCN-NEXT: v_writelane_b32 v31, s10, 54 +; GCN-NEXT: v_writelane_b32 v31, s11, 55 +; GCN-NEXT: v_writelane_b32 v31, s12, 56 +; GCN-NEXT: v_writelane_b32 v31, s13, 57 +; GCN-NEXT: v_writelane_b32 v31, s14, 58 +; GCN-NEXT: v_writelane_b32 v31, s15, 59 +; GCN-NEXT: v_writelane_b32 v31, s16, 60 +; GCN-NEXT: v_writelane_b32 v31, s17, 61 +; GCN-NEXT: v_writelane_b32 v31, s18, 62 +; GCN-NEXT: v_writelane_b32 v31, s19, 63 +; GCN-NEXT: buffer_store_dword v0, off, s[20:23], 0 +; GCN-NEXT: v_writelane_b32 v0, s0, 0 +; GCN-NEXT: v_writelane_b32 v0, s1, 1 +; GCN-NEXT: s_mov_b64 s[0:1], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; GCN-NEXT: s_cbranch_scc1 BB2_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s0, v31, 1 -; GCN-NEXT: v_readlane_b32 s1, v31, 2 -; GCN-NEXT: v_readlane_b32 s2, v31, 3 -; GCN-NEXT: v_readlane_b32 s3, v31, 4 -; GCN-NEXT: v_readlane_b32 s4, v31, 5 -; GCN-NEXT: v_readlane_b32 s5, v31, 6 -; GCN-NEXT: v_readlane_b32 s6, v31, 7 -; GCN-NEXT: v_readlane_b32 s7, v31, 8 -; GCN-NEXT: v_readlane_b32 s8, v31, 9 -; GCN-NEXT: v_readlane_b32 s9, v31, 10 -; GCN-NEXT: v_readlane_b32 s10, v31, 11 -; GCN-NEXT: v_readlane_b32 s11, v31, 12 -; GCN-NEXT: v_readlane_b32 s12, v31, 13 -; GCN-NEXT: v_readlane_b32 s13, v31, 14 -; GCN-NEXT: v_readlane_b32 s14, v31, 15 -; GCN-NEXT: v_readlane_b32 s15, v31, 16 +; GCN-NEXT: v_readlane_b32 s0, v31, 0 +; GCN-NEXT: v_readlane_b32 s1, v31, 1 +; GCN-NEXT: v_readlane_b32 s2, v31, 2 +; GCN-NEXT: v_readlane_b32 s3, v31, 3 +; GCN-NEXT: v_readlane_b32 s4, v31, 4 +; GCN-NEXT: v_readlane_b32 s5, v31, 5 +; GCN-NEXT: v_readlane_b32 s6, v31, 6 +; GCN-NEXT: v_readlane_b32 s7, v31, 7 +; GCN-NEXT: v_readlane_b32 s8, v31, 8 +; GCN-NEXT: v_readlane_b32 s9, v31, 9 +; GCN-NEXT: v_readlane_b32 s10, v31, 10 +; GCN-NEXT: v_readlane_b32 s11, v31, 11 +; GCN-NEXT: v_readlane_b32 s12, v31, 12 +; GCN-NEXT: v_readlane_b32 s13, v31, 13 +; GCN-NEXT: v_readlane_b32 s14, v31, 14 +; GCN-NEXT: v_readlane_b32 s15, v31, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v31, 17 -; GCN-NEXT: v_readlane_b32 s1, v31, 18 -; GCN-NEXT: v_readlane_b32 s2, v31, 19 -; GCN-NEXT: v_readlane_b32 s3, v31, 20 -; GCN-NEXT: v_readlane_b32 s4, v31, 21 -; GCN-NEXT: v_readlane_b32 s5, v31, 22 -; GCN-NEXT: v_readlane_b32 s6, v31, 23 -; GCN-NEXT: v_readlane_b32 s7, v31, 24 -; GCN-NEXT: v_readlane_b32 s8, v31, 25 -; GCN-NEXT: v_readlane_b32 s9, v31, 26 -; GCN-NEXT: v_readlane_b32 s10, v31, 27 -; GCN-NEXT: v_readlane_b32 s11, v31, 28 -; GCN-NEXT: v_readlane_b32 s12, v31, 29 -; GCN-NEXT: v_readlane_b32 s13, v31, 30 -; GCN-NEXT: v_readlane_b32 s14, v31, 31 -; GCN-NEXT: v_readlane_b32 s15, v31, 32 +; GCN-NEXT: v_readlane_b32 s0, v31, 32 +; GCN-NEXT: v_readlane_b32 s1, v31, 33 +; GCN-NEXT: v_readlane_b32 s2, v31, 34 +; GCN-NEXT: v_readlane_b32 s3, v31, 35 +; GCN-NEXT: v_readlane_b32 s4, v31, 36 +; GCN-NEXT: v_readlane_b32 s5, v31, 37 +; GCN-NEXT: v_readlane_b32 s6, v31, 38 +; GCN-NEXT: v_readlane_b32 s7, v31, 39 +; GCN-NEXT: v_readlane_b32 s8, v31, 40 +; GCN-NEXT: v_readlane_b32 s9, v31, 41 +; GCN-NEXT: v_readlane_b32 s10, v31, 42 +; GCN-NEXT: v_readlane_b32 s11, v31, 43 +; GCN-NEXT: v_readlane_b32 s12, v31, 44 +; GCN-NEXT: v_readlane_b32 s13, v31, 45 +; GCN-NEXT: v_readlane_b32 s14, v31, 46 +; GCN-NEXT: v_readlane_b32 s15, v31, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v31, 33 -; GCN-NEXT: v_readlane_b32 s1, v31, 34 -; GCN-NEXT: v_readlane_b32 s2, v31, 35 -; GCN-NEXT: v_readlane_b32 s3, v31, 36 -; GCN-NEXT: v_readlane_b32 s4, v31, 37 -; GCN-NEXT: v_readlane_b32 s5, v31, 38 -; GCN-NEXT: v_readlane_b32 s6, v31, 39 -; GCN-NEXT: v_readlane_b32 s7, v31, 40 -; GCN-NEXT: v_readlane_b32 s8, v31, 41 -; GCN-NEXT: v_readlane_b32 s9, v31, 42 -; GCN-NEXT: v_readlane_b32 s10, v31, 43 -; GCN-NEXT: v_readlane_b32 s11, v31, 44 -; GCN-NEXT: v_readlane_b32 s12, v31, 45 -; GCN-NEXT: v_readlane_b32 s13, v31, 46 -; GCN-NEXT: v_readlane_b32 s14, v31, 47 -; GCN-NEXT: v_readlane_b32 s15, v31, 48 +; GCN-NEXT: v_readlane_b32 s0, v31, 16 +; GCN-NEXT: v_readlane_b32 s1, v31, 17 +; GCN-NEXT: v_readlane_b32 s2, v31, 18 +; GCN-NEXT: v_readlane_b32 s3, v31, 19 +; GCN-NEXT: v_readlane_b32 s4, v31, 20 +; GCN-NEXT: v_readlane_b32 s5, v31, 21 +; GCN-NEXT: v_readlane_b32 s6, v31, 22 +; GCN-NEXT: v_readlane_b32 s7, v31, 23 +; GCN-NEXT: v_readlane_b32 s8, v31, 24 +; GCN-NEXT: v_readlane_b32 s9, v31, 25 +; GCN-NEXT: v_readlane_b32 s10, v31, 26 +; GCN-NEXT: v_readlane_b32 s11, v31, 27 +; GCN-NEXT: v_readlane_b32 s12, v31, 28 +; GCN-NEXT: v_readlane_b32 s13, v31, 29 +; GCN-NEXT: v_readlane_b32 s14, v31, 30 +; GCN-NEXT: v_readlane_b32 s15, v31, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b64 s[0:1], exec -; GCN-NEXT: s_mov_b64 exec, 0xffff -; GCN-NEXT: buffer_load_dword v0, off, s[56:59], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[0:1] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-NEXT: v_readlane_b32 s2, v0, 2 -; GCN-NEXT: v_readlane_b32 s3, v0, 3 -; GCN-NEXT: v_readlane_b32 s4, v0, 4 -; GCN-NEXT: v_readlane_b32 s5, v0, 5 -; GCN-NEXT: v_readlane_b32 s6, v0, 6 -; GCN-NEXT: v_readlane_b32 s7, v0, 7 -; GCN-NEXT: v_readlane_b32 s8, v0, 8 -; GCN-NEXT: v_readlane_b32 s9, v0, 9 -; GCN-NEXT: v_readlane_b32 s10, v0, 10 -; GCN-NEXT: v_readlane_b32 s11, v0, 11 -; GCN-NEXT: v_readlane_b32 s12, v0, 12 -; GCN-NEXT: v_readlane_b32 s13, v0, 13 -; GCN-NEXT: v_readlane_b32 s14, v0, 14 -; GCN-NEXT: v_readlane_b32 s15, v0, 15 +; GCN-NEXT: v_readlane_b32 s0, v31, 48 +; GCN-NEXT: v_readlane_b32 s1, v31, 49 +; GCN-NEXT: v_readlane_b32 s2, v31, 50 +; GCN-NEXT: v_readlane_b32 s3, v31, 51 +; GCN-NEXT: v_readlane_b32 s4, v31, 52 +; GCN-NEXT: v_readlane_b32 s5, v31, 53 +; GCN-NEXT: v_readlane_b32 s6, v31, 54 +; GCN-NEXT: v_readlane_b32 s7, v31, 55 +; GCN-NEXT: v_readlane_b32 s8, v31, 56 +; GCN-NEXT: v_readlane_b32 s9, v31, 57 +; GCN-NEXT: v_readlane_b32 s10, v31, 58 +; GCN-NEXT: v_readlane_b32 s11, v31, 59 +; GCN-NEXT: v_readlane_b32 s12, v31, 60 +; GCN-NEXT: v_readlane_b32 s13, v31, 61 +; GCN-NEXT: v_readlane_b32 s14, v31, 62 +; GCN-NEXT: v_readlane_b32 s15, v31, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v31, 49 -; GCN-NEXT: v_readlane_b32 s1, v31, 50 +; GCN-NEXT: s_mov_b64 s[16:17], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s16, v0, 0 +; GCN-NEXT: v_readlane_b32 s17, v0, 1 ; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; use s[0:1] +; GCN-NEXT: ; use s[16:17] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: BB2_2: ; %ret ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/private-element-size.ll b/llvm/test/CodeGen/AMDGPU/private-element-size.ll index 843f554b05134..94bebe7a31fcb 100644 --- a/llvm/test/CodeGen/AMDGPU/private-element-size.ll +++ b/llvm/test/CodeGen/AMDGPU/private-element-size.ll @@ -141,8 +141,8 @@ entry: ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -177,8 +177,8 @@ entry: ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -224,10 +224,10 @@ entry: ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:40{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:44{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:12{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} define amdgpu_kernel void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index c4bc8cdaabf5b..12c46d3605289 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -55,3 +55,28 @@ entry: store <4 x i32> %3, <4 x i32> addrspace(1)* %in ret void } + +; GCN-LABEL: @test_rotl_i16 +; GCN: global_load_ushort [[X:v[0-9]+]] +; GCN: global_load_ushort [[D:v[0-9]+]] +; GCN: v_sub_nc_u16_e64 [[NX:v[0-9]+]], 0, [[X]] +; GCN: v_and_b32_e32 [[XAND:v[0-9]+]], 15, [[X]] +; GCN: v_and_b32_e32 [[NXAND:v[0-9]+]], 15, [[NX]] +; GCN: v_lshlrev_b16_e64 [[LO:v[0-9]+]], [[XAND]], [[D]] +; GCN: v_lshrrev_b16_e64 [[HI:v[0-9]+]], [[NXAND]], [[D]] +; GCN: v_or_b32_e32 [[RES:v[0-9]+]], [[LO]], [[HI]] +; GCN: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RES]] + +declare i16 @llvm.fshl.i16(i16, i16, i16) + +define void @test_rotl_i16(i16 addrspace(1)* nocapture readonly %sourceA, i16 addrspace(1)* nocapture readonly %sourceB, i16 addrspace(1)* nocapture %destValues) { +entry: + %arrayidx = getelementptr inbounds i16, i16 addrspace(1)* %sourceA, i64 16 + %a = load i16, i16 addrspace(1)* %arrayidx + %arrayidx2 = getelementptr inbounds i16, i16 addrspace(1)* %sourceB, i64 24 + %b = load i16, i16 addrspace(1)* %arrayidx2 + %c = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 %b) + %arrayidx5 = getelementptr inbounds i16, i16 addrspace(1)* %destValues, i64 4 + store i16 %c, i16 addrspace(1)* %arrayidx5 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index b4e2c2b67ce14..84f277bcc0870 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -51,3 +51,28 @@ entry: store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %in ret void } + +; GCN-LABEL: @test_rotr_i16 +; GCN: global_load_ushort [[X:v[0-9]+]] +; GCN: global_load_ushort [[D:v[0-9]+]] +; GCN: v_sub_nc_u16_e64 [[NX:v[0-9]+]], 0, [[X]] +; GCN: v_and_b32_e32 [[XAND:v[0-9]+]], 15, [[X]] +; GCN: v_and_b32_e32 [[NXAND:v[0-9]+]], 15, [[NX]] +; GCN: v_lshrrev_b16_e64 [[LO:v[0-9]+]], [[XAND]], [[D]] +; GCN: v_lshlrev_b16_e64 [[HI:v[0-9]+]], [[NXAND]], [[D]] +; GCN: v_or_b32_e32 [[RES:v[0-9]+]], [[LO]], [[HI]] +; GCN: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RES]] + +declare i16 @llvm.fshr.i16(i16, i16, i16) + +define void @test_rotr_i16(i16 addrspace(1)* nocapture readonly %sourceA, i16 addrspace(1)* nocapture readonly %sourceB, i16 addrspace(1)* nocapture %destValues) { +entry: + %arrayidx = getelementptr inbounds i16, i16 addrspace(1)* %sourceA, i64 16 + %a = load i16, i16 addrspace(1)* %arrayidx + %arrayidx2 = getelementptr inbounds i16, i16 addrspace(1)* %sourceB, i64 24 + %b = load i16, i16 addrspace(1)* %arrayidx2 + %c = tail call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 %b) + %arrayidx5 = getelementptr inbounds i16, i16 addrspace(1)* %destValues, i64 4 + store i16 %c, i16 addrspace(1)* %arrayidx5 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 3a4a2d07772c1..464b413e65588 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -earlycse-debug-hash -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}v_sad_u32_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll b/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll new file mode 100644 index 0000000000000..6beddf8fe947a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll @@ -0,0 +1,44 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -debug-only=machine-scheduler -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope %s +; REQUIRES: asserts + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32) + +; CHECK: CritRes: {{[0-9]+}} HWXDL +; CHECK: Picking: Cand SU([[nid:[0-9]+]]) RES-DEMAND +; CHECK: Scheduling SU([[nid]]) {{.*}} V_MFMA_F32_32X32X4F16 +define amdgpu_kernel void @schedule-xdl-resource(<32 x float> addrspace(1)* %in, <32 x float> addrspace(1)* %out, <4 x half> addrspace(3)* %lds, i32 %stride) #0 { + %in_ptr.1 = getelementptr <32 x float>, <32 x float> addrspace(1)* %in, i32 %stride + %in_ptr.2 = getelementptr <32 x float>, <32 x float> addrspace(1)* %in_ptr.1, i32 %stride + %in_ptr.3 = getelementptr <32 x float>, <32 x float> addrspace(1)* %in_ptr.2, i32 %stride + %in.load.1 = load <32 x float>, <32 x float> addrspace (1)* %in_ptr.1 + %in.load.2 = load <32 x float>, <32 x float> addrspace (1)* %in_ptr.2 + %in.load.3 = load <32 x float>, <32 x float> addrspace (1)* %in_ptr.3 + %lds_ptr.1 = getelementptr <4 x half>, <4 x half> addrspace(3)* %lds, i32 %stride + %lds_ptr.2 = getelementptr <4 x half>, <4 x half> addrspace(3)* %lds_ptr.1, i32 %stride + %lds_ptr.3 = getelementptr <4 x half>, <4 x half> addrspace(3)* %lds_ptr.2, i32 %stride + %lds.load.1 = load <4 x half>, <4 x half> addrspace(3)* %lds_ptr.1 + %lds.load.2 = load <4 x half>, <4 x half> addrspace(3)* %lds_ptr.2 + %lds.load.3 = load <4 x half>, <4 x half> addrspace(3)* %lds_ptr.3 + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.1, <4 x half> %lds.load.1, <32 x float> %in.load.1, i32 1, i32 1, i32 1) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.2, <4 x half> %lds.load.2, <32 x float> %in.load.2, i32 1, i32 1, i32 1) + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.3, <4 x half> %lds.load.3, <32 x float> %in.load.3, i32 1, i32 1, i32 1) + %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.1, <4 x half> %lds.load.1, <32 x float> %in.load.1, i32 2, i32 2, i32 2) + %mai.5 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.2, <4 x half> %lds.load.2, <32 x float> %in.load.2, i32 2, i32 2, i32 2) + %mai.6 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.3, <4 x half> %lds.load.3, <32 x float> %in.load.3, i32 2, i32 2, i32 2) + %out_ptr.1 = getelementptr <32 x float>, <32 x float> addrspace(1)* %out, i32 %stride + %out_ptr.2 = getelementptr <32 x float>, <32 x float> addrspace(1)* %out_ptr.1, i32 %stride + %out_ptr.3 = getelementptr <32 x float>, <32 x float> addrspace(1)* %out_ptr.2, i32 %stride + %out_ptr.4 = getelementptr <32 x float>, <32 x float> addrspace(1)* %out_ptr.3, i32 %stride + %out_ptr.5 = getelementptr <32 x float>, <32 x float> addrspace(1)* %out_ptr.4, i32 %stride + %out_ptr.6 = getelementptr <32 x float>, <32 x float> addrspace(1)* %out_ptr.5, i32 %stride + store <32 x float> %mai.1, <32 x float> addrspace(1)* %out_ptr.1 + store <32 x float> %mai.2, <32 x float> addrspace(1)* %out_ptr.2 + store <32 x float> %mai.3, <32 x float> addrspace(1)* %out_ptr.3 + store <32 x float> %mai.4, <32 x float> addrspace(1)* %out_ptr.4 + store <32 x float> %mai.5, <32 x float> addrspace(1)* %out_ptr.5 + store <32 x float> %mai.6, <32 x float> addrspace(1)* %out_ptr.6 + + ret void +} + +attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" } diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll new file mode 100644 index 0000000000000..f032f170e3b4c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll @@ -0,0 +1,26 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; CHECK-LABEL: {{^}}t0: +; CHECK: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[4:5], 0x0 +; CHECK-COUNT-1: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]] +; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]] +define protected amdgpu_kernel void @t0(float addrspace(1)* %p, i32 %i0, i32 %j0, i32 %k0) { +entry: + %0 = tail call i32 @llvm.amdgcn.workitem.id.x() + %i = add i32 %0, %i0 + %j = add i32 %0, %j0 + %k = add i32 %0, %k0 + %pi = getelementptr float, float addrspace(1)* %p, i32 %i + %vi = load float, float addrspace(1)* %pi + %pj = getelementptr float, float addrspace(1)* %p, i32 %j + %vj = load float, float addrspace(1)* %pj + %sum = fadd float %vi, %vj + %pk = getelementptr float, float addrspace(1)* %p, i32 %k + store float %sum, float addrspace(1)* %pk + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll index fb74c0829fcde..d7fa172f501e7 100644 --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll @@ -29,12 +29,12 @@ define void @shl_base_global_ptr_global_atomic_fadd(i32 addrspace(1)* %out, i64 %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64 %shl = shl i64 %cast, 2 %castback = inttoptr i64 %shl to float addrspace(1)* - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %castback, float 100.0) + call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %castback, float 100.0) store volatile i64 %cast, i64 addrspace(1)* %extra.use, align 4 ret void } -declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #1 +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #1 attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 1bb5b9dd4bce4..3a9fe209a0ca6 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1026,7 +1026,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out, ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v1, v1, 7 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x400007 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1100,7 +1100,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index ee61d6dd0b711..e089ac0afc163 100644 --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -249,13 +249,13 @@ define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %o ; CI: v_mov_b32 ; CI: v_mov_b32 -; CI: v_add_i32 -; CI: v_add_i32 +; CI-DAG: v_add_i32 +; CI-DAG: v_add_i32 -; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} -; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} -; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} +; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} +; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} +; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 ; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll index 9b629a5f91110..a03318ead716c 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -77,101 +77,6 @@ endif: ; preds = %else, %if ret void } -; Force save and restore of m0 during SMEM spill -; GCN-LABEL: {{^}}m0_unavailable_spill: - -; GCN: ; def m0, 1 - -; GCN: s_mov_b32 m0, s0 -; GCN: v_interp_mov_f32 - -; GCN: ; clobber m0 - -; TOSMEM: s_mov_b32 s2, m0 -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill -; TOSMEM: s_mov_b32 m0, s2 - -; TOSMEM: s_mov_b64 exec, -; TOSMEM: s_cbranch_execz -; TOSMEM: s_branch - -; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload - -; GCN-NOT: v_readlane_b32 m0 -; GCN-NOT: s_buffer_store_dword m0 -; GCN-NOT: s_buffer_load_dword m0 -define amdgpu_kernel void @m0_unavailable_spill(i32 %m0.arg) #0 { -main_body: - %m0 = call i32 asm sideeffect "; def $0, 1", "={m0}"() #0 - %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0.arg) - call void asm sideeffect "; clobber $0", "~{m0}"() #0 - %cmp = fcmp ueq float 0.000000e+00, %tmp - br i1 %cmp, label %if, label %else - -if: ; preds = %main_body - store volatile i32 8, i32 addrspace(1)* undef - br label %endif - -else: ; preds = %main_body - store volatile i32 11, i32 addrspace(1)* undef - br label %endif - -endif: - ret void -} - -; GCN-LABEL: {{^}}restore_m0_lds: -; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] -; TOSMEM: s_cmp_eq_u32 -; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it -; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill -; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x200 -; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill -; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_cbranch_scc1 - -; TOSMEM: s_mov_b32 m0, -1 - -; TOSMEM: s_mov_b32 s2, m0 -; TOSMEM: s_add_u32 m0, s3, 0x200 -; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload -; TOSMEM: s_mov_b32 m0, s2 -; TOSMEM: s_waitcnt lgkmcnt(0) - -; TOSMEM: ds_write_b64 - -; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM: s_buffer_load_dword s2, s[88:91], m0 ; 4-byte Folded Reload -; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_waitcnt lgkmcnt(0) -; TOSMEM-NOT: m0 -; TOSMEM: s_mov_b32 m0, s2 -; TOSMEM: ; use m0 - -; TOSMEM: s_dcache_wb -; TOSMEM: s_endpgm -define amdgpu_kernel void @restore_m0_lds(i32 %arg) { - %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={m0}"() #0 - %sval = load volatile i64, i64 addrspace(4)* undef - %cmp = icmp eq i32 %arg, 0 - br i1 %cmp, label %ret, label %bb - -bb: - store volatile i64 %sval, i64 addrspace(3)* undef - call void asm sideeffect "; use $0", "{m0}"(i32 %m0) #0 - br label %ret - -ret: - ret void -} - declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index d2434682eebc9..5695487d58d88 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -39,7 +39,7 @@ entry: ; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32 ; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9]+]] ; GFX6: NumSgprs: 48 -; GFX6: ScratchSize: 8624 +; GFX6: ScratchSize: 8608 define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 { entry: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir index dca3150b404cd..c9f3a82cf695f 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir @@ -1,42 +1,169 @@ -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=greedy -o - -verify-machineinstrs %s | FileCheck -check-prefixes=MIR,RA %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=greedy,virtregrewriter,post-RA-sched -o - -verify-machineinstrs %s | FileCheck -check-prefixes=MIR,VR %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before=greedy -o - -verify-machineinstrs %s | FileCheck -check-prefix=ASM %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=greedy -o - -verify-machineinstrs %s | FileCheck -check-prefix=RA %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=greedy,virtregrewriter,post-RA-sched -o - -verify-machineinstrs %s | FileCheck -check-prefix=VR %s --- -# MIR-LABEL: name: splitkit_copy_bundle - -# RA: undef %4.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:sgpr_1024 = COPY %5.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 { -# RA-NEXT: internal %4.sub12_sub13_sub14_sub15_sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27:sgpr_1024 = COPY %5.sub12_sub13_sub14_sub15_sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27 -# RA-NEXT: internal %4.sub28_sub29:sgpr_1024 = COPY %5.sub28_sub29 -# RA-NEXT: } - -# RA: undef %6.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:sgpr_1024 = COPY %4.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 { -# RA-NEXT: internal %6.sub12_sub13_sub14_sub15_sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27:sgpr_1024 = COPY %4.sub12_sub13_sub14_sub15_sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27 -# RA-NEXT: internal %6.sub28_sub29:sgpr_1024 = COPY %4.sub28_sub29 -# RA-NEXT: } - - -# RA: undef %4.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:sgpr_1024 = COPY %6.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 { -# RA-NEXT: internal %4.sub12_sub13_sub14_sub15_sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27:sgpr_1024 = COPY %6.sub12_sub13_sub14_sub15_sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27 -# RA-NEXT: internal %4.sub28_sub29:sgpr_1024 = COPY %6.sub28_sub29 -# RA-NEXT: } - - -# VR: renamable $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = KILL undef renamable $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 -# VR-NEXT: renamable $sgpr96_sgpr97 = KILL undef renamable $sgpr96_sgpr97 - -# ASM-LABEL: {{^}}splitkit_copy_bundle: -# ASM: ; implicit-def: $sgpr34_sgpr35 -# ASM-NEXT: ; implicit-def: $sgpr98_sgpr99 -# ASM-NEXT: ; kill: def $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 killed $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 -# ASM-NEXT: ; kill: def $sgpr96_sgpr97 killed $sgpr96_sgpr97 - name: splitkit_copy_bundle tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' stackPtrOffsetReg: '$sgpr32' body: | + ; RA-LABEL: name: splitkit_copy_bundle + ; RA: bb.0: + ; RA: successors: %bb.1(0x80000000) + ; RA: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; RA: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; RA: undef %5.sub1:sgpr_1024 = S_MOV_B32 -1 + ; RA: %5.sub0:sgpr_1024 = S_MOV_B32 -1 + ; RA: undef %4.sub0_sub1:sgpr_1024 = COPY %5.sub0_sub1 + ; RA: undef %3.sub0:sgpr_1024 = S_MOV_B32 0 + ; RA: bb.1: + ; RA: successors: %bb.2(0x80000000) + ; RA: undef %6.sub0_sub1:sgpr_1024 = COPY %4.sub0_sub1 + ; RA: %6.sub2:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub3:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub4:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub5:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub6:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub7:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub8:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub9:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub10:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub11:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub12:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub13:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub14:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub15:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub16:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub17:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub18:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub19:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub20:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub21:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub22:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub23:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub24:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub25:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub26:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub27:sgpr_1024 = COPY %6.sub1 + ; RA: %6.sub28:sgpr_1024 = COPY %6.sub0 + ; RA: %6.sub29:sgpr_1024 = COPY %6.sub1 + ; RA: undef %4.sub0_sub1:sgpr_1024 = COPY %6.sub0_sub1 + ; RA: %3.sub1:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub2:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub3:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub4:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub5:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub6:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub7:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub8:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub9:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub10:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub11:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub12:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub13:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub14:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub15:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub16:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub17:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub18:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub19:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub20:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub21:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub22:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub23:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub24:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub25:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub26:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub27:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub28:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub29:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub30:sgpr_1024 = COPY %3.sub0 + ; RA: %3.sub31:sgpr_1024 = COPY %3.sub0 + ; RA: bb.2: + ; RA: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; RA: S_NOP 0, csr_amdgpu_highregs, implicit [[DEF]], implicit [[DEF1]] + ; RA: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc + ; RA: S_BRANCH %bb.2 + ; VR-LABEL: name: splitkit_copy_bundle + ; VR: bb.0: + ; VR: successors: %bb.1(0x80000000) + ; VR: renamable $sgpr69 = S_MOV_B32 -1 + ; VR: renamable $sgpr68 = S_MOV_B32 -1 + ; VR: renamable $sgpr36 = S_MOV_B32 0 + ; VR: renamable $sgpr34_sgpr35 = IMPLICIT_DEF + ; VR: renamable $sgpr70_sgpr71 = IMPLICIT_DEF + ; VR: bb.1: + ; VR: successors: %bb.2(0x80000000) + ; VR: liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x0000000000000003, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x000000000000000F, $sgpr34_sgpr35, $sgpr70_sgpr71 + ; VR: renamable $sgpr40_sgpr41 = COPY killed renamable $sgpr68_sgpr69 + ; VR: renamable $sgpr42 = COPY renamable $sgpr40 + ; VR: renamable $sgpr43 = COPY renamable $sgpr41 + ; VR: renamable $sgpr44 = COPY renamable $sgpr40 + ; VR: renamable $sgpr45 = COPY renamable $sgpr41 + ; VR: renamable $sgpr46 = COPY renamable $sgpr40 + ; VR: renamable $sgpr47 = COPY renamable $sgpr41 + ; VR: renamable $sgpr48 = COPY renamable $sgpr40 + ; VR: renamable $sgpr49 = COPY renamable $sgpr41 + ; VR: renamable $sgpr50 = COPY renamable $sgpr40 + ; VR: renamable $sgpr51 = COPY renamable $sgpr41 + ; VR: renamable $sgpr52 = COPY renamable $sgpr40 + ; VR: renamable $sgpr53 = COPY renamable $sgpr41 + ; VR: renamable $sgpr54 = COPY renamable $sgpr40 + ; VR: renamable $sgpr55 = COPY renamable $sgpr41 + ; VR: renamable $sgpr56 = COPY renamable $sgpr40 + ; VR: renamable $sgpr57 = COPY renamable $sgpr41 + ; VR: renamable $sgpr58 = COPY renamable $sgpr40 + ; VR: renamable $sgpr59 = COPY renamable $sgpr41 + ; VR: renamable $sgpr60 = COPY renamable $sgpr40 + ; VR: renamable $sgpr61 = COPY renamable $sgpr41 + ; VR: renamable $sgpr62 = COPY renamable $sgpr40 + ; VR: renamable $sgpr63 = COPY renamable $sgpr41 + ; VR: renamable $sgpr64 = COPY renamable $sgpr40 + ; VR: renamable $sgpr65 = COPY renamable $sgpr41 + ; VR: renamable $sgpr66 = COPY renamable $sgpr40 + ; VR: renamable $sgpr67 = COPY renamable $sgpr41 + ; VR: renamable $sgpr68 = COPY renamable $sgpr40 + ; VR: renamable $sgpr69 = COPY renamable $sgpr41 + ; VR: renamable $sgpr68_sgpr69 = COPY killed renamable $sgpr40_sgpr41 + ; VR: renamable $sgpr37 = COPY renamable $sgpr36 + ; VR: renamable $sgpr38 = COPY renamable $sgpr36 + ; VR: renamable $sgpr39 = COPY renamable $sgpr36 + ; VR: renamable $sgpr40 = COPY renamable $sgpr36 + ; VR: renamable $sgpr41 = COPY renamable $sgpr36 + ; VR: renamable $sgpr42 = COPY renamable $sgpr36 + ; VR: renamable $sgpr43 = COPY renamable $sgpr36 + ; VR: renamable $sgpr44 = COPY renamable $sgpr36 + ; VR: renamable $sgpr45 = COPY renamable $sgpr36 + ; VR: renamable $sgpr46 = COPY renamable $sgpr36 + ; VR: renamable $sgpr47 = COPY renamable $sgpr36 + ; VR: renamable $sgpr48 = COPY renamable $sgpr36 + ; VR: renamable $sgpr49 = COPY renamable $sgpr36 + ; VR: renamable $sgpr50 = COPY renamable $sgpr36 + ; VR: renamable $sgpr51 = COPY renamable $sgpr36 + ; VR: renamable $sgpr52 = COPY renamable $sgpr36 + ; VR: renamable $sgpr53 = COPY renamable $sgpr36 + ; VR: renamable $sgpr54 = COPY renamable $sgpr36 + ; VR: renamable $sgpr55 = COPY renamable $sgpr36 + ; VR: renamable $sgpr56 = COPY renamable $sgpr36 + ; VR: renamable $sgpr57 = COPY renamable $sgpr36 + ; VR: renamable $sgpr58 = COPY renamable $sgpr36 + ; VR: renamable $sgpr59 = COPY renamable $sgpr36 + ; VR: renamable $sgpr60 = COPY renamable $sgpr36 + ; VR: renamable $sgpr61 = COPY renamable $sgpr36 + ; VR: renamable $sgpr62 = COPY renamable $sgpr36 + ; VR: renamable $sgpr63 = COPY renamable $sgpr36 + ; VR: renamable $sgpr64 = COPY renamable $sgpr36 + ; VR: renamable $sgpr65 = COPY renamable $sgpr36 + ; VR: renamable $sgpr66 = COPY renamable $sgpr36 + ; VR: renamable $sgpr67 = COPY renamable $sgpr36 + ; VR: bb.2: + ; VR: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; VR: liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x0000000000000003, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x000000000000000F, $sgpr34_sgpr35, $sgpr70_sgpr71 + ; VR: S_NOP 0, csr_amdgpu_highregs, implicit renamable $sgpr34_sgpr35, implicit renamable $sgpr70_sgpr71 + ; VR: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc + ; VR: S_BRANCH %bb.2 bb.0: %0:sreg_64 = IMPLICIT_DEF %1:sreg_64 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir new file mode 100644 index 0000000000000..56ebf9305dbd5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir @@ -0,0 +1,525 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -verify-regalloc -run-pass=greedy %s -o - | FileCheck %s + +--- +name: zextload_global_v64i16_to_v64i64 +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: zextload_global_v64i16_to_v64i64 + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0, 0 :: (dereferenceable invariant load 16, align 4, addrspace 4) + ; CHECK: undef %2.sub3:sgpr_128 = S_MOV_B32 61440 + ; CHECK: %2.sub2:sgpr_128 = S_MOV_B32 -1 + ; CHECK: %2.sub0:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub0 + ; CHECK: %2.sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub1 + ; CHECK: undef %3.sub0:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub2 + ; CHECK: %3.sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub3 + ; CHECK: %3.sub2:sgpr_128 = COPY %2.sub2 + ; CHECK: %3.sub3:sgpr_128 = COPY %2.sub3 + ; CHECK: early-clobber %4:vreg_128, early-clobber %5:vreg_128, early-clobber %6:vreg_128, early-clobber %7:vreg_128 = BUNDLE %3, implicit $exec { + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 128, addrspace 1) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + ; CHECK: } + ; CHECK: undef %47.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %47, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) + ; CHECK: undef %52.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %52, %stack.1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.1, align 4, addrspace 5) + ; CHECK: undef %57.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %57, %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.2, align 4, addrspace 5) + ; CHECK: undef %62.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %62, %stack.3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.3, align 4, addrspace 5) + ; CHECK: undef %67.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec + ; CHECK: undef %71.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %71, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.4, align 4, addrspace 5) + ; CHECK: undef %76.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %76, %stack.5, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.5, align 4, addrspace 5) + ; CHECK: undef %81.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %81, %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.6, align 4, addrspace 5) + ; CHECK: undef %86.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec + ; CHECK: undef %90.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %90, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.7, align 4, addrspace 5) + ; CHECK: undef %95.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %95, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.8, align 4, addrspace 5) + ; CHECK: undef %100.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %100, %stack.9, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.9, align 4, addrspace 5) + ; CHECK: undef %105.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec + ; CHECK: undef %109.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec + ; CHECK: undef %113.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec + ; CHECK: undef %117.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %117, %stack.10, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.10, align 4, addrspace 5) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 64, addrspace 1) + ; CHECK: undef %122.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec + ; CHECK: undef %126.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec + ; CHECK: undef %130.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %130, %stack.11, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.11, align 4, addrspace 5) + ; CHECK: undef %135.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %135, %stack.12, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.12, align 4, addrspace 5) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + ; CHECK: undef %140.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec + ; CHECK: undef %144.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %144, %stack.13, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.13, align 4, addrspace 5) + ; CHECK: undef %149.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %149, %stack.14, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.14, align 4, addrspace 5) + ; CHECK: undef %154.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) + ; CHECK: undef %158.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec + ; CHECK: undef %36.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec + ; CHECK: undef %37.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec + ; CHECK: undef %38.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + ; CHECK: undef %40.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec + ; CHECK: undef %41.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec + ; CHECK: undef %42.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec + ; CHECK: undef %43.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE]], %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.1, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE1]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE1]], %stack.1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.1, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.2, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE2]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE2]], %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.2, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.3, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE3]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE3]], %stack.3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.3, align 4, addrspace 5) + ; CHECK: undef %68.sub2:vreg_128 = COPY %67.sub2 + ; CHECK: %68.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.4, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE4]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE4]], %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.4, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.5, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE5]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE5]], %stack.5, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.5, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.6, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE6]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE6]], %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.6, align 4, addrspace 5) + ; CHECK: undef %87.sub2:vreg_128 = COPY %86.sub2 + ; CHECK: %87.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.7, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE7]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE7]], %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.7, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.8, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE8]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE8]], %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.8, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.9, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE9]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE9]], %stack.9, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.9, align 4, addrspace 5) + ; CHECK: undef %106.sub2:vreg_128 = COPY %105.sub2 + ; CHECK: %106.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec + ; CHECK: undef %110.sub2:vreg_128 = COPY %109.sub2 + ; CHECK: %110.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec + ; CHECK: undef %114.sub2:vreg_128 = COPY %113.sub2 + ; CHECK: %114.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.10, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE10]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE10]], %stack.10, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.10, align 4, addrspace 5) + ; CHECK: undef %123.sub2:vreg_128 = COPY %122.sub2 + ; CHECK: %123.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec + ; CHECK: undef %127.sub2:vreg_128 = COPY %126.sub2 + ; CHECK: %127.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.11, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE11]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE11]], %stack.11, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.11, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE12:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.12, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE12]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE12]], %stack.12, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.12, align 4, addrspace 5) + ; CHECK: undef %141.sub2:vreg_128 = COPY %140.sub2 + ; CHECK: %141.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE13:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.13, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE13]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE13]], %stack.13, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.13, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE14:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.14, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE14]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE14]], %stack.14, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.14, align 4, addrspace 5) + ; CHECK: undef %155.sub2:vreg_128 = COPY %154.sub2 + ; CHECK: %155.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec + ; CHECK: undef %159.sub2:vreg_128 = COPY %158.sub2 + ; CHECK: %159.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec + ; CHECK: %36.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec + ; CHECK: %37.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec + ; CHECK: %38.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec + ; CHECK: %40.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec + ; CHECK: %41.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec + ; CHECK: %42.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec + ; CHECK: %43.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec + ; CHECK: %43.sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: %43.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %43, %2, 0, 480, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: %42.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %42.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %42, %2, 0, 496, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: %41.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %41.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %41, %2, 0, 448, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + ; CHECK: %40.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %40.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %40, %2, 0, 464, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: %38.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %38.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %38, %2, 0, 416, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: %37.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %37.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %37, %2, 0, 432, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: %36.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %36.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1) + ; CHECK: undef %157.sub0:vreg_128 = COPY %159.sub0 { + ; CHECK: internal %157.sub2:vreg_128 = COPY %159.sub2 + ; CHECK: } + ; CHECK: %157.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %157.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %157, %2, 0, 400, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: undef %153.sub0:vreg_128 = COPY %155.sub0 { + ; CHECK: internal %153.sub2:vreg_128 = COPY %155.sub2 + ; CHECK: } + ; CHECK: %153.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %153.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %153, %2, 0, 352, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE15:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.14, align 4, addrspace 5) + ; CHECK: undef %148.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub0 { + ; CHECK: internal %148.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub2 + ; CHECK: } + ; CHECK: %148.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %148.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %148, %2, 0, 368, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE16:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.13, align 4, addrspace 5) + ; CHECK: undef %143.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub0 { + ; CHECK: internal %143.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub2 + ; CHECK: } + ; CHECK: %143.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %143.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %143, %2, 0, 320, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + ; CHECK: undef %139.sub0:vreg_128 = COPY %141.sub0 { + ; CHECK: internal %139.sub2:vreg_128 = COPY %141.sub2 + ; CHECK: } + ; CHECK: %139.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %139.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %139, %2, 0, 336, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE17:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.12, align 4, addrspace 5) + ; CHECK: undef %134.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub0 { + ; CHECK: internal %134.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub2 + ; CHECK: } + ; CHECK: %134.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %134.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %134, %2, 0, 288, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE18:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.11, align 4, addrspace 5) + ; CHECK: undef %129.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub0 { + ; CHECK: internal %129.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub2 + ; CHECK: } + ; CHECK: %129.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %129.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %129, %2, 0, 304, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: undef %125.sub0:vreg_128 = COPY %127.sub0 { + ; CHECK: internal %125.sub2:vreg_128 = COPY %127.sub2 + ; CHECK: } + ; CHECK: %125.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %125.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %125, %2, 0, 256, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 256, addrspace 1) + ; CHECK: undef %121.sub0:vreg_128 = COPY %123.sub0 { + ; CHECK: internal %121.sub2:vreg_128 = COPY %123.sub2 + ; CHECK: } + ; CHECK: %121.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %121.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %121, %2, 0, 272, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE19:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.10, align 4, addrspace 5) + ; CHECK: undef %116.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub0 { + ; CHECK: internal %116.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub2 + ; CHECK: } + ; CHECK: %116.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %116.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %116, %2, 0, 224, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: undef %112.sub0:vreg_128 = COPY %114.sub0 { + ; CHECK: internal %112.sub2:vreg_128 = COPY %114.sub2 + ; CHECK: } + ; CHECK: %112.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %112.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %112, %2, 0, 240, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: undef %108.sub0:vreg_128 = COPY %110.sub0 { + ; CHECK: internal %108.sub2:vreg_128 = COPY %110.sub2 + ; CHECK: } + ; CHECK: %108.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %108.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %108, %2, 0, 192, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + ; CHECK: undef %104.sub0:vreg_128 = COPY %106.sub0 { + ; CHECK: internal %104.sub2:vreg_128 = COPY %106.sub2 + ; CHECK: } + ; CHECK: %104.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %104.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %104, %2, 0, 208, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE20:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.9, align 4, addrspace 5) + ; CHECK: undef %99.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub0 { + ; CHECK: internal %99.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub2 + ; CHECK: } + ; CHECK: %99.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %99.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %99, %2, 0, 160, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE21:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.8, align 4, addrspace 5) + ; CHECK: undef %94.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub0 { + ; CHECK: internal %94.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub2 + ; CHECK: } + ; CHECK: %94.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %94.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %94, %2, 0, 176, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE22:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.7, align 4, addrspace 5) + ; CHECK: undef %89.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub0 { + ; CHECK: internal %89.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub2 + ; CHECK: } + ; CHECK: %89.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %89.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %89, %2, 0, 128, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1) + ; CHECK: undef %85.sub0:vreg_128 = COPY %87.sub0 { + ; CHECK: internal %85.sub2:vreg_128 = COPY %87.sub2 + ; CHECK: } + ; CHECK: %85.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %85.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %85, %2, 0, 144, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE23:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.6, align 4, addrspace 5) + ; CHECK: undef %80.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub0 { + ; CHECK: internal %80.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub2 + ; CHECK: } + ; CHECK: %80.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %80.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %80, %2, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE24:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.5, align 4, addrspace 5) + ; CHECK: undef %75.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub0 { + ; CHECK: internal %75.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub2 + ; CHECK: } + ; CHECK: %75.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %75.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %75, %2, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE25:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.4, align 4, addrspace 5) + ; CHECK: undef %70.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub0 { + ; CHECK: internal %70.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub2 + ; CHECK: } + ; CHECK: %70.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %70.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %70, %2, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + ; CHECK: undef %66.sub0:vreg_128 = COPY %68.sub0 { + ; CHECK: internal %66.sub2:vreg_128 = COPY %68.sub2 + ; CHECK: } + ; CHECK: %66.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %66.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %66, %2, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE26:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.3, align 4, addrspace 5) + ; CHECK: undef %61.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE26]].sub0 { + ; CHECK: internal %61.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE26]].sub2 + ; CHECK: } + ; CHECK: %61.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %61.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %61, %2, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE27:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.2, align 4, addrspace 5) + ; CHECK: undef %56.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE27]].sub0 { + ; CHECK: internal %56.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE27]].sub2 + ; CHECK: } + ; CHECK: %56.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %56.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %56, %2, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE28:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.1, align 4, addrspace 5) + ; CHECK: undef %51.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE28]].sub0 { + ; CHECK: internal %51.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE28]].sub2 + ; CHECK: } + ; CHECK: %51.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %51.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %51, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 512, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE29:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) + ; CHECK: undef %46.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE29]].sub0 { + ; CHECK: internal %46.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE29]].sub2 + ; CHECK: } + ; CHECK: %46.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %46.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %46, %2, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: S_ENDPGM 0 + %0:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0(p4), 9, 0, 0 :: (dereferenceable invariant load 16, align 4, addrspace 4) + undef %2.sub3:sgpr_128 = S_MOV_B32 61440 + %2.sub2:sgpr_128 = S_MOV_B32 -1 + %2.sub0:sgpr_128 = COPY %1.sub0 + %2.sub1:sgpr_128 = COPY %1.sub1 + undef %3.sub0:sgpr_128 = COPY %1.sub2 + %3.sub1:sgpr_128 = COPY %1.sub3 + %3.sub2:sgpr_128 = COPY %2.sub2 + %3.sub3:sgpr_128 = COPY %2.sub3 + early-clobber %4:vreg_128, early-clobber %5:vreg_128, early-clobber %6:vreg_128, early-clobber %7:vreg_128 = BUNDLE %3, implicit $exec { + %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 128, addrspace 1) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + %4:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) + %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + } + undef %8.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %7.sub1, implicit $exec + undef %9.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %7.sub0, implicit $exec + undef %10.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %7.sub3, implicit $exec + undef %11.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %7.sub2, implicit $exec + undef %12.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %5.sub1, implicit $exec + undef %13.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %5.sub0, implicit $exec + undef %14.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %5.sub3, implicit $exec + undef %15.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %5.sub2, implicit $exec + undef %16.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %4.sub1, implicit $exec + undef %17.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %4.sub0, implicit $exec + undef %18.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %4.sub3, implicit $exec + undef %19.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %4.sub2, implicit $exec + undef %20.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %6.sub1, implicit $exec + undef %21.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %6.sub0, implicit $exec + undef %22.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %6.sub3, implicit $exec + undef %23.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %6.sub2, implicit $exec + %24:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 64, addrspace 1) + undef %25.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %24.sub1, implicit $exec + undef %26.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %24.sub0, implicit $exec + undef %27.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %24.sub3, implicit $exec + undef %28.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %24.sub2, implicit $exec + %29:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + undef %30.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %29.sub1, implicit $exec + undef %31.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %29.sub0, implicit $exec + undef %32.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %29.sub3, implicit $exec + undef %33.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %29.sub2, implicit $exec + %34:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) + undef %35.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %34.sub1, implicit $exec + undef %36.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %34.sub0, implicit $exec + undef %37.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %34.sub3, implicit $exec + undef %38.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %34.sub2, implicit $exec + %39:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + undef %40.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %39.sub1, implicit $exec + undef %41.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %39.sub0, implicit $exec + undef %42.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %39.sub3, implicit $exec + undef %43.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %39.sub2, implicit $exec + %44:sreg_32 = S_MOV_B32 65535 + %8.sub0:vreg_128 = V_AND_B32_e32 %44, %7.sub1, implicit $exec + %9.sub0:vreg_128 = V_AND_B32_e32 %44, %7.sub0, implicit $exec + %10.sub0:vreg_128 = V_AND_B32_e32 %44, %7.sub3, implicit $exec + %11.sub0:vreg_128 = V_AND_B32_e32 %44, %7.sub2, implicit $exec + %12.sub0:vreg_128 = V_AND_B32_e32 %44, %5.sub1, implicit $exec + %13.sub0:vreg_128 = V_AND_B32_e32 %44, %5.sub0, implicit $exec + %14.sub0:vreg_128 = V_AND_B32_e32 %44, %5.sub3, implicit $exec + %15.sub0:vreg_128 = V_AND_B32_e32 %44, %5.sub2, implicit $exec + %16.sub0:vreg_128 = V_AND_B32_e32 %44, %4.sub1, implicit $exec + %17.sub0:vreg_128 = V_AND_B32_e32 %44, %4.sub0, implicit $exec + %18.sub0:vreg_128 = V_AND_B32_e32 %44, %4.sub3, implicit $exec + %19.sub0:vreg_128 = V_AND_B32_e32 %44, %4.sub2, implicit $exec + %20.sub0:vreg_128 = V_AND_B32_e32 %44, %6.sub1, implicit $exec + %21.sub0:vreg_128 = V_AND_B32_e32 %44, %6.sub0, implicit $exec + %22.sub0:vreg_128 = V_AND_B32_e32 %44, %6.sub3, implicit $exec + %23.sub0:vreg_128 = V_AND_B32_e32 %44, %6.sub2, implicit $exec + %25.sub0:vreg_128 = V_AND_B32_e32 %44, %24.sub1, implicit $exec + %26.sub0:vreg_128 = V_AND_B32_e32 %44, %24.sub0, implicit $exec + %27.sub0:vreg_128 = V_AND_B32_e32 %44, %24.sub3, implicit $exec + %28.sub0:vreg_128 = V_AND_B32_e32 %44, %24.sub2, implicit $exec + %30.sub0:vreg_128 = V_AND_B32_e32 %44, %29.sub1, implicit $exec + %31.sub0:vreg_128 = V_AND_B32_e32 %44, %29.sub0, implicit $exec + %32.sub0:vreg_128 = V_AND_B32_e32 %44, %29.sub3, implicit $exec + %33.sub0:vreg_128 = V_AND_B32_e32 %44, %29.sub2, implicit $exec + %35.sub0:vreg_128 = V_AND_B32_e32 %44, %34.sub1, implicit $exec + %36.sub0:vreg_128 = V_AND_B32_e32 %44, %34.sub0, implicit $exec + %37.sub0:vreg_128 = V_AND_B32_e32 %44, %34.sub3, implicit $exec + %38.sub0:vreg_128 = V_AND_B32_e32 %44, %34.sub2, implicit $exec + %40.sub0:vreg_128 = V_AND_B32_e32 %44, %39.sub1, implicit $exec + %41.sub0:vreg_128 = V_AND_B32_e32 %44, %39.sub0, implicit $exec + %42.sub0:vreg_128 = V_AND_B32_e32 %44, %39.sub3, implicit $exec + %43.sub0:vreg_128 = V_AND_B32_e32 %44, %39.sub2, implicit $exec + %43.sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec + %43.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %43, %2, 0, 480, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + %42.sub1:vreg_128 = COPY %43.sub1 + %42.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %42, %2, 0, 496, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %41.sub1:vreg_128 = COPY %43.sub1 + %41.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %41, %2, 0, 448, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + %40.sub1:vreg_128 = COPY %43.sub1 + %40.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %40, %2, 0, 464, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %38.sub1:vreg_128 = COPY %43.sub1 + %38.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %38, %2, 0, 416, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + %37.sub1:vreg_128 = COPY %43.sub1 + %37.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %37, %2, 0, 432, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %36.sub1:vreg_128 = COPY %43.sub1 + %36.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1) + %35.sub1:vreg_128 = COPY %43.sub1 + %35.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %35, %2, 0, 400, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %33.sub1:vreg_128 = COPY %43.sub1 + %33.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %33, %2, 0, 352, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + %32.sub1:vreg_128 = COPY %43.sub1 + %32.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %32, %2, 0, 368, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %31.sub1:vreg_128 = COPY %43.sub1 + %31.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %31, %2, 0, 320, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + %30.sub1:vreg_128 = COPY %43.sub1 + %30.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %30, %2, 0, 336, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %28.sub1:vreg_128 = COPY %43.sub1 + %28.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %28, %2, 0, 288, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + %27.sub1:vreg_128 = COPY %43.sub1 + %27.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %27, %2, 0, 304, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %26.sub1:vreg_128 = COPY %43.sub1 + %26.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %26, %2, 0, 256, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 256, addrspace 1) + %25.sub1:vreg_128 = COPY %43.sub1 + %25.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %25, %2, 0, 272, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %23.sub1:vreg_128 = COPY %43.sub1 + %23.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %23, %2, 0, 224, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + %22.sub1:vreg_128 = COPY %43.sub1 + %22.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %22, %2, 0, 240, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %21.sub1:vreg_128 = COPY %43.sub1 + %21.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %21, %2, 0, 192, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + %20.sub1:vreg_128 = COPY %43.sub1 + %20.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %20, %2, 0, 208, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %19.sub1:vreg_128 = COPY %43.sub1 + %19.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %19, %2, 0, 160, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + %18.sub1:vreg_128 = COPY %43.sub1 + %18.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %18, %2, 0, 176, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %17.sub1:vreg_128 = COPY %43.sub1 + %17.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %17, %2, 0, 128, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1) + %16.sub1:vreg_128 = COPY %43.sub1 + %16.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %16, %2, 0, 144, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %15.sub1:vreg_128 = COPY %43.sub1 + %15.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %15, %2, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + %14.sub1:vreg_128 = COPY %43.sub1 + %14.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %14, %2, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %13.sub1:vreg_128 = COPY %43.sub1 + %13.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %13, %2, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + %12.sub1:vreg_128 = COPY %43.sub1 + %12.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %12, %2, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %11.sub1:vreg_128 = COPY %43.sub1 + %11.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %11, %2, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + %10.sub1:vreg_128 = COPY %43.sub1 + %10.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %10, %2, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + %9.sub1:vreg_128 = COPY %43.sub1 + %9.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %9, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 512, addrspace 1) + %8.sub1:vreg_128 = COPY %43.sub1 + %8.sub3:vreg_128 = COPY %43.sub1 + BUFFER_STORE_DWORDX4_OFFSET %8, %2, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index e2d64c105d955..78e1402b1b022 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -41,8 +41,9 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, < ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: s_cbranch_execz BB0_2 ; GCN-NEXT: ; %bb.1: ; %if.then4.i -; GCN-NEXT: buffer_load_dword v0, v40, s[36:39], s32 offen -; GCN-NEXT: buffer_load_dword v1, v40, s[36:39], s32 offen offset:4 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: buffer_load_dword v0, v40, s[36:39], 0 offen +; GCN-NEXT: buffer_load_dword v1, v40, s[36:39], 0 offen offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll index 3fa202768f483..80658fa9ed756 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -55,42 +55,42 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: s_lshr_b32 s4, s2, 8 -; GFX9-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s0, s0, 24 -; GFX9-NEXT: v_mov_b32_e32 v8, s0 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: ds_write_b8 v0, v6 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v6 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: s_lshr_b32 s0, s3, 24 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 -; GFX9-NEXT: ds_write_b8 v0, v5 offset:4 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:13 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: s_lshr_b32 s0, s1, 24 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:15 -; GFX9-NEXT: ds_write_b8 v0, v3 offset:9 -; GFX9-NEXT: ds_write_b8 v0, v4 offset:11 -; GFX9-NEXT: ds_write_b8 v0, v6 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v5 offset:6 +; GFX9-NEXT: s_lshr_b32 s4, s3, 8 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX9-NEXT: ds_write_b8 v0, v7 offset:1 -; GFX9-NEXT: ds_write_b8 v0, v8 offset:3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align1: @@ -100,50 +100,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_lshr_b32 s4, s3, 8 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: s_lshr_b32 s4, s3, 16 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshr_b32 s3, s3, 24 -; GFX7-NEXT: ds_write_b8 v0, v5 offset:13 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-NEXT: s_lshr_b32 s3, s2, 8 -; GFX7-NEXT: v_mov_b32_e32 v6, s4 -; GFX7-NEXT: ds_write_b8 v0, v5 offset:15 -; GFX7-NEXT: ds_write_b8 v0, v6 offset:14 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: s_lshr_b32 s3, s2, 16 -; GFX7-NEXT: s_lshr_b32 s2, s2, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:12 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:8 -; GFX7-NEXT: ds_write_b8 v0, v5 offset:9 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s4, s3, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_lshr_b32 s4, s3, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_lshr_b32 s3, s3, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s3, s2, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s3, s2, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s2, s2, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_lshr_b32 s2, s1, 8 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:12 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:11 -; GFX7-NEXT: ds_write_b8 v0, v6 offset:10 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s1, 16 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: s_lshr_b32 s1, s1, 24 +; GFX7-NEXT: s_lshr_b32 s2, s1, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s1, s0, 8 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:6 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: s_lshr_b32 s1, s0, 16 -; GFX7-NEXT: s_lshr_b32 s0, s0, 24 -; GFX7-NEXT: ds_write_b8 v0, v4 +; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v3 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align1: @@ -153,50 +153,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_lshr_b32 s4, s3, 8 -; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: s_lshr_b32 s4, s3, 16 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: s_lshr_b32 s3, s3, 24 -; GFX6-NEXT: ds_write_b8 v0, v5 offset:13 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 -; GFX6-NEXT: s_lshr_b32 s3, s2, 8 -; GFX6-NEXT: v_mov_b32_e32 v6, s4 -; GFX6-NEXT: ds_write_b8 v0, v5 offset:15 -; GFX6-NEXT: ds_write_b8 v0, v6 offset:14 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: s_lshr_b32 s3, s2, 16 -; GFX6-NEXT: s_lshr_b32 s2, s2, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:12 ; GFX6-NEXT: ds_write_b8 v0, v2 offset:8 -; GFX6-NEXT: ds_write_b8 v0, v5 offset:9 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s4, s3, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_lshr_b32 s4, s3, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s2, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s2, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_lshr_b32 s2, s1, 8 -; GFX6-NEXT: v_mov_b32_e32 v6, s3 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:12 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:11 -; GFX6-NEXT: ds_write_b8 v0, v6 offset:10 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s1, 16 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: s_lshr_b32 s1, s1, 24 +; GFX6-NEXT: s_lshr_b32 s2, s1, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s1, s0, 8 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:6 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: s_lshr_b32 s0, s0, 24 -; GFX6-NEXT: ds_write_b8 v0, v4 +; GFX6-NEXT: s_lshr_b32 s1, s0, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v3 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 ; GFX6-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void @@ -210,17 +210,17 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 -; GFX9-NEXT: ds_write_b16 v0, v4 -; GFX9-NEXT: ds_write_b16 v0, v3 offset:4 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v4 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align2: @@ -230,26 +230,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: v_mov_b32_e32 v5, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: ds_write_b16 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: ds_write_b16 v0, v3 offset:4 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 -; GFX7-NEXT: s_lshr_b32 s0, s3, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX7-NEXT: ds_write_b16 v0, v2 offset:8 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s3, s3, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: ds_write_b16 v0, v2 offset:14 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:12 -; GFX7-NEXT: ds_write_b16 v0, v3 offset:10 -; GFX7-NEXT: ds_write_b16 v0, v4 offset:6 -; GFX7-NEXT: ds_write_b16 v0, v5 offset:2 +; GFX7-NEXT: s_lshr_b32 s2, s2, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align2: @@ -259,26 +259,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: v_mov_b32_e32 v5, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: ds_write_b16 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: ds_write_b16 v0, v3 offset:4 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: s_lshr_b32 s0, s3, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX6-NEXT: ds_write_b16 v0, v2 offset:8 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: ds_write_b16 v0, v2 offset:14 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:12 -; GFX6-NEXT: ds_write_b16 v0, v3 offset:10 -; GFX6-NEXT: ds_write_b16 v0, v4 offset:6 -; GFX6-NEXT: ds_write_b16 v0, v5 offset:2 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 ; GFX6-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void @@ -307,10 +307,10 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s3 ; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 -; GFX7-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align4: diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll index 351b632d06479..41fdb1cbd61be 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -36,10 +36,10 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i3 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX6-NEXT: ds_write_b64 v2, v[0:1] ; GFX6-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out @@ -53,33 +53,33 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 8 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 -; GFX9-NEXT: s_lshr_b32 s0, s0, 24 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: ds_write_b8 v0, v4 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v4 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: s_lshr_b32 s0, s2, 24 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX9-NEXT: ds_write_b8 v0, v3 offset:4 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: s_lshr_b32 s0, s1, 24 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:11 -; GFX9-NEXT: ds_write_b8 v0, v4 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX9-NEXT: ds_write_b8 v0, v5 offset:1 -; GFX9-NEXT: ds_write_b8 v0, v6 offset:3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align1: @@ -89,39 +89,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_lshr_b32 s3, s2, 8 -; GFX7-NEXT: v_mov_b32_e32 v4, s3 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s3, s2, 16 -; GFX7-NEXT: s_lshr_b32 s2, s2, 24 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b8 v0, v4 offset:9 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s3, s2, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s3, s2, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s2, s2, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_lshr_b32 s2, s1, 8 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX7-NEXT: ds_write_b8 v0, v5 offset:10 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s1, 16 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_lshr_b32 s1, s1, 24 +; GFX7-NEXT: s_lshr_b32 s2, s1, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s1, s0, 8 -; GFX7-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: ds_write_b8 v0, v4 offset:6 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 -; GFX7-NEXT: s_lshr_b32 s1, s0, 16 -; GFX7-NEXT: s_lshr_b32 s0, s0, 24 -; GFX7-NEXT: ds_write_b8 v0, v3 +; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 -; GFX7-NEXT: ds_write_b8 v0, v4 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align1: @@ -131,39 +131,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_lshr_b32 s3, s2, 8 -; GFX6-NEXT: v_mov_b32_e32 v4, s3 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s3, s2, 16 -; GFX6-NEXT: s_lshr_b32 s2, s2, 24 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b8 v0, v4 offset:9 +; GFX6-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s3, s2, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s2, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_lshr_b32 s2, s1, 8 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX6-NEXT: ds_write_b8 v0, v5 offset:10 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s1, 16 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: s_lshr_b32 s1, s1, 24 +; GFX6-NEXT: s_lshr_b32 s2, s1, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s1, s0, 8 -; GFX6-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: ds_write_b8 v0, v4 offset:6 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: s_lshr_b32 s0, s0, 24 -; GFX6-NEXT: ds_write_b8 v0, v3 +; GFX6-NEXT: s_lshr_b32 s1, s0, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX6-NEXT: v_mov_b32_e32 v4, s1 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 -; GFX6-NEXT: ds_write_b8 v0, v4 offset:2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 ; GFX6-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void @@ -178,13 +178,13 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b16 v0, v3 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align2: @@ -194,21 +194,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_write_b16 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 -; GFX7-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX7-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s2, s2, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: ds_write_b16 v0, v2 offset:10 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b16 v0, v3 offset:6 -; GFX7-NEXT: ds_write_b16 v0, v4 offset:2 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align2: @@ -218,21 +218,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: ds_write_b16 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX6-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: ds_write_b16 v0, v2 offset:10 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b16 v0, v3 offset:6 -; GFX6-NEXT: ds_write_b16 v0, v4 offset:2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 ; GFX6-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 ret void @@ -260,9 +260,9 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 ; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 -; GFX7-NEXT: ds_write_b32 v0, v3 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: ds_write_b32 v0, v1 offset:8 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align4: @@ -302,10 +302,10 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX7-NEXT: ds_write_b64 v2, v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -316,10 +316,10 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX6-NEXT: ds_write_b64 v2, v[0:1] ; GFX6-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8 @@ -359,10 +359,10 @@ define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX6-NEXT: ds_write_b64 v2, v[0:1] ; GFX6-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 70c5655fe8117..90336ca79ac29 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -6,14 +6,14 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { ; CIVI-LABEL: local_store_i56: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b32 v0, v1 -; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 -; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 +; CIVI-NEXT: ds_write_b32 v0, v1 +; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i56: ; GFX9: ; %bb.0: @@ -30,70 +30,70 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s4, 14 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s5 -; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, s0 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s2 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 -; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 -; HAWAII-NEXT: ds_write_b32 v1, v2 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_or_b32 s0, s4, 14 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 +; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v1, s0 +; HAWAII-NEXT: v_mov_b32_e32 v3, s2 +; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 +; HAWAII-NEXT: ds_write_b32 v1, v2 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s4, 14 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s5 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v1, s0 -; FIJI-NEXT: v_mov_b32_e32 v3, s1 -; FIJI-NEXT: s_and_b32 s3, s2, 0xffff -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 -; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 -; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 -; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 -; FIJI-NEXT: ds_write_b32 v1, v3 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_or_b32 s0, s4, 14 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v1, s0 +; FIJI-NEXT: v_mov_b32_e32 v3, s1 +; FIJI-NEXT: s_and_b32 s3, s2, 0xffff +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 +; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 +; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 +; FIJI-NEXT: ds_write_b32 v1, v3 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_and_b32 s3, s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v2, s3, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b32 v0, v3 -; GFX9-NEXT: s_endpgm +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_and_b32 s3, s2, 0xffff +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s3, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 +; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void } @@ -101,31 +101,31 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 -; HAWAII-NEXT: v_mov_b32_e32 v1, s2 -; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 -; HAWAII-NEXT: ds_write_b32 v0, v2 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b32 v0, v1 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v2, s1 -; FIJI-NEXT: v_mov_b32_e32 v1, s2 -; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 -; FIJI-NEXT: ds_write_b32 v0, v2 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b32 v0, v1 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: @@ -146,35 +146,35 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: s_and_b32 s3, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s3 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 -; HAWAII-NEXT: ds_write_b64 v2, v[0:1] -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: s_and_b32 s3, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v0, s3 +; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b64 v2, v[0:1] +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: s_and_b32 s3, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v3, s3 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 -; FIJI-NEXT: ds_write_b64 v2, v[0:1] -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: s_and_b32 s3, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v0, s3 +; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b64 v2, v[0:1] +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: @@ -218,22 +218,22 @@ define void @local_store_i13(i13 addrspace(3)* %ptr, i13 %arg) #0 { define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 { ; CIVI-LABEL: local_store_i17: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; CIVI-NEXT: ds_write_b16 v0, v1 -; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b16 v0, v1 +; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:2 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i17: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x1ffff, v1 -; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x1ffff, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i17 %arg, i17 addrspace(3)* %ptr, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir index 0fa0ddab4e11f..6759cd1040f85 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir +++ b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir @@ -110,7 +110,7 @@ body: | ; and inserting a spill. Here we just check that the point where the error ; occurs we see a correctly generated spill. ; GCN-LABEL: bb.7: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 @@ -126,7 +126,7 @@ body: | successors: %bb.12(0x80000000) ; GCN-LABEL: bb.9: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 @@ -137,7 +137,7 @@ body: | successors: %bb.12(0x80000000) ; GCN-LABEL: bb.10: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 2143289344, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 diff --git a/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll index 1648c7fe37ccb..e10cd44c6f3b0 100644 --- a/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll +++ b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll @@ -5,37 +5,37 @@ ; GCN-LABEL: {{^}}token_factor_inline_limit_test: ; GCN-TFILD: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN-TFILD: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN-TFILD: buffer_store_dword [[REG8]], {{.*$}} +; GCN-TFILD: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 ; GCN-TFILD: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 ; GCN-TFILD: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 ; GCN-TFILD: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 ; GCN-TFILD: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 ; GCN-TFILD: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 ; GCN-TFILD: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN-TFILD: buffer_store_dword [[REG15]], {{.*}} offset:28 ; GCN-TFIL7: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 ; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:28 +; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 ; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 ; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 ; GCN-TFIL7: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 ; GCN-TFIL7: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 ; GCN-TFIL7: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 ; GCN-TFIL7: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 ; GCN-TFIL7: buffer_store_dword [[REG8]], {{.*$}} ; GCN: v_mov_b32_e32 v31, 7 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir new file mode 100644 index 0000000000000..4905bcc06c622 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir @@ -0,0 +1,66 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s + +# Make sure no waitcnt is inserted for meta instruction uses. + +--- + +name: waitcnt_kill + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; GCN-LABEL: name: waitcnt_kill + ; GCN: S_WAITCNT 0 + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: KILL $vgpr0 + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + KILL $vgpr0 +... + +--- + +name: waitcnt_implicit_def + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; GCN-LABEL: name: waitcnt_implicit_def + ; GCN: S_WAITCNT 0 + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = IMPLICIT_DEF + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + $vgpr0 = IMPLICIT_DEF +... + +--- + +name: waitcnt_eh_label + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + ; GCN-LABEL: name: waitcnt_eh_label + ; GCN: S_WAITCNT 0 + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: EH_LABEL , implicit $vgpr0 + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + EH_LABEL , implicit $vgpr0 + +... + +--- + +name: waitcnt_cfi + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + ; GCN-LABEL: name: waitcnt_cfi + ; GCN: S_WAITCNT 0 + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: CFI_INSTRUCTION offset $vgpr0_lo16, 16 + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + CFI_INSTRUCTION offset $vgpr0, 16 + +... diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll index 4cbd89147722b..4d9c6a9a540fd 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -153,7 +153,9 @@ bb: ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup: ; GCN: flat_load_dword -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8_9: s_waitcnt lgkmcnt(0){{$}} +; GFX8_9: s_waitcnt vmcnt(0){{$}} +; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) { diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index bff7cf6809905..a56137757b411 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -135,12 +135,13 @@ define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) { ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s7, s7, 34 ; SI-NEXT: s_or_b32 s7, s7, 4 -; SI-NEXT: s_bfe_u32 s8, s7, 0x10010 ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: s_bfe_u32 s8, s7, 0x10010 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i17_constant_load: @@ -157,9 +158,9 @@ define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) { ; VI-NEXT: s_or_b32 s0, s0, 4 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_bfe_u32 s0, s0, 0x10010 -; VI-NEXT: v_mov_b32_e32 v5, s0 ; VI-NEXT: flat_store_short v[0:1], v4 -; VI-NEXT: flat_store_byte v[2:3], v5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm %load = load i17, i17 addrspace(4)* %arg, align 4 %add = add i17 %load, 34 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 127d0bc0fc686..860e58d33abf4 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -650,12 +650,12 @@ main_body: ; CHECK: image_store ; CHECK: s_wqm_b64 exec, exec ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0 -; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000 +; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000 ; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]] ; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop -; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]] +; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]] ; CHECK: s_cbranch_vccz [[LOOPHDR]] ; CHECK: ; %break diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 1a48e76a241bb..e4beac77e1be2 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -94,10 +94,10 @@ define i32 @called(i32 %a) noinline { ; GFX9-LABEL: {{^}}call: define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) { -; GFX9-O0: v_mov_b32_e32 v0, s0 +; GFX9-O0: v_mov_b32_e32 v0, s2 ; GFX9-O3: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0) @@ -142,8 +142,8 @@ define amdgpu_kernel void @call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) { ; GFX9-O0: buffer_store_dword v1 ; GFX9: s_swappc_b64 %tmp134 = call i64 @called_i64(i64 %tmp107) -; GFX9-O0: buffer_load_dword v4 -; GFX9-O0: buffer_load_dword v5 +; GFX9-O0: buffer_load_dword v6 +; GFX9-O0: buffer_load_dword v7 %tmp136 = add i64 %tmp134, %tmp107 %tmp137 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp136) %tmp138 = bitcast i64 %tmp137 to <2 x i32> diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll index 27cc1d3d6b45d..ab63bccd9dedd 100644 --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -87,6 +87,7 @@ ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: Early Machine Loop Invariant Code Motion ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Block Frequency Analysis diff --git a/llvm/test/CodeGen/ARM/arm-storebytesmerge.ll b/llvm/test/CodeGen/ARM/arm-storebytesmerge.ll index fec6ea7ae8382..c7bd79e7ca1d2 100644 --- a/llvm/test/CodeGen/ARM/arm-storebytesmerge.ll +++ b/llvm/test/CodeGen/ARM/arm-storebytesmerge.ll @@ -1,11 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7em-arm-none-eabi %s -o - | FileCheck %s -target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" -target triple = "thumbv7em-arm-none-eabi" - -; Function Attrs: nounwind -define arm_aapcs_vfpcc void @test(i8* %v50) #0 { +define arm_aapcs_vfpcc void @test(i8* %v50) { ; CHECK-LABEL: test: ; CHECK: @ %bb.0: ; CHECK-NEXT: movw r1, #65534 @@ -337,5 +333,3 @@ define arm_aapcs_vfpcc void @test(i8* %v50) #0 { ret void } -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="-d32,+dsp,+fp-armv8,+hwdiv,+thumb-mode,-crc,-crypto,-dotprod,-fullfp16,-hwdiv-arm,-neon,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" } - diff --git a/llvm/test/CodeGen/ARM/constant-island-SOImm-limit16.mir b/llvm/test/CodeGen/ARM/constant-island-SOImm-limit16.mir new file mode 100644 index 0000000000000..223a3b0b33b13 --- /dev/null +++ b/llvm/test/CodeGen/ARM/constant-island-SOImm-limit16.mir @@ -0,0 +1,62 @@ +# RUN: sed -e "s/SPACEBYTES/100/g" %s | sed -e "s/OFFSET/116/g" > %t.mir +# RUN: llc %t.mir --filetype=obj -start-before=arm-cp-islands -o - | \ +# RUN: llvm-objdump --arch=armv8a --disassemble - | FileCheck %t.mir + +# RUN: sed -e "s/SPACEBYTES/400/g" %s | sed -e "s/OFFSET/12/g" > %t.mir +# RUN: llc %t.mir --filetype=obj -start-before=arm-cp-islands -o - | \ +# RUN: llvm-objdump --arch=armv8a --disassemble - | FileCheck %t.mir + +# RUN: sed -e "s/SPACEBYTES/800/g" %s | sed -e "s/OFFSET/12/g" > %t.mir +# RUN: llc %t.mir --filetype=obj -start-before=arm-cp-islands -o - | \ +# RUN: llvm-objdump --arch=armv8a --disassemble - | FileCheck %t.mir + +--- | + target triple = "armv8.2a-arm-none-eabi" + + define dso_local i32 @main() #0 { ret i32 0 } + + attributes #0 = { "frame-pointer"="all" } !4 = !{i32 210} + +... +--- + +name: main +alignment: 4 +tracksRegLiveness: true +constants: + +- + id: 0 + value: half 0xH5440 + alignment: 2 +- + id: 1 + value: half 0xH5441 + alignment: 2 + +machineFunctionInfo: {} +body: | + + bb.0 (%ir-block.0): + liveins: $lr + + $sp = frame-setup STMDB_UPD $sp, 14, $noreg, killed $r11, killed $lr + $r11 = frame-setup MOVr killed $sp, 14, $noreg, $noreg + $sp = frame-setup SUBri killed $sp, 80, 14, $noreg, $noreg + + ; Test handling of 16-bit constant pool entries. + ; 2 consecutive entries: 1 is 4-byte aligned, 1 is not 4-byte aligned. + + renamable $r1 = LEApcrel %const.0, 14, $noreg + renamable $r1 = LDRH killed renamable $r1, $noreg, 0, 14, $noreg :: (load 2 from constant-pool) + renamable $r1 = LEApcrel %const.1, 14, $noreg + renamable $r1 = LDRH killed renamable $r1, $noreg, 0, 14, $noreg :: (load 2 from constant-pool) + + renamable $r0 = SPACE SPACEBYTES, undef renamable $r0 + + $sp = frame-destroy MOVr $r11, 14, $noreg, $noreg + $sp = frame-destroy LDMIA_RET $sp, 14, $noreg, def $r11, def $pc, implicit killed $r0 + + # CHECK: add r1, pc, #OFFSET +--- +... diff --git a/llvm/test/CodeGen/ARM/fminmax-folds.ll b/llvm/test/CodeGen/ARM/fminmax-folds.ll new file mode 100644 index 0000000000000..b13426c7c0500 --- /dev/null +++ b/llvm/test/CodeGen/ARM/fminmax-folds.ll @@ -0,0 +1,598 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=armv8-eabi | FileCheck %s + +declare float @llvm.minnum.f32(float, float) +declare float @llvm.maxnum.f32(float, float) +declare float @llvm.minimum.f32(float, float) +declare float @llvm.maximum.f32(float, float) +declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) +declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>) +declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>) + +define float @test_minnum_const_nan(float %x) { +; CHECK-LABEL: test_minnum_const_nan: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call float @llvm.minnum.f32(float %x, float 0x7fff000000000000) + ret float %r +} + +define float @test_maxnum_const_nan(float %x) { +; CHECK-LABEL: test_maxnum_const_nan: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call float @llvm.maxnum.f32(float %x, float 0x7fff000000000000) + ret float %r +} + +define float @test_maximum_const_nan(float %x) { +; CHECK-LABEL: test_maximum_const_nan: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #0 +; CHECK-NEXT: movt r0, #32760 +; CHECK-NEXT: bx lr + %r = call float @llvm.maximum.f32(float %x, float 0x7fff000000000000) + ret float %r +} + +define float @test_minimum_const_nan(float %x) { +; CHECK-LABEL: test_minimum_const_nan: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #0 +; CHECK-NEXT: movt r0, #32760 +; CHECK-NEXT: bx lr + %r = call float @llvm.minimum.f32(float %x, float 0x7fff000000000000) + ret float %r +} + +define float @test_minnum_const_inf(float %x) { +; CHECK-LABEL: test_minnum_const_inf: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI4_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vminnm.f32 s0, s2, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI4_0: +; CHECK-NEXT: .long 0x7f800000 @ float +Inf + %r = call float @llvm.minnum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_maxnum_const_inf(float %x) { +; CHECK-LABEL: test_maxnum_const_inf: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #0 +; CHECK-NEXT: movt r0, #32640 +; CHECK-NEXT: bx lr + %r = call float @llvm.maxnum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_maximum_const_inf(float %x) { +; CHECK-LABEL: test_maximum_const_inf: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI6_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmax.f32 d0, d1, d0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI6_0: +; CHECK-NEXT: .long 0x7f800000 @ float +Inf + %r = call float @llvm.maximum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_minimum_const_inf(float %x) { +; CHECK-LABEL: test_minimum_const_inf: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call float @llvm.minimum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_minnum_const_neg_inf(float %x) { +; CHECK-LABEL: test_minnum_const_neg_inf: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #0 +; CHECK-NEXT: movt r0, #65408 +; CHECK-NEXT: bx lr + %r = call float @llvm.minnum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_maxnum_const_neg_inf(float %x) { +; CHECK-LABEL: test_maxnum_const_neg_inf: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI9_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmaxnm.f32 s0, s2, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .long 0xff800000 @ float -Inf + %r = call float @llvm.maxnum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_maximum_const_neg_inf(float %x) { +; CHECK-LABEL: test_maximum_const_neg_inf: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call float @llvm.maximum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_minimum_const_neg_inf(float %x) { +; CHECK-LABEL: test_minimum_const_neg_inf: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI11_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmin.f32 d0, d1, d0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .long 0xff800000 @ float -Inf + %r = call float @llvm.minimum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_minnum_const_inf_nnan(float %x) { +; CHECK-LABEL: test_minnum_const_inf_nnan: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call nnan float @llvm.minnum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_maxnum_const_inf_nnan(float %x) { +; CHECK-LABEL: test_maxnum_const_inf_nnan: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #0 +; CHECK-NEXT: movt r0, #32640 +; CHECK-NEXT: bx lr + %r = call nnan float @llvm.maxnum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_maximum_const_inf_nnan(float %x) { +; CHECK-LABEL: test_maximum_const_inf_nnan: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #0 +; CHECK-NEXT: movt r0, #32640 +; CHECK-NEXT: bx lr + %r = call nnan float @llvm.maximum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_minimum_const_inf_nnan(float %x) { +; CHECK-LABEL: test_minimum_const_inf_nnan: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call nnan float @llvm.minimum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_minnum_const_inf_nnan_comm(float %x) { +; CHECK-LABEL: test_minnum_const_inf_nnan_comm: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call nnan float @llvm.minnum.f32(float 0x7ff0000000000000, float %x) + ret float %r +} + +define float @test_maxnum_const_inf_nnan_comm(float %x) { +; CHECK-LABEL: test_maxnum_const_inf_nnan_comm: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #0 +; CHECK-NEXT: movt r0, #32640 +; CHECK-NEXT: bx lr + %r = call nnan float @llvm.maxnum.f32(float 0x7ff0000000000000, float %x) + ret float %r +} + +define float @test_maximum_const_inf_nnan_comm(float %x) { +; CHECK-LABEL: test_maximum_const_inf_nnan_comm: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #0 +; CHECK-NEXT: movt r0, #32640 +; CHECK-NEXT: bx lr + %r = call nnan float @llvm.maximum.f32(float 0x7ff0000000000000, float %x) + ret float %r +} + +define float @test_minimum_const_inf_nnan_comm(float %x) { +; CHECK-LABEL: test_minimum_const_inf_nnan_comm: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call nnan float @llvm.minimum.f32(float 0x7ff0000000000000, float %x) + ret float %r +} + +define <2 x float> @test_minnum_const_inf_nnan_comm_vec(<2 x float> %x) { +; CHECK-LABEL: test_minnum_const_inf_nnan_comm_vec: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call nnan <2 x float> @llvm.minnum.v2f32(<2 x float> , <2 x float> %x) + ret <2 x float> %r +} + +define <2 x float> @test_maxnum_const_inf_nnan_comm_vec(<2 x float> %x) { +; CHECK-LABEL: test_maxnum_const_inf_nnan_comm_vec: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, .LCPI21_0 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI21_0: +; CHECK-NEXT: .long 0x7f800000 @ float +Inf +; CHECK-NEXT: .long 0x7f800000 @ float +Inf + %r = call nnan <2 x float> @llvm.maxnum.v2f32(<2 x float> , <2 x float> %x) + ret <2 x float> %r +} + +define <2 x float> @test_maximum_const_inf_nnan_comm_vec(<2 x float> %x) { +; CHECK-LABEL: test_maximum_const_inf_nnan_comm_vec: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, .LCPI22_0 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI22_0: +; CHECK-NEXT: .long 0x7f800000 @ float +Inf +; CHECK-NEXT: .long 0x7f800000 @ float +Inf + %r = call nnan <2 x float> @llvm.maximum.v2f32(<2 x float> , <2 x float> %x) + ret <2 x float> %r +} + +define <2 x float> @test_minimum_const_inf_nnan_comm_vec(<2 x float> %x) { +; CHECK-LABEL: test_minimum_const_inf_nnan_comm_vec: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call nnan <2 x float> @llvm.minimum.v2f32(<2 x float> , <2 x float> %x) + ret <2 x float> %r +} + +define float @test_minnum_const_neg_inf_nnan(float %x) { +; CHECK-LABEL: test_minnum_const_neg_inf_nnan: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #0 +; CHECK-NEXT: movt r0, #65408 +; CHECK-NEXT: bx lr + %r = call nnan float @llvm.minnum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_maxnum_const_neg_inf_nnan(float %x) { +; CHECK-LABEL: test_maxnum_const_neg_inf_nnan: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call nnan float @llvm.maxnum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_maximum_const_neg_inf_nnan(float %x) { +; CHECK-LABEL: test_maximum_const_neg_inf_nnan: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call nnan float @llvm.maximum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_minimum_const_neg_inf_nnan(float %x) { +; CHECK-LABEL: test_minimum_const_neg_inf_nnan: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #0 +; CHECK-NEXT: movt r0, #65408 +; CHECK-NEXT: bx lr + %r = call nnan float @llvm.minimum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_minnum_const_max(float %x) { +; CHECK-LABEL: test_minnum_const_max: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI28_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vminnm.f32 s0, s2, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI28_0: +; CHECK-NEXT: .long 0x7f7fffff @ float 3.40282347E+38 + %r = call float @llvm.minnum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_maxnum_const_max(float %x) { +; CHECK-LABEL: test_maxnum_const_max: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI29_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmaxnm.f32 s0, s2, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI29_0: +; CHECK-NEXT: .long 0x7f7fffff @ float 3.40282347E+38 + %r = call float @llvm.maxnum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_maximum_const_max(float %x) { +; CHECK-LABEL: test_maximum_const_max: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI30_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmax.f32 d0, d1, d0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI30_0: +; CHECK-NEXT: .long 0x7f7fffff @ float 3.40282347E+38 + %r = call float @llvm.maximum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_minimum_const_max(float %x) { +; CHECK-LABEL: test_minimum_const_max: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI31_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmin.f32 d0, d1, d0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI31_0: +; CHECK-NEXT: .long 0x7f7fffff @ float 3.40282347E+38 + %r = call float @llvm.minimum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_minnum_const_neg_max(float %x) { +; CHECK-LABEL: test_minnum_const_neg_max: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI32_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vminnm.f32 s0, s2, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI32_0: +; CHECK-NEXT: .long 0xff7fffff @ float -3.40282347E+38 + %r = call float @llvm.minnum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_maxnum_const_neg_max(float %x) { +; CHECK-LABEL: test_maxnum_const_neg_max: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI33_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmaxnm.f32 s0, s2, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI33_0: +; CHECK-NEXT: .long 0xff7fffff @ float -3.40282347E+38 + %r = call float @llvm.maxnum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_maximum_const_neg_max(float %x) { +; CHECK-LABEL: test_maximum_const_neg_max: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI34_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmax.f32 d0, d1, d0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI34_0: +; CHECK-NEXT: .long 0xff7fffff @ float -3.40282347E+38 + %r = call float @llvm.maximum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_minimum_const_neg_max(float %x) { +; CHECK-LABEL: test_minimum_const_neg_max: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI35_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmin.f32 d0, d1, d0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI35_0: +; CHECK-NEXT: .long 0xff7fffff @ float -3.40282347E+38 + %r = call float @llvm.minimum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_minnum_const_max_ninf(float %x) { +; CHECK-LABEL: test_minnum_const_max_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI36_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vminnm.f32 s0, s2, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI36_0: +; CHECK-NEXT: .long 0x7f7fffff @ float 3.40282347E+38 + %r = call ninf float @llvm.minnum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_maxnum_const_max_ninf(float %x) { +; CHECK-LABEL: test_maxnum_const_max_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #65535 +; CHECK-NEXT: movt r0, #32639 +; CHECK-NEXT: bx lr + %r = call ninf float @llvm.maxnum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_maximum_const_max_ninf(float %x) { +; CHECK-LABEL: test_maximum_const_max_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI38_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmax.f32 d0, d1, d0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI38_0: +; CHECK-NEXT: .long 0x7f7fffff @ float 3.40282347E+38 + %r = call ninf float @llvm.maximum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_minimum_const_max_ninf(float %x) { +; CHECK-LABEL: test_minimum_const_max_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call ninf float @llvm.minimum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_minnum_const_neg_max_ninf(float %x) { +; CHECK-LABEL: test_minnum_const_neg_max_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: mvn r0, #8388608 +; CHECK-NEXT: bx lr + %r = call ninf float @llvm.minnum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_maxnum_const_neg_max_ninf(float %x) { +; CHECK-LABEL: test_maxnum_const_neg_max_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI41_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmaxnm.f32 s0, s2, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI41_0: +; CHECK-NEXT: .long 0xff7fffff @ float -3.40282347E+38 + %r = call ninf float @llvm.maxnum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_maximum_const_neg_max_ninf(float %x) { +; CHECK-LABEL: test_maximum_const_neg_max_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call ninf float @llvm.maximum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_minimum_const_neg_max_ninf(float %x) { +; CHECK-LABEL: test_minimum_const_neg_max_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, .LCPI43_0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmin.f32 d0, d1, d0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI43_0: +; CHECK-NEXT: .long 0xff7fffff @ float -3.40282347E+38 + %r = call ninf float @llvm.minimum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_minnum_const_max_nnan_ninf(float %x) { +; CHECK-LABEL: test_minnum_const_max_nnan_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call nnan ninf float @llvm.minnum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_maxnum_const_max_nnan_ninf(float %x) { +; CHECK-LABEL: test_maxnum_const_max_nnan_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #65535 +; CHECK-NEXT: movt r0, #32639 +; CHECK-NEXT: bx lr + %r = call nnan ninf float @llvm.maxnum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_maximum_const_max_nnan_ninf(float %x) { +; CHECK-LABEL: test_maximum_const_max_nnan_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, #65535 +; CHECK-NEXT: movt r0, #32639 +; CHECK-NEXT: bx lr + %r = call nnan ninf float @llvm.maximum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_minimum_const_max_nnan_ninf(float %x) { +; CHECK-LABEL: test_minimum_const_max_nnan_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call nnan ninf float @llvm.minimum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_minnum_const_neg_max_nnan_ninf(float %x) { +; CHECK-LABEL: test_minnum_const_neg_max_nnan_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: mvn r0, #8388608 +; CHECK-NEXT: bx lr + %r = call nnan ninf float @llvm.minnum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_maxnum_const_neg_max_nnan_ninf(float %x) { +; CHECK-LABEL: test_maxnum_const_neg_max_nnan_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call nnan ninf float @llvm.maxnum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_maximum_const_neg_max_nnan_ninf(float %x) { +; CHECK-LABEL: test_maximum_const_neg_max_nnan_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + %r = call nnan ninf float @llvm.maximum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_minimum_const_neg_max_nnan_ninf(float %x) { +; CHECK-LABEL: test_minimum_const_neg_max_nnan_ninf: +; CHECK: @ %bb.0: +; CHECK-NEXT: mvn r0, #8388608 +; CHECK-NEXT: bx lr + %r = call nnan ninf float @llvm.minimum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} diff --git a/llvm/test/CodeGen/ARM/fp16-bitcast.ll b/llvm/test/CodeGen/ARM/fp16-bitcast.ll index d26c2d96614a4..4d450e86d46fe 100644 --- a/llvm/test/CodeGen/ARM/fp16-bitcast.ll +++ b/llvm/test/CodeGen/ARM/fp16-bitcast.ll @@ -129,3 +129,66 @@ entry: %add = add i16 %hc, 1 ret i16 %add } + +define half @constcall() { +; CHECK-VFPV4-SOFT-LABEL: constcall: +; CHECK-VFPV4-SOFT: @ %bb.0: @ %entry +; CHECK-VFPV4-SOFT-NEXT: mov.w r0, #18688 +; CHECK-VFPV4-SOFT-NEXT: b ccc +; +; CHECK-FP16-SOFT-LABEL: constcall: +; CHECK-FP16-SOFT: @ %bb.0: @ %entry +; CHECK-FP16-SOFT-NEXT: vmov.f16 s0, #1.000000e+01 +; CHECK-FP16-SOFT-NEXT: vmov.f16 r0, s0 +; CHECK-FP16-SOFT-NEXT: b ccc +; +; CHECK-VFPV4-HARD-LABEL: constcall: +; CHECK-VFPV4-HARD: @ %bb.0: @ %entry +; CHECK-VFPV4-HARD-NEXT: vldr s0, .LCPI4_0 +; CHECK-VFPV4-HARD-NEXT: b ccc +; CHECK-VFPV4-HARD-NEXT: .p2align 2 +; CHECK-VFPV4-HARD-NEXT: @ %bb.1: +; CHECK-VFPV4-HARD-NEXT: .LCPI4_0: +; CHECK-VFPV4-HARD-NEXT: .long 0x00004900 @ float 2.61874657E-41 +; +; CHECK-FP16-HARD-LABEL: constcall: +; CHECK-FP16-HARD: @ %bb.0: @ %entry +; CHECK-FP16-HARD-NEXT: vmov.f16 s0, #1.000000e+01 +; CHECK-FP16-HARD-NEXT: vmov.f16 r0, s0 +; CHECK-FP16-HARD-NEXT: vmov s0, r0 +; CHECK-FP16-HARD-NEXT: b ccc +entry: + %call = tail call fast half @ccc(half 0xH4900) + ret half %call +} + +define half @constret() { +; CHECK-VFPV4-SOFT-LABEL: constret: +; CHECK-VFPV4-SOFT: @ %bb.0: @ %entry +; CHECK-VFPV4-SOFT-NEXT: mov.w r0, #18688 +; CHECK-VFPV4-SOFT-NEXT: bx lr +; +; CHECK-FP16-SOFT-LABEL: constret: +; CHECK-FP16-SOFT: @ %bb.0: @ %entry +; CHECK-FP16-SOFT-NEXT: vmov.f16 s0, #1.000000e+01 +; CHECK-FP16-SOFT-NEXT: vmov r0, s0 +; CHECK-FP16-SOFT-NEXT: bx lr +; +; CHECK-VFPV4-HARD-LABEL: constret: +; CHECK-VFPV4-HARD: @ %bb.0: @ %entry +; CHECK-VFPV4-HARD-NEXT: vldr s0, .LCPI5_0 +; CHECK-VFPV4-HARD-NEXT: bx lr +; CHECK-VFPV4-HARD-NEXT: .p2align 2 +; CHECK-VFPV4-HARD-NEXT: @ %bb.1: +; CHECK-VFPV4-HARD-NEXT: .LCPI5_0: +; CHECK-VFPV4-HARD-NEXT: .long 0x00004900 @ float 2.61874657E-41 +; +; CHECK-FP16-HARD-LABEL: constret: +; CHECK-FP16-HARD: @ %bb.0: @ %entry +; CHECK-FP16-HARD-NEXT: vmov.f16 s0, #1.000000e+01 +; CHECK-FP16-HARD-NEXT: bx lr +entry: + ret half 0xH4900 +} + +declare half @ccc(half) diff --git a/llvm/test/CodeGen/ARM/legalize-bitcast.ll b/llvm/test/CodeGen/ARM/legalize-bitcast.ll index 529775df5fd7d..478ff985bf475 100644 --- a/llvm/test/CodeGen/ARM/legalize-bitcast.ll +++ b/llvm/test/CodeGen/ARM/legalize-bitcast.ll @@ -49,9 +49,9 @@ define i16 @int_to_vec(i80 %in) { ; CHECK-NEXT: vmov.32 d16[0], r0 ; CHECK-NEXT: @ implicit-def: $q9 ; CHECK-NEXT: vmov.f64 d18, d16 -; CHECK-NEXT: vrev32.16 q8, q9 -; CHECK-NEXT: @ kill: def $d16 killed $d16 killed $q8 -; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: vrev32.16 q9, q9 +; CHECK-NEXT: @ kill: def $d18 killed $d18 killed $q9 +; CHECK-NEXT: vmov.u16 r0, d18[0] ; CHECK-NEXT: bx lr %vec = bitcast i80 %in to <5 x i16> %e0 = extractelement <5 x i16> %vec, i32 0 diff --git a/llvm/test/CodeGen/ARM/machine-outliner-calls.mir b/llvm/test/CodeGen/ARM/machine-outliner-calls.mir new file mode 100644 index 0000000000000..7880ddfb0051c --- /dev/null +++ b/llvm/test/CodeGen/ARM/machine-outliner-calls.mir @@ -0,0 +1,360 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=arm-- -run-pass=prologepilog -run-pass=machine-outliner \ +# RUN: -verify-machineinstrs %s -o - | FileCheck %s + +--- | + define void @outline_call_arm() #0 { ret void } + define void @outline_call_thumb() #1 { ret void } + define void @outline_call_tailcall_arm() #0 { ret void } + define void @outline_call_tailcall_thumb() #1 { ret void } + define void @outline_call_KO_mcount() #0 { ret void } + define void @bar() #0 { ret void } + declare void @"\01mcount"() + + attributes #0 = { minsize optsize } + attributes #1 = { minsize optsize "target-features"="+armv7-a,+thumb-mode" } +... +--- + +name: outline_call_arm +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: outline_call_arm + ; CHECK: bb.0: + ; CHECK: liveins: $r4, $lr + ; CHECK: $sp = frame-setup STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $lr + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: BL @OUTLINED_FUNCTION_0 + ; CHECK: bb.1: + ; CHECK: BL @OUTLINED_FUNCTION_0 + ; CHECK: bb.2: + ; CHECK: BL @OUTLINED_FUNCTION_0 + ; CHECK: bb.3: + ; CHECK: BL @OUTLINED_FUNCTION_0 + ; CHECK: bb.4: + ; CHECK: BL @OUTLINED_FUNCTION_0 + ; CHECK: bb.5: + ; CHECK: $sp = frame-destroy LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r4, def $lr + ; CHECK: BX_RET 14 /* CC::al */, $noreg + bb.0: + BL @bar, implicit-def dead $lr, implicit $sp + $r0 = MOVi 1, 14, $noreg, $noreg + $r1 = MOVi 1, 14, $noreg, $noreg + $r2 = MOVi 1, 14, $noreg, $noreg + $r3 = MOVi 1, 14, $noreg, $noreg + $r4 = MOVi 1, 14, $noreg, $noreg + bb.1: + BL @bar, implicit-def dead $lr, implicit $sp + $r0 = MOVi 1, 14, $noreg, $noreg + $r1 = MOVi 1, 14, $noreg, $noreg + $r2 = MOVi 1, 14, $noreg, $noreg + $r3 = MOVi 1, 14, $noreg, $noreg + $r4 = MOVi 1, 14, $noreg, $noreg + bb.2: + BL @bar, implicit-def dead $lr, implicit $sp + $r0 = MOVi 1, 14, $noreg, $noreg + $r1 = MOVi 1, 14, $noreg, $noreg + $r2 = MOVi 1, 14, $noreg, $noreg + $r3 = MOVi 1, 14, $noreg, $noreg + $r4 = MOVi 1, 14, $noreg, $noreg + bb.3: + BL @bar, implicit-def dead $lr, implicit $sp + $r0 = MOVi 1, 14, $noreg, $noreg + $r1 = MOVi 1, 14, $noreg, $noreg + $r2 = MOVi 1, 14, $noreg, $noreg + $r3 = MOVi 1, 14, $noreg, $noreg + $r4 = MOVi 1, 14, $noreg, $noreg + bb.4: + BL @bar, implicit-def dead $lr, implicit $sp + $r0 = MOVi 1, 14, $noreg, $noreg + $r1 = MOVi 1, 14, $noreg, $noreg + $r2 = MOVi 1, 14, $noreg, $noreg + $r3 = MOVi 1, 14, $noreg, $noreg + $r4 = MOVi 1, 14, $noreg, $noreg + bb.5: + BX_RET 14, $noreg +... +--- + +name: outline_call_thumb +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: outline_call_thumb + ; CHECK: bb.0: + ; CHECK: liveins: $r7, $lr + ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tBL 14 /* CC::al */, $noreg, @OUTLINED_FUNCTION_3 + ; CHECK: bb.1: + ; CHECK: tBL 14 /* CC::al */, $noreg, @OUTLINED_FUNCTION_3 + ; CHECK: bb.2: + ; CHECK: tBL 14 /* CC::al */, $noreg, @OUTLINED_FUNCTION_3 + ; CHECK: bb.3: + ; CHECK: tBL 14 /* CC::al */, $noreg, @OUTLINED_FUNCTION_3 + ; CHECK: bb.4: + ; CHECK: tBL 14 /* CC::al */, $noreg, @OUTLINED_FUNCTION_3 + ; CHECK: bb.5: + ; CHECK: $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc + bb.0: + tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp + $r0 = t2MOVi 1, 14, $noreg, $noreg + $r1 = t2MOVi 1, 14, $noreg, $noreg + $r2 = t2MOVi 1, 14, $noreg, $noreg + bb.1: + tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp + $r0 = t2MOVi 1, 14, $noreg, $noreg + $r1 = t2MOVi 1, 14, $noreg, $noreg + $r2 = t2MOVi 1, 14, $noreg, $noreg + bb.2: + tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp + $r0 = t2MOVi 1, 14, $noreg, $noreg + $r1 = t2MOVi 1, 14, $noreg, $noreg + $r2 = t2MOVi 1, 14, $noreg, $noreg + bb.3: + tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp + $r0 = t2MOVi 1, 14, $noreg, $noreg + $r1 = t2MOVi 1, 14, $noreg, $noreg + $r2 = t2MOVi 1, 14, $noreg, $noreg + bb.4: + tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp + $r0 = t2MOVi 1, 14, $noreg, $noreg + $r1 = t2MOVi 1, 14, $noreg, $noreg + $r2 = t2MOVi 1, 14, $noreg, $noreg + bb.5: + tBX_RET 14, $noreg +... +--- + +name: outline_call_tailcall_arm +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: outline_call_tailcall_arm + ; CHECK: bb.0: + ; CHECK: liveins: $r4, $lr + ; CHECK: $sp = frame-setup STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $lr + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: BL @OUTLINED_FUNCTION_2 + ; CHECK: bb.1: + ; CHECK: BL @OUTLINED_FUNCTION_2 + ; CHECK: bb.2: + ; CHECK: BL @OUTLINED_FUNCTION_2 + ; CHECK: bb.3: + ; CHECK: $sp = frame-destroy LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r4, def $lr + ; CHECK: BX_RET 14 /* CC::al */, $noreg + bb.0: + BL @bar, implicit-def dead $lr, implicit $sp + $r0 = MOVi 2, 14, $noreg, $noreg + $r1 = MOVi 2, 14, $noreg, $noreg + $r2 = MOVi 2, 14, $noreg, $noreg + $r3 = MOVi 2, 14, $noreg, $noreg + $r4 = MOVi 2, 14, $noreg, $noreg + BL @bar, implicit-def dead $lr, implicit $sp + bb.1: + BL @bar, implicit-def dead $lr, implicit $sp + $r0 = MOVi 2, 14, $noreg, $noreg + $r1 = MOVi 2, 14, $noreg, $noreg + $r2 = MOVi 2, 14, $noreg, $noreg + $r3 = MOVi 2, 14, $noreg, $noreg + $r4 = MOVi 2, 14, $noreg, $noreg + BL @bar, implicit-def dead $lr, implicit $sp + bb.2: + BL @bar, implicit-def dead $lr, implicit $sp + $r0 = MOVi 2, 14, $noreg, $noreg + $r1 = MOVi 2, 14, $noreg, $noreg + $r2 = MOVi 2, 14, $noreg, $noreg + $r3 = MOVi 2, 14, $noreg, $noreg + $r4 = MOVi 2, 14, $noreg, $noreg + BL @bar, implicit-def dead $lr, implicit $sp + bb.3: + BX_RET 14, $noreg +... +--- + +name: outline_call_tailcall_thumb +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: outline_call_tailcall_thumb + ; CHECK: bb.0: + ; CHECK: liveins: $r7, $lr + ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tBL 14 /* CC::al */, $noreg, @OUTLINED_FUNCTION_4 + ; CHECK: bb.1: + ; CHECK: tBL 14 /* CC::al */, $noreg, @OUTLINED_FUNCTION_4 + ; CHECK: bb.2: + ; CHECK: tBL 14 /* CC::al */, $noreg, @OUTLINED_FUNCTION_4 + ; CHECK: bb.3: + ; CHECK: $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc + bb.0: + tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp + $r0 = t2MOVi 2, 14, $noreg, $noreg + $r1 = t2MOVi 2, 14, $noreg, $noreg + $r2 = t2MOVi 2, 14, $noreg, $noreg + tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp + bb.1: + tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp + $r0 = t2MOVi 2, 14, $noreg, $noreg + $r1 = t2MOVi 2, 14, $noreg, $noreg + $r2 = t2MOVi 2, 14, $noreg, $noreg + tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp + bb.2: + tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp + $r0 = t2MOVi 2, 14, $noreg, $noreg + $r1 = t2MOVi 2, 14, $noreg, $noreg + $r2 = t2MOVi 2, 14, $noreg, $noreg + tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp + bb.3: + tBX_RET 14, $noreg +... +--- + +name: outline_call_KO_mcount +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: outline_call_KO_mcount + ; CHECK: bb.0: + ; CHECK: liveins: $r4, $lr + ; CHECK: $sp = frame-setup STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $lr + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: BL @"\01mcount", csr_aapcs, implicit-def dead $lr, implicit $sp + ; CHECK: BL @OUTLINED_FUNCTION_1 + ; CHECK: bb.1: + ; CHECK: BL @"\01mcount", csr_aapcs, implicit-def dead $lr, implicit $sp + ; CHECK: BL @OUTLINED_FUNCTION_1 + ; CHECK: bb.2: + ; CHECK: BL @"\01mcount", csr_aapcs, implicit-def dead $lr, implicit $sp + ; CHECK: BL @OUTLINED_FUNCTION_1 + ; CHECK: bb.3: + ; CHECK: BL @"\01mcount", csr_aapcs, implicit-def dead $lr, implicit $sp + ; CHECK: BL @OUTLINED_FUNCTION_1 + ; CHECK: bb.4: + ; CHECK: BL @"\01mcount", csr_aapcs, implicit-def dead $lr, implicit $sp + ; CHECK: BL @OUTLINED_FUNCTION_1 + ; CHECK: bb.5: + ; CHECK: $sp = frame-destroy LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r4, def $lr + ; CHECK: BX_RET 14 /* CC::al */, $noreg + bb.0: + BL @"\01mcount", csr_aapcs, implicit-def dead $lr, implicit $sp + $r0 = MOVi 3, 14, $noreg, $noreg + $r1 = MOVi 3, 14, $noreg, $noreg + $r2 = MOVi 3, 14, $noreg, $noreg + $r3 = MOVi 3, 14, $noreg, $noreg + $r4 = MOVi 3, 14, $noreg, $noreg + bb.1: + BL @"\01mcount", csr_aapcs, implicit-def dead $lr, implicit $sp + $r0 = MOVi 3, 14, $noreg, $noreg + $r1 = MOVi 3, 14, $noreg, $noreg + $r2 = MOVi 3, 14, $noreg, $noreg + $r3 = MOVi 3, 14, $noreg, $noreg + $r4 = MOVi 3, 14, $noreg, $noreg + bb.2: + BL @"\01mcount", csr_aapcs, implicit-def dead $lr, implicit $sp + $r0 = MOVi 3, 14, $noreg, $noreg + $r1 = MOVi 3, 14, $noreg, $noreg + $r2 = MOVi 3, 14, $noreg, $noreg + $r3 = MOVi 3, 14, $noreg, $noreg + $r4 = MOVi 3, 14, $noreg, $noreg + bb.3: + BL @"\01mcount", csr_aapcs, implicit-def dead $lr, implicit $sp + $r0 = MOVi 3, 14, $noreg, $noreg + $r1 = MOVi 3, 14, $noreg, $noreg + $r2 = MOVi 3, 14, $noreg, $noreg + $r3 = MOVi 3, 14, $noreg, $noreg + $r4 = MOVi 3, 14, $noreg, $noreg + bb.4: + BL @"\01mcount", csr_aapcs, implicit-def dead $lr, implicit $sp + $r0 = MOVi 3, 14, $noreg, $noreg + $r1 = MOVi 3, 14, $noreg, $noreg + $r2 = MOVi 3, 14, $noreg, $noreg + $r3 = MOVi 3, 14, $noreg, $noreg + $r4 = MOVi 3, 14, $noreg, $noreg + bb.5: + BX_RET 14, $noreg +... +--- + +name: bar +tracksRegLiveness: true +body: | + bb.0: + BX_RET 14, $noreg + + + ; CHECK-LABEL: name: OUTLINED_FUNCTION_0 + ; CHECK: bb.0: + ; CHECK: liveins: $r11, $r10, $r9, $r8, $r7, $r6, $r5, $d15, $d14, $d13, $d12, $d11, $d10, $d9, $d8, $lr + ; CHECK: early-clobber $sp = STR_PRE_IMM killed $lr, $sp, -8, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, 8 + ; CHECK: BL @bar, implicit-def dead $lr, implicit $sp + ; CHECK: $r0 = MOVi 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r1 = MOVi 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r2 = MOVi 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r3 = MOVi 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r4 = MOVi 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $lr, $sp = LDR_POST_IMM $sp, $noreg, 8, 14 /* CC::al */, $noreg + ; CHECK: MOVPCLR 14 /* CC::al */, $noreg + + ; CHECK-LABEL: name: OUTLINED_FUNCTION_1 + ; CHECK: bb.0: + ; CHECK: liveins: $r11, $r10, $r9, $r8, $r7, $r6, $r5, $d15, $d14, $d13, $d12, $d11, $d10, $d9, $d8 + ; CHECK: $r0 = MOVi 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r1 = MOVi 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r2 = MOVi 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r3 = MOVi 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r4 = MOVi 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: MOVPCLR 14 /* CC::al */, $noreg + + ; CHECK-LABEL: name: OUTLINED_FUNCTION_2 + ; CHECK: bb.0: + ; CHECK: liveins: $r11, $r10, $r9, $r8, $r7, $r6, $r5, $d15, $d14, $d13, $d12, $d11, $d10, $d9, $d8, $lr + ; CHECK: early-clobber $sp = STR_PRE_IMM killed $lr, $sp, -8, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, 8 + ; CHECK: BL @bar, implicit-def dead $lr, implicit $sp + ; CHECK: $r0 = MOVi 2, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r1 = MOVi 2, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r2 = MOVi 2, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r3 = MOVi 2, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r4 = MOVi 2, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $lr, $sp = LDR_POST_IMM $sp, $noreg, 8, 14 /* CC::al */, $noreg + ; CHECK: TAILJMPd @bar, implicit $sp + + ; CHECK-LABEL: name: OUTLINED_FUNCTION_3 + ; CHECK: bb.0: + ; CHECK: liveins: $r11, $r10, $r9, $r8, $r6, $r5, $r4, $d15, $d14, $d13, $d12, $d11, $d10, $d9, $d8, $lr + ; CHECK: early-clobber $sp = t2STR_PRE killed $lr, $sp, -8, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, 8 + ; CHECK: tBL 14 /* CC::al */, $noreg, @bar, implicit-def dead $lr, implicit $sp + ; CHECK: $r0 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r1 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r2 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $lr, $sp = t2LDR_POST $sp, 8, 14 /* CC::al */, $noreg + ; CHECK: tBX_RET 14 /* CC::al */, $noreg + + ; CHECK-LABEL: name: OUTLINED_FUNCTION_4 + ; CHECK: bb.0: + ; CHECK: liveins: $r11, $r10, $r9, $r8, $r6, $r5, $r4, $d15, $d14, $d13, $d12, $d11, $d10, $d9, $d8, $lr + ; CHECK: early-clobber $sp = t2STR_PRE killed $lr, $sp, -8, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, 8 + ; CHECK: tBL 14 /* CC::al */, $noreg, @bar, implicit-def dead $lr, implicit $sp + ; CHECK: $r0 = t2MOVi 2, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r1 = t2MOVi 2, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r2 = t2MOVi 2, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $lr, $sp = t2LDR_POST $sp, 8, 14 /* CC::al */, $noreg + ; CHECK: tTAILJMPdND @bar, 14 /* CC::al */, $noreg, implicit $sp + + + diff --git a/llvm/test/CodeGen/ARM/machine-outliner-default.mir b/llvm/test/CodeGen/ARM/machine-outliner-default.mir index 452d6a96c5393..9db4207d2df7a 100644 --- a/llvm/test/CodeGen/ARM/machine-outliner-default.mir +++ b/llvm/test/CodeGen/ARM/machine-outliner-default.mir @@ -5,8 +5,6 @@ --- | define void @outline_default_arm() #0 { ret void } define void @outline_default_thumb() #1 { ret void } - define void @outline_default_KO_call_arm() #0 { ret void } - define void @outline_default_KO_call_thumb() #1 { ret void } define void @outline_default_KO_stack_arm() #0 { ret void } define void @outline_default_KO_stack_thumb() #0 { ret void } declare void @bar() @@ -118,120 +116,6 @@ body: | ... --- -name: outline_default_KO_call_arm -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: outline_default_KO_call_arm - ; CHECK: bb.0: - ; CHECK: liveins: $lr - ; CHECK: BL @bar, implicit-def dead $lr, implicit $sp - ; CHECK: $r0 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r1 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r2 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r3 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r4 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: bb.1: - ; CHECK: liveins: $lr, $r5, $r6, $r7, $r8, $r9, $r10, $r11 - ; CHECK: BL @bar, implicit-def dead $lr, implicit $sp - ; CHECK: $r0 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r1 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r2 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r3 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r4 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: bb.2: - ; CHECK: liveins: $lr, $r5, $r6, $r7, $r8, $r9, $r10, $r11 - ; CHECK: BL @bar, implicit-def dead $lr, implicit $sp - ; CHECK: $r0 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r1 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r2 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r3 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r4 = MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: bb.3: - ; CHECK: liveins: $lr, $r5, $r6, $r7, $r8, $r9, $r10, $r11 - ; CHECK: $r2 = MOVr $lr, 14 /* CC::al */, $noreg, $noreg - ; CHECK: BX_RET 14 /* CC::al */, $noreg - bb.0: - liveins: $lr - BL @bar, implicit-def dead $lr, implicit $sp - $r0 = MOVi 2, 14, $noreg, $noreg - $r1 = MOVi 2, 14, $noreg, $noreg - $r2 = MOVi 2, 14, $noreg, $noreg - $r3 = MOVi 2, 14, $noreg, $noreg - $r4 = MOVi 2, 14, $noreg, $noreg - bb.1: - liveins: $lr, $r5, $r6, $r7, $r8, $r9, $r10, $r11 - BL @bar, implicit-def dead $lr, implicit $sp - $r0 = MOVi 2, 14, $noreg, $noreg - $r1 = MOVi 2, 14, $noreg, $noreg - $r2 = MOVi 2, 14, $noreg, $noreg - $r3 = MOVi 2, 14, $noreg, $noreg - $r4 = MOVi 2, 14, $noreg, $noreg - bb.2: - liveins: $lr, $r5, $r6, $r7, $r8, $r9, $r10, $r11 - BL @bar, implicit-def dead $lr, implicit $sp - $r0 = MOVi 2, 14, $noreg, $noreg - $r1 = MOVi 2, 14, $noreg, $noreg - $r2 = MOVi 2, 14, $noreg, $noreg - $r3 = MOVi 2, 14, $noreg, $noreg - $r4 = MOVi 2, 14, $noreg, $noreg - bb.3: - liveins: $lr, $r5, $r6, $r7, $r8, $r9, $r10, $r11 - $r2 = MOVr $lr, 14, $noreg, $noreg - BX_RET 14, $noreg -... ---- - -name: outline_default_KO_call_thumb -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: outline_default_KO_call_thumb - ; CHECK: bb.0: - ; CHECK: liveins: $lr - ; CHECK: tBL 14 /* CC::al */, $noreg, @bar, implicit-def dead $lr, implicit $sp - ; CHECK: $r0 = t2MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r1 = t2MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r2 = t2MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: bb.1: - ; CHECK: liveins: $lr, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11 - ; CHECK: tBL 14 /* CC::al */, $noreg, @bar, implicit-def dead $lr, implicit $sp - ; CHECK: $r0 = t2MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r1 = t2MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r2 = t2MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: bb.2: - ; CHECK: liveins: $lr, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11 - ; CHECK: tBL 14 /* CC::al */, $noreg, @bar, implicit-def dead $lr, implicit $sp - ; CHECK: $r0 = t2MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r1 = t2MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r2 = t2MOVi 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: bb.3: - ; CHECK: liveins: $lr, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11 - ; CHECK: $r2 = tMOVr $lr, 14 /* CC::al */, $noreg - ; CHECK: tBX_RET 14 /* CC::al */, $noreg - bb.0: - liveins: $lr - tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp - $r0 = t2MOVi 2, 14, $noreg, $noreg - $r1 = t2MOVi 2, 14, $noreg, $noreg - $r2 = t2MOVi 2, 14, $noreg, $noreg - bb.1: - liveins: $lr, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11 - tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp - $r0 = t2MOVi 2, 14, $noreg, $noreg - $r1 = t2MOVi 2, 14, $noreg, $noreg - $r2 = t2MOVi 2, 14, $noreg, $noreg - bb.2: - liveins: $lr, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11 - tBL 14, $noreg, @bar, implicit-def dead $lr, implicit $sp - $r0 = t2MOVi 2, 14, $noreg, $noreg - $r1 = t2MOVi 2, 14, $noreg, $noreg - $r2 = t2MOVi 2, 14, $noreg, $noreg - bb.3: - liveins: $lr, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11 - $r2 = tMOVr $lr, 14, $noreg - tBX_RET 14, $noreg -... ---- - name: outline_default_KO_stack_arm tracksRegLiveness: true body: | diff --git a/llvm/test/CodeGen/ARM/parity.ll b/llvm/test/CodeGen/ARM/parity.ll new file mode 100644 index 0000000000000..40c0d7bd32f11 --- /dev/null +++ b/llvm/test/CodeGen/ARM/parity.ll @@ -0,0 +1,162 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple arm-eabi -mattr=+v6t2 | FileCheck %s + +define i4 @parity_4(i4 %x) { +; CHECK-LABEL: parity_4: +; CHECK: @ %bb.0: +; CHECK-NEXT: and r0, r0, #15 +; CHECK-NEXT: eor r0, r0, r0, lsr #2 +; CHECK-NEXT: eor r0, r0, r0, lsr #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: bx lr + %1 = tail call i4 @llvm.ctpop.i4(i4 %x) + %2 = and i4 %1, 1 + ret i4 %2 +} + +define i8 @parity_8(i8 %x) { +; CHECK-LABEL: parity_8: +; CHECK: @ %bb.0: +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: eor r0, r0, r0, lsr #4 +; CHECK-NEXT: eor r0, r0, r0, lsr #2 +; CHECK-NEXT: eor r0, r0, r0, lsr #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: bx lr + %1 = tail call i8 @llvm.ctpop.i8(i8 %x) + %2 = and i8 %1, 1 + ret i8 %2 +} + +define i16 @parity_16(i16 %x) { +; CHECK-LABEL: parity_16: +; CHECK: @ %bb.0: +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: eor r0, r0, r0, lsr #8 +; CHECK-NEXT: eor r0, r0, r0, lsr #4 +; CHECK-NEXT: eor r0, r0, r0, lsr #2 +; CHECK-NEXT: eor r0, r0, r0, lsr #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: bx lr + %1 = tail call i16 @llvm.ctpop.i16(i16 %x) + %2 = and i16 %1, 1 + ret i16 %2 +} + +define i17 @parity_17(i17 %x) { +; CHECK-LABEL: parity_17: +; CHECK: @ %bb.0: +; CHECK-NEXT: bfc r0, #17, #15 +; CHECK-NEXT: eor r0, r0, r0, lsr #16 +; CHECK-NEXT: eor r0, r0, r0, lsr #8 +; CHECK-NEXT: eor r0, r0, r0, lsr #4 +; CHECK-NEXT: eor r0, r0, r0, lsr #2 +; CHECK-NEXT: eor r0, r0, r0, lsr #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: bx lr + %1 = tail call i17 @llvm.ctpop.i17(i17 %x) + %2 = and i17 %1, 1 + ret i17 %2 +} + +define i32 @parity_32(i32 %x) { +; CHECK-LABEL: parity_32: +; CHECK: @ %bb.0: +; CHECK-NEXT: eor r0, r0, r0, lsr #16 +; CHECK-NEXT: eor r0, r0, r0, lsr #8 +; CHECK-NEXT: eor r0, r0, r0, lsr #4 +; CHECK-NEXT: eor r0, r0, r0, lsr #2 +; CHECK-NEXT: eor r0, r0, r0, lsr #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: bx lr + %1 = tail call i32 @llvm.ctpop.i32(i32 %x) + %2 = and i32 %1, 1 + ret i32 %2 +} + +define i64 @parity_64(i64 %x) { +; CHECK-LABEL: parity_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: eor r0, r0, r1 +; CHECK-NEXT: mov r1, #0 +; CHECK-NEXT: eor r0, r0, r0, lsr #16 +; CHECK-NEXT: eor r0, r0, r0, lsr #8 +; CHECK-NEXT: eor r0, r0, r0, lsr #4 +; CHECK-NEXT: eor r0, r0, r0, lsr #2 +; CHECK-NEXT: eor r0, r0, r0, lsr #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: bx lr + %1 = tail call i64 @llvm.ctpop.i64(i64 %x) + %2 = and i64 %1, 1 + ret i64 %2 +} + +define i32 @parity_64_trunc(i64 %x) { +; CHECK-LABEL: parity_64_trunc: +; CHECK: @ %bb.0: +; CHECK-NEXT: eor r0, r0, r1 +; CHECK-NEXT: eor r0, r0, r0, lsr #16 +; CHECK-NEXT: eor r0, r0, r0, lsr #8 +; CHECK-NEXT: eor r0, r0, r0, lsr #4 +; CHECK-NEXT: eor r0, r0, r0, lsr #2 +; CHECK-NEXT: eor r0, r0, r0, lsr #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: bx lr + %1 = tail call i64 @llvm.ctpop.i64(i64 %x) + %2 = trunc i64 %1 to i32 + %3 = and i32 %2, 1 + ret i32 %3 +} + +define i8 @parity_32_trunc(i32 %x) { +; CHECK-LABEL: parity_32_trunc: +; CHECK: @ %bb.0: +; CHECK-NEXT: eor r0, r0, r0, lsr #16 +; CHECK-NEXT: eor r0, r0, r0, lsr #8 +; CHECK-NEXT: eor r0, r0, r0, lsr #4 +; CHECK-NEXT: eor r0, r0, r0, lsr #2 +; CHECK-NEXT: eor r0, r0, r0, lsr #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: bx lr + %1 = tail call i32 @llvm.ctpop.i32(i32 %x) + %2 = trunc i32 %1 to i8 + %3 = and i8 %2, 1 + ret i8 %3 +} + +define i32 @parity_8_zext(i8 %x) { +; CHECK-LABEL: parity_8_zext: +; CHECK: @ %bb.0: +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: eor r0, r0, r0, lsr #4 +; CHECK-NEXT: eor r0, r0, r0, lsr #2 +; CHECK-NEXT: eor r0, r0, r0, lsr #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: bx lr + %a = zext i8 %x to i32 + %b = tail call i32 @llvm.ctpop.i32(i32 %a) + %c = and i32 %b, 1 + ret i32 %c +} + +define i32 @parity_8_mask(i32 %x) { +; CHECK-LABEL: parity_8_mask: +; CHECK: @ %bb.0: +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: eor r0, r0, r0, lsr #4 +; CHECK-NEXT: eor r0, r0, r0, lsr #2 +; CHECK-NEXT: eor r0, r0, r0, lsr #1 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: bx lr + %a = and i32 %x, 255 + %b = tail call i32 @llvm.ctpop.i32(i32 %a) + %c = and i32 %b, 1 + ret i32 %c +} + +declare i4 @llvm.ctpop.i4(i4 %x) +declare i8 @llvm.ctpop.i8(i8 %x) +declare i16 @llvm.ctpop.i16(i16 %x) +declare i17 @llvm.ctpop.i17(i17 %x) +declare i32 @llvm.ctpop.i32(i32 %x) +declare i64 @llvm.ctpop.i64(i64 %x) diff --git a/llvm/test/CodeGen/ARM/ssat.ll b/llvm/test/CodeGen/ARM/ssat.ll index f1e11dd33d1fb..a2027435ed291 100644 --- a/llvm/test/CodeGen/ARM/ssat.ll +++ b/llvm/test/CodeGen/ARM/ssat.ll @@ -20,10 +20,10 @@ define i32 @sat_base_32bit(i32 %x) #0 { ; V6T2: ssat r0, #24, r0 ; V4T-NOT: ssat entry: - %cmpLow = icmp slt i32 %x, -8388608 - %cmpUp = icmp sgt i32 %x, 8388607 - %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x - %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %saturateUp + %0 = icmp slt i32 %x, 8388607 + %saturateUp = select i1 %0, i32 %x, i32 8388607 + %1 = icmp sgt i32 %saturateUp, -8388608 + %saturateLow = select i1 %1, i32 %saturateUp, i32 -8388608 ret i32 %saturateLow } @@ -34,10 +34,10 @@ define i16 @sat_base_16bit(i16 %x) #0 { ; V6T2: ssat r0, #12, r0 ; V4T-NOT: ssat entry: - %cmpLow = icmp slt i16 %x, -2048 - %cmpUp = icmp sgt i16 %x, 2047 - %saturateUp = select i1 %cmpUp, i16 2047, i16 %x - %saturateLow = select i1 %cmpLow, i16 -2048, i16 %saturateUp + %0 = icmp slt i16 %x, 2047 + %saturateUp = select i1 %0, i16 %x, i16 2047 + %1 = icmp sgt i16 %saturateUp, -2048 + %saturateLow = select i1 %1, i16 %saturateUp, i16 -2048 ret i16 %saturateLow } @@ -48,10 +48,10 @@ define i8 @sat_base_8bit(i8 %x) #0 { ; V6T2: ssat r0, #6, r0 ; V4T-NOT: ssat entry: - %cmpLow = icmp slt i8 %x, -32 - %cmpUp = icmp sgt i8 %x, 31 - %saturateUp = select i1 %cmpUp, i8 31, i8 %x - %saturateLow = select i1 %cmpLow, i8 -32, i8 %saturateUp + %0 = icmp slt i8 %x, 31 + %saturateUp = select i1 %0, i8 %x, i8 31 + %1 = icmp sgt i8 %saturateUp, -32 + %saturateLow = select i1 %1, i8 %saturateUp, i8 -32 ret i8 %saturateLow } @@ -67,10 +67,10 @@ define i32 @sat_lower_upper_1(i32 %x) #0 { ; V6T2: ssat r0, #24, r0 ; V4T-NOT: ssat entry: - %cmpLow = icmp slt i32 %x, -8388608 %cmpUp = icmp slt i32 %x, 8388607 %saturateUp = select i1 %cmpUp, i32 %x, i32 8388607 - %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %saturateUp + %0 = icmp sgt i32 %saturateUp, -8388608 + %saturateLow = select i1 %0, i32 %saturateUp, i32 -8388608 ret i32 %saturateLow } @@ -80,10 +80,10 @@ define i32 @sat_lower_upper_2(i32 %x) #0 { ; V6T2: ssat r0, #24, r0 ; V4T-NOT: ssat entry: - %cmpLow = icmp sgt i32 %x, -8388608 - %cmpUp = icmp sgt i32 %x, 8388607 - %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x - %saturateLow = select i1 %cmpLow, i32 %saturateUp, i32 -8388608 + %0 = icmp slt i32 %x, 8388607 + %saturateUp = select i1 %0, i32 %x, i32 8388607 + %1 = icmp sgt i32 %saturateUp, -8388608 + %saturateLow = select i1 %1, i32 %saturateUp, i32 -8388608 ret i32 %saturateLow } @@ -93,10 +93,10 @@ define i32 @sat_upper_lower_1(i32 %x) #0 { ; V6T2: ssat r0, #24, r0 ; V4T-NOT: ssat entry: - %cmpUp = icmp slt i32 %x, 8388607 - %cmpLow = icmp slt i32 %x, -8388608 - %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %x - %saturateUp = select i1 %cmpUp, i32 %saturateLow, i32 8388607 + %0 = icmp sgt i32 %x, -8388608 + %saturateLow = select i1 %0, i32 %x, i32 -8388608 + %1 = icmp slt i32 %saturateLow, 8388607 + %saturateUp = select i1 %1, i32 %saturateLow, i32 8388607 ret i32 %saturateUp } @@ -106,10 +106,10 @@ define i32 @sat_upper_lower_2(i32 %x) #0 { ; V6T2: ssat r0, #24, r0 ; V4T-NOT: ssat entry: - %cmpUp = icmp sgt i32 %x, 8388607 - %cmpLow = icmp slt i32 %x, -8388608 - %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %x - %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + %0 = icmp sgt i32 %x, -8388608 + %saturateLow = select i1 %0, i32 %x, i32 -8388608 + %1 = icmp slt i32 %saturateLow, 8388607 + %saturateUp = select i1 %1, i32 %saturateLow, i32 8388607 ret i32 %saturateUp } @@ -119,10 +119,10 @@ define i32 @sat_upper_lower_3(i32 %x) #0 { ; V6T2: ssat r0, #24, r0 ; V4T-NOT: ssat entry: - %cmpUp = icmp slt i32 8388607, %x %cmpLow = icmp sgt i32 %x, -8388608 %saturateLow = select i1 %cmpLow, i32 %x, i32 -8388608 - %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + %0 = icmp slt i32 %saturateLow, 8388607 + %saturateUp = select i1 %0, i32 %saturateLow, i32 8388607 ret i32 %saturateUp } @@ -137,10 +137,10 @@ define i32 @sat_le_ge(i32 %x) #0 { ; V6T2: ssat r0, #24, r0 ; V4T-NOT: ssat entry: - %cmpUp = icmp sle i32 8388607, %x - %cmpLow = icmp sge i32 %x, -8388608 - %saturateLow = select i1 %cmpLow, i32 %x, i32 -8388608 - %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + %0 = icmp sgt i32 %x, -8388608 + %saturateLow = select i1 %0, i32 %x, i32 -8388608 + %1 = icmp slt i32 %saturateLow, 8388607 + %saturateUp = select i1 %1, i32 %saturateLow, i32 8388607 ret i32 %saturateUp } @@ -156,8 +156,8 @@ define i32 @no_sat_missing_lower(i32 %x) #0 { ; CHECK-NOT: ssat entry: %cmpUp = icmp sgt i32 %x, 8388607 - %cmpLow = icmp sgt i32 %x, -8388608 - %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %x + %0 = icmp slt i32 %x, -8388608 + %saturateLow = select i1 %0, i32 %x, i32 -8388608 %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow ret i32 %saturateUp } @@ -169,8 +169,8 @@ define i32 @no_sat_missing_upper(i32 %x) #0 { ; CHECK-NOT: ssat entry: %cmpUp = icmp slt i32 %x, 8388607 - %cmpLow = icmp slt i32 %x, -8388608 - %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %x + %0 = icmp sgt i32 %x, -8388608 + %saturateLow = select i1 %0, i32 %x, i32 -8388608 %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow ret i32 %saturateUp } @@ -192,10 +192,10 @@ define i32 @no_sat_incorrect_interval(i32 %x) #0 { ; CHECK-LABEL: no_sat_incorrect_interval: ; CHECK-NOT: ssat entry: - %cmpUp = icmp sgt i32 %x, 8388607 - %cmpLow = icmp slt i32 %x, -19088744 - %saturateLow = select i1 %cmpLow, i32 -19088744, i32 %x - %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + %0 = icmp sgt i32 %x, -19088744 + %saturateLow = select i1 %0, i32 %x, i32 -19088744 + %1 = icmp slt i32 %saturateLow, 8388607 + %saturateUp = select i1 %1, i32 %saturateLow, i32 8388607 ret i32 %saturateUp } diff --git a/llvm/test/CodeGen/ARM/usat.ll b/llvm/test/CodeGen/ARM/usat.ll index 8f19d11ef7bb7..ba4e0dd037649 100644 --- a/llvm/test/CodeGen/ARM/usat.ll +++ b/llvm/test/CodeGen/ARM/usat.ll @@ -22,10 +22,10 @@ define i32 @unsigned_sat_base_32bit(i32 %x) #0 { ; V6T2: usat r0, #23, r0 ; V4T-NOT: usat entry: - %cmpLow = icmp slt i32 %x, 0 - %cmpUp = icmp sgt i32 %x, 8388607 - %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x - %saturateLow = select i1 %cmpLow, i32 0, i32 %saturateUp + %0 = icmp slt i32 %x, 8388607 + %saturateUp = select i1 %0, i32 %x, i32 8388607 + %1 = icmp sgt i32 %saturateUp, 0 + %saturateLow = select i1 %1, i32 %saturateUp, i32 0 ret i32 %saturateLow } @@ -37,10 +37,10 @@ define i16 @unsigned_sat_base_16bit(i16 %x) #0 { ; V6T2: usat r0, #11, r0 ; V4T-NOT: usat entry: - %cmpLow = icmp slt i16 %x, 0 - %cmpUp = icmp sgt i16 %x, 2047 - %saturateUp = select i1 %cmpUp, i16 2047, i16 %x - %saturateLow = select i1 %cmpLow, i16 0, i16 %saturateUp + %0 = icmp slt i16 %x, 2047 + %saturateUp = select i1 %0, i16 %x, i16 2047 + %1 = icmp sgt i16 %saturateUp, 0 + %saturateLow = select i1 %1, i16 %saturateUp, i16 0 ret i16 %saturateLow } @@ -52,10 +52,10 @@ define i8 @unsigned_sat_base_8bit(i8 %x) #0 { ; V6T2: usat r0, #5, r0 ; V4T-NOT: usat entry: - %cmpLow = icmp slt i8 %x, 0 - %cmpUp = icmp sgt i8 %x, 31 - %saturateUp = select i1 %cmpUp, i8 31, i8 %x - %saturateLow = select i1 %cmpLow, i8 0, i8 %saturateUp + %0 = icmp slt i8 %x, 31 + %saturateUp = select i1 %0, i8 %x, i8 31 + %1 = icmp sgt i8 %saturateUp, 0 + %saturateLow = select i1 %1, i8 %saturateUp, i8 0 ret i8 %saturateLow } @@ -71,10 +71,10 @@ define i32 @unsigned_sat_lower_upper_1(i32 %x) #0 { ; V6T2: usat r0, #23, r0 ; V4T-NOT: usat entry: - %cmpLow = icmp slt i32 %x, 0 %cmpUp = icmp slt i32 %x, 8388607 %saturateUp = select i1 %cmpUp, i32 %x, i32 8388607 - %saturateLow = select i1 %cmpLow, i32 0, i32 %saturateUp + %0 = icmp sgt i32 %saturateUp, 0 + %saturateLow = select i1 %0, i32 %saturateUp, i32 0 ret i32 %saturateLow } @@ -85,10 +85,10 @@ define i32 @unsigned_sat_lower_upper_2(i32 %x) #0 { ; V6T2: usat r0, #23, r0 ; V4T-NOT: usat entry: - %cmpLow = icmp sgt i32 %x, 0 - %cmpUp = icmp sgt i32 %x, 8388607 - %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x - %saturateLow = select i1 %cmpLow, i32 %saturateUp, i32 0 + %0 = icmp slt i32 %x, 8388607 + %saturateUp = select i1 %0, i32 %x, i32 8388607 + %1 = icmp sgt i32 %saturateUp, 0 + %saturateLow = select i1 %1, i32 %saturateUp, i32 0 ret i32 %saturateLow } @@ -99,10 +99,10 @@ define i32 @unsigned_sat_upper_lower_1(i32 %x) #0 { ; V6T2: usat r0, #23, r0 ; V4T-NOT: usat entry: - %cmpUp = icmp slt i32 %x, 8388607 - %cmpLow = icmp slt i32 %x, 0 - %saturateLow = select i1 %cmpLow, i32 0, i32 %x - %saturateUp = select i1 %cmpUp, i32 %saturateLow, i32 8388607 + %0 = icmp sgt i32 %x, 0 + %saturateLow = select i1 %0, i32 %x, i32 0 + %1 = icmp slt i32 %saturateLow, 8388607 + %saturateUp = select i1 %1, i32 %saturateLow, i32 8388607 ret i32 %saturateUp } @@ -113,10 +113,10 @@ define i32 @unsigned_sat_upper_lower_2(i32 %x) #0 { ; V6T2: usat r0, #23, r0 ; V4T-NOT: usat entry: - %cmpUp = icmp sgt i32 %x, 8388607 - %cmpLow = icmp slt i32 %x, 0 - %saturateLow = select i1 %cmpLow, i32 0, i32 %x - %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + %0 = icmp sgt i32 %x, 0 + %saturateLow = select i1 %0, i32 %x, i32 0 + %1 = icmp slt i32 %saturateLow, 8388607 + %saturateUp = select i1 %1, i32 %saturateLow, i32 8388607 ret i32 %saturateUp } @@ -127,10 +127,10 @@ define i32 @unsigned_sat_upper_lower_3(i32 %x) #0 { ; V6T2: usat r0, #23, r0 ; V4T-NOT: usat entry: - %cmpUp = icmp slt i32 8388607, %x %cmpLow = icmp sgt i32 %x, 0 %saturateLow = select i1 %cmpLow, i32 %x, i32 0 - %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + %0 = icmp slt i32 %saturateLow, 8388607 + %saturateUp = select i1 %0, i32 %saturateLow, i32 8388607 ret i32 %saturateUp } @@ -145,8 +145,8 @@ define i32 @no_unsigned_sat_missing_lower(i32 %x) #0 { ; CHECK-NOT: usat entry: %cmpUp = icmp sgt i32 %x, 8388607 - %cmpLow = icmp sgt i32 %x, 0 - %saturateLow = select i1 %cmpLow, i32 0, i32 %x + %0 = icmp slt i32 %x, 0 + %saturateLow = select i1 %0, i32 %x, i32 0 %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow ret i32 %saturateUp } @@ -158,8 +158,8 @@ define i32 @no_unsigned_sat_missing_upper(i32 %x) #0 { ; CHECK-NOT: usat entry: %cmpUp = icmp slt i32 %x, 8388607 - %cmpLow = icmp slt i32 %x, 0 - %saturateLow = select i1 %cmpLow, i32 0, i32 %x + %0 = icmp sgt i32 %x, 0 + %saturateLow = select i1 %0, i32 %x, i32 0 %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow ret i32 %saturateUp } @@ -169,10 +169,22 @@ define i32 @no_unsigned_sat_incorrect_constant(i32 %x) #0 { ; CHECK-LABEL: no_unsigned_sat_incorrect_constant: ; CHECK-NOT: usat entry: - %cmpUp = icmp sgt i32 %x, 8388607 - %cmpLow = icmp slt i32 %x, 0 - %saturateLow = select i1 %cmpLow, i32 -1, i32 %x - %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + %cmpLow.inv = icmp sgt i32 %x, -1 + %saturateLow = select i1 %cmpLow.inv, i32 %x, i32 -1 + %0 = icmp slt i32 %saturateLow, 8388607 + %saturateUp = select i1 %0, i32 %saturateLow, i32 8388607 + ret i32 %saturateUp +} + +; The interval is [0, k] but k+1 is not a power of 2 +define i32 @no_unsigned_sat_incorrect_constant2(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_constant2: +; CHECK-NOT: usat +entry: + %0 = icmp sgt i32 %x, 0 + %saturateLow = select i1 %0, i32 %x, i32 0 + %1 = icmp slt i32 %saturateLow, 8388609 + %saturateUp = select i1 %1, i32 %saturateLow, i32 8388609 ret i32 %saturateUp } @@ -181,10 +193,10 @@ define i32 @no_unsigned_sat_incorrect_interval(i32 %x) #0 { ; CHECK-LABEL: no_unsigned_sat_incorrect_interval: ; CHECK-NOT: usat entry: - %cmpUp = icmp sgt i32 %x, 8388607 - %cmpLow = icmp slt i32 %x, -4 - %saturateLow = select i1 %cmpLow, i32 -4, i32 %x - %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + %0 = icmp sgt i32 %x, -4 + %saturateLow = select i1 %0, i32 %x, i32 -4 + %1 = icmp slt i32 %saturateLow, 8388607 + %saturateUp = select i1 %1, i32 %saturateLow, i32 8388607 ret i32 %saturateUp } diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll index f3eeb11a17fd2..aaa376a0ba6e9 100644 --- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll @@ -1,27 +1,59 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK +declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>) declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>) declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>) +define half @test_v4f16(<4 x half> %a) nounwind { +; CHECK-LABEL: test_v4f16: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov r4, #255 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: orr r4, r4, #65280 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: and r0, r3, r4 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: and r0, r5, r4 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: and r0, r7, r4 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: and r0, r6, r4 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: bl __aeabi_fadd +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl __aeabi_fadd +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: bl __aeabi_fadd +; CHECK-NEXT: bl __aeabi_f2h +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half 0.0, <4 x half> %a) + ret half %b +} + define float @test_v4f32(<4 x float> %a) nounwind { ; CHECK-LABEL: test_v4f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: pop {r4, r5, r6, lr} +; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a) ret float %b diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll new file mode 100644 index 0000000000000..586a02b92bf3c --- /dev/null +++ b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK + +declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>) +declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) +declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>) +declare fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128>) + +define half @test_v4f16(<4 x half> %a) nounwind { +; CHECK-LABEL: test_v4f16: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov r4, #255 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: orr r4, r4, #65280 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: and r0, r3, r4 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: and r0, r5, r4 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: and r0, r7, r4 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: and r0, r6, r4 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: bl fmaxf +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl fmaxf +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: bl fmaxf +; CHECK-NEXT: bl __aeabi_f2h +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %a) + ret half %b +} + +define float @test_v4f32(<4 x float> %a) nounwind { +; CHECK-LABEL: test_v4f32: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: bl fmaxf +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl fmaxf +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: bl fmaxf +; CHECK-NEXT: pop {r4, r5, r11, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a) + ret float %b +} + +define double @test_v2f64(<2 x double> %a) nounwind { +; CHECK-LABEL: test_v2f64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl fmax +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a) + ret double %b +} + +define fp128 @test_v2f128(<2 x fp128> %a) nounwind { +; CHECK-LABEL: test_v2f128: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: ldr r12, [sp, #36] +; CHECK-NEXT: str r12, [sp, #12] +; CHECK-NEXT: ldr r12, [sp, #32] +; CHECK-NEXT: str r12, [sp, #8] +; CHECK-NEXT: ldr r12, [sp, #28] +; CHECK-NEXT: str r12, [sp, #4] +; CHECK-NEXT: ldr r12, [sp, #24] +; CHECK-NEXT: str r12, [sp] +; CHECK-NEXT: bl fmaxl +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a) + ret fp128 %b +} diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll new file mode 100644 index 0000000000000..b64e4473981bb --- /dev/null +++ b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK + +declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>) +declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) +declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) +declare fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128>) + +define half @test_v4f16(<4 x half> %a) nounwind { +; CHECK-LABEL: test_v4f16: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov r4, #255 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: orr r4, r4, #65280 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: and r0, r3, r4 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: and r0, r5, r4 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: and r0, r7, r4 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: and r0, r6, r4 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: bl fminf +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl fminf +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: bl fminf +; CHECK-NEXT: bl __aeabi_f2h +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %a) + ret half %b +} + +define float @test_v4f32(<4 x float> %a) nounwind { +; CHECK-LABEL: test_v4f32: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: bl fminf +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl fminf +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: bl fminf +; CHECK-NEXT: pop {r4, r5, r11, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a) + ret float %b +} + +define double @test_v2f64(<2 x double> %a) nounwind { +; CHECK-LABEL: test_v2f64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl fmin +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a) + ret double %b +} + +define fp128 @test_v2f128(<2 x fp128> %a) nounwind { +; CHECK-LABEL: test_v2f128: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: ldr r12, [sp, #36] +; CHECK-NEXT: str r12, [sp, #12] +; CHECK-NEXT: ldr r12, [sp, #32] +; CHECK-NEXT: str r12, [sp, #8] +; CHECK-NEXT: ldr r12, [sp, #28] +; CHECK-NEXT: str r12, [sp, #4] +; CHECK-NEXT: ldr r12, [sp, #24] +; CHECK-NEXT: str r12, [sp] +; CHECK-NEXT: bl fminl +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a) + ret fp128 %b +} diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll new file mode 100644 index 0000000000000..62111e5f0f342 --- /dev/null +++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK + +declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half, <4 x half>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>) +declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128, <2 x fp128>) + +define half @test_v4f16(<4 x half> %a) nounwind { +; CHECK-LABEL: test_v4f16: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov r4, #255 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: orr r4, r4, #65280 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: and r0, r3, r4 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: and r0, r5, r4 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: and r0, r7, r4 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: and r0, r6, r4 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: bl __aeabi_fmul +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl __aeabi_fmul +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: bl __aeabi_fmul +; CHECK-NEXT: bl __aeabi_f2h +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half 1.0, <4 x half> %a) + ret half %b +} + +define float @test_v4f32(<4 x float> %a) nounwind { +; CHECK-LABEL: test_v4f32: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: bl __aeabi_fmul +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl __aeabi_fmul +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: bl __aeabi_fmul +; CHECK-NEXT: pop {r4, r5, r11, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a) + ret float %b +} + +define double @test_v2f64(<2 x double> %a) nounwind { +; CHECK-LABEL: test_v2f64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a) + ret double %b +} + +define fp128 @test_v2f128(<2 x fp128> %a) nounwind { +; CHECK-LABEL: test_v2f128: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: ldr r12, [sp, #36] +; CHECK-NEXT: str r12, [sp, #12] +; CHECK-NEXT: ldr r12, [sp, #32] +; CHECK-NEXT: str r12, [sp, #8] +; CHECK-NEXT: ldr r12, [sp, #28] +; CHECK-NEXT: str r12, [sp, #4] +; CHECK-NEXT: ldr r12, [sp, #24] +; CHECK-NEXT: str r12, [sp] +; CHECK-NEXT: bl __multf3 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: mov pc, lr + %b = call fast fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128 0xL00000000000000003fff00000000000000, <2 x fp128> %a) + ret fp128 %b +} diff --git a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll index 11abf902eeb3a..e0e3149e35119 100644 --- a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll +++ b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll @@ -93,8 +93,8 @@ define float @fadd_f32(<4 x float> %vec) { ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float 0.000000e+00, [[TMP0]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd fast float 0.000000e+00, [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %vec) @@ -109,8 +109,8 @@ define float @fadd_f32_accum(float %accum, <4 x float> %vec) { ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float %accum, [[TMP0]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd fast float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec) @@ -161,8 +161,8 @@ define float @fmul_f32(<4 x float> %vec) { ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float 1.000000e+00, [[TMP0]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul fast float 1.000000e+00, [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %vec) @@ -177,8 +177,8 @@ define float @fmul_f32_accum(float %accum, <4 x float> %vec) { ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float %accum, [[TMP0]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul fast float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec) @@ -277,40 +277,40 @@ entry: ret i64 %r } +; FIXME: Expand using maxnum intrinsic? + define double @fmax_f64(<2 x double> %vec) { ; CHECK-LABEL: @fmax_f64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <2 x double> [[VEC]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]] -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 -; CHECK-NEXT: ret double [[TMP0]] +; CHECK-NEXT: [[R:%.*]] = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> [[VEC:%.*]]) +; CHECK-NEXT: ret double [[R]] ; entry: %r = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %vec) ret double %r } +; FIXME: Expand using minnum intrinsic? + define double @fmin_f64(<2 x double> %vec) { ; CHECK-LABEL: @fmin_f64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <2 x double> [[VEC]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]] -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 -; CHECK-NEXT: ret double [[TMP0]] +; CHECK-NEXT: [[R:%.*]] = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> [[VEC:%.*]]) +; CHECK-NEXT: ret double [[R]] ; entry: %r = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %vec) ret double %r } +; FIXME: Why is this not expanded? + ; Test when the vector size is not power of two. define i8 @test_v3i8(<3 x i8> %a) nounwind { ; CHECK-LABEL: @test_v3i8( ; CHECK-NEXT: entry: -; CHECK-NEXT: %b = call i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> %a) -; CHECK-NEXT: ret i8 %b +; CHECK-NEXT: [[B:%.*]] = call i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> [[A:%.*]]) +; CHECK-NEXT: ret i8 [[B]] ; entry: %b = call i8 @llvm.experimental.vector.reduce.and.i8.v3i8(<3 x i8> %a) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-split-masked.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-split-masked.ll new file mode 100644 index 0000000000000..61bcbce6e6422 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-split-masked.ll @@ -0,0 +1,32 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; Check that this compiles successfully. +; CHECK: vmem + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define void @f0() #0 { +b0: + %v0 = call <64 x i32> @llvm.masked.load.v64i32.p0v64i32(<64 x i32>* nonnull undef, i32 4, <64 x i1> , <64 x i32> undef) + %v1 = icmp sgt <64 x i32> %v0, zeroinitializer + %v2 = sext <64 x i1> %v1 to <64 x i32> + %v3 = add nsw <64 x i32> zeroinitializer, %v2 + %v4 = add nsw <64 x i32> %v3, zeroinitializer + %v5 = icmp sgt <64 x i32> %v4, zeroinitializer + %v6 = select <64 x i1> %v5, <64 x i32> %v4, <64 x i32> zeroinitializer + %v7 = select <64 x i1> zeroinitializer, <64 x i32> undef, <64 x i32> %v6 + %v8 = trunc <64 x i32> %v7 to <64 x i16> + call void @llvm.masked.store.v64i16.p0v64i16(<64 x i16> %v8, <64 x i16>* undef, i32 2, <64 x i1> ) + ret void +} + +; Function Attrs: argmemonly nounwind readonly willreturn +declare <64 x i32> @llvm.masked.load.v64i32.p0v64i32(<64 x i32>*, i32 immarg, <64 x i1>, <64 x i32>) #1 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.masked.store.v64i16.p0v64i16(<64 x i16>, <64 x i16>*, i32 immarg, <64 x i1>) #2 + +attributes #0 = { "target-features"="+hvx-length128b,+hvxv67,+v67,-long-calls" } +attributes #1 = { argmemonly nounwind readonly willreturn } +attributes #2 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate-legal.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate-legal.ll new file mode 100644 index 0000000000000..e9c7f9cce771e --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate-legal.ll @@ -0,0 +1,34 @@ +; RUN: llc -march=hexagon -hexagon-hvx-widen=32 < %s | FileCheck %s + +; Truncating a type-to-be-widenened to a legal type (v8i8). +; Check that this compiles successfully. +; CHECK-LABEL: f0: +; CHECK: dealloc_return + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define dllexport void @f0(i8* %a0) local_unnamed_addr #0 { +b0: + %v0 = load i8, i8* undef, align 1 + %v1 = zext i8 %v0 to i16 + %v2 = add i16 0, %v1 + %v3 = icmp sgt i16 %v2, 1 + %v4 = select i1 %v3, i16 %v2, i16 1 + %v5 = udiv i16 -32768, %v4 + %v6 = zext i16 %v5 to i32 + %v7 = insertelement <8 x i32> undef, i32 %v6, i32 0 + %v8 = shufflevector <8 x i32> %v7, <8 x i32> undef, <8 x i32> zeroinitializer + %v9 = load <8 x i16>, <8 x i16>* undef, align 2 + %v10 = sext <8 x i16> %v9 to <8 x i32> + %v11 = mul nsw <8 x i32> %v8, %v10 + %v12 = add nsw <8 x i32> %v11, + %v13 = lshr <8 x i32> %v12, + %v14 = trunc <8 x i32> %v13 to <8 x i8> + %v15 = getelementptr inbounds i8, i8* %a0, i32 undef + %v16 = bitcast i8* %v15 to <8 x i8>* + store <8 x i8> %v14, <8 x i8>* %v16, align 1 + ret void +} + +attributes #0 = { "target-features"="+hvx,+hvx-length128b" } diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-widen-truncate-illegal-elem.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-widen-truncate-illegal-elem.ll new file mode 100644 index 0000000000000..3f55d22308c3d --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-widen-truncate-illegal-elem.ll @@ -0,0 +1,34 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; Check that this does not crash. +; CHECK: vmem + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define dso_local void @f0() local_unnamed_addr #0 { +b0: + %v0 = load i32, i32* undef, align 4 + %v1 = select i1 undef, i32 0, i32 1073741823 + %v2 = shl i32 %v1, 0 + %v3 = sext i32 %v0 to i64 + %v4 = sext i32 %v2 to i64 + %v5 = mul nsw i64 %v4, %v3 + %v6 = lshr i64 %v5, 32 + %v7 = trunc i64 %v6 to i32 + %v8 = sext i32 %v7 to i64 + %v9 = insertelement <32 x i64> undef, i64 %v8, i32 0 + %v10 = shufflevector <32 x i64> %v9, <32 x i64> undef, <32 x i32> zeroinitializer + %v11 = getelementptr i32, i32* null, i32 32 + %v12 = bitcast i32* %v11 to <32 x i32>* + %v13 = load <32 x i32>, <32 x i32>* %v12, align 4 + %v14 = shl <32 x i32> %v13, zeroinitializer + %v15 = sext <32 x i32> %v14 to <32 x i64> + %v16 = mul nsw <32 x i64> %v10, %v15 + %v17 = lshr <32 x i64> %v16, + %v18 = trunc <32 x i64> %v17 to <32 x i32> + store <32 x i32> %v18, <32 x i32>* %v12, align 4 + ret void +} + +attributes #0 = { "target-features"="+hvx-length128b,+hvxv67,+v67,-long-calls" } diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-widen-truncate-pair.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-widen-truncate-pair.ll new file mode 100644 index 0000000000000..23e8b590b2d8a --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-widen-truncate-pair.ll @@ -0,0 +1,17 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; This has a v32i8 = truncate v16i32 (64b mode), which was legalized to +; 64i8 = vpackl v32i32, for which there were no selection patterns provided. +; Check that we generate vpackeh->vpackeb for this. + +; CHECK-LABEL: fred: +; CHECK: v[[V0:[0-9]+]].h = vpacke(v1.w,v0.w) +; CHECK: = vpacke({{.*}},v[[V0]].h) +define void @fred(<32 x i8>* %a0, <32 x i32> %a1) #0 { + %v0 = trunc <32 x i32> %a1 to <32 x i8> + store <32 x i8> %v0, <32 x i8>* %a0, align 32 + ret void +} + +attributes #0 = { "target-cpu"="hexagonv65" "target-features"="+hvx,+hvx-length64b" } + diff --git a/llvm/test/CodeGen/Hexagon/autohvx/widen-ext.ll b/llvm/test/CodeGen/Hexagon/autohvx/widen-ext.ll new file mode 100644 index 0000000000000..eb4f115220820 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/widen-ext.ll @@ -0,0 +1,99 @@ +; RUN: llc -march=hexagon -hexagon-hvx-widen=32 < %s | FileCheck %s + +; v32i8 -> v32i16 +; CHECK-LABEL: f0: +; CHECK: r[[R0:[0-9]+]] = #64 +; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK: v[[V1:[0-9]+]]:[[V2:[0-9]+]].h = vunpack(v[[V0]].b) +; CHECK: q[[Q0:[0-3]]] = vsetq(r[[R0]]) +; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V2]] +define void @f0(<32 x i8>* %a0, <32 x i16>* %a1) #0 { + %v0 = load <32 x i8>, <32 x i8>* %a0, align 128 + %v1 = sext <32 x i8> %v0 to <32 x i16> + store <32 x i16> %v1, <32 x i16>* %a1, align 128 + ret void +} + +; v32i8 -> v32i32 +; CHECK-LABEL: f1: +; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK: v[[V1:[0-9]+]]:[[V2:[0-9]+]].h = vunpack(v[[V0]].b) +; CHECK: v[[V3:[0-9]+]]:[[V4:[0-9]+]].w = vunpack(v[[V2]].h) +; CHECK: vmem(r1+#0) = v[[V4]] +define void @f1(<32 x i8>* %a0, <32 x i32>* %a1) #0 { + %v0 = load <32 x i8>, <32 x i8>* %a0, align 128 + %v1 = sext <32 x i8> %v0 to <32 x i32> + store <32 x i32> %v1, <32 x i32>* %a1, align 128 + ret void +} + +; v64i8 -> v64i16 +; CHECK-LABEL: f2: +; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK: v[[V1:[0-9]+]]:[[V2:[0-9]+]].h = vunpack(v[[V0]].b) +; CHECK: vmem(r1+#0) = v[[V2]] +define void @f2(<64 x i8>* %a0, <64 x i16>* %a1) #0 { + %v0 = load <64 x i8>, <64 x i8>* %a0, align 128 + %v1 = sext <64 x i8> %v0 to <64 x i16> + store <64 x i16> %v1, <64 x i16>* %a1, align 128 + ret void +} + +; v64i8 -> v64i32 +; CHECK-LABEL: f3: +; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK: v[[V1:[0-9]+]]:[[V2:[0-9]+]].h = vunpack(v[[V0]].b) +; CHECK: v[[V3:[0-9]+]]:[[V4:[0-9]+]].w = vunpack(v[[V2]].h) +; CHECK-DAG: vmem(r1+#0) = v[[V4]] +; CHECK-DAG: vmem(r1+#1) = v[[V3]] +define void @f3(<64 x i8>* %a0, <64 x i32>* %a1) #0 { + %v0 = load <64 x i8>, <64 x i8>* %a0, align 128 + %v1 = sext <64 x i8> %v0 to <64 x i32> + store <64 x i32> %v1, <64 x i32>* %a1, align 128 + ret void +} + +; v16i16 -> v16i32 +; CHECK-LABEL: f4: +; CHECK: r[[R0:[0-9]+]] = #64 +; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK: v[[V1:[0-9]+]]:[[V2:[0-9]+]].w = vunpack(v[[V0]].h) +; CHECK: q[[Q0:[0-3]]] = vsetq(r[[R0]]) +; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V2]] +define void @f4(<16 x i16>* %a0, <16 x i32>* %a1) #0 { + %v0 = load <16 x i16>, <16 x i16>* %a0, align 128 + %v1 = sext <16 x i16> %v0 to <16 x i32> + store <16 x i32> %v1, <16 x i32>* %a1, align 128 + ret void +} + +; v32i16 -> v32i32 +; CHECK-LABEL: f5: +; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK: v[[V1:[0-9]+]]:[[V2:[0-9]+]].w = vunpack(v[[V0]].h) +; CHECK: vmem(r1+#0) = v[[V2]] +define void @f5(<32 x i16>* %a0, <32 x i32>* %a1) #0 { + %v0 = load <32 x i16>, <32 x i16>* %a0, align 128 + %v1 = sext <32 x i16> %v0 to <32 x i32> + store <32 x i32> %v1, <32 x i32>* %a1, align 128 + ret void +} + +; v8i8 -> v8i32 +; CHECK-LABEL: f6: +; CHECK: r[[R0:[0-9]+]]:[[R1:[0-9]+]] = memd(r0+#0) +; CHECK-DAG: v[[V0:[0-9]+]].w = vinsert(r[[R0]]) +; CHECK-DAG: v[[V0]].w = vinsert(r[[R1]]) +; CHECK-DAG: q[[Q0:[0-3]]] = vsetq +; CHECK: v[[V1:[0-9]+]]:[[V2:[0-9]+]].h = vunpack(v[[V0]].b) +; CHECK: v[[V3:[0-9]+]]:[[V4:[0-9]+]].w = vunpack(v[[V2]].h) +; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V4]] +define void @f6(<8 x i8>* %a0, <8 x i32>* %a1) #0 { + %v0 = load <8 x i8>, <8 x i8>* %a0, align 128 + %v1 = sext <8 x i8> %v0 to <8 x i32> + store <8 x i32> %v1, <8 x i32>* %a1, align 128 + ret void +} + +attributes #0 = { "target-cpu"="hexagonv65" "target-features"="+hvx,+hvx-length128b,-packets" } + diff --git a/llvm/test/CodeGen/Hexagon/autohvx/widen-trunc.ll b/llvm/test/CodeGen/Hexagon/autohvx/widen-trunc.ll new file mode 100644 index 0000000000000..71e24bd0d6c0d --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/widen-trunc.ll @@ -0,0 +1,107 @@ +; RUN: llc -march=hexagon -hexagon-hvx-widen=32 < %s | FileCheck %s + +; If the "rx = #N, vsetq(rx)" get reordered with the rest, update the test. + +; v32i16 -> v32i8 +; CHECK-LABEL: f0: +; CHECK: r[[R0:[0-9]+]] = #32 +; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK: v[[V1:[0-9]+]].b = vdeal(v[[V0]].b) +; CHECK: q[[Q0:[0-3]]] = vsetq(r[[R0]]) +; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V1]] +define void @f0(<32 x i16>* %a0, <32 x i8>* %a1) #0 { + %v0 = load <32 x i16>, <32 x i16>* %a0, align 128 + %v1 = trunc <32 x i16> %v0 to <32 x i8> + store <32 x i8> %v1, <32 x i8>* %a1, align 128 + ret void +} + +; v32i32 -> v32i8 +; CHECK-LABEL: f1: +; CHECK: r[[R0:[0-9]+]] = #32 +; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK: v[[V1:[0-9]+]].b = vdeale({{.*}},v[[V0]].b) +; CHECK: q[[Q0:[0-3]]] = vsetq(r[[R0]]) +; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V1]] +define void @f1(<32 x i32>* %a0, <32 x i8>* %a1) #0 { + %v0 = load <32 x i32>, <32 x i32>* %a0, align 128 + %v1 = trunc <32 x i32> %v0 to <32 x i8> + store <32 x i8> %v1, <32 x i8>* %a1, align 128 + ret void +} + +; v64i16 -> v64i8 +; CHECK-LABEL: f2: +; CHECK: r[[R0:[0-9]+]] = #64 +; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK: v[[V1:[0-9]+]].b = vdeal(v[[V0]].b) +; CHECK: q[[Q0:[0-3]]] = vsetq(r[[R0]]) +; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V1]] +define void @f2(<64 x i16>* %a0, <64 x i8>* %a1) #0 { + %v0 = load <64 x i16>, <64 x i16>* %a0, align 128 + %v1 = trunc <64 x i16> %v0 to <64 x i8> + store <64 x i8> %v1, <64 x i8>* %a1, align 128 + ret void +} + +; v64i32 -> v64i8 +; CHECK-LABEL: f3: +; CHECK-DAG: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK-DAG: v[[V1:[0-9]+]] = vmem(r0+#1) +; CHECK-DAG: q[[Q0:[0-3]]] = vsetq +; CHECK: v[[V2:[0-9]+]].h = vpacke(v[[V1]].w,v[[V0]].w) +; CHECK: v[[V3:[0-9]+]].b = vpacke({{.*}},v[[V2]].h) +; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V3]] +define void @f3(<64 x i32>* %a0, <64 x i8>* %a1) #0 { + %v0 = load <64 x i32>, <64 x i32>* %a0, align 128 + %v1 = trunc <64 x i32> %v0 to <64 x i8> + store <64 x i8> %v1, <64 x i8>* %a1, align 128 + ret void +} + +; v16i32 -> v16i16 +; CHECK-LABEL: f4: +; CHECK: r[[R0:[0-9]+]] = #32 +; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK: v[[V1:[0-9]+]].h = vdeal(v[[V0]].h) +; CHECK: q[[Q0:[0-3]]] = vsetq(r[[R0]]) +; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V1]] +define void @f4(<16 x i32>* %a0, <16 x i16>* %a1) #0 { + %v0 = load <16 x i32>, <16 x i32>* %a0, align 128 + %v1 = trunc <16 x i32> %v0 to <16 x i16> + store <16 x i16> %v1, <16 x i16>* %a1, align 128 + ret void +} + +; v32i32 -> v32i16 +; CHECK-LABEL: f5: +; CHECK: r[[R0:[0-9]+]] = #64 +; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK: v[[V1:[0-9]+]].h = vdeal(v[[V0]].h) +; CHECK: q[[Q0:[0-3]]] = vsetq(r[[R0]]) +; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V1]] +define void @f5(<32 x i32>* %a0, <32 x i16>* %a1) #0 { + %v0 = load <32 x i32>, <32 x i32>* %a0, align 128 + %v1 = trunc <32 x i32> %v0 to <32 x i16> + store <32 x i16> %v1, <32 x i16>* %a1, align 128 + ret void +} + +; v8i32 -> v8i8 +; CHECK-LABEL: f6: +; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0) +; CHECK: v[[V1:[0-9]+]].b = vdeale({{.*}},v[[V0]].b) +; CHECK: vmem(r[[R0:[0-9]+]]+#0) = v[[V1]] +; CHECK-DAG: r[[R1:[0-9]+]] = memw(r[[R0]]+#0) +; CHECK-DAG: r[[R2:[0-9]+]] = memw(r[[R0]]+#4) +; CHECK: memd(r1+#0) = r[[R2]]:[[R1]] +define void @f6(<8 x i32>* %a0, <8 x i8>* %a1) #0 { + %v0 = load <8 x i32>, <8 x i32>* %a0, align 128 + %v1 = trunc <8 x i32> %v0 to <8 x i8> + store <8 x i8> %v1, <8 x i8>* %a1, align 128 + ret void +} + + +attributes #0 = { "target-cpu"="hexagonv65" "target-features"="+hvx,+hvx-length128b,-packets" } + diff --git a/llvm/test/CodeGen/Hexagon/swp-pragma-initiation-interval-reset.ii b/llvm/test/CodeGen/Hexagon/swp-pragma-initiation-interval-reset.ii new file mode 100644 index 0000000000000..03c2a13f77f22 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-pragma-initiation-interval-reset.ii @@ -0,0 +1,85 @@ +; RUN: llc -disable-lsr -march=hexagon -enable-pipeliner \ +; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s +; REQUIRES: asserts +; +; Test that checks that the II set by pragma was reset between loops. + +; CHECK: MII = 10 MAX_II = 10 +; CHECK: MII = 1 MAX_II = 11 (rec=1, res=1) +; CHECK-NOT: MII = 10 MAX_II = 10 + +; Function Attrs: nounwind +define void @f0(i32* nocapture %a0, i32 %a1) #0 { +b0: + %v0 = icmp sgt i32 %a1, 1 + br i1 %v0, label %b1, label %b4 + +b1: ; preds = %b0 + %v1 = load i32, i32* %a0, align 4 + %v2 = add i32 %v1, 10 + %v3 = getelementptr i32, i32* %a0, i32 1 + %v4 = add i32 %a1, -1 + br label %b2 + +b2: ; preds = %b2, %b1 + %v5 = phi i32 [ %v12, %b2 ], [ %v4, %b1 ] + %v6 = phi i32* [ %v11, %b2 ], [ %v3, %b1 ] + %v7 = phi i32 [ %v10, %b2 ], [ %v2, %b1 ] + store i32 %v7, i32* %v6, align 4 + %v8 = add i32 %v7, 10 + %v9 = getelementptr i32, i32* %v6, i32 -1 + store i32 %v8, i32* %v9, align 4 + %v10 = add i32 %v7, 10 + %v11 = getelementptr i32, i32* %v6, i32 1 + %v12 = add i32 %v5, -1 + %v13 = icmp eq i32 %v12, 0 + br i1 %v13, label %b3, label %b2 + +b3: ; preds = %b2 + br label %b4 , !llvm.loop !2 + +b4: ; preds = %b3, %b0 + ret void +} + +; Function Attrs: nounwind +define void @f1(i32* nocapture %a0, i32 %a1) #0 { +b0: + %v0 = icmp sgt i32 %a1, 1 + br i1 %v0, label %b1, label %b4 + +b1: ; preds = %b0 + %v1 = load i32, i32* %a0, align 4 + %v2 = add i32 %v1, 10 + %v3 = getelementptr i32, i32* %a0, i32 1 + %v4 = add i32 %a1, -1 + br label %b2 + +b2: ; preds = %b2, %b1 + %v5 = phi i32 [ %v12, %b2 ], [ %v4, %b1 ] + %v6 = phi i32* [ %v11, %b2 ], [ %v3, %b1 ] + %v7 = phi i32 [ %v10, %b2 ], [ %v2, %b1 ] + store i32 %v7, i32* %v6, align 4 + %v8 = add i32 %v7, 10 + %v9 = getelementptr i32, i32* %v6, i32 -1 + store i32 %v8, i32* %v9, align 4 + %v10 = add i32 %v7, 10 + %v11 = getelementptr i32, i32* %v6, i32 1 + %v12 = add i32 %v5, -1 + %v13 = icmp eq i32 %v12, 0 + br i1 %v13, label %b3, label %b2 + +b3: ; preds = %b2 + br label %b4 + +b4: ; preds = %b3, %b0 + ret void +} + +attributes #0 = { nounwind } + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!2, !2, i64 0} +!2 = distinct !{!2, !3} +!3 = !{!"llvm.loop.pipeline.initiationinterval", i32 10} + diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fptosi_and_fptoui.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fptosi_and_fptoui.ll index a98c6eb9fd6cb..c63f24ea692ce 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fptosi_and_fptoui.ll +++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fptosi_and_fptoui.ll @@ -235,15 +235,15 @@ define i32 @f64tou32(double %a) { ; FP32-NEXT: mfc1 $1, $f0 ; FP32-NEXT: lui $2, 16864 ; FP32-NEXT: ori $3, $zero, 0 -; FP32-NEXT: mtc1 $3, $f0 -; FP32-NEXT: mtc1 $2, $f1 -; FP32-NEXT: sub.d $f2, $f12, $f0 -; FP32-NEXT: trunc.w.d $f2, $f2 -; FP32-NEXT: mfc1 $2, $f2 +; FP32-NEXT: mtc1 $3, $f2 +; FP32-NEXT: mtc1 $2, $f3 +; FP32-NEXT: sub.d $f4, $f12, $f2 +; FP32-NEXT: trunc.w.d $f0, $f4 +; FP32-NEXT: mfc1 $2, $f0 ; FP32-NEXT: lui $3, 32768 ; FP32-NEXT: xor $2, $2, $3 ; FP32-NEXT: addiu $3, $zero, 1 -; FP32-NEXT: c.ult.d $f12, $f0 +; FP32-NEXT: c.ult.d $f12, $f2 ; FP32-NEXT: movf $3, $zero, $fcc0 ; FP32-NEXT: andi $3, $3, 1 ; FP32-NEXT: movn $2, $1, $3 @@ -256,15 +256,15 @@ define i32 @f64tou32(double %a) { ; FP64-NEXT: mfc1 $1, $f0 ; FP64-NEXT: lui $2, 16864 ; FP64-NEXT: ori $3, $zero, 0 -; FP64-NEXT: mtc1 $3, $f0 -; FP64-NEXT: mthc1 $2, $f0 -; FP64-NEXT: sub.d $f1, $f12, $f0 -; FP64-NEXT: trunc.w.d $f1, $f1 -; FP64-NEXT: mfc1 $2, $f1 +; FP64-NEXT: mtc1 $3, $f1 +; FP64-NEXT: mthc1 $2, $f1 +; FP64-NEXT: sub.d $f2, $f12, $f1 +; FP64-NEXT: trunc.w.d $f0, $f2 +; FP64-NEXT: mfc1 $2, $f0 ; FP64-NEXT: lui $3, 32768 ; FP64-NEXT: xor $2, $2, $3 ; FP64-NEXT: addiu $3, $zero, 1 -; FP64-NEXT: c.ult.d $f12, $f0 +; FP64-NEXT: c.ult.d $f12, $f1 ; FP64-NEXT: movf $3, $zero, $fcc0 ; FP64-NEXT: andi $3, $3, 1 ; FP64-NEXT: movn $2, $1, $3 @@ -282,15 +282,15 @@ define zeroext i16 @f64tou16(double %a) { ; FP32-NEXT: mfc1 $1, $f0 ; FP32-NEXT: lui $2, 16864 ; FP32-NEXT: ori $3, $zero, 0 -; FP32-NEXT: mtc1 $3, $f0 -; FP32-NEXT: mtc1 $2, $f1 -; FP32-NEXT: sub.d $f2, $f12, $f0 -; FP32-NEXT: trunc.w.d $f2, $f2 -; FP32-NEXT: mfc1 $2, $f2 +; FP32-NEXT: mtc1 $3, $f2 +; FP32-NEXT: mtc1 $2, $f3 +; FP32-NEXT: sub.d $f4, $f12, $f2 +; FP32-NEXT: trunc.w.d $f0, $f4 +; FP32-NEXT: mfc1 $2, $f0 ; FP32-NEXT: lui $3, 32768 ; FP32-NEXT: xor $2, $2, $3 ; FP32-NEXT: addiu $3, $zero, 1 -; FP32-NEXT: c.ult.d $f12, $f0 +; FP32-NEXT: c.ult.d $f12, $f2 ; FP32-NEXT: movf $3, $zero, $fcc0 ; FP32-NEXT: andi $3, $3, 1 ; FP32-NEXT: movn $2, $1, $3 @@ -304,15 +304,15 @@ define zeroext i16 @f64tou16(double %a) { ; FP64-NEXT: mfc1 $1, $f0 ; FP64-NEXT: lui $2, 16864 ; FP64-NEXT: ori $3, $zero, 0 -; FP64-NEXT: mtc1 $3, $f0 -; FP64-NEXT: mthc1 $2, $f0 -; FP64-NEXT: sub.d $f1, $f12, $f0 -; FP64-NEXT: trunc.w.d $f1, $f1 -; FP64-NEXT: mfc1 $2, $f1 +; FP64-NEXT: mtc1 $3, $f1 +; FP64-NEXT: mthc1 $2, $f1 +; FP64-NEXT: sub.d $f2, $f12, $f1 +; FP64-NEXT: trunc.w.d $f0, $f2 +; FP64-NEXT: mfc1 $2, $f0 ; FP64-NEXT: lui $3, 32768 ; FP64-NEXT: xor $2, $2, $3 ; FP64-NEXT: addiu $3, $zero, 1 -; FP64-NEXT: c.ult.d $f12, $f0 +; FP64-NEXT: c.ult.d $f12, $f1 ; FP64-NEXT: movf $3, $zero, $fcc0 ; FP64-NEXT: andi $3, $3, 1 ; FP64-NEXT: movn $2, $1, $3 @@ -331,15 +331,15 @@ define zeroext i8 @f64tou8(double %a) { ; FP32-NEXT: mfc1 $1, $f0 ; FP32-NEXT: lui $2, 16864 ; FP32-NEXT: ori $3, $zero, 0 -; FP32-NEXT: mtc1 $3, $f0 -; FP32-NEXT: mtc1 $2, $f1 -; FP32-NEXT: sub.d $f2, $f12, $f0 -; FP32-NEXT: trunc.w.d $f2, $f2 -; FP32-NEXT: mfc1 $2, $f2 +; FP32-NEXT: mtc1 $3, $f2 +; FP32-NEXT: mtc1 $2, $f3 +; FP32-NEXT: sub.d $f4, $f12, $f2 +; FP32-NEXT: trunc.w.d $f0, $f4 +; FP32-NEXT: mfc1 $2, $f0 ; FP32-NEXT: lui $3, 32768 ; FP32-NEXT: xor $2, $2, $3 ; FP32-NEXT: addiu $3, $zero, 1 -; FP32-NEXT: c.ult.d $f12, $f0 +; FP32-NEXT: c.ult.d $f12, $f2 ; FP32-NEXT: movf $3, $zero, $fcc0 ; FP32-NEXT: andi $3, $3, 1 ; FP32-NEXT: movn $2, $1, $3 @@ -353,15 +353,15 @@ define zeroext i8 @f64tou8(double %a) { ; FP64-NEXT: mfc1 $1, $f0 ; FP64-NEXT: lui $2, 16864 ; FP64-NEXT: ori $3, $zero, 0 -; FP64-NEXT: mtc1 $3, $f0 -; FP64-NEXT: mthc1 $2, $f0 -; FP64-NEXT: sub.d $f1, $f12, $f0 -; FP64-NEXT: trunc.w.d $f1, $f1 -; FP64-NEXT: mfc1 $2, $f1 +; FP64-NEXT: mtc1 $3, $f1 +; FP64-NEXT: mthc1 $2, $f1 +; FP64-NEXT: sub.d $f2, $f12, $f1 +; FP64-NEXT: trunc.w.d $f0, $f2 +; FP64-NEXT: mfc1 $2, $f0 ; FP64-NEXT: lui $3, 32768 ; FP64-NEXT: xor $2, $2, $3 ; FP64-NEXT: addiu $3, $zero, 1 -; FP64-NEXT: c.ult.d $f12, $f0 +; FP64-NEXT: c.ult.d $f12, $f1 ; FP64-NEXT: movf $3, $zero, $fcc0 ; FP64-NEXT: andi $3, $3, 1 ; FP64-NEXT: movn $2, $1, $3 diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/long_ambiguous_chain_s32.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/long_ambiguous_chain_s32.ll index 20e549b81a61a..2dcc174860c10 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/long_ambiguous_chain_s32.ll +++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/long_ambiguous_chain_s32.ll @@ -20,88 +20,100 @@ define void @long_chain_ambiguous_i32_in_gpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, i32* ; MIPS32-NEXT: sw $7, 28($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $2, 24($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 20($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $8, $BB0_9 +; MIPS32-NEXT: bnez $8, $BB0_12 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.1: # %pre.PHI.1 +; MIPS32-NEXT: # %bb.1: # %entry +; MIPS32-NEXT: j $BB0_2 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB0_2: # %pre.PHI.1 ; MIPS32-NEXT: lw $1, 36($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB0_4 +; MIPS32-NEXT: bnez $2, $BB0_7 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.3: # %pre.PHI.1 +; MIPS32-NEXT: j $BB0_4 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.2: # %pre.PHI.1.0 +; MIPS32-NEXT: $BB0_4: # %pre.PHI.1.0 ; MIPS32-NEXT: lw $1, 32($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB0_5 +; MIPS32-NEXT: bnez $2, $BB0_8 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.3: # %b.PHI.1.0 +; MIPS32-NEXT: # %bb.5: # %pre.PHI.1.0 +; MIPS32-NEXT: j $BB0_6 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB0_6: # %b.PHI.1.0 ; MIPS32-NEXT: lw $1, 28($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 16($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB0_6 +; MIPS32-NEXT: j $BB0_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_4: # %b.PHI.1.1 +; MIPS32-NEXT: $BB0_7: # %b.PHI.1.1 ; MIPS32-NEXT: lw $1, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 16($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB0_6 +; MIPS32-NEXT: j $BB0_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_5: # %b.PHI.1.2 +; MIPS32-NEXT: $BB0_8: # %b.PHI.1.2 ; MIPS32-NEXT: lw $1, 24($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 16($sp) # 4-byte Folded Spill -; MIPS32-NEXT: $BB0_6: # %b.PHI.1 +; MIPS32-NEXT: $BB0_9: # %b.PHI.1 ; MIPS32-NEXT: lw $1, 16($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 32($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $3, $2, 1 ; MIPS32-NEXT: move $4, $1 ; MIPS32-NEXT: sw $1, 12($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $4, 8($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $3, $BB0_8 +; MIPS32-NEXT: bnez $3, $BB0_11 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.7: # %b.PHI.1 -; MIPS32-NEXT: j $BB0_15 +; MIPS32-NEXT: # %bb.10: # %b.PHI.1 +; MIPS32-NEXT: j $BB0_19 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_8: # %b.PHI.1.end +; MIPS32-NEXT: $BB0_11: # %b.PHI.1.end ; MIPS32-NEXT: lw $1, 12($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sw $1, 0($2) ; MIPS32-NEXT: addiu $sp, $sp, 48 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_9: # %pre.PHI.2 +; MIPS32-NEXT: $BB0_12: # %pre.PHI.2 ; MIPS32-NEXT: lw $1, 40($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB0_11 +; MIPS32-NEXT: bnez $2, $BB0_14 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.10: # %pre.PHI.2 -; MIPS32-NEXT: j $BB0_12 +; MIPS32-NEXT: # %bb.13: # %pre.PHI.2 +; MIPS32-NEXT: j $BB0_15 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_11: # %b.PHI.2.0 +; MIPS32-NEXT: $BB0_14: # %b.PHI.2.0 ; MIPS32-NEXT: lw $1, 28($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 4($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB0_13 +; MIPS32-NEXT: j $BB0_16 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_12: # %b.PHI.2.1 +; MIPS32-NEXT: $BB0_15: # %b.PHI.2.1 ; MIPS32-NEXT: lw $1, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 4($sp) # 4-byte Folded Spill -; MIPS32-NEXT: $BB0_13: # %b.PHI.2 +; MIPS32-NEXT: $BB0_16: # %b.PHI.2 ; MIPS32-NEXT: lw $1, 4($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 36($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $3, $2, 1 ; MIPS32-NEXT: move $4, $1 ; MIPS32-NEXT: sw $1, 0($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $4, 8($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $3, $BB0_15 +; MIPS32-NEXT: bnez $3, $BB0_19 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.14: # %b.PHI.2.end +; MIPS32-NEXT: # %bb.17: # %b.PHI.2 +; MIPS32-NEXT: j $BB0_18 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB0_18: # %b.PHI.2.end ; MIPS32-NEXT: lw $1, 0($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sw $1, 0($2) ; MIPS32-NEXT: addiu $sp, $sp, 48 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_15: # %b.PHI.3 +; MIPS32-NEXT: $BB0_19: # %b.PHI.3 ; MIPS32-NEXT: lw $1, 8($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 8($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $3, 32($sp) # 4-byte Folded Reload @@ -197,35 +209,44 @@ define void @long_chain_i32_in_gpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, i32* %a, i32* % ; MIPS32-NEXT: sw $2, 32($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 28($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $8, 24($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $9, $BB1_9 +; MIPS32-NEXT: bnez $9, $BB1_12 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.1: # %entry +; MIPS32-NEXT: j $BB1_2 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.1: # %pre.PHI.1 +; MIPS32-NEXT: $BB1_2: # %pre.PHI.1 ; MIPS32-NEXT: lw $1, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB1_4 +; MIPS32-NEXT: bnez $2, $BB1_7 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.2: # %pre.PHI.1.0 +; MIPS32-NEXT: # %bb.3: # %pre.PHI.1 +; MIPS32-NEXT: j $BB1_4 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB1_4: # %pre.PHI.1.0 ; MIPS32-NEXT: lw $1, 40($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB1_5 +; MIPS32-NEXT: bnez $2, $BB1_8 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.5: # %pre.PHI.1.0 +; MIPS32-NEXT: j $BB1_6 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.3: # %b.PHI.1.0 +; MIPS32-NEXT: $BB1_6: # %b.PHI.1.0 ; MIPS32-NEXT: lw $1, 36($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 20($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB1_6 +; MIPS32-NEXT: j $BB1_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_4: # %b.PHI.1.1 +; MIPS32-NEXT: $BB1_7: # %b.PHI.1.1 ; MIPS32-NEXT: lw $1, 52($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 20($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB1_6 +; MIPS32-NEXT: j $BB1_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_5: # %b.PHI.1.2 +; MIPS32-NEXT: $BB1_8: # %b.PHI.1.2 ; MIPS32-NEXT: lw $1, 32($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 20($sp) # 4-byte Folded Spill -; MIPS32-NEXT: $BB1_6: # %b.PHI.1 +; MIPS32-NEXT: $BB1_9: # %b.PHI.1 ; MIPS32-NEXT: lw $1, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 40($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $3, $2, 1 @@ -234,37 +255,37 @@ define void @long_chain_i32_in_gpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, i32* %a, i32* % ; MIPS32-NEXT: sw $1, 16($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $4, 12($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $5, 8($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $3, $BB1_8 +; MIPS32-NEXT: bnez $3, $BB1_11 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.7: # %b.PHI.1 -; MIPS32-NEXT: j $BB1_15 +; MIPS32-NEXT: # %bb.10: # %b.PHI.1 +; MIPS32-NEXT: j $BB1_19 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_8: # %b.PHI.1.end +; MIPS32-NEXT: $BB1_11: # %b.PHI.1.end ; MIPS32-NEXT: lw $1, 16($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 28($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sw $1, 0($2) ; MIPS32-NEXT: addiu $sp, $sp, 56 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_9: # %pre.PHI.2 +; MIPS32-NEXT: $BB1_12: # %pre.PHI.2 ; MIPS32-NEXT: lw $1, 48($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB1_11 +; MIPS32-NEXT: bnez $2, $BB1_14 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.10: # %pre.PHI.2 -; MIPS32-NEXT: j $BB1_12 +; MIPS32-NEXT: # %bb.13: # %pre.PHI.2 +; MIPS32-NEXT: j $BB1_15 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_11: # %b.PHI.2.0 +; MIPS32-NEXT: $BB1_14: # %b.PHI.2.0 ; MIPS32-NEXT: lw $1, 36($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 4($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB1_13 +; MIPS32-NEXT: j $BB1_16 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_12: # %b.PHI.2.1 +; MIPS32-NEXT: $BB1_15: # %b.PHI.2.1 ; MIPS32-NEXT: lw $1, 52($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 4($sp) # 4-byte Folded Spill -; MIPS32-NEXT: $BB1_13: # %b.PHI.2 +; MIPS32-NEXT: $BB1_16: # %b.PHI.2 ; MIPS32-NEXT: lw $1, 4($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $3, $2, 1 @@ -273,16 +294,19 @@ define void @long_chain_i32_in_gpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, i32* %a, i32* % ; MIPS32-NEXT: sw $1, 0($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $4, 12($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $5, 8($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $3, $BB1_15 +; MIPS32-NEXT: bnez $3, $BB1_19 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.17: # %b.PHI.2 +; MIPS32-NEXT: j $BB1_18 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.14: # %b.PHI.2.end +; MIPS32-NEXT: $BB1_18: # %b.PHI.2.end ; MIPS32-NEXT: lw $1, 0($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 28($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sw $1, 0($2) ; MIPS32-NEXT: addiu $sp, $sp, 56 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_15: # %b.PHI.3 +; MIPS32-NEXT: $BB1_19: # %b.PHI.3 ; MIPS32-NEXT: lw $1, 8($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 12($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $3, 40($sp) # 4-byte Folded Reload @@ -375,88 +399,100 @@ define void @long_chain_ambiguous_float_in_fpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, flo ; MIPS32-NEXT: sw $7, 28($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $2, 24($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 20($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $8, $BB2_9 +; MIPS32-NEXT: bnez $8, $BB2_12 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.1: # %pre.PHI.1 +; MIPS32-NEXT: # %bb.1: # %entry +; MIPS32-NEXT: j $BB2_2 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB2_2: # %pre.PHI.1 ; MIPS32-NEXT: lw $1, 36($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB2_4 +; MIPS32-NEXT: bnez $2, $BB2_7 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.3: # %pre.PHI.1 +; MIPS32-NEXT: j $BB2_4 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.2: # %pre.PHI.1.0 +; MIPS32-NEXT: $BB2_4: # %pre.PHI.1.0 ; MIPS32-NEXT: lw $1, 32($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB2_5 +; MIPS32-NEXT: bnez $2, $BB2_8 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.3: # %b.PHI.1.0 +; MIPS32-NEXT: # %bb.5: # %pre.PHI.1.0 +; MIPS32-NEXT: j $BB2_6 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB2_6: # %b.PHI.1.0 ; MIPS32-NEXT: lw $1, 28($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 16($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB2_6 +; MIPS32-NEXT: j $BB2_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_4: # %b.PHI.1.1 +; MIPS32-NEXT: $BB2_7: # %b.PHI.1.1 ; MIPS32-NEXT: lw $1, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 16($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB2_6 +; MIPS32-NEXT: j $BB2_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_5: # %b.PHI.1.2 +; MIPS32-NEXT: $BB2_8: # %b.PHI.1.2 ; MIPS32-NEXT: lw $1, 24($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 16($sp) # 4-byte Folded Spill -; MIPS32-NEXT: $BB2_6: # %b.PHI.1 +; MIPS32-NEXT: $BB2_9: # %b.PHI.1 ; MIPS32-NEXT: lw $1, 16($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 32($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $3, $2, 1 ; MIPS32-NEXT: move $4, $1 ; MIPS32-NEXT: sw $1, 12($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $4, 8($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $3, $BB2_8 +; MIPS32-NEXT: bnez $3, $BB2_11 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.7: # %b.PHI.1 -; MIPS32-NEXT: j $BB2_15 +; MIPS32-NEXT: # %bb.10: # %b.PHI.1 +; MIPS32-NEXT: j $BB2_19 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_8: # %b.PHI.1.end +; MIPS32-NEXT: $BB2_11: # %b.PHI.1.end ; MIPS32-NEXT: lw $1, 12($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sw $1, 0($2) ; MIPS32-NEXT: addiu $sp, $sp, 48 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_9: # %pre.PHI.2 +; MIPS32-NEXT: $BB2_12: # %pre.PHI.2 ; MIPS32-NEXT: lw $1, 40($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB2_11 +; MIPS32-NEXT: bnez $2, $BB2_14 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.10: # %pre.PHI.2 -; MIPS32-NEXT: j $BB2_12 +; MIPS32-NEXT: # %bb.13: # %pre.PHI.2 +; MIPS32-NEXT: j $BB2_15 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_11: # %b.PHI.2.0 +; MIPS32-NEXT: $BB2_14: # %b.PHI.2.0 ; MIPS32-NEXT: lw $1, 28($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 4($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB2_13 +; MIPS32-NEXT: j $BB2_16 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_12: # %b.PHI.2.1 +; MIPS32-NEXT: $BB2_15: # %b.PHI.2.1 ; MIPS32-NEXT: lw $1, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: sw $2, 4($sp) # 4-byte Folded Spill -; MIPS32-NEXT: $BB2_13: # %b.PHI.2 +; MIPS32-NEXT: $BB2_16: # %b.PHI.2 ; MIPS32-NEXT: lw $1, 4($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 36($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $3, $2, 1 ; MIPS32-NEXT: move $4, $1 ; MIPS32-NEXT: sw $1, 0($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $4, 8($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $3, $BB2_15 +; MIPS32-NEXT: bnez $3, $BB2_19 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.17: # %b.PHI.2 +; MIPS32-NEXT: j $BB2_18 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.14: # %b.PHI.2.end +; MIPS32-NEXT: $BB2_18: # %b.PHI.2.end ; MIPS32-NEXT: lw $1, 0($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sw $1, 0($2) ; MIPS32-NEXT: addiu $sp, $sp, 48 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_15: # %b.PHI.3 +; MIPS32-NEXT: $BB2_19: # %b.PHI.3 ; MIPS32-NEXT: lw $1, 8($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 8($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $3, 32($sp) # 4-byte Folded Reload @@ -553,35 +589,44 @@ define void @long_chain_float_in_fpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, float* %a, fl ; MIPS32-NEXT: sw $2, 32($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 28($sp) # 4-byte Folded Spill ; MIPS32-NEXT: swc1 $f0, 24($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $8, $BB3_9 +; MIPS32-NEXT: bnez $8, $BB3_12 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.1: # %entry +; MIPS32-NEXT: j $BB3_2 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.1: # %pre.PHI.1 +; MIPS32-NEXT: $BB3_2: # %pre.PHI.1 ; MIPS32-NEXT: lw $1, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB3_4 +; MIPS32-NEXT: bnez $2, $BB3_7 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.2: # %pre.PHI.1.0 +; MIPS32-NEXT: # %bb.3: # %pre.PHI.1 +; MIPS32-NEXT: j $BB3_4 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB3_4: # %pre.PHI.1.0 ; MIPS32-NEXT: lw $1, 40($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB3_5 +; MIPS32-NEXT: bnez $2, $BB3_8 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.3: # %b.PHI.1.0 +; MIPS32-NEXT: # %bb.5: # %pre.PHI.1.0 +; MIPS32-NEXT: j $BB3_6 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB3_6: # %b.PHI.1.0 ; MIPS32-NEXT: lw $1, 36($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lwc1 $f0, 0($1) ; MIPS32-NEXT: swc1 $f0, 20($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB3_6 +; MIPS32-NEXT: j $BB3_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_4: # %b.PHI.1.1 +; MIPS32-NEXT: $BB3_7: # %b.PHI.1.1 ; MIPS32-NEXT: lw $1, 52($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lwc1 $f0, 0($1) ; MIPS32-NEXT: swc1 $f0, 20($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB3_6 +; MIPS32-NEXT: j $BB3_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_5: # %b.PHI.1.2 +; MIPS32-NEXT: $BB3_8: # %b.PHI.1.2 ; MIPS32-NEXT: lw $1, 32($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lwc1 $f0, 0($1) ; MIPS32-NEXT: swc1 $f0, 20($sp) # 4-byte Folded Spill -; MIPS32-NEXT: $BB3_6: # %b.PHI.1 +; MIPS32-NEXT: $BB3_9: # %b.PHI.1 ; MIPS32-NEXT: lwc1 $f0, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $1, 40($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 @@ -590,37 +635,37 @@ define void @long_chain_float_in_fpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, float* %a, fl ; MIPS32-NEXT: swc1 $f0, 16($sp) # 4-byte Folded Spill ; MIPS32-NEXT: swc1 $f1, 12($sp) # 4-byte Folded Spill ; MIPS32-NEXT: swc1 $f2, 8($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $2, $BB3_8 +; MIPS32-NEXT: bnez $2, $BB3_11 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.7: # %b.PHI.1 -; MIPS32-NEXT: j $BB3_15 +; MIPS32-NEXT: # %bb.10: # %b.PHI.1 +; MIPS32-NEXT: j $BB3_19 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_8: # %b.PHI.1.end +; MIPS32-NEXT: $BB3_11: # %b.PHI.1.end ; MIPS32-NEXT: lwc1 $f0, 16($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $1, 28($sp) # 4-byte Folded Reload ; MIPS32-NEXT: swc1 $f0, 0($1) ; MIPS32-NEXT: addiu $sp, $sp, 56 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_9: # %pre.PHI.2 +; MIPS32-NEXT: $BB3_12: # %pre.PHI.2 ; MIPS32-NEXT: lw $1, 48($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB3_11 +; MIPS32-NEXT: bnez $2, $BB3_14 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.10: # %pre.PHI.2 -; MIPS32-NEXT: j $BB3_12 +; MIPS32-NEXT: # %bb.13: # %pre.PHI.2 +; MIPS32-NEXT: j $BB3_15 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_11: # %b.PHI.2.0 +; MIPS32-NEXT: $BB3_14: # %b.PHI.2.0 ; MIPS32-NEXT: lw $1, 36($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lwc1 $f0, 0($1) ; MIPS32-NEXT: swc1 $f0, 4($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB3_13 +; MIPS32-NEXT: j $BB3_16 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_12: # %b.PHI.2.1 +; MIPS32-NEXT: $BB3_15: # %b.PHI.2.1 ; MIPS32-NEXT: lw $1, 52($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lwc1 $f0, 0($1) ; MIPS32-NEXT: swc1 $f0, 4($sp) # 4-byte Folded Spill -; MIPS32-NEXT: $BB3_13: # %b.PHI.2 +; MIPS32-NEXT: $BB3_16: # %b.PHI.2 ; MIPS32-NEXT: lwc1 $f0, 4($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $1, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 @@ -629,16 +674,19 @@ define void @long_chain_float_in_fpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, float* %a, fl ; MIPS32-NEXT: swc1 $f0, 0($sp) # 4-byte Folded Spill ; MIPS32-NEXT: swc1 $f1, 12($sp) # 4-byte Folded Spill ; MIPS32-NEXT: swc1 $f2, 8($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $2, $BB3_15 +; MIPS32-NEXT: bnez $2, $BB3_19 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.17: # %b.PHI.2 +; MIPS32-NEXT: j $BB3_18 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.14: # %b.PHI.2.end +; MIPS32-NEXT: $BB3_18: # %b.PHI.2.end ; MIPS32-NEXT: lwc1 $f0, 0($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $1, 28($sp) # 4-byte Folded Reload ; MIPS32-NEXT: swc1 $f0, 0($1) ; MIPS32-NEXT: addiu $sp, $sp, 56 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_15: # %b.PHI.3 +; MIPS32-NEXT: $BB3_19: # %b.PHI.3 ; MIPS32-NEXT: lwc1 $f0, 8($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lwc1 $f1, 12($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $1, 40($sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/long_ambiguous_chain_s64.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/long_ambiguous_chain_s64.ll index a237099eb75ba..bafa309df76a1 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/long_ambiguous_chain_s64.ll +++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/long_ambiguous_chain_s64.ll @@ -20,88 +20,100 @@ define void @long_chain_ambiguous_i64_in_fpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, i64* ; MIPS32-NEXT: sw $7, 52($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $2, 48($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 44($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $8, $BB0_9 +; MIPS32-NEXT: bnez $8, $BB0_12 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.1: # %pre.PHI.1 +; MIPS32-NEXT: # %bb.1: # %entry +; MIPS32-NEXT: j $BB0_2 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB0_2: # %pre.PHI.1 ; MIPS32-NEXT: lw $1, 60($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB0_4 +; MIPS32-NEXT: bnez $2, $BB0_7 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.3: # %pre.PHI.1 +; MIPS32-NEXT: j $BB0_4 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.2: # %pre.PHI.1.0 +; MIPS32-NEXT: $BB0_4: # %pre.PHI.1.0 ; MIPS32-NEXT: lw $1, 56($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB0_5 +; MIPS32-NEXT: bnez $2, $BB0_8 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.3: # %b.PHI.1.0 +; MIPS32-NEXT: # %bb.5: # %pre.PHI.1.0 +; MIPS32-NEXT: j $BB0_6 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB0_6: # %b.PHI.1.0 ; MIPS32-NEXT: lw $1, 52($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 32($sp) # 8-byte Folded Spill -; MIPS32-NEXT: j $BB0_6 +; MIPS32-NEXT: j $BB0_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_4: # %b.PHI.1.1 +; MIPS32-NEXT: $BB0_7: # %b.PHI.1.1 ; MIPS32-NEXT: lw $1, 68($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 32($sp) # 8-byte Folded Spill -; MIPS32-NEXT: j $BB0_6 +; MIPS32-NEXT: j $BB0_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_5: # %b.PHI.1.2 +; MIPS32-NEXT: $BB0_8: # %b.PHI.1.2 ; MIPS32-NEXT: lw $1, 48($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 32($sp) # 8-byte Folded Spill -; MIPS32-NEXT: $BB0_6: # %b.PHI.1 +; MIPS32-NEXT: $BB0_9: # %b.PHI.1 ; MIPS32-NEXT: ldc1 $f0, 32($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 56($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 ; MIPS32-NEXT: mov.d $f2, $f0 ; MIPS32-NEXT: sdc1 $f0, 24($sp) # 8-byte Folded Spill ; MIPS32-NEXT: sdc1 $f2, 16($sp) # 8-byte Folded Spill -; MIPS32-NEXT: bnez $2, $BB0_8 +; MIPS32-NEXT: bnez $2, $BB0_11 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.7: # %b.PHI.1 -; MIPS32-NEXT: j $BB0_15 +; MIPS32-NEXT: # %bb.10: # %b.PHI.1 +; MIPS32-NEXT: j $BB0_19 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_8: # %b.PHI.1.end +; MIPS32-NEXT: $BB0_11: # %b.PHI.1.end ; MIPS32-NEXT: ldc1 $f0, 24($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sdc1 $f0, 0($1) ; MIPS32-NEXT: addiu $sp, $sp, 72 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_9: # %pre.PHI.2 +; MIPS32-NEXT: $BB0_12: # %pre.PHI.2 ; MIPS32-NEXT: lw $1, 64($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB0_11 +; MIPS32-NEXT: bnez $2, $BB0_14 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.10: # %pre.PHI.2 -; MIPS32-NEXT: j $BB0_12 +; MIPS32-NEXT: # %bb.13: # %pre.PHI.2 +; MIPS32-NEXT: j $BB0_15 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_11: # %b.PHI.2.0 +; MIPS32-NEXT: $BB0_14: # %b.PHI.2.0 ; MIPS32-NEXT: lw $1, 52($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 8($sp) # 8-byte Folded Spill -; MIPS32-NEXT: j $BB0_13 +; MIPS32-NEXT: j $BB0_16 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_12: # %b.PHI.2.1 +; MIPS32-NEXT: $BB0_15: # %b.PHI.2.1 ; MIPS32-NEXT: lw $1, 68($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 8($sp) # 8-byte Folded Spill -; MIPS32-NEXT: $BB0_13: # %b.PHI.2 +; MIPS32-NEXT: $BB0_16: # %b.PHI.2 ; MIPS32-NEXT: ldc1 $f0, 8($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 60($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 ; MIPS32-NEXT: mov.d $f2, $f0 ; MIPS32-NEXT: sdc1 $f0, 0($sp) # 8-byte Folded Spill ; MIPS32-NEXT: sdc1 $f2, 16($sp) # 8-byte Folded Spill -; MIPS32-NEXT: bnez $2, $BB0_15 +; MIPS32-NEXT: bnez $2, $BB0_19 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.14: # %b.PHI.2.end +; MIPS32-NEXT: # %bb.17: # %b.PHI.2 +; MIPS32-NEXT: j $BB0_18 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB0_18: # %b.PHI.2.end ; MIPS32-NEXT: ldc1 $f0, 0($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sdc1 $f0, 0($1) ; MIPS32-NEXT: addiu $sp, $sp, 72 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB0_15: # %b.PHI.3 +; MIPS32-NEXT: $BB0_19: # %b.PHI.3 ; MIPS32-NEXT: ldc1 $f0, 16($sp) # 8-byte Folded Reload ; MIPS32-NEXT: ldc1 $f2, 16($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 56($sp) # 4-byte Folded Reload @@ -197,41 +209,50 @@ define void @long_chain_i64_in_gpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, i64* %a, i64* % ; MIPS32-NEXT: sw $2, 56($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 52($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $8, 48($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $9, $BB1_9 +; MIPS32-NEXT: bnez $9, $BB1_12 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.1: # %entry +; MIPS32-NEXT: j $BB1_2 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.1: # %pre.PHI.1 +; MIPS32-NEXT: $BB1_2: # %pre.PHI.1 ; MIPS32-NEXT: lw $1, 68($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB1_4 +; MIPS32-NEXT: bnez $2, $BB1_7 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.3: # %pre.PHI.1 +; MIPS32-NEXT: j $BB1_4 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.2: # %pre.PHI.1.0 +; MIPS32-NEXT: $BB1_4: # %pre.PHI.1.0 ; MIPS32-NEXT: lw $1, 64($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB1_5 +; MIPS32-NEXT: bnez $2, $BB1_8 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.3: # %b.PHI.1.0 +; MIPS32-NEXT: # %bb.5: # %pre.PHI.1.0 +; MIPS32-NEXT: j $BB1_6 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB1_6: # %b.PHI.1.0 ; MIPS32-NEXT: lw $1, 60($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: lw $3, 4($1) ; MIPS32-NEXT: sw $2, 44($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 40($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB1_6 +; MIPS32-NEXT: j $BB1_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_4: # %b.PHI.1.1 +; MIPS32-NEXT: $BB1_7: # %b.PHI.1.1 ; MIPS32-NEXT: lw $1, 76($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: lw $3, 4($1) ; MIPS32-NEXT: sw $2, 44($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 40($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB1_6 +; MIPS32-NEXT: j $BB1_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_5: # %b.PHI.1.2 +; MIPS32-NEXT: $BB1_8: # %b.PHI.1.2 ; MIPS32-NEXT: lw $1, 56($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: lw $3, 4($1) ; MIPS32-NEXT: sw $2, 44($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 40($sp) # 4-byte Folded Spill -; MIPS32-NEXT: $BB1_6: # %b.PHI.1 +; MIPS32-NEXT: $BB1_9: # %b.PHI.1 ; MIPS32-NEXT: lw $1, 40($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $3, 64($sp) # 4-byte Folded Reload @@ -246,12 +267,12 @@ define void @long_chain_i64_in_gpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, i64* %a, i64* % ; MIPS32-NEXT: sw $6, 24($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $7, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $8, 16($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $4, $BB1_8 +; MIPS32-NEXT: bnez $4, $BB1_11 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.7: # %b.PHI.1 -; MIPS32-NEXT: j $BB1_15 +; MIPS32-NEXT: # %bb.10: # %b.PHI.1 +; MIPS32-NEXT: j $BB1_19 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_8: # %b.PHI.1.end +; MIPS32-NEXT: $BB1_11: # %b.PHI.1.end ; MIPS32-NEXT: lw $1, 32($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 52($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sw $1, 0($2) @@ -260,29 +281,29 @@ define void @long_chain_i64_in_gpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, i64* %a, i64* % ; MIPS32-NEXT: addiu $sp, $sp, 80 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_9: # %pre.PHI.2 +; MIPS32-NEXT: $BB1_12: # %pre.PHI.2 ; MIPS32-NEXT: lw $1, 72($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB1_11 +; MIPS32-NEXT: bnez $2, $BB1_14 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.10: # %pre.PHI.2 -; MIPS32-NEXT: j $BB1_12 +; MIPS32-NEXT: # %bb.13: # %pre.PHI.2 +; MIPS32-NEXT: j $BB1_15 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_11: # %b.PHI.2.0 +; MIPS32-NEXT: $BB1_14: # %b.PHI.2.0 ; MIPS32-NEXT: lw $1, 60($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: lw $3, 4($1) ; MIPS32-NEXT: sw $2, 12($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 8($sp) # 4-byte Folded Spill -; MIPS32-NEXT: j $BB1_13 +; MIPS32-NEXT: j $BB1_16 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_12: # %b.PHI.2.1 +; MIPS32-NEXT: $BB1_15: # %b.PHI.2.1 ; MIPS32-NEXT: lw $1, 76($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 0($1) ; MIPS32-NEXT: lw $3, 4($1) ; MIPS32-NEXT: sw $2, 12($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 8($sp) # 4-byte Folded Spill -; MIPS32-NEXT: $BB1_13: # %b.PHI.2 +; MIPS32-NEXT: $BB1_16: # %b.PHI.2 ; MIPS32-NEXT: lw $1, 8($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 12($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $3, 68($sp) # 4-byte Folded Reload @@ -297,9 +318,12 @@ define void @long_chain_i64_in_gpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, i64* %a, i64* % ; MIPS32-NEXT: sw $6, 24($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $7, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $8, 16($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $4, $BB1_15 +; MIPS32-NEXT: bnez $4, $BB1_19 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.14: # %b.PHI.2.end +; MIPS32-NEXT: # %bb.17: # %b.PHI.2 +; MIPS32-NEXT: j $BB1_18 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB1_18: # %b.PHI.2.end ; MIPS32-NEXT: lw $1, 0($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 52($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sw $1, 0($2) @@ -308,7 +332,7 @@ define void @long_chain_i64_in_gpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, i64* %a, i64* % ; MIPS32-NEXT: addiu $sp, $sp, 80 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB1_15: # %b.PHI.3 +; MIPS32-NEXT: $BB1_19: # %b.PHI.3 ; MIPS32-NEXT: lw $1, 16($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $2, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $3, 24($sp) # 4-byte Folded Reload @@ -408,88 +432,100 @@ define void @long_chain_ambiguous_double_in_fpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, do ; MIPS32-NEXT: sw $7, 52($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $2, 48($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 44($sp) # 4-byte Folded Spill -; MIPS32-NEXT: bnez $8, $BB2_9 +; MIPS32-NEXT: bnez $8, $BB2_12 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.1: # %entry +; MIPS32-NEXT: j $BB2_2 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.1: # %pre.PHI.1 +; MIPS32-NEXT: $BB2_2: # %pre.PHI.1 ; MIPS32-NEXT: lw $1, 60($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB2_4 +; MIPS32-NEXT: bnez $2, $BB2_7 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.3: # %pre.PHI.1 +; MIPS32-NEXT: j $BB2_4 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.2: # %pre.PHI.1.0 +; MIPS32-NEXT: $BB2_4: # %pre.PHI.1.0 ; MIPS32-NEXT: lw $1, 56($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB2_5 +; MIPS32-NEXT: bnez $2, $BB2_8 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.5: # %pre.PHI.1.0 +; MIPS32-NEXT: j $BB2_6 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.3: # %b.PHI.1.0 +; MIPS32-NEXT: $BB2_6: # %b.PHI.1.0 ; MIPS32-NEXT: lw $1, 52($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 32($sp) # 8-byte Folded Spill -; MIPS32-NEXT: j $BB2_6 +; MIPS32-NEXT: j $BB2_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_4: # %b.PHI.1.1 +; MIPS32-NEXT: $BB2_7: # %b.PHI.1.1 ; MIPS32-NEXT: lw $1, 68($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 32($sp) # 8-byte Folded Spill -; MIPS32-NEXT: j $BB2_6 +; MIPS32-NEXT: j $BB2_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_5: # %b.PHI.1.2 +; MIPS32-NEXT: $BB2_8: # %b.PHI.1.2 ; MIPS32-NEXT: lw $1, 48($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 32($sp) # 8-byte Folded Spill -; MIPS32-NEXT: $BB2_6: # %b.PHI.1 +; MIPS32-NEXT: $BB2_9: # %b.PHI.1 ; MIPS32-NEXT: ldc1 $f0, 32($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 56($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 ; MIPS32-NEXT: mov.d $f2, $f0 ; MIPS32-NEXT: sdc1 $f0, 24($sp) # 8-byte Folded Spill ; MIPS32-NEXT: sdc1 $f2, 16($sp) # 8-byte Folded Spill -; MIPS32-NEXT: bnez $2, $BB2_8 +; MIPS32-NEXT: bnez $2, $BB2_11 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.7: # %b.PHI.1 -; MIPS32-NEXT: j $BB2_15 +; MIPS32-NEXT: # %bb.10: # %b.PHI.1 +; MIPS32-NEXT: j $BB2_19 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_8: # %b.PHI.1.end +; MIPS32-NEXT: $BB2_11: # %b.PHI.1.end ; MIPS32-NEXT: ldc1 $f0, 24($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sdc1 $f0, 0($1) ; MIPS32-NEXT: addiu $sp, $sp, 72 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_9: # %pre.PHI.2 +; MIPS32-NEXT: $BB2_12: # %pre.PHI.2 ; MIPS32-NEXT: lw $1, 64($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB2_11 +; MIPS32-NEXT: bnez $2, $BB2_14 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.10: # %pre.PHI.2 -; MIPS32-NEXT: j $BB2_12 +; MIPS32-NEXT: # %bb.13: # %pre.PHI.2 +; MIPS32-NEXT: j $BB2_15 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_11: # %b.PHI.2.0 +; MIPS32-NEXT: $BB2_14: # %b.PHI.2.0 ; MIPS32-NEXT: lw $1, 52($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 8($sp) # 8-byte Folded Spill -; MIPS32-NEXT: j $BB2_13 +; MIPS32-NEXT: j $BB2_16 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_12: # %b.PHI.2.1 +; MIPS32-NEXT: $BB2_15: # %b.PHI.2.1 ; MIPS32-NEXT: lw $1, 68($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 8($sp) # 8-byte Folded Spill -; MIPS32-NEXT: $BB2_13: # %b.PHI.2 +; MIPS32-NEXT: $BB2_16: # %b.PHI.2 ; MIPS32-NEXT: ldc1 $f0, 8($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 60($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 ; MIPS32-NEXT: mov.d $f2, $f0 ; MIPS32-NEXT: sdc1 $f0, 0($sp) # 8-byte Folded Spill ; MIPS32-NEXT: sdc1 $f2, 16($sp) # 8-byte Folded Spill -; MIPS32-NEXT: bnez $2, $BB2_15 +; MIPS32-NEXT: bnez $2, $BB2_19 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.17: # %b.PHI.2 +; MIPS32-NEXT: j $BB2_18 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.14: # %b.PHI.2.end +; MIPS32-NEXT: $BB2_18: # %b.PHI.2.end ; MIPS32-NEXT: ldc1 $f0, 0($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 44($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sdc1 $f0, 0($1) ; MIPS32-NEXT: addiu $sp, $sp, 72 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB2_15: # %b.PHI.3 +; MIPS32-NEXT: $BB2_19: # %b.PHI.3 ; MIPS32-NEXT: ldc1 $f0, 16($sp) # 8-byte Folded Reload ; MIPS32-NEXT: ldc1 $f2, 16($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 56($sp) # 4-byte Folded Reload @@ -588,35 +624,44 @@ define void @long_chain_double_in_fpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, double* %a, ; MIPS32-NEXT: sw $2, 64($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $3, 60($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sdc1 $f0, 48($sp) # 8-byte Folded Spill -; MIPS32-NEXT: bnez $8, $BB3_9 +; MIPS32-NEXT: bnez $8, $BB3_12 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.1: # %entry +; MIPS32-NEXT: j $BB3_2 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.1: # %pre.PHI.1 +; MIPS32-NEXT: $BB3_2: # %pre.PHI.1 ; MIPS32-NEXT: lw $1, 76($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB3_4 +; MIPS32-NEXT: bnez $2, $BB3_7 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.2: # %pre.PHI.1.0 +; MIPS32-NEXT: # %bb.3: # %pre.PHI.1 +; MIPS32-NEXT: j $BB3_4 +; MIPS32-NEXT: nop +; MIPS32-NEXT: $BB3_4: # %pre.PHI.1.0 ; MIPS32-NEXT: lw $1, 72($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB3_5 +; MIPS32-NEXT: bnez $2, $BB3_8 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.5: # %pre.PHI.1.0 +; MIPS32-NEXT: j $BB3_6 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.3: # %b.PHI.1.0 +; MIPS32-NEXT: $BB3_6: # %b.PHI.1.0 ; MIPS32-NEXT: lw $1, 68($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 40($sp) # 8-byte Folded Spill -; MIPS32-NEXT: j $BB3_6 +; MIPS32-NEXT: j $BB3_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_4: # %b.PHI.1.1 +; MIPS32-NEXT: $BB3_7: # %b.PHI.1.1 ; MIPS32-NEXT: lw $1, 84($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 40($sp) # 8-byte Folded Spill -; MIPS32-NEXT: j $BB3_6 +; MIPS32-NEXT: j $BB3_9 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_5: # %b.PHI.1.2 +; MIPS32-NEXT: $BB3_8: # %b.PHI.1.2 ; MIPS32-NEXT: lw $1, 64($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 40($sp) # 8-byte Folded Spill -; MIPS32-NEXT: $BB3_6: # %b.PHI.1 +; MIPS32-NEXT: $BB3_9: # %b.PHI.1 ; MIPS32-NEXT: ldc1 $f0, 40($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 72($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 @@ -625,37 +670,37 @@ define void @long_chain_double_in_fpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, double* %a, ; MIPS32-NEXT: sdc1 $f0, 32($sp) # 8-byte Folded Spill ; MIPS32-NEXT: sdc1 $f2, 24($sp) # 8-byte Folded Spill ; MIPS32-NEXT: sdc1 $f4, 16($sp) # 8-byte Folded Spill -; MIPS32-NEXT: bnez $2, $BB3_8 +; MIPS32-NEXT: bnez $2, $BB3_11 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.7: # %b.PHI.1 -; MIPS32-NEXT: j $BB3_15 +; MIPS32-NEXT: # %bb.10: # %b.PHI.1 +; MIPS32-NEXT: j $BB3_19 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_8: # %b.PHI.1.end +; MIPS32-NEXT: $BB3_11: # %b.PHI.1.end ; MIPS32-NEXT: ldc1 $f0, 32($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 60($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sdc1 $f0, 0($1) ; MIPS32-NEXT: addiu $sp, $sp, 88 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_9: # %pre.PHI.2 +; MIPS32-NEXT: $BB3_12: # %pre.PHI.2 ; MIPS32-NEXT: lw $1, 80($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 -; MIPS32-NEXT: bnez $2, $BB3_11 +; MIPS32-NEXT: bnez $2, $BB3_14 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.10: # %pre.PHI.2 -; MIPS32-NEXT: j $BB3_12 +; MIPS32-NEXT: # %bb.13: # %pre.PHI.2 +; MIPS32-NEXT: j $BB3_15 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_11: # %b.PHI.2.0 +; MIPS32-NEXT: $BB3_14: # %b.PHI.2.0 ; MIPS32-NEXT: lw $1, 68($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 8($sp) # 8-byte Folded Spill -; MIPS32-NEXT: j $BB3_13 +; MIPS32-NEXT: j $BB3_16 ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_12: # %b.PHI.2.1 +; MIPS32-NEXT: $BB3_15: # %b.PHI.2.1 ; MIPS32-NEXT: lw $1, 84($sp) # 4-byte Folded Reload ; MIPS32-NEXT: ldc1 $f0, 0($1) ; MIPS32-NEXT: sdc1 $f0, 8($sp) # 8-byte Folded Spill -; MIPS32-NEXT: $BB3_13: # %b.PHI.2 +; MIPS32-NEXT: $BB3_16: # %b.PHI.2 ; MIPS32-NEXT: ldc1 $f0, 8($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 76($sp) # 4-byte Folded Reload ; MIPS32-NEXT: andi $2, $1, 1 @@ -664,16 +709,19 @@ define void @long_chain_double_in_fpr(i1 %cnd0, i1 %cnd1, i1 %cnd2, double* %a, ; MIPS32-NEXT: sdc1 $f0, 0($sp) # 8-byte Folded Spill ; MIPS32-NEXT: sdc1 $f2, 24($sp) # 8-byte Folded Spill ; MIPS32-NEXT: sdc1 $f4, 16($sp) # 8-byte Folded Spill -; MIPS32-NEXT: bnez $2, $BB3_15 +; MIPS32-NEXT: bnez $2, $BB3_19 +; MIPS32-NEXT: nop +; MIPS32-NEXT: # %bb.17: # %b.PHI.2 +; MIPS32-NEXT: j $BB3_18 ; MIPS32-NEXT: nop -; MIPS32-NEXT: # %bb.14: # %b.PHI.2.end +; MIPS32-NEXT: $BB3_18: # %b.PHI.2.end ; MIPS32-NEXT: ldc1 $f0, 0($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 60($sp) # 4-byte Folded Reload ; MIPS32-NEXT: sdc1 $f0, 0($1) ; MIPS32-NEXT: addiu $sp, $sp, 88 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: nop -; MIPS32-NEXT: $BB3_15: # %b.PHI.3 +; MIPS32-NEXT: $BB3_19: # %b.PHI.3 ; MIPS32-NEXT: ldc1 $f0, 16($sp) # 8-byte Folded Reload ; MIPS32-NEXT: ldc1 $f2, 24($sp) # 8-byte Folded Reload ; MIPS32-NEXT: lw $1, 72($sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/atomic-min-max.ll b/llvm/test/CodeGen/Mips/atomic-min-max.ll index 646af650c00e7..a6200851940cd 100644 --- a/llvm/test/CodeGen/Mips/atomic-min-max.ll +++ b/llvm/test/CodeGen/Mips/atomic-min-max.ll @@ -1154,26 +1154,26 @@ define i16 @test_max_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64-NEXT: sll $2, $2, 3 ; MIPS64-NEXT: ori $3, $zero, 65535 ; MIPS64-NEXT: sllv $3, $3, $2 -; MIPS64-NEXT: nor $4, $zero, $3 +; MIPS64-NEXT: nor $6, $zero, $3 ; MIPS64-NEXT: sllv $5, $5, $2 ; MIPS64-NEXT: .LBB4_1: # %entry ; MIPS64-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64-NEXT: ll $7, 0($1) -; MIPS64-NEXT: slt $10, $7, $5 -; MIPS64-NEXT: move $8, $7 -; MIPS64-NEXT: movn $8, $5, $10 -; MIPS64-NEXT: and $8, $8, $3 -; MIPS64-NEXT: and $9, $7, $4 -; MIPS64-NEXT: or $9, $9, $8 -; MIPS64-NEXT: sc $9, 0($1) -; MIPS64-NEXT: beqz $9, .LBB4_1 +; MIPS64-NEXT: ll $8, 0($1) +; MIPS64-NEXT: slt $11, $8, $5 +; MIPS64-NEXT: move $9, $8 +; MIPS64-NEXT: movn $9, $5, $11 +; MIPS64-NEXT: and $9, $9, $3 +; MIPS64-NEXT: and $10, $8, $6 +; MIPS64-NEXT: or $10, $10, $9 +; MIPS64-NEXT: sc $10, 0($1) +; MIPS64-NEXT: beqz $10, .LBB4_1 ; MIPS64-NEXT: nop ; MIPS64-NEXT: # %bb.2: # %entry -; MIPS64-NEXT: and $6, $7, $3 -; MIPS64-NEXT: srlv $6, $6, $2 -; MIPS64-NEXT: seh $6, $6 +; MIPS64-NEXT: and $7, $8, $3 +; MIPS64-NEXT: srlv $7, $7, $2 +; MIPS64-NEXT: seh $7, $7 ; MIPS64-NEXT: # %bb.3: # %entry -; MIPS64-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64-NEXT: # %bb.4: # %entry ; MIPS64-NEXT: sync ; MIPS64-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -1194,26 +1194,26 @@ define i16 @test_max_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64R6-NEXT: sll $2, $2, 3 ; MIPS64R6-NEXT: ori $3, $zero, 65535 ; MIPS64R6-NEXT: sllv $3, $3, $2 -; MIPS64R6-NEXT: nor $4, $zero, $3 +; MIPS64R6-NEXT: nor $6, $zero, $3 ; MIPS64R6-NEXT: sllv $5, $5, $2 ; MIPS64R6-NEXT: .LBB4_1: # %entry ; MIPS64R6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6-NEXT: ll $7, 0($1) -; MIPS64R6-NEXT: slt $10, $7, $5 -; MIPS64R6-NEXT: seleqz $8, $7, $10 -; MIPS64R6-NEXT: selnez $10, $5, $10 -; MIPS64R6-NEXT: or $8, $8, $10 -; MIPS64R6-NEXT: and $8, $8, $3 -; MIPS64R6-NEXT: and $9, $7, $4 -; MIPS64R6-NEXT: or $9, $9, $8 -; MIPS64R6-NEXT: sc $9, 0($1) -; MIPS64R6-NEXT: beqzc $9, .LBB4_1 +; MIPS64R6-NEXT: ll $8, 0($1) +; MIPS64R6-NEXT: slt $11, $8, $5 +; MIPS64R6-NEXT: seleqz $9, $8, $11 +; MIPS64R6-NEXT: selnez $11, $5, $11 +; MIPS64R6-NEXT: or $9, $9, $11 +; MIPS64R6-NEXT: and $9, $9, $3 +; MIPS64R6-NEXT: and $10, $8, $6 +; MIPS64R6-NEXT: or $10, $10, $9 +; MIPS64R6-NEXT: sc $10, 0($1) +; MIPS64R6-NEXT: beqzc $10, .LBB4_1 ; MIPS64R6-NEXT: # %bb.2: # %entry -; MIPS64R6-NEXT: and $6, $7, $3 -; MIPS64R6-NEXT: srlv $6, $6, $2 -; MIPS64R6-NEXT: seh $6, $6 +; MIPS64R6-NEXT: and $7, $8, $3 +; MIPS64R6-NEXT: srlv $7, $7, $2 +; MIPS64R6-NEXT: seh $7, $7 ; MIPS64R6-NEXT: # %bb.3: # %entry -; MIPS64R6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6-NEXT: # %bb.4: # %entry ; MIPS64R6-NEXT: sync ; MIPS64R6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -1232,28 +1232,28 @@ define i16 @test_max_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64EL-NEXT: sll $2, $2, 3 ; MIPS64EL-NEXT: ori $3, $zero, 65535 ; MIPS64EL-NEXT: sllv $3, $3, $2 -; MIPS64EL-NEXT: nor $4, $zero, $3 +; MIPS64EL-NEXT: nor $6, $zero, $3 ; MIPS64EL-NEXT: sllv $5, $5, $2 ; MIPS64EL-NEXT: .LBB4_1: # %entry ; MIPS64EL-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64EL-NEXT: ll $7, 0($1) -; MIPS64EL-NEXT: and $7, $7, $3 -; MIPS64EL-NEXT: and $5, $5, $3 -; MIPS64EL-NEXT: slt $10, $7, $5 -; MIPS64EL-NEXT: move $8, $7 -; MIPS64EL-NEXT: movn $8, $5, $10 +; MIPS64EL-NEXT: ll $8, 0($1) ; MIPS64EL-NEXT: and $8, $8, $3 -; MIPS64EL-NEXT: and $9, $7, $4 -; MIPS64EL-NEXT: or $9, $9, $8 -; MIPS64EL-NEXT: sc $9, 0($1) -; MIPS64EL-NEXT: beqz $9, .LBB4_1 +; MIPS64EL-NEXT: and $5, $5, $3 +; MIPS64EL-NEXT: slt $11, $8, $5 +; MIPS64EL-NEXT: move $9, $8 +; MIPS64EL-NEXT: movn $9, $5, $11 +; MIPS64EL-NEXT: and $9, $9, $3 +; MIPS64EL-NEXT: and $10, $8, $6 +; MIPS64EL-NEXT: or $10, $10, $9 +; MIPS64EL-NEXT: sc $10, 0($1) +; MIPS64EL-NEXT: beqz $10, .LBB4_1 ; MIPS64EL-NEXT: nop ; MIPS64EL-NEXT: # %bb.2: # %entry -; MIPS64EL-NEXT: and $6, $7, $3 -; MIPS64EL-NEXT: srlv $6, $6, $2 -; MIPS64EL-NEXT: seh $6, $6 +; MIPS64EL-NEXT: and $7, $8, $3 +; MIPS64EL-NEXT: srlv $7, $7, $2 +; MIPS64EL-NEXT: seh $7, $7 ; MIPS64EL-NEXT: # %bb.3: # %entry -; MIPS64EL-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64EL-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64EL-NEXT: # %bb.4: # %entry ; MIPS64EL-NEXT: sync ; MIPS64EL-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -1273,28 +1273,28 @@ define i16 @test_max_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64ELR6-NEXT: sll $2, $2, 3 ; MIPS64ELR6-NEXT: ori $3, $zero, 65535 ; MIPS64ELR6-NEXT: sllv $3, $3, $2 -; MIPS64ELR6-NEXT: nor $4, $zero, $3 +; MIPS64ELR6-NEXT: nor $6, $zero, $3 ; MIPS64ELR6-NEXT: sllv $5, $5, $2 ; MIPS64ELR6-NEXT: .LBB4_1: # %entry ; MIPS64ELR6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64ELR6-NEXT: ll $7, 0($1) -; MIPS64ELR6-NEXT: and $7, $7, $3 -; MIPS64ELR6-NEXT: and $5, $5, $3 -; MIPS64ELR6-NEXT: slt $10, $7, $5 -; MIPS64ELR6-NEXT: seleqz $8, $7, $10 -; MIPS64ELR6-NEXT: selnez $10, $5, $10 -; MIPS64ELR6-NEXT: or $8, $8, $10 +; MIPS64ELR6-NEXT: ll $8, 0($1) ; MIPS64ELR6-NEXT: and $8, $8, $3 -; MIPS64ELR6-NEXT: and $9, $7, $4 -; MIPS64ELR6-NEXT: or $9, $9, $8 -; MIPS64ELR6-NEXT: sc $9, 0($1) -; MIPS64ELR6-NEXT: beqzc $9, .LBB4_1 +; MIPS64ELR6-NEXT: and $5, $5, $3 +; MIPS64ELR6-NEXT: slt $11, $8, $5 +; MIPS64ELR6-NEXT: seleqz $9, $8, $11 +; MIPS64ELR6-NEXT: selnez $11, $5, $11 +; MIPS64ELR6-NEXT: or $9, $9, $11 +; MIPS64ELR6-NEXT: and $9, $9, $3 +; MIPS64ELR6-NEXT: and $10, $8, $6 +; MIPS64ELR6-NEXT: or $10, $10, $9 +; MIPS64ELR6-NEXT: sc $10, 0($1) +; MIPS64ELR6-NEXT: beqzc $10, .LBB4_1 ; MIPS64ELR6-NEXT: # %bb.2: # %entry -; MIPS64ELR6-NEXT: and $6, $7, $3 -; MIPS64ELR6-NEXT: srlv $6, $6, $2 -; MIPS64ELR6-NEXT: seh $6, $6 +; MIPS64ELR6-NEXT: and $7, $8, $3 +; MIPS64ELR6-NEXT: srlv $7, $7, $2 +; MIPS64ELR6-NEXT: seh $7, $7 ; MIPS64ELR6-NEXT: # %bb.3: # %entry -; MIPS64ELR6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64ELR6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64ELR6-NEXT: # %bb.4: # %entry ; MIPS64ELR6-NEXT: sync ; MIPS64ELR6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -1635,26 +1635,26 @@ define i16 @test_min_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64-NEXT: sll $2, $2, 3 ; MIPS64-NEXT: ori $3, $zero, 65535 ; MIPS64-NEXT: sllv $3, $3, $2 -; MIPS64-NEXT: nor $4, $zero, $3 +; MIPS64-NEXT: nor $6, $zero, $3 ; MIPS64-NEXT: sllv $5, $5, $2 ; MIPS64-NEXT: .LBB5_1: # %entry ; MIPS64-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64-NEXT: ll $7, 0($1) -; MIPS64-NEXT: slt $10, $7, $5 -; MIPS64-NEXT: move $8, $7 -; MIPS64-NEXT: movz $8, $5, $10 -; MIPS64-NEXT: and $8, $8, $3 -; MIPS64-NEXT: and $9, $7, $4 -; MIPS64-NEXT: or $9, $9, $8 -; MIPS64-NEXT: sc $9, 0($1) -; MIPS64-NEXT: beqz $9, .LBB5_1 +; MIPS64-NEXT: ll $8, 0($1) +; MIPS64-NEXT: slt $11, $8, $5 +; MIPS64-NEXT: move $9, $8 +; MIPS64-NEXT: movz $9, $5, $11 +; MIPS64-NEXT: and $9, $9, $3 +; MIPS64-NEXT: and $10, $8, $6 +; MIPS64-NEXT: or $10, $10, $9 +; MIPS64-NEXT: sc $10, 0($1) +; MIPS64-NEXT: beqz $10, .LBB5_1 ; MIPS64-NEXT: nop ; MIPS64-NEXT: # %bb.2: # %entry -; MIPS64-NEXT: and $6, $7, $3 -; MIPS64-NEXT: srlv $6, $6, $2 -; MIPS64-NEXT: seh $6, $6 +; MIPS64-NEXT: and $7, $8, $3 +; MIPS64-NEXT: srlv $7, $7, $2 +; MIPS64-NEXT: seh $7, $7 ; MIPS64-NEXT: # %bb.3: # %entry -; MIPS64-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64-NEXT: # %bb.4: # %entry ; MIPS64-NEXT: sync ; MIPS64-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -1675,26 +1675,26 @@ define i16 @test_min_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64R6-NEXT: sll $2, $2, 3 ; MIPS64R6-NEXT: ori $3, $zero, 65535 ; MIPS64R6-NEXT: sllv $3, $3, $2 -; MIPS64R6-NEXT: nor $4, $zero, $3 +; MIPS64R6-NEXT: nor $6, $zero, $3 ; MIPS64R6-NEXT: sllv $5, $5, $2 ; MIPS64R6-NEXT: .LBB5_1: # %entry ; MIPS64R6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6-NEXT: ll $7, 0($1) -; MIPS64R6-NEXT: slt $10, $7, $5 -; MIPS64R6-NEXT: selnez $8, $7, $10 -; MIPS64R6-NEXT: seleqz $10, $5, $10 -; MIPS64R6-NEXT: or $8, $8, $10 -; MIPS64R6-NEXT: and $8, $8, $3 -; MIPS64R6-NEXT: and $9, $7, $4 -; MIPS64R6-NEXT: or $9, $9, $8 -; MIPS64R6-NEXT: sc $9, 0($1) -; MIPS64R6-NEXT: beqzc $9, .LBB5_1 +; MIPS64R6-NEXT: ll $8, 0($1) +; MIPS64R6-NEXT: slt $11, $8, $5 +; MIPS64R6-NEXT: selnez $9, $8, $11 +; MIPS64R6-NEXT: seleqz $11, $5, $11 +; MIPS64R6-NEXT: or $9, $9, $11 +; MIPS64R6-NEXT: and $9, $9, $3 +; MIPS64R6-NEXT: and $10, $8, $6 +; MIPS64R6-NEXT: or $10, $10, $9 +; MIPS64R6-NEXT: sc $10, 0($1) +; MIPS64R6-NEXT: beqzc $10, .LBB5_1 ; MIPS64R6-NEXT: # %bb.2: # %entry -; MIPS64R6-NEXT: and $6, $7, $3 -; MIPS64R6-NEXT: srlv $6, $6, $2 -; MIPS64R6-NEXT: seh $6, $6 +; MIPS64R6-NEXT: and $7, $8, $3 +; MIPS64R6-NEXT: srlv $7, $7, $2 +; MIPS64R6-NEXT: seh $7, $7 ; MIPS64R6-NEXT: # %bb.3: # %entry -; MIPS64R6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6-NEXT: # %bb.4: # %entry ; MIPS64R6-NEXT: sync ; MIPS64R6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -1713,28 +1713,28 @@ define i16 @test_min_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64EL-NEXT: sll $2, $2, 3 ; MIPS64EL-NEXT: ori $3, $zero, 65535 ; MIPS64EL-NEXT: sllv $3, $3, $2 -; MIPS64EL-NEXT: nor $4, $zero, $3 +; MIPS64EL-NEXT: nor $6, $zero, $3 ; MIPS64EL-NEXT: sllv $5, $5, $2 ; MIPS64EL-NEXT: .LBB5_1: # %entry ; MIPS64EL-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64EL-NEXT: ll $7, 0($1) -; MIPS64EL-NEXT: and $7, $7, $3 -; MIPS64EL-NEXT: and $5, $5, $3 -; MIPS64EL-NEXT: slt $10, $7, $5 -; MIPS64EL-NEXT: move $8, $7 -; MIPS64EL-NEXT: movz $8, $5, $10 +; MIPS64EL-NEXT: ll $8, 0($1) ; MIPS64EL-NEXT: and $8, $8, $3 -; MIPS64EL-NEXT: and $9, $7, $4 -; MIPS64EL-NEXT: or $9, $9, $8 -; MIPS64EL-NEXT: sc $9, 0($1) -; MIPS64EL-NEXT: beqz $9, .LBB5_1 +; MIPS64EL-NEXT: and $5, $5, $3 +; MIPS64EL-NEXT: slt $11, $8, $5 +; MIPS64EL-NEXT: move $9, $8 +; MIPS64EL-NEXT: movz $9, $5, $11 +; MIPS64EL-NEXT: and $9, $9, $3 +; MIPS64EL-NEXT: and $10, $8, $6 +; MIPS64EL-NEXT: or $10, $10, $9 +; MIPS64EL-NEXT: sc $10, 0($1) +; MIPS64EL-NEXT: beqz $10, .LBB5_1 ; MIPS64EL-NEXT: nop ; MIPS64EL-NEXT: # %bb.2: # %entry -; MIPS64EL-NEXT: and $6, $7, $3 -; MIPS64EL-NEXT: srlv $6, $6, $2 -; MIPS64EL-NEXT: seh $6, $6 +; MIPS64EL-NEXT: and $7, $8, $3 +; MIPS64EL-NEXT: srlv $7, $7, $2 +; MIPS64EL-NEXT: seh $7, $7 ; MIPS64EL-NEXT: # %bb.3: # %entry -; MIPS64EL-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64EL-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64EL-NEXT: # %bb.4: # %entry ; MIPS64EL-NEXT: sync ; MIPS64EL-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -1754,28 +1754,28 @@ define i16 @test_min_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64ELR6-NEXT: sll $2, $2, 3 ; MIPS64ELR6-NEXT: ori $3, $zero, 65535 ; MIPS64ELR6-NEXT: sllv $3, $3, $2 -; MIPS64ELR6-NEXT: nor $4, $zero, $3 +; MIPS64ELR6-NEXT: nor $6, $zero, $3 ; MIPS64ELR6-NEXT: sllv $5, $5, $2 ; MIPS64ELR6-NEXT: .LBB5_1: # %entry ; MIPS64ELR6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64ELR6-NEXT: ll $7, 0($1) -; MIPS64ELR6-NEXT: and $7, $7, $3 -; MIPS64ELR6-NEXT: and $5, $5, $3 -; MIPS64ELR6-NEXT: slt $10, $7, $5 -; MIPS64ELR6-NEXT: selnez $8, $7, $10 -; MIPS64ELR6-NEXT: seleqz $10, $5, $10 -; MIPS64ELR6-NEXT: or $8, $8, $10 +; MIPS64ELR6-NEXT: ll $8, 0($1) ; MIPS64ELR6-NEXT: and $8, $8, $3 -; MIPS64ELR6-NEXT: and $9, $7, $4 -; MIPS64ELR6-NEXT: or $9, $9, $8 -; MIPS64ELR6-NEXT: sc $9, 0($1) -; MIPS64ELR6-NEXT: beqzc $9, .LBB5_1 +; MIPS64ELR6-NEXT: and $5, $5, $3 +; MIPS64ELR6-NEXT: slt $11, $8, $5 +; MIPS64ELR6-NEXT: selnez $9, $8, $11 +; MIPS64ELR6-NEXT: seleqz $11, $5, $11 +; MIPS64ELR6-NEXT: or $9, $9, $11 +; MIPS64ELR6-NEXT: and $9, $9, $3 +; MIPS64ELR6-NEXT: and $10, $8, $6 +; MIPS64ELR6-NEXT: or $10, $10, $9 +; MIPS64ELR6-NEXT: sc $10, 0($1) +; MIPS64ELR6-NEXT: beqzc $10, .LBB5_1 ; MIPS64ELR6-NEXT: # %bb.2: # %entry -; MIPS64ELR6-NEXT: and $6, $7, $3 -; MIPS64ELR6-NEXT: srlv $6, $6, $2 -; MIPS64ELR6-NEXT: seh $6, $6 +; MIPS64ELR6-NEXT: and $7, $8, $3 +; MIPS64ELR6-NEXT: srlv $7, $7, $2 +; MIPS64ELR6-NEXT: seh $7, $7 ; MIPS64ELR6-NEXT: # %bb.3: # %entry -; MIPS64ELR6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64ELR6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64ELR6-NEXT: # %bb.4: # %entry ; MIPS64ELR6-NEXT: sync ; MIPS64ELR6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -2116,26 +2116,26 @@ define i16 @test_umax_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64-NEXT: sll $2, $2, 3 ; MIPS64-NEXT: ori $3, $zero, 65535 ; MIPS64-NEXT: sllv $3, $3, $2 -; MIPS64-NEXT: nor $4, $zero, $3 +; MIPS64-NEXT: nor $6, $zero, $3 ; MIPS64-NEXT: sllv $5, $5, $2 ; MIPS64-NEXT: .LBB6_1: # %entry ; MIPS64-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64-NEXT: ll $7, 0($1) -; MIPS64-NEXT: sltu $10, $7, $5 -; MIPS64-NEXT: move $8, $7 -; MIPS64-NEXT: movn $8, $5, $10 -; MIPS64-NEXT: and $8, $8, $3 -; MIPS64-NEXT: and $9, $7, $4 -; MIPS64-NEXT: or $9, $9, $8 -; MIPS64-NEXT: sc $9, 0($1) -; MIPS64-NEXT: beqz $9, .LBB6_1 +; MIPS64-NEXT: ll $8, 0($1) +; MIPS64-NEXT: sltu $11, $8, $5 +; MIPS64-NEXT: move $9, $8 +; MIPS64-NEXT: movn $9, $5, $11 +; MIPS64-NEXT: and $9, $9, $3 +; MIPS64-NEXT: and $10, $8, $6 +; MIPS64-NEXT: or $10, $10, $9 +; MIPS64-NEXT: sc $10, 0($1) +; MIPS64-NEXT: beqz $10, .LBB6_1 ; MIPS64-NEXT: nop ; MIPS64-NEXT: # %bb.2: # %entry -; MIPS64-NEXT: and $6, $7, $3 -; MIPS64-NEXT: srlv $6, $6, $2 -; MIPS64-NEXT: seh $6, $6 +; MIPS64-NEXT: and $7, $8, $3 +; MIPS64-NEXT: srlv $7, $7, $2 +; MIPS64-NEXT: seh $7, $7 ; MIPS64-NEXT: # %bb.3: # %entry -; MIPS64-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64-NEXT: # %bb.4: # %entry ; MIPS64-NEXT: sync ; MIPS64-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -2156,26 +2156,26 @@ define i16 @test_umax_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64R6-NEXT: sll $2, $2, 3 ; MIPS64R6-NEXT: ori $3, $zero, 65535 ; MIPS64R6-NEXT: sllv $3, $3, $2 -; MIPS64R6-NEXT: nor $4, $zero, $3 +; MIPS64R6-NEXT: nor $6, $zero, $3 ; MIPS64R6-NEXT: sllv $5, $5, $2 ; MIPS64R6-NEXT: .LBB6_1: # %entry ; MIPS64R6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6-NEXT: ll $7, 0($1) -; MIPS64R6-NEXT: sltu $10, $7, $5 -; MIPS64R6-NEXT: seleqz $8, $7, $10 -; MIPS64R6-NEXT: selnez $10, $5, $10 -; MIPS64R6-NEXT: or $8, $8, $10 -; MIPS64R6-NEXT: and $8, $8, $3 -; MIPS64R6-NEXT: and $9, $7, $4 -; MIPS64R6-NEXT: or $9, $9, $8 -; MIPS64R6-NEXT: sc $9, 0($1) -; MIPS64R6-NEXT: beqzc $9, .LBB6_1 +; MIPS64R6-NEXT: ll $8, 0($1) +; MIPS64R6-NEXT: sltu $11, $8, $5 +; MIPS64R6-NEXT: seleqz $9, $8, $11 +; MIPS64R6-NEXT: selnez $11, $5, $11 +; MIPS64R6-NEXT: or $9, $9, $11 +; MIPS64R6-NEXT: and $9, $9, $3 +; MIPS64R6-NEXT: and $10, $8, $6 +; MIPS64R6-NEXT: or $10, $10, $9 +; MIPS64R6-NEXT: sc $10, 0($1) +; MIPS64R6-NEXT: beqzc $10, .LBB6_1 ; MIPS64R6-NEXT: # %bb.2: # %entry -; MIPS64R6-NEXT: and $6, $7, $3 -; MIPS64R6-NEXT: srlv $6, $6, $2 -; MIPS64R6-NEXT: seh $6, $6 +; MIPS64R6-NEXT: and $7, $8, $3 +; MIPS64R6-NEXT: srlv $7, $7, $2 +; MIPS64R6-NEXT: seh $7, $7 ; MIPS64R6-NEXT: # %bb.3: # %entry -; MIPS64R6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6-NEXT: # %bb.4: # %entry ; MIPS64R6-NEXT: sync ; MIPS64R6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -2194,28 +2194,28 @@ define i16 @test_umax_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64EL-NEXT: sll $2, $2, 3 ; MIPS64EL-NEXT: ori $3, $zero, 65535 ; MIPS64EL-NEXT: sllv $3, $3, $2 -; MIPS64EL-NEXT: nor $4, $zero, $3 +; MIPS64EL-NEXT: nor $6, $zero, $3 ; MIPS64EL-NEXT: sllv $5, $5, $2 ; MIPS64EL-NEXT: .LBB6_1: # %entry ; MIPS64EL-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64EL-NEXT: ll $7, 0($1) -; MIPS64EL-NEXT: and $7, $7, $3 -; MIPS64EL-NEXT: and $5, $5, $3 -; MIPS64EL-NEXT: sltu $10, $7, $5 -; MIPS64EL-NEXT: move $8, $7 -; MIPS64EL-NEXT: movn $8, $5, $10 +; MIPS64EL-NEXT: ll $8, 0($1) ; MIPS64EL-NEXT: and $8, $8, $3 -; MIPS64EL-NEXT: and $9, $7, $4 -; MIPS64EL-NEXT: or $9, $9, $8 -; MIPS64EL-NEXT: sc $9, 0($1) -; MIPS64EL-NEXT: beqz $9, .LBB6_1 +; MIPS64EL-NEXT: and $5, $5, $3 +; MIPS64EL-NEXT: sltu $11, $8, $5 +; MIPS64EL-NEXT: move $9, $8 +; MIPS64EL-NEXT: movn $9, $5, $11 +; MIPS64EL-NEXT: and $9, $9, $3 +; MIPS64EL-NEXT: and $10, $8, $6 +; MIPS64EL-NEXT: or $10, $10, $9 +; MIPS64EL-NEXT: sc $10, 0($1) +; MIPS64EL-NEXT: beqz $10, .LBB6_1 ; MIPS64EL-NEXT: nop ; MIPS64EL-NEXT: # %bb.2: # %entry -; MIPS64EL-NEXT: and $6, $7, $3 -; MIPS64EL-NEXT: srlv $6, $6, $2 -; MIPS64EL-NEXT: seh $6, $6 +; MIPS64EL-NEXT: and $7, $8, $3 +; MIPS64EL-NEXT: srlv $7, $7, $2 +; MIPS64EL-NEXT: seh $7, $7 ; MIPS64EL-NEXT: # %bb.3: # %entry -; MIPS64EL-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64EL-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64EL-NEXT: # %bb.4: # %entry ; MIPS64EL-NEXT: sync ; MIPS64EL-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -2235,28 +2235,28 @@ define i16 @test_umax_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64ELR6-NEXT: sll $2, $2, 3 ; MIPS64ELR6-NEXT: ori $3, $zero, 65535 ; MIPS64ELR6-NEXT: sllv $3, $3, $2 -; MIPS64ELR6-NEXT: nor $4, $zero, $3 +; MIPS64ELR6-NEXT: nor $6, $zero, $3 ; MIPS64ELR6-NEXT: sllv $5, $5, $2 ; MIPS64ELR6-NEXT: .LBB6_1: # %entry ; MIPS64ELR6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64ELR6-NEXT: ll $7, 0($1) -; MIPS64ELR6-NEXT: and $7, $7, $3 -; MIPS64ELR6-NEXT: and $5, $5, $3 -; MIPS64ELR6-NEXT: sltu $10, $7, $5 -; MIPS64ELR6-NEXT: seleqz $8, $7, $10 -; MIPS64ELR6-NEXT: selnez $10, $5, $10 -; MIPS64ELR6-NEXT: or $8, $8, $10 +; MIPS64ELR6-NEXT: ll $8, 0($1) ; MIPS64ELR6-NEXT: and $8, $8, $3 -; MIPS64ELR6-NEXT: and $9, $7, $4 -; MIPS64ELR6-NEXT: or $9, $9, $8 -; MIPS64ELR6-NEXT: sc $9, 0($1) -; MIPS64ELR6-NEXT: beqzc $9, .LBB6_1 +; MIPS64ELR6-NEXT: and $5, $5, $3 +; MIPS64ELR6-NEXT: sltu $11, $8, $5 +; MIPS64ELR6-NEXT: seleqz $9, $8, $11 +; MIPS64ELR6-NEXT: selnez $11, $5, $11 +; MIPS64ELR6-NEXT: or $9, $9, $11 +; MIPS64ELR6-NEXT: and $9, $9, $3 +; MIPS64ELR6-NEXT: and $10, $8, $6 +; MIPS64ELR6-NEXT: or $10, $10, $9 +; MIPS64ELR6-NEXT: sc $10, 0($1) +; MIPS64ELR6-NEXT: beqzc $10, .LBB6_1 ; MIPS64ELR6-NEXT: # %bb.2: # %entry -; MIPS64ELR6-NEXT: and $6, $7, $3 -; MIPS64ELR6-NEXT: srlv $6, $6, $2 -; MIPS64ELR6-NEXT: seh $6, $6 +; MIPS64ELR6-NEXT: and $7, $8, $3 +; MIPS64ELR6-NEXT: srlv $7, $7, $2 +; MIPS64ELR6-NEXT: seh $7, $7 ; MIPS64ELR6-NEXT: # %bb.3: # %entry -; MIPS64ELR6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64ELR6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64ELR6-NEXT: # %bb.4: # %entry ; MIPS64ELR6-NEXT: sync ; MIPS64ELR6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -2597,26 +2597,26 @@ define i16 @test_umin_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64-NEXT: sll $2, $2, 3 ; MIPS64-NEXT: ori $3, $zero, 65535 ; MIPS64-NEXT: sllv $3, $3, $2 -; MIPS64-NEXT: nor $4, $zero, $3 +; MIPS64-NEXT: nor $6, $zero, $3 ; MIPS64-NEXT: sllv $5, $5, $2 ; MIPS64-NEXT: .LBB7_1: # %entry ; MIPS64-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64-NEXT: ll $7, 0($1) -; MIPS64-NEXT: sltu $10, $7, $5 -; MIPS64-NEXT: move $8, $7 -; MIPS64-NEXT: movz $8, $5, $10 -; MIPS64-NEXT: and $8, $8, $3 -; MIPS64-NEXT: and $9, $7, $4 -; MIPS64-NEXT: or $9, $9, $8 -; MIPS64-NEXT: sc $9, 0($1) -; MIPS64-NEXT: beqz $9, .LBB7_1 +; MIPS64-NEXT: ll $8, 0($1) +; MIPS64-NEXT: sltu $11, $8, $5 +; MIPS64-NEXT: move $9, $8 +; MIPS64-NEXT: movz $9, $5, $11 +; MIPS64-NEXT: and $9, $9, $3 +; MIPS64-NEXT: and $10, $8, $6 +; MIPS64-NEXT: or $10, $10, $9 +; MIPS64-NEXT: sc $10, 0($1) +; MIPS64-NEXT: beqz $10, .LBB7_1 ; MIPS64-NEXT: nop ; MIPS64-NEXT: # %bb.2: # %entry -; MIPS64-NEXT: and $6, $7, $3 -; MIPS64-NEXT: srlv $6, $6, $2 -; MIPS64-NEXT: seh $6, $6 +; MIPS64-NEXT: and $7, $8, $3 +; MIPS64-NEXT: srlv $7, $7, $2 +; MIPS64-NEXT: seh $7, $7 ; MIPS64-NEXT: # %bb.3: # %entry -; MIPS64-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64-NEXT: # %bb.4: # %entry ; MIPS64-NEXT: sync ; MIPS64-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -2637,26 +2637,26 @@ define i16 @test_umin_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64R6-NEXT: sll $2, $2, 3 ; MIPS64R6-NEXT: ori $3, $zero, 65535 ; MIPS64R6-NEXT: sllv $3, $3, $2 -; MIPS64R6-NEXT: nor $4, $zero, $3 +; MIPS64R6-NEXT: nor $6, $zero, $3 ; MIPS64R6-NEXT: sllv $5, $5, $2 ; MIPS64R6-NEXT: .LBB7_1: # %entry ; MIPS64R6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6-NEXT: ll $7, 0($1) -; MIPS64R6-NEXT: sltu $10, $7, $5 -; MIPS64R6-NEXT: selnez $8, $7, $10 -; MIPS64R6-NEXT: seleqz $10, $5, $10 -; MIPS64R6-NEXT: or $8, $8, $10 -; MIPS64R6-NEXT: and $8, $8, $3 -; MIPS64R6-NEXT: and $9, $7, $4 -; MIPS64R6-NEXT: or $9, $9, $8 -; MIPS64R6-NEXT: sc $9, 0($1) -; MIPS64R6-NEXT: beqzc $9, .LBB7_1 +; MIPS64R6-NEXT: ll $8, 0($1) +; MIPS64R6-NEXT: sltu $11, $8, $5 +; MIPS64R6-NEXT: selnez $9, $8, $11 +; MIPS64R6-NEXT: seleqz $11, $5, $11 +; MIPS64R6-NEXT: or $9, $9, $11 +; MIPS64R6-NEXT: and $9, $9, $3 +; MIPS64R6-NEXT: and $10, $8, $6 +; MIPS64R6-NEXT: or $10, $10, $9 +; MIPS64R6-NEXT: sc $10, 0($1) +; MIPS64R6-NEXT: beqzc $10, .LBB7_1 ; MIPS64R6-NEXT: # %bb.2: # %entry -; MIPS64R6-NEXT: and $6, $7, $3 -; MIPS64R6-NEXT: srlv $6, $6, $2 -; MIPS64R6-NEXT: seh $6, $6 +; MIPS64R6-NEXT: and $7, $8, $3 +; MIPS64R6-NEXT: srlv $7, $7, $2 +; MIPS64R6-NEXT: seh $7, $7 ; MIPS64R6-NEXT: # %bb.3: # %entry -; MIPS64R6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6-NEXT: # %bb.4: # %entry ; MIPS64R6-NEXT: sync ; MIPS64R6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -2675,28 +2675,28 @@ define i16 @test_umin_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64EL-NEXT: sll $2, $2, 3 ; MIPS64EL-NEXT: ori $3, $zero, 65535 ; MIPS64EL-NEXT: sllv $3, $3, $2 -; MIPS64EL-NEXT: nor $4, $zero, $3 +; MIPS64EL-NEXT: nor $6, $zero, $3 ; MIPS64EL-NEXT: sllv $5, $5, $2 ; MIPS64EL-NEXT: .LBB7_1: # %entry ; MIPS64EL-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64EL-NEXT: ll $7, 0($1) -; MIPS64EL-NEXT: and $7, $7, $3 -; MIPS64EL-NEXT: and $5, $5, $3 -; MIPS64EL-NEXT: sltu $10, $7, $5 -; MIPS64EL-NEXT: move $8, $7 -; MIPS64EL-NEXT: movz $8, $5, $10 +; MIPS64EL-NEXT: ll $8, 0($1) ; MIPS64EL-NEXT: and $8, $8, $3 -; MIPS64EL-NEXT: and $9, $7, $4 -; MIPS64EL-NEXT: or $9, $9, $8 -; MIPS64EL-NEXT: sc $9, 0($1) -; MIPS64EL-NEXT: beqz $9, .LBB7_1 +; MIPS64EL-NEXT: and $5, $5, $3 +; MIPS64EL-NEXT: sltu $11, $8, $5 +; MIPS64EL-NEXT: move $9, $8 +; MIPS64EL-NEXT: movz $9, $5, $11 +; MIPS64EL-NEXT: and $9, $9, $3 +; MIPS64EL-NEXT: and $10, $8, $6 +; MIPS64EL-NEXT: or $10, $10, $9 +; MIPS64EL-NEXT: sc $10, 0($1) +; MIPS64EL-NEXT: beqz $10, .LBB7_1 ; MIPS64EL-NEXT: nop ; MIPS64EL-NEXT: # %bb.2: # %entry -; MIPS64EL-NEXT: and $6, $7, $3 -; MIPS64EL-NEXT: srlv $6, $6, $2 -; MIPS64EL-NEXT: seh $6, $6 +; MIPS64EL-NEXT: and $7, $8, $3 +; MIPS64EL-NEXT: srlv $7, $7, $2 +; MIPS64EL-NEXT: seh $7, $7 ; MIPS64EL-NEXT: # %bb.3: # %entry -; MIPS64EL-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64EL-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64EL-NEXT: # %bb.4: # %entry ; MIPS64EL-NEXT: sync ; MIPS64EL-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -2716,28 +2716,28 @@ define i16 @test_umin_16(i16* nocapture %ptr, i16 signext %val) { ; MIPS64ELR6-NEXT: sll $2, $2, 3 ; MIPS64ELR6-NEXT: ori $3, $zero, 65535 ; MIPS64ELR6-NEXT: sllv $3, $3, $2 -; MIPS64ELR6-NEXT: nor $4, $zero, $3 +; MIPS64ELR6-NEXT: nor $6, $zero, $3 ; MIPS64ELR6-NEXT: sllv $5, $5, $2 ; MIPS64ELR6-NEXT: .LBB7_1: # %entry ; MIPS64ELR6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64ELR6-NEXT: ll $7, 0($1) -; MIPS64ELR6-NEXT: and $7, $7, $3 -; MIPS64ELR6-NEXT: and $5, $5, $3 -; MIPS64ELR6-NEXT: sltu $10, $7, $5 -; MIPS64ELR6-NEXT: selnez $8, $7, $10 -; MIPS64ELR6-NEXT: seleqz $10, $5, $10 -; MIPS64ELR6-NEXT: or $8, $8, $10 +; MIPS64ELR6-NEXT: ll $8, 0($1) ; MIPS64ELR6-NEXT: and $8, $8, $3 -; MIPS64ELR6-NEXT: and $9, $7, $4 -; MIPS64ELR6-NEXT: or $9, $9, $8 -; MIPS64ELR6-NEXT: sc $9, 0($1) -; MIPS64ELR6-NEXT: beqzc $9, .LBB7_1 +; MIPS64ELR6-NEXT: and $5, $5, $3 +; MIPS64ELR6-NEXT: sltu $11, $8, $5 +; MIPS64ELR6-NEXT: selnez $9, $8, $11 +; MIPS64ELR6-NEXT: seleqz $11, $5, $11 +; MIPS64ELR6-NEXT: or $9, $9, $11 +; MIPS64ELR6-NEXT: and $9, $9, $3 +; MIPS64ELR6-NEXT: and $10, $8, $6 +; MIPS64ELR6-NEXT: or $10, $10, $9 +; MIPS64ELR6-NEXT: sc $10, 0($1) +; MIPS64ELR6-NEXT: beqzc $10, .LBB7_1 ; MIPS64ELR6-NEXT: # %bb.2: # %entry -; MIPS64ELR6-NEXT: and $6, $7, $3 -; MIPS64ELR6-NEXT: srlv $6, $6, $2 -; MIPS64ELR6-NEXT: seh $6, $6 +; MIPS64ELR6-NEXT: and $7, $8, $3 +; MIPS64ELR6-NEXT: srlv $7, $7, $2 +; MIPS64ELR6-NEXT: seh $7, $7 ; MIPS64ELR6-NEXT: # %bb.3: # %entry -; MIPS64ELR6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64ELR6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64ELR6-NEXT: # %bb.4: # %entry ; MIPS64ELR6-NEXT: sync ; MIPS64ELR6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -3079,26 +3079,26 @@ define i8 @test_max_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64-NEXT: sll $2, $2, 3 ; MIPS64-NEXT: ori $3, $zero, 255 ; MIPS64-NEXT: sllv $3, $3, $2 -; MIPS64-NEXT: nor $4, $zero, $3 +; MIPS64-NEXT: nor $6, $zero, $3 ; MIPS64-NEXT: sllv $5, $5, $2 ; MIPS64-NEXT: .LBB8_1: # %entry ; MIPS64-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64-NEXT: ll $7, 0($1) -; MIPS64-NEXT: slt $10, $7, $5 -; MIPS64-NEXT: move $8, $7 -; MIPS64-NEXT: movn $8, $5, $10 -; MIPS64-NEXT: and $8, $8, $3 -; MIPS64-NEXT: and $9, $7, $4 -; MIPS64-NEXT: or $9, $9, $8 -; MIPS64-NEXT: sc $9, 0($1) -; MIPS64-NEXT: beqz $9, .LBB8_1 +; MIPS64-NEXT: ll $8, 0($1) +; MIPS64-NEXT: slt $11, $8, $5 +; MIPS64-NEXT: move $9, $8 +; MIPS64-NEXT: movn $9, $5, $11 +; MIPS64-NEXT: and $9, $9, $3 +; MIPS64-NEXT: and $10, $8, $6 +; MIPS64-NEXT: or $10, $10, $9 +; MIPS64-NEXT: sc $10, 0($1) +; MIPS64-NEXT: beqz $10, .LBB8_1 ; MIPS64-NEXT: nop ; MIPS64-NEXT: # %bb.2: # %entry -; MIPS64-NEXT: and $6, $7, $3 -; MIPS64-NEXT: srlv $6, $6, $2 -; MIPS64-NEXT: seh $6, $6 +; MIPS64-NEXT: and $7, $8, $3 +; MIPS64-NEXT: srlv $7, $7, $2 +; MIPS64-NEXT: seh $7, $7 ; MIPS64-NEXT: # %bb.3: # %entry -; MIPS64-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64-NEXT: # %bb.4: # %entry ; MIPS64-NEXT: sync ; MIPS64-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -3119,26 +3119,26 @@ define i8 @test_max_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64R6-NEXT: sll $2, $2, 3 ; MIPS64R6-NEXT: ori $3, $zero, 255 ; MIPS64R6-NEXT: sllv $3, $3, $2 -; MIPS64R6-NEXT: nor $4, $zero, $3 +; MIPS64R6-NEXT: nor $6, $zero, $3 ; MIPS64R6-NEXT: sllv $5, $5, $2 ; MIPS64R6-NEXT: .LBB8_1: # %entry ; MIPS64R6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6-NEXT: ll $7, 0($1) -; MIPS64R6-NEXT: slt $10, $7, $5 -; MIPS64R6-NEXT: seleqz $8, $7, $10 -; MIPS64R6-NEXT: selnez $10, $5, $10 -; MIPS64R6-NEXT: or $8, $8, $10 -; MIPS64R6-NEXT: and $8, $8, $3 -; MIPS64R6-NEXT: and $9, $7, $4 -; MIPS64R6-NEXT: or $9, $9, $8 -; MIPS64R6-NEXT: sc $9, 0($1) -; MIPS64R6-NEXT: beqzc $9, .LBB8_1 +; MIPS64R6-NEXT: ll $8, 0($1) +; MIPS64R6-NEXT: slt $11, $8, $5 +; MIPS64R6-NEXT: seleqz $9, $8, $11 +; MIPS64R6-NEXT: selnez $11, $5, $11 +; MIPS64R6-NEXT: or $9, $9, $11 +; MIPS64R6-NEXT: and $9, $9, $3 +; MIPS64R6-NEXT: and $10, $8, $6 +; MIPS64R6-NEXT: or $10, $10, $9 +; MIPS64R6-NEXT: sc $10, 0($1) +; MIPS64R6-NEXT: beqzc $10, .LBB8_1 ; MIPS64R6-NEXT: # %bb.2: # %entry -; MIPS64R6-NEXT: and $6, $7, $3 -; MIPS64R6-NEXT: srlv $6, $6, $2 -; MIPS64R6-NEXT: seh $6, $6 +; MIPS64R6-NEXT: and $7, $8, $3 +; MIPS64R6-NEXT: srlv $7, $7, $2 +; MIPS64R6-NEXT: seh $7, $7 ; MIPS64R6-NEXT: # %bb.3: # %entry -; MIPS64R6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6-NEXT: # %bb.4: # %entry ; MIPS64R6-NEXT: sync ; MIPS64R6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -3157,28 +3157,28 @@ define i8 @test_max_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64EL-NEXT: sll $2, $2, 3 ; MIPS64EL-NEXT: ori $3, $zero, 255 ; MIPS64EL-NEXT: sllv $3, $3, $2 -; MIPS64EL-NEXT: nor $4, $zero, $3 +; MIPS64EL-NEXT: nor $6, $zero, $3 ; MIPS64EL-NEXT: sllv $5, $5, $2 ; MIPS64EL-NEXT: .LBB8_1: # %entry ; MIPS64EL-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64EL-NEXT: ll $7, 0($1) -; MIPS64EL-NEXT: and $7, $7, $3 -; MIPS64EL-NEXT: and $5, $5, $3 -; MIPS64EL-NEXT: slt $10, $7, $5 -; MIPS64EL-NEXT: move $8, $7 -; MIPS64EL-NEXT: movn $8, $5, $10 +; MIPS64EL-NEXT: ll $8, 0($1) ; MIPS64EL-NEXT: and $8, $8, $3 -; MIPS64EL-NEXT: and $9, $7, $4 -; MIPS64EL-NEXT: or $9, $9, $8 -; MIPS64EL-NEXT: sc $9, 0($1) -; MIPS64EL-NEXT: beqz $9, .LBB8_1 +; MIPS64EL-NEXT: and $5, $5, $3 +; MIPS64EL-NEXT: slt $11, $8, $5 +; MIPS64EL-NEXT: move $9, $8 +; MIPS64EL-NEXT: movn $9, $5, $11 +; MIPS64EL-NEXT: and $9, $9, $3 +; MIPS64EL-NEXT: and $10, $8, $6 +; MIPS64EL-NEXT: or $10, $10, $9 +; MIPS64EL-NEXT: sc $10, 0($1) +; MIPS64EL-NEXT: beqz $10, .LBB8_1 ; MIPS64EL-NEXT: nop ; MIPS64EL-NEXT: # %bb.2: # %entry -; MIPS64EL-NEXT: and $6, $7, $3 -; MIPS64EL-NEXT: srlv $6, $6, $2 -; MIPS64EL-NEXT: seh $6, $6 +; MIPS64EL-NEXT: and $7, $8, $3 +; MIPS64EL-NEXT: srlv $7, $7, $2 +; MIPS64EL-NEXT: seh $7, $7 ; MIPS64EL-NEXT: # %bb.3: # %entry -; MIPS64EL-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64EL-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64EL-NEXT: # %bb.4: # %entry ; MIPS64EL-NEXT: sync ; MIPS64EL-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -3198,28 +3198,28 @@ define i8 @test_max_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64ELR6-NEXT: sll $2, $2, 3 ; MIPS64ELR6-NEXT: ori $3, $zero, 255 ; MIPS64ELR6-NEXT: sllv $3, $3, $2 -; MIPS64ELR6-NEXT: nor $4, $zero, $3 +; MIPS64ELR6-NEXT: nor $6, $zero, $3 ; MIPS64ELR6-NEXT: sllv $5, $5, $2 ; MIPS64ELR6-NEXT: .LBB8_1: # %entry ; MIPS64ELR6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64ELR6-NEXT: ll $7, 0($1) -; MIPS64ELR6-NEXT: and $7, $7, $3 -; MIPS64ELR6-NEXT: and $5, $5, $3 -; MIPS64ELR6-NEXT: slt $10, $7, $5 -; MIPS64ELR6-NEXT: seleqz $8, $7, $10 -; MIPS64ELR6-NEXT: selnez $10, $5, $10 -; MIPS64ELR6-NEXT: or $8, $8, $10 +; MIPS64ELR6-NEXT: ll $8, 0($1) ; MIPS64ELR6-NEXT: and $8, $8, $3 -; MIPS64ELR6-NEXT: and $9, $7, $4 -; MIPS64ELR6-NEXT: or $9, $9, $8 -; MIPS64ELR6-NEXT: sc $9, 0($1) -; MIPS64ELR6-NEXT: beqzc $9, .LBB8_1 +; MIPS64ELR6-NEXT: and $5, $5, $3 +; MIPS64ELR6-NEXT: slt $11, $8, $5 +; MIPS64ELR6-NEXT: seleqz $9, $8, $11 +; MIPS64ELR6-NEXT: selnez $11, $5, $11 +; MIPS64ELR6-NEXT: or $9, $9, $11 +; MIPS64ELR6-NEXT: and $9, $9, $3 +; MIPS64ELR6-NEXT: and $10, $8, $6 +; MIPS64ELR6-NEXT: or $10, $10, $9 +; MIPS64ELR6-NEXT: sc $10, 0($1) +; MIPS64ELR6-NEXT: beqzc $10, .LBB8_1 ; MIPS64ELR6-NEXT: # %bb.2: # %entry -; MIPS64ELR6-NEXT: and $6, $7, $3 -; MIPS64ELR6-NEXT: srlv $6, $6, $2 -; MIPS64ELR6-NEXT: seh $6, $6 +; MIPS64ELR6-NEXT: and $7, $8, $3 +; MIPS64ELR6-NEXT: srlv $7, $7, $2 +; MIPS64ELR6-NEXT: seh $7, $7 ; MIPS64ELR6-NEXT: # %bb.3: # %entry -; MIPS64ELR6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64ELR6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64ELR6-NEXT: # %bb.4: # %entry ; MIPS64ELR6-NEXT: sync ; MIPS64ELR6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -3560,26 +3560,26 @@ define i8 @test_min_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64-NEXT: sll $2, $2, 3 ; MIPS64-NEXT: ori $3, $zero, 255 ; MIPS64-NEXT: sllv $3, $3, $2 -; MIPS64-NEXT: nor $4, $zero, $3 +; MIPS64-NEXT: nor $6, $zero, $3 ; MIPS64-NEXT: sllv $5, $5, $2 ; MIPS64-NEXT: .LBB9_1: # %entry ; MIPS64-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64-NEXT: ll $7, 0($1) -; MIPS64-NEXT: slt $10, $7, $5 -; MIPS64-NEXT: move $8, $7 -; MIPS64-NEXT: movz $8, $5, $10 -; MIPS64-NEXT: and $8, $8, $3 -; MIPS64-NEXT: and $9, $7, $4 -; MIPS64-NEXT: or $9, $9, $8 -; MIPS64-NEXT: sc $9, 0($1) -; MIPS64-NEXT: beqz $9, .LBB9_1 +; MIPS64-NEXT: ll $8, 0($1) +; MIPS64-NEXT: slt $11, $8, $5 +; MIPS64-NEXT: move $9, $8 +; MIPS64-NEXT: movz $9, $5, $11 +; MIPS64-NEXT: and $9, $9, $3 +; MIPS64-NEXT: and $10, $8, $6 +; MIPS64-NEXT: or $10, $10, $9 +; MIPS64-NEXT: sc $10, 0($1) +; MIPS64-NEXT: beqz $10, .LBB9_1 ; MIPS64-NEXT: nop ; MIPS64-NEXT: # %bb.2: # %entry -; MIPS64-NEXT: and $6, $7, $3 -; MIPS64-NEXT: srlv $6, $6, $2 -; MIPS64-NEXT: seh $6, $6 +; MIPS64-NEXT: and $7, $8, $3 +; MIPS64-NEXT: srlv $7, $7, $2 +; MIPS64-NEXT: seh $7, $7 ; MIPS64-NEXT: # %bb.3: # %entry -; MIPS64-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64-NEXT: # %bb.4: # %entry ; MIPS64-NEXT: sync ; MIPS64-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -3600,26 +3600,26 @@ define i8 @test_min_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64R6-NEXT: sll $2, $2, 3 ; MIPS64R6-NEXT: ori $3, $zero, 255 ; MIPS64R6-NEXT: sllv $3, $3, $2 -; MIPS64R6-NEXT: nor $4, $zero, $3 +; MIPS64R6-NEXT: nor $6, $zero, $3 ; MIPS64R6-NEXT: sllv $5, $5, $2 ; MIPS64R6-NEXT: .LBB9_1: # %entry ; MIPS64R6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6-NEXT: ll $7, 0($1) -; MIPS64R6-NEXT: slt $10, $7, $5 -; MIPS64R6-NEXT: selnez $8, $7, $10 -; MIPS64R6-NEXT: seleqz $10, $5, $10 -; MIPS64R6-NEXT: or $8, $8, $10 -; MIPS64R6-NEXT: and $8, $8, $3 -; MIPS64R6-NEXT: and $9, $7, $4 -; MIPS64R6-NEXT: or $9, $9, $8 -; MIPS64R6-NEXT: sc $9, 0($1) -; MIPS64R6-NEXT: beqzc $9, .LBB9_1 +; MIPS64R6-NEXT: ll $8, 0($1) +; MIPS64R6-NEXT: slt $11, $8, $5 +; MIPS64R6-NEXT: selnez $9, $8, $11 +; MIPS64R6-NEXT: seleqz $11, $5, $11 +; MIPS64R6-NEXT: or $9, $9, $11 +; MIPS64R6-NEXT: and $9, $9, $3 +; MIPS64R6-NEXT: and $10, $8, $6 +; MIPS64R6-NEXT: or $10, $10, $9 +; MIPS64R6-NEXT: sc $10, 0($1) +; MIPS64R6-NEXT: beqzc $10, .LBB9_1 ; MIPS64R6-NEXT: # %bb.2: # %entry -; MIPS64R6-NEXT: and $6, $7, $3 -; MIPS64R6-NEXT: srlv $6, $6, $2 -; MIPS64R6-NEXT: seh $6, $6 +; MIPS64R6-NEXT: and $7, $8, $3 +; MIPS64R6-NEXT: srlv $7, $7, $2 +; MIPS64R6-NEXT: seh $7, $7 ; MIPS64R6-NEXT: # %bb.3: # %entry -; MIPS64R6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6-NEXT: # %bb.4: # %entry ; MIPS64R6-NEXT: sync ; MIPS64R6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -3638,28 +3638,28 @@ define i8 @test_min_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64EL-NEXT: sll $2, $2, 3 ; MIPS64EL-NEXT: ori $3, $zero, 255 ; MIPS64EL-NEXT: sllv $3, $3, $2 -; MIPS64EL-NEXT: nor $4, $zero, $3 +; MIPS64EL-NEXT: nor $6, $zero, $3 ; MIPS64EL-NEXT: sllv $5, $5, $2 ; MIPS64EL-NEXT: .LBB9_1: # %entry ; MIPS64EL-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64EL-NEXT: ll $7, 0($1) -; MIPS64EL-NEXT: and $7, $7, $3 -; MIPS64EL-NEXT: and $5, $5, $3 -; MIPS64EL-NEXT: slt $10, $7, $5 -; MIPS64EL-NEXT: move $8, $7 -; MIPS64EL-NEXT: movz $8, $5, $10 +; MIPS64EL-NEXT: ll $8, 0($1) ; MIPS64EL-NEXT: and $8, $8, $3 -; MIPS64EL-NEXT: and $9, $7, $4 -; MIPS64EL-NEXT: or $9, $9, $8 -; MIPS64EL-NEXT: sc $9, 0($1) -; MIPS64EL-NEXT: beqz $9, .LBB9_1 +; MIPS64EL-NEXT: and $5, $5, $3 +; MIPS64EL-NEXT: slt $11, $8, $5 +; MIPS64EL-NEXT: move $9, $8 +; MIPS64EL-NEXT: movz $9, $5, $11 +; MIPS64EL-NEXT: and $9, $9, $3 +; MIPS64EL-NEXT: and $10, $8, $6 +; MIPS64EL-NEXT: or $10, $10, $9 +; MIPS64EL-NEXT: sc $10, 0($1) +; MIPS64EL-NEXT: beqz $10, .LBB9_1 ; MIPS64EL-NEXT: nop ; MIPS64EL-NEXT: # %bb.2: # %entry -; MIPS64EL-NEXT: and $6, $7, $3 -; MIPS64EL-NEXT: srlv $6, $6, $2 -; MIPS64EL-NEXT: seh $6, $6 +; MIPS64EL-NEXT: and $7, $8, $3 +; MIPS64EL-NEXT: srlv $7, $7, $2 +; MIPS64EL-NEXT: seh $7, $7 ; MIPS64EL-NEXT: # %bb.3: # %entry -; MIPS64EL-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64EL-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64EL-NEXT: # %bb.4: # %entry ; MIPS64EL-NEXT: sync ; MIPS64EL-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -3679,28 +3679,28 @@ define i8 @test_min_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64ELR6-NEXT: sll $2, $2, 3 ; MIPS64ELR6-NEXT: ori $3, $zero, 255 ; MIPS64ELR6-NEXT: sllv $3, $3, $2 -; MIPS64ELR6-NEXT: nor $4, $zero, $3 +; MIPS64ELR6-NEXT: nor $6, $zero, $3 ; MIPS64ELR6-NEXT: sllv $5, $5, $2 ; MIPS64ELR6-NEXT: .LBB9_1: # %entry ; MIPS64ELR6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64ELR6-NEXT: ll $7, 0($1) -; MIPS64ELR6-NEXT: and $7, $7, $3 -; MIPS64ELR6-NEXT: and $5, $5, $3 -; MIPS64ELR6-NEXT: slt $10, $7, $5 -; MIPS64ELR6-NEXT: selnez $8, $7, $10 -; MIPS64ELR6-NEXT: seleqz $10, $5, $10 -; MIPS64ELR6-NEXT: or $8, $8, $10 +; MIPS64ELR6-NEXT: ll $8, 0($1) ; MIPS64ELR6-NEXT: and $8, $8, $3 -; MIPS64ELR6-NEXT: and $9, $7, $4 -; MIPS64ELR6-NEXT: or $9, $9, $8 -; MIPS64ELR6-NEXT: sc $9, 0($1) -; MIPS64ELR6-NEXT: beqzc $9, .LBB9_1 +; MIPS64ELR6-NEXT: and $5, $5, $3 +; MIPS64ELR6-NEXT: slt $11, $8, $5 +; MIPS64ELR6-NEXT: selnez $9, $8, $11 +; MIPS64ELR6-NEXT: seleqz $11, $5, $11 +; MIPS64ELR6-NEXT: or $9, $9, $11 +; MIPS64ELR6-NEXT: and $9, $9, $3 +; MIPS64ELR6-NEXT: and $10, $8, $6 +; MIPS64ELR6-NEXT: or $10, $10, $9 +; MIPS64ELR6-NEXT: sc $10, 0($1) +; MIPS64ELR6-NEXT: beqzc $10, .LBB9_1 ; MIPS64ELR6-NEXT: # %bb.2: # %entry -; MIPS64ELR6-NEXT: and $6, $7, $3 -; MIPS64ELR6-NEXT: srlv $6, $6, $2 -; MIPS64ELR6-NEXT: seh $6, $6 +; MIPS64ELR6-NEXT: and $7, $8, $3 +; MIPS64ELR6-NEXT: srlv $7, $7, $2 +; MIPS64ELR6-NEXT: seh $7, $7 ; MIPS64ELR6-NEXT: # %bb.3: # %entry -; MIPS64ELR6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64ELR6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64ELR6-NEXT: # %bb.4: # %entry ; MIPS64ELR6-NEXT: sync ; MIPS64ELR6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -4041,26 +4041,26 @@ define i8 @test_umax_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64-NEXT: sll $2, $2, 3 ; MIPS64-NEXT: ori $3, $zero, 255 ; MIPS64-NEXT: sllv $3, $3, $2 -; MIPS64-NEXT: nor $4, $zero, $3 +; MIPS64-NEXT: nor $6, $zero, $3 ; MIPS64-NEXT: sllv $5, $5, $2 ; MIPS64-NEXT: .LBB10_1: # %entry ; MIPS64-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64-NEXT: ll $7, 0($1) -; MIPS64-NEXT: sltu $10, $7, $5 -; MIPS64-NEXT: move $8, $7 -; MIPS64-NEXT: movn $8, $5, $10 -; MIPS64-NEXT: and $8, $8, $3 -; MIPS64-NEXT: and $9, $7, $4 -; MIPS64-NEXT: or $9, $9, $8 -; MIPS64-NEXT: sc $9, 0($1) -; MIPS64-NEXT: beqz $9, .LBB10_1 +; MIPS64-NEXT: ll $8, 0($1) +; MIPS64-NEXT: sltu $11, $8, $5 +; MIPS64-NEXT: move $9, $8 +; MIPS64-NEXT: movn $9, $5, $11 +; MIPS64-NEXT: and $9, $9, $3 +; MIPS64-NEXT: and $10, $8, $6 +; MIPS64-NEXT: or $10, $10, $9 +; MIPS64-NEXT: sc $10, 0($1) +; MIPS64-NEXT: beqz $10, .LBB10_1 ; MIPS64-NEXT: nop ; MIPS64-NEXT: # %bb.2: # %entry -; MIPS64-NEXT: and $6, $7, $3 -; MIPS64-NEXT: srlv $6, $6, $2 -; MIPS64-NEXT: seh $6, $6 +; MIPS64-NEXT: and $7, $8, $3 +; MIPS64-NEXT: srlv $7, $7, $2 +; MIPS64-NEXT: seh $7, $7 ; MIPS64-NEXT: # %bb.3: # %entry -; MIPS64-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64-NEXT: # %bb.4: # %entry ; MIPS64-NEXT: sync ; MIPS64-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -4081,26 +4081,26 @@ define i8 @test_umax_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64R6-NEXT: sll $2, $2, 3 ; MIPS64R6-NEXT: ori $3, $zero, 255 ; MIPS64R6-NEXT: sllv $3, $3, $2 -; MIPS64R6-NEXT: nor $4, $zero, $3 +; MIPS64R6-NEXT: nor $6, $zero, $3 ; MIPS64R6-NEXT: sllv $5, $5, $2 ; MIPS64R6-NEXT: .LBB10_1: # %entry ; MIPS64R6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6-NEXT: ll $7, 0($1) -; MIPS64R6-NEXT: sltu $10, $7, $5 -; MIPS64R6-NEXT: seleqz $8, $7, $10 -; MIPS64R6-NEXT: selnez $10, $5, $10 -; MIPS64R6-NEXT: or $8, $8, $10 -; MIPS64R6-NEXT: and $8, $8, $3 -; MIPS64R6-NEXT: and $9, $7, $4 -; MIPS64R6-NEXT: or $9, $9, $8 -; MIPS64R6-NEXT: sc $9, 0($1) -; MIPS64R6-NEXT: beqzc $9, .LBB10_1 +; MIPS64R6-NEXT: ll $8, 0($1) +; MIPS64R6-NEXT: sltu $11, $8, $5 +; MIPS64R6-NEXT: seleqz $9, $8, $11 +; MIPS64R6-NEXT: selnez $11, $5, $11 +; MIPS64R6-NEXT: or $9, $9, $11 +; MIPS64R6-NEXT: and $9, $9, $3 +; MIPS64R6-NEXT: and $10, $8, $6 +; MIPS64R6-NEXT: or $10, $10, $9 +; MIPS64R6-NEXT: sc $10, 0($1) +; MIPS64R6-NEXT: beqzc $10, .LBB10_1 ; MIPS64R6-NEXT: # %bb.2: # %entry -; MIPS64R6-NEXT: and $6, $7, $3 -; MIPS64R6-NEXT: srlv $6, $6, $2 -; MIPS64R6-NEXT: seh $6, $6 +; MIPS64R6-NEXT: and $7, $8, $3 +; MIPS64R6-NEXT: srlv $7, $7, $2 +; MIPS64R6-NEXT: seh $7, $7 ; MIPS64R6-NEXT: # %bb.3: # %entry -; MIPS64R6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6-NEXT: # %bb.4: # %entry ; MIPS64R6-NEXT: sync ; MIPS64R6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -4119,28 +4119,28 @@ define i8 @test_umax_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64EL-NEXT: sll $2, $2, 3 ; MIPS64EL-NEXT: ori $3, $zero, 255 ; MIPS64EL-NEXT: sllv $3, $3, $2 -; MIPS64EL-NEXT: nor $4, $zero, $3 +; MIPS64EL-NEXT: nor $6, $zero, $3 ; MIPS64EL-NEXT: sllv $5, $5, $2 ; MIPS64EL-NEXT: .LBB10_1: # %entry ; MIPS64EL-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64EL-NEXT: ll $7, 0($1) -; MIPS64EL-NEXT: and $7, $7, $3 -; MIPS64EL-NEXT: and $5, $5, $3 -; MIPS64EL-NEXT: sltu $10, $7, $5 -; MIPS64EL-NEXT: move $8, $7 -; MIPS64EL-NEXT: movn $8, $5, $10 +; MIPS64EL-NEXT: ll $8, 0($1) ; MIPS64EL-NEXT: and $8, $8, $3 -; MIPS64EL-NEXT: and $9, $7, $4 -; MIPS64EL-NEXT: or $9, $9, $8 -; MIPS64EL-NEXT: sc $9, 0($1) -; MIPS64EL-NEXT: beqz $9, .LBB10_1 +; MIPS64EL-NEXT: and $5, $5, $3 +; MIPS64EL-NEXT: sltu $11, $8, $5 +; MIPS64EL-NEXT: move $9, $8 +; MIPS64EL-NEXT: movn $9, $5, $11 +; MIPS64EL-NEXT: and $9, $9, $3 +; MIPS64EL-NEXT: and $10, $8, $6 +; MIPS64EL-NEXT: or $10, $10, $9 +; MIPS64EL-NEXT: sc $10, 0($1) +; MIPS64EL-NEXT: beqz $10, .LBB10_1 ; MIPS64EL-NEXT: nop ; MIPS64EL-NEXT: # %bb.2: # %entry -; MIPS64EL-NEXT: and $6, $7, $3 -; MIPS64EL-NEXT: srlv $6, $6, $2 -; MIPS64EL-NEXT: seh $6, $6 +; MIPS64EL-NEXT: and $7, $8, $3 +; MIPS64EL-NEXT: srlv $7, $7, $2 +; MIPS64EL-NEXT: seh $7, $7 ; MIPS64EL-NEXT: # %bb.3: # %entry -; MIPS64EL-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64EL-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64EL-NEXT: # %bb.4: # %entry ; MIPS64EL-NEXT: sync ; MIPS64EL-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -4160,28 +4160,28 @@ define i8 @test_umax_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64ELR6-NEXT: sll $2, $2, 3 ; MIPS64ELR6-NEXT: ori $3, $zero, 255 ; MIPS64ELR6-NEXT: sllv $3, $3, $2 -; MIPS64ELR6-NEXT: nor $4, $zero, $3 +; MIPS64ELR6-NEXT: nor $6, $zero, $3 ; MIPS64ELR6-NEXT: sllv $5, $5, $2 ; MIPS64ELR6-NEXT: .LBB10_1: # %entry ; MIPS64ELR6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64ELR6-NEXT: ll $7, 0($1) -; MIPS64ELR6-NEXT: and $7, $7, $3 -; MIPS64ELR6-NEXT: and $5, $5, $3 -; MIPS64ELR6-NEXT: sltu $10, $7, $5 -; MIPS64ELR6-NEXT: seleqz $8, $7, $10 -; MIPS64ELR6-NEXT: selnez $10, $5, $10 -; MIPS64ELR6-NEXT: or $8, $8, $10 +; MIPS64ELR6-NEXT: ll $8, 0($1) ; MIPS64ELR6-NEXT: and $8, $8, $3 -; MIPS64ELR6-NEXT: and $9, $7, $4 -; MIPS64ELR6-NEXT: or $9, $9, $8 -; MIPS64ELR6-NEXT: sc $9, 0($1) -; MIPS64ELR6-NEXT: beqzc $9, .LBB10_1 +; MIPS64ELR6-NEXT: and $5, $5, $3 +; MIPS64ELR6-NEXT: sltu $11, $8, $5 +; MIPS64ELR6-NEXT: seleqz $9, $8, $11 +; MIPS64ELR6-NEXT: selnez $11, $5, $11 +; MIPS64ELR6-NEXT: or $9, $9, $11 +; MIPS64ELR6-NEXT: and $9, $9, $3 +; MIPS64ELR6-NEXT: and $10, $8, $6 +; MIPS64ELR6-NEXT: or $10, $10, $9 +; MIPS64ELR6-NEXT: sc $10, 0($1) +; MIPS64ELR6-NEXT: beqzc $10, .LBB10_1 ; MIPS64ELR6-NEXT: # %bb.2: # %entry -; MIPS64ELR6-NEXT: and $6, $7, $3 -; MIPS64ELR6-NEXT: srlv $6, $6, $2 -; MIPS64ELR6-NEXT: seh $6, $6 +; MIPS64ELR6-NEXT: and $7, $8, $3 +; MIPS64ELR6-NEXT: srlv $7, $7, $2 +; MIPS64ELR6-NEXT: seh $7, $7 ; MIPS64ELR6-NEXT: # %bb.3: # %entry -; MIPS64ELR6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64ELR6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64ELR6-NEXT: # %bb.4: # %entry ; MIPS64ELR6-NEXT: sync ; MIPS64ELR6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -4522,26 +4522,26 @@ define i8 @test_umin_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64-NEXT: sll $2, $2, 3 ; MIPS64-NEXT: ori $3, $zero, 255 ; MIPS64-NEXT: sllv $3, $3, $2 -; MIPS64-NEXT: nor $4, $zero, $3 +; MIPS64-NEXT: nor $6, $zero, $3 ; MIPS64-NEXT: sllv $5, $5, $2 ; MIPS64-NEXT: .LBB11_1: # %entry ; MIPS64-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64-NEXT: ll $7, 0($1) -; MIPS64-NEXT: sltu $10, $7, $5 -; MIPS64-NEXT: move $8, $7 -; MIPS64-NEXT: movz $8, $5, $10 -; MIPS64-NEXT: and $8, $8, $3 -; MIPS64-NEXT: and $9, $7, $4 -; MIPS64-NEXT: or $9, $9, $8 -; MIPS64-NEXT: sc $9, 0($1) -; MIPS64-NEXT: beqz $9, .LBB11_1 +; MIPS64-NEXT: ll $8, 0($1) +; MIPS64-NEXT: sltu $11, $8, $5 +; MIPS64-NEXT: move $9, $8 +; MIPS64-NEXT: movz $9, $5, $11 +; MIPS64-NEXT: and $9, $9, $3 +; MIPS64-NEXT: and $10, $8, $6 +; MIPS64-NEXT: or $10, $10, $9 +; MIPS64-NEXT: sc $10, 0($1) +; MIPS64-NEXT: beqz $10, .LBB11_1 ; MIPS64-NEXT: nop ; MIPS64-NEXT: # %bb.2: # %entry -; MIPS64-NEXT: and $6, $7, $3 -; MIPS64-NEXT: srlv $6, $6, $2 -; MIPS64-NEXT: seh $6, $6 +; MIPS64-NEXT: and $7, $8, $3 +; MIPS64-NEXT: srlv $7, $7, $2 +; MIPS64-NEXT: seh $7, $7 ; MIPS64-NEXT: # %bb.3: # %entry -; MIPS64-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64-NEXT: # %bb.4: # %entry ; MIPS64-NEXT: sync ; MIPS64-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -4562,26 +4562,26 @@ define i8 @test_umin_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64R6-NEXT: sll $2, $2, 3 ; MIPS64R6-NEXT: ori $3, $zero, 255 ; MIPS64R6-NEXT: sllv $3, $3, $2 -; MIPS64R6-NEXT: nor $4, $zero, $3 +; MIPS64R6-NEXT: nor $6, $zero, $3 ; MIPS64R6-NEXT: sllv $5, $5, $2 ; MIPS64R6-NEXT: .LBB11_1: # %entry ; MIPS64R6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6-NEXT: ll $7, 0($1) -; MIPS64R6-NEXT: sltu $10, $7, $5 -; MIPS64R6-NEXT: selnez $8, $7, $10 -; MIPS64R6-NEXT: seleqz $10, $5, $10 -; MIPS64R6-NEXT: or $8, $8, $10 -; MIPS64R6-NEXT: and $8, $8, $3 -; MIPS64R6-NEXT: and $9, $7, $4 -; MIPS64R6-NEXT: or $9, $9, $8 -; MIPS64R6-NEXT: sc $9, 0($1) -; MIPS64R6-NEXT: beqzc $9, .LBB11_1 +; MIPS64R6-NEXT: ll $8, 0($1) +; MIPS64R6-NEXT: sltu $11, $8, $5 +; MIPS64R6-NEXT: selnez $9, $8, $11 +; MIPS64R6-NEXT: seleqz $11, $5, $11 +; MIPS64R6-NEXT: or $9, $9, $11 +; MIPS64R6-NEXT: and $9, $9, $3 +; MIPS64R6-NEXT: and $10, $8, $6 +; MIPS64R6-NEXT: or $10, $10, $9 +; MIPS64R6-NEXT: sc $10, 0($1) +; MIPS64R6-NEXT: beqzc $10, .LBB11_1 ; MIPS64R6-NEXT: # %bb.2: # %entry -; MIPS64R6-NEXT: and $6, $7, $3 -; MIPS64R6-NEXT: srlv $6, $6, $2 -; MIPS64R6-NEXT: seh $6, $6 +; MIPS64R6-NEXT: and $7, $8, $3 +; MIPS64R6-NEXT: srlv $7, $7, $2 +; MIPS64R6-NEXT: seh $7, $7 ; MIPS64R6-NEXT: # %bb.3: # %entry -; MIPS64R6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6-NEXT: # %bb.4: # %entry ; MIPS64R6-NEXT: sync ; MIPS64R6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -4600,28 +4600,28 @@ define i8 @test_umin_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64EL-NEXT: sll $2, $2, 3 ; MIPS64EL-NEXT: ori $3, $zero, 255 ; MIPS64EL-NEXT: sllv $3, $3, $2 -; MIPS64EL-NEXT: nor $4, $zero, $3 +; MIPS64EL-NEXT: nor $6, $zero, $3 ; MIPS64EL-NEXT: sllv $5, $5, $2 ; MIPS64EL-NEXT: .LBB11_1: # %entry ; MIPS64EL-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64EL-NEXT: ll $7, 0($1) -; MIPS64EL-NEXT: and $7, $7, $3 -; MIPS64EL-NEXT: and $5, $5, $3 -; MIPS64EL-NEXT: sltu $10, $7, $5 -; MIPS64EL-NEXT: move $8, $7 -; MIPS64EL-NEXT: movz $8, $5, $10 +; MIPS64EL-NEXT: ll $8, 0($1) ; MIPS64EL-NEXT: and $8, $8, $3 -; MIPS64EL-NEXT: and $9, $7, $4 -; MIPS64EL-NEXT: or $9, $9, $8 -; MIPS64EL-NEXT: sc $9, 0($1) -; MIPS64EL-NEXT: beqz $9, .LBB11_1 +; MIPS64EL-NEXT: and $5, $5, $3 +; MIPS64EL-NEXT: sltu $11, $8, $5 +; MIPS64EL-NEXT: move $9, $8 +; MIPS64EL-NEXT: movz $9, $5, $11 +; MIPS64EL-NEXT: and $9, $9, $3 +; MIPS64EL-NEXT: and $10, $8, $6 +; MIPS64EL-NEXT: or $10, $10, $9 +; MIPS64EL-NEXT: sc $10, 0($1) +; MIPS64EL-NEXT: beqz $10, .LBB11_1 ; MIPS64EL-NEXT: nop ; MIPS64EL-NEXT: # %bb.2: # %entry -; MIPS64EL-NEXT: and $6, $7, $3 -; MIPS64EL-NEXT: srlv $6, $6, $2 -; MIPS64EL-NEXT: seh $6, $6 +; MIPS64EL-NEXT: and $7, $8, $3 +; MIPS64EL-NEXT: srlv $7, $7, $2 +; MIPS64EL-NEXT: seh $7, $7 ; MIPS64EL-NEXT: # %bb.3: # %entry -; MIPS64EL-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64EL-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64EL-NEXT: # %bb.4: # %entry ; MIPS64EL-NEXT: sync ; MIPS64EL-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -4641,28 +4641,28 @@ define i8 @test_umin_8(i8* nocapture %ptr, i8 signext %val) { ; MIPS64ELR6-NEXT: sll $2, $2, 3 ; MIPS64ELR6-NEXT: ori $3, $zero, 255 ; MIPS64ELR6-NEXT: sllv $3, $3, $2 -; MIPS64ELR6-NEXT: nor $4, $zero, $3 +; MIPS64ELR6-NEXT: nor $6, $zero, $3 ; MIPS64ELR6-NEXT: sllv $5, $5, $2 ; MIPS64ELR6-NEXT: .LBB11_1: # %entry ; MIPS64ELR6-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64ELR6-NEXT: ll $7, 0($1) -; MIPS64ELR6-NEXT: and $7, $7, $3 -; MIPS64ELR6-NEXT: and $5, $5, $3 -; MIPS64ELR6-NEXT: sltu $10, $7, $5 -; MIPS64ELR6-NEXT: selnez $8, $7, $10 -; MIPS64ELR6-NEXT: seleqz $10, $5, $10 -; MIPS64ELR6-NEXT: or $8, $8, $10 +; MIPS64ELR6-NEXT: ll $8, 0($1) ; MIPS64ELR6-NEXT: and $8, $8, $3 -; MIPS64ELR6-NEXT: and $9, $7, $4 -; MIPS64ELR6-NEXT: or $9, $9, $8 -; MIPS64ELR6-NEXT: sc $9, 0($1) -; MIPS64ELR6-NEXT: beqzc $9, .LBB11_1 +; MIPS64ELR6-NEXT: and $5, $5, $3 +; MIPS64ELR6-NEXT: sltu $11, $8, $5 +; MIPS64ELR6-NEXT: selnez $9, $8, $11 +; MIPS64ELR6-NEXT: seleqz $11, $5, $11 +; MIPS64ELR6-NEXT: or $9, $9, $11 +; MIPS64ELR6-NEXT: and $9, $9, $3 +; MIPS64ELR6-NEXT: and $10, $8, $6 +; MIPS64ELR6-NEXT: or $10, $10, $9 +; MIPS64ELR6-NEXT: sc $10, 0($1) +; MIPS64ELR6-NEXT: beqzc $10, .LBB11_1 ; MIPS64ELR6-NEXT: # %bb.2: # %entry -; MIPS64ELR6-NEXT: and $6, $7, $3 -; MIPS64ELR6-NEXT: srlv $6, $6, $2 -; MIPS64ELR6-NEXT: seh $6, $6 +; MIPS64ELR6-NEXT: and $7, $8, $3 +; MIPS64ELR6-NEXT: srlv $7, $7, $2 +; MIPS64ELR6-NEXT: seh $7, $7 ; MIPS64ELR6-NEXT: # %bb.3: # %entry -; MIPS64ELR6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64ELR6-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64ELR6-NEXT: # %bb.4: # %entry ; MIPS64ELR6-NEXT: sync ; MIPS64ELR6-NEXT: lw $2, 12($sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/atomic.ll b/llvm/test/CodeGen/Mips/atomic.ll index 59ff83e4969cc..3846fda47b138 100644 --- a/llvm/test/CodeGen/Mips/atomic.ll +++ b/llvm/test/CodeGen/Mips/atomic.ll @@ -2559,28 +2559,28 @@ define signext i8 @AtomicLoadAdd8(i8 signext %incr) nounwind { ; MIPS64R6O0-NEXT: ld $1, %got_disp(y)($1) ; MIPS64R6O0-NEXT: daddiu $2, $zero, -4 ; MIPS64R6O0-NEXT: and $2, $1, $2 -; MIPS64R6O0-NEXT: andi $1, $1, 3 -; MIPS64R6O0-NEXT: xori $1, $1, 3 -; MIPS64R6O0-NEXT: sll $1, $1, 3 -; MIPS64R6O0-NEXT: ori $3, $zero, 255 -; MIPS64R6O0-NEXT: sllv $3, $3, $1 -; MIPS64R6O0-NEXT: nor $5, $zero, $3 -; MIPS64R6O0-NEXT: sllv $4, $4, $1 +; MIPS64R6O0-NEXT: andi $3, $1, 3 +; MIPS64R6O0-NEXT: xori $3, $3, 3 +; MIPS64R6O0-NEXT: sll $3, $3, 3 +; MIPS64R6O0-NEXT: ori $5, $zero, 255 +; MIPS64R6O0-NEXT: sllv $5, $5, $3 +; MIPS64R6O0-NEXT: nor $6, $zero, $5 +; MIPS64R6O0-NEXT: sllv $4, $4, $3 ; MIPS64R6O0-NEXT: .LBB8_1: # %entry ; MIPS64R6O0-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6O0-NEXT: ll $7, 0($2) -; MIPS64R6O0-NEXT: addu $8, $7, $4 -; MIPS64R6O0-NEXT: and $8, $8, $3 -; MIPS64R6O0-NEXT: and $9, $7, $5 -; MIPS64R6O0-NEXT: or $9, $9, $8 -; MIPS64R6O0-NEXT: sc $9, 0($2) -; MIPS64R6O0-NEXT: beqzc $9, .LBB8_1 +; MIPS64R6O0-NEXT: ll $8, 0($2) +; MIPS64R6O0-NEXT: addu $9, $8, $4 +; MIPS64R6O0-NEXT: and $9, $9, $5 +; MIPS64R6O0-NEXT: and $10, $8, $6 +; MIPS64R6O0-NEXT: or $10, $10, $9 +; MIPS64R6O0-NEXT: sc $10, 0($2) +; MIPS64R6O0-NEXT: beqzc $10, .LBB8_1 ; MIPS64R6O0-NEXT: # %bb.2: # %entry -; MIPS64R6O0-NEXT: and $6, $7, $3 -; MIPS64R6O0-NEXT: srlv $6, $6, $1 -; MIPS64R6O0-NEXT: seb $6, $6 +; MIPS64R6O0-NEXT: and $7, $8, $5 +; MIPS64R6O0-NEXT: srlv $7, $7, $3 +; MIPS64R6O0-NEXT: seb $7, $7 ; MIPS64R6O0-NEXT: # %bb.3: # %entry -; MIPS64R6O0-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6O0-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6O0-NEXT: # %bb.4: # %entry ; MIPS64R6O0-NEXT: lw $1, 12($sp) # 4-byte Folded Reload ; MIPS64R6O0-NEXT: seb $2, $1 @@ -3075,28 +3075,28 @@ define signext i8 @AtomicLoadSub8(i8 signext %incr) nounwind { ; MIPS64R6O0-NEXT: ld $1, %got_disp(y)($1) ; MIPS64R6O0-NEXT: daddiu $2, $zero, -4 ; MIPS64R6O0-NEXT: and $2, $1, $2 -; MIPS64R6O0-NEXT: andi $1, $1, 3 -; MIPS64R6O0-NEXT: xori $1, $1, 3 -; MIPS64R6O0-NEXT: sll $1, $1, 3 -; MIPS64R6O0-NEXT: ori $3, $zero, 255 -; MIPS64R6O0-NEXT: sllv $3, $3, $1 -; MIPS64R6O0-NEXT: nor $5, $zero, $3 -; MIPS64R6O0-NEXT: sllv $4, $4, $1 +; MIPS64R6O0-NEXT: andi $3, $1, 3 +; MIPS64R6O0-NEXT: xori $3, $3, 3 +; MIPS64R6O0-NEXT: sll $3, $3, 3 +; MIPS64R6O0-NEXT: ori $5, $zero, 255 +; MIPS64R6O0-NEXT: sllv $5, $5, $3 +; MIPS64R6O0-NEXT: nor $6, $zero, $5 +; MIPS64R6O0-NEXT: sllv $4, $4, $3 ; MIPS64R6O0-NEXT: .LBB9_1: # %entry ; MIPS64R6O0-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6O0-NEXT: ll $7, 0($2) -; MIPS64R6O0-NEXT: subu $8, $7, $4 -; MIPS64R6O0-NEXT: and $8, $8, $3 -; MIPS64R6O0-NEXT: and $9, $7, $5 -; MIPS64R6O0-NEXT: or $9, $9, $8 -; MIPS64R6O0-NEXT: sc $9, 0($2) -; MIPS64R6O0-NEXT: beqzc $9, .LBB9_1 +; MIPS64R6O0-NEXT: ll $8, 0($2) +; MIPS64R6O0-NEXT: subu $9, $8, $4 +; MIPS64R6O0-NEXT: and $9, $9, $5 +; MIPS64R6O0-NEXT: and $10, $8, $6 +; MIPS64R6O0-NEXT: or $10, $10, $9 +; MIPS64R6O0-NEXT: sc $10, 0($2) +; MIPS64R6O0-NEXT: beqzc $10, .LBB9_1 ; MIPS64R6O0-NEXT: # %bb.2: # %entry -; MIPS64R6O0-NEXT: and $6, $7, $3 -; MIPS64R6O0-NEXT: srlv $6, $6, $1 -; MIPS64R6O0-NEXT: seb $6, $6 +; MIPS64R6O0-NEXT: and $7, $8, $5 +; MIPS64R6O0-NEXT: srlv $7, $7, $3 +; MIPS64R6O0-NEXT: seb $7, $7 ; MIPS64R6O0-NEXT: # %bb.3: # %entry -; MIPS64R6O0-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6O0-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6O0-NEXT: # %bb.4: # %entry ; MIPS64R6O0-NEXT: lw $1, 12($sp) # 4-byte Folded Reload ; MIPS64R6O0-NEXT: seb $2, $1 @@ -3601,29 +3601,29 @@ define signext i8 @AtomicLoadNand8(i8 signext %incr) nounwind { ; MIPS64R6O0-NEXT: ld $1, %got_disp(y)($1) ; MIPS64R6O0-NEXT: daddiu $2, $zero, -4 ; MIPS64R6O0-NEXT: and $2, $1, $2 -; MIPS64R6O0-NEXT: andi $1, $1, 3 -; MIPS64R6O0-NEXT: xori $1, $1, 3 -; MIPS64R6O0-NEXT: sll $1, $1, 3 -; MIPS64R6O0-NEXT: ori $3, $zero, 255 -; MIPS64R6O0-NEXT: sllv $3, $3, $1 -; MIPS64R6O0-NEXT: nor $5, $zero, $3 -; MIPS64R6O0-NEXT: sllv $4, $4, $1 +; MIPS64R6O0-NEXT: andi $3, $1, 3 +; MIPS64R6O0-NEXT: xori $3, $3, 3 +; MIPS64R6O0-NEXT: sll $3, $3, 3 +; MIPS64R6O0-NEXT: ori $5, $zero, 255 +; MIPS64R6O0-NEXT: sllv $5, $5, $3 +; MIPS64R6O0-NEXT: nor $6, $zero, $5 +; MIPS64R6O0-NEXT: sllv $4, $4, $3 ; MIPS64R6O0-NEXT: .LBB10_1: # %entry ; MIPS64R6O0-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6O0-NEXT: ll $7, 0($2) -; MIPS64R6O0-NEXT: and $8, $7, $4 -; MIPS64R6O0-NEXT: nor $8, $zero, $8 -; MIPS64R6O0-NEXT: and $8, $8, $3 -; MIPS64R6O0-NEXT: and $9, $7, $5 -; MIPS64R6O0-NEXT: or $9, $9, $8 -; MIPS64R6O0-NEXT: sc $9, 0($2) -; MIPS64R6O0-NEXT: beqzc $9, .LBB10_1 +; MIPS64R6O0-NEXT: ll $8, 0($2) +; MIPS64R6O0-NEXT: and $9, $8, $4 +; MIPS64R6O0-NEXT: nor $9, $zero, $9 +; MIPS64R6O0-NEXT: and $9, $9, $5 +; MIPS64R6O0-NEXT: and $10, $8, $6 +; MIPS64R6O0-NEXT: or $10, $10, $9 +; MIPS64R6O0-NEXT: sc $10, 0($2) +; MIPS64R6O0-NEXT: beqzc $10, .LBB10_1 ; MIPS64R6O0-NEXT: # %bb.2: # %entry -; MIPS64R6O0-NEXT: and $6, $7, $3 -; MIPS64R6O0-NEXT: srlv $6, $6, $1 -; MIPS64R6O0-NEXT: seb $6, $6 +; MIPS64R6O0-NEXT: and $7, $8, $5 +; MIPS64R6O0-NEXT: srlv $7, $7, $3 +; MIPS64R6O0-NEXT: seb $7, $7 ; MIPS64R6O0-NEXT: # %bb.3: # %entry -; MIPS64R6O0-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6O0-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6O0-NEXT: # %bb.4: # %entry ; MIPS64R6O0-NEXT: lw $1, 12($sp) # 4-byte Folded Reload ; MIPS64R6O0-NEXT: seb $2, $1 @@ -4115,27 +4115,27 @@ define signext i8 @AtomicSwap8(i8 signext %newval) nounwind { ; MIPS64R6O0-NEXT: ld $1, %got_disp(y)($1) ; MIPS64R6O0-NEXT: daddiu $2, $zero, -4 ; MIPS64R6O0-NEXT: and $2, $1, $2 -; MIPS64R6O0-NEXT: andi $1, $1, 3 -; MIPS64R6O0-NEXT: xori $1, $1, 3 -; MIPS64R6O0-NEXT: sll $1, $1, 3 -; MIPS64R6O0-NEXT: ori $3, $zero, 255 -; MIPS64R6O0-NEXT: sllv $3, $3, $1 -; MIPS64R6O0-NEXT: nor $5, $zero, $3 -; MIPS64R6O0-NEXT: sllv $4, $4, $1 +; MIPS64R6O0-NEXT: andi $3, $1, 3 +; MIPS64R6O0-NEXT: xori $3, $3, 3 +; MIPS64R6O0-NEXT: sll $3, $3, 3 +; MIPS64R6O0-NEXT: ori $5, $zero, 255 +; MIPS64R6O0-NEXT: sllv $5, $5, $3 +; MIPS64R6O0-NEXT: nor $6, $zero, $5 +; MIPS64R6O0-NEXT: sllv $4, $4, $3 ; MIPS64R6O0-NEXT: .LBB11_1: # %entry ; MIPS64R6O0-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6O0-NEXT: ll $7, 0($2) -; MIPS64R6O0-NEXT: and $8, $4, $3 -; MIPS64R6O0-NEXT: and $9, $7, $5 -; MIPS64R6O0-NEXT: or $9, $9, $8 -; MIPS64R6O0-NEXT: sc $9, 0($2) -; MIPS64R6O0-NEXT: beqzc $9, .LBB11_1 +; MIPS64R6O0-NEXT: ll $8, 0($2) +; MIPS64R6O0-NEXT: and $9, $4, $5 +; MIPS64R6O0-NEXT: and $10, $8, $6 +; MIPS64R6O0-NEXT: or $10, $10, $9 +; MIPS64R6O0-NEXT: sc $10, 0($2) +; MIPS64R6O0-NEXT: beqzc $10, .LBB11_1 ; MIPS64R6O0-NEXT: # %bb.2: # %entry -; MIPS64R6O0-NEXT: and $6, $7, $3 -; MIPS64R6O0-NEXT: srlv $6, $6, $1 -; MIPS64R6O0-NEXT: seb $6, $6 +; MIPS64R6O0-NEXT: and $7, $8, $5 +; MIPS64R6O0-NEXT: srlv $7, $7, $3 +; MIPS64R6O0-NEXT: seb $7, $7 ; MIPS64R6O0-NEXT: # %bb.3: # %entry -; MIPS64R6O0-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6O0-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6O0-NEXT: # %bb.4: # %entry ; MIPS64R6O0-NEXT: lw $1, 12($sp) # 4-byte Folded Reload ; MIPS64R6O0-NEXT: seb $2, $1 @@ -4666,32 +4666,32 @@ define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwi ; MIPS64R6O0-NEXT: ld $1, %got_disp(y)($1) ; MIPS64R6O0-NEXT: daddiu $2, $zero, -4 ; MIPS64R6O0-NEXT: and $2, $1, $2 -; MIPS64R6O0-NEXT: andi $1, $1, 3 -; MIPS64R6O0-NEXT: xori $1, $1, 3 -; MIPS64R6O0-NEXT: sll $1, $1, 3 -; MIPS64R6O0-NEXT: ori $3, $zero, 255 -; MIPS64R6O0-NEXT: sllv $3, $3, $1 -; MIPS64R6O0-NEXT: nor $6, $zero, $3 +; MIPS64R6O0-NEXT: andi $3, $1, 3 +; MIPS64R6O0-NEXT: xori $3, $3, 3 +; MIPS64R6O0-NEXT: sll $3, $3, 3 +; MIPS64R6O0-NEXT: ori $6, $zero, 255 +; MIPS64R6O0-NEXT: sllv $6, $6, $3 +; MIPS64R6O0-NEXT: nor $7, $zero, $6 ; MIPS64R6O0-NEXT: andi $4, $4, 255 -; MIPS64R6O0-NEXT: sllv $4, $4, $1 +; MIPS64R6O0-NEXT: sllv $4, $4, $3 ; MIPS64R6O0-NEXT: andi $5, $5, 255 -; MIPS64R6O0-NEXT: sllv $5, $5, $1 +; MIPS64R6O0-NEXT: sllv $5, $5, $3 ; MIPS64R6O0-NEXT: .LBB12_1: # %entry ; MIPS64R6O0-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6O0-NEXT: ll $8, 0($2) -; MIPS64R6O0-NEXT: and $9, $8, $3 -; MIPS64R6O0-NEXT: bnec $9, $4, .LBB12_3 +; MIPS64R6O0-NEXT: ll $9, 0($2) +; MIPS64R6O0-NEXT: and $10, $9, $6 +; MIPS64R6O0-NEXT: bnec $10, $4, .LBB12_3 ; MIPS64R6O0-NEXT: # %bb.2: # %entry ; MIPS64R6O0-NEXT: # in Loop: Header=BB12_1 Depth=1 -; MIPS64R6O0-NEXT: and $8, $8, $6 -; MIPS64R6O0-NEXT: or $8, $8, $5 -; MIPS64R6O0-NEXT: sc $8, 0($2) -; MIPS64R6O0-NEXT: beqzc $8, .LBB12_1 +; MIPS64R6O0-NEXT: and $9, $9, $7 +; MIPS64R6O0-NEXT: or $9, $9, $5 +; MIPS64R6O0-NEXT: sc $9, 0($2) +; MIPS64R6O0-NEXT: beqzc $9, .LBB12_1 ; MIPS64R6O0-NEXT: .LBB12_3: # %entry -; MIPS64R6O0-NEXT: srlv $7, $9, $1 -; MIPS64R6O0-NEXT: seb $7, $7 +; MIPS64R6O0-NEXT: srlv $8, $10, $3 +; MIPS64R6O0-NEXT: seb $8, $8 ; MIPS64R6O0-NEXT: # %bb.4: # %entry -; MIPS64R6O0-NEXT: sw $7, 12($sp) # 4-byte Folded Spill +; MIPS64R6O0-NEXT: sw $8, 12($sp) # 4-byte Folded Spill ; MIPS64R6O0-NEXT: # %bb.5: # %entry ; MIPS64R6O0-NEXT: lw $2, 12($sp) # 4-byte Folded Reload ; MIPS64R6O0-NEXT: daddiu $sp, $sp, 16 @@ -5236,28 +5236,28 @@ define i1 @AtomicCmpSwapRes8(i8* %ptr, i8 signext %oldval, i8 signext %newval) n ; MIPS64R6O0-NEXT: sll $2, $2, 3 ; MIPS64R6O0-NEXT: ori $3, $zero, 255 ; MIPS64R6O0-NEXT: sllv $3, $3, $2 -; MIPS64R6O0-NEXT: nor $4, $zero, $3 -; MIPS64R6O0-NEXT: andi $7, $5, 255 -; MIPS64R6O0-NEXT: sllv $7, $7, $2 +; MIPS64R6O0-NEXT: nor $7, $zero, $3 +; MIPS64R6O0-NEXT: andi $8, $5, 255 +; MIPS64R6O0-NEXT: sllv $8, $8, $2 ; MIPS64R6O0-NEXT: andi $6, $6, 255 ; MIPS64R6O0-NEXT: sllv $6, $6, $2 ; MIPS64R6O0-NEXT: .LBB13_1: # %entry ; MIPS64R6O0-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6O0-NEXT: ll $9, 0($1) -; MIPS64R6O0-NEXT: and $10, $9, $3 -; MIPS64R6O0-NEXT: bnec $10, $7, .LBB13_3 +; MIPS64R6O0-NEXT: ll $10, 0($1) +; MIPS64R6O0-NEXT: and $11, $10, $3 +; MIPS64R6O0-NEXT: bnec $11, $8, .LBB13_3 ; MIPS64R6O0-NEXT: # %bb.2: # %entry ; MIPS64R6O0-NEXT: # in Loop: Header=BB13_1 Depth=1 -; MIPS64R6O0-NEXT: and $9, $9, $4 -; MIPS64R6O0-NEXT: or $9, $9, $6 -; MIPS64R6O0-NEXT: sc $9, 0($1) -; MIPS64R6O0-NEXT: beqzc $9, .LBB13_1 +; MIPS64R6O0-NEXT: and $10, $10, $7 +; MIPS64R6O0-NEXT: or $10, $10, $6 +; MIPS64R6O0-NEXT: sc $10, 0($1) +; MIPS64R6O0-NEXT: beqzc $10, .LBB13_1 ; MIPS64R6O0-NEXT: .LBB13_3: # %entry -; MIPS64R6O0-NEXT: srlv $8, $10, $2 -; MIPS64R6O0-NEXT: seb $8, $8 +; MIPS64R6O0-NEXT: srlv $9, $11, $2 +; MIPS64R6O0-NEXT: seb $9, $9 ; MIPS64R6O0-NEXT: # %bb.4: # %entry ; MIPS64R6O0-NEXT: sw $5, 12($sp) # 4-byte Folded Spill -; MIPS64R6O0-NEXT: sw $8, 8($sp) # 4-byte Folded Spill +; MIPS64R6O0-NEXT: sw $9, 8($sp) # 4-byte Folded Spill ; MIPS64R6O0-NEXT: # %bb.5: # %entry ; MIPS64R6O0-NEXT: lw $1, 8($sp) # 4-byte Folded Reload ; MIPS64R6O0-NEXT: lw $2, 12($sp) # 4-byte Folded Reload @@ -5775,28 +5775,28 @@ define signext i16 @AtomicLoadAdd16(i16 signext %incr) nounwind { ; MIPS64R6O0-NEXT: ld $1, %got_disp(z)($1) ; MIPS64R6O0-NEXT: daddiu $2, $zero, -4 ; MIPS64R6O0-NEXT: and $2, $1, $2 -; MIPS64R6O0-NEXT: andi $1, $1, 3 -; MIPS64R6O0-NEXT: xori $1, $1, 2 -; MIPS64R6O0-NEXT: sll $1, $1, 3 -; MIPS64R6O0-NEXT: ori $3, $zero, 65535 -; MIPS64R6O0-NEXT: sllv $3, $3, $1 -; MIPS64R6O0-NEXT: nor $5, $zero, $3 -; MIPS64R6O0-NEXT: sllv $4, $4, $1 +; MIPS64R6O0-NEXT: andi $3, $1, 3 +; MIPS64R6O0-NEXT: xori $3, $3, 2 +; MIPS64R6O0-NEXT: sll $3, $3, 3 +; MIPS64R6O0-NEXT: ori $5, $zero, 65535 +; MIPS64R6O0-NEXT: sllv $5, $5, $3 +; MIPS64R6O0-NEXT: nor $6, $zero, $5 +; MIPS64R6O0-NEXT: sllv $4, $4, $3 ; MIPS64R6O0-NEXT: .LBB14_1: # %entry ; MIPS64R6O0-NEXT: # =>This Inner Loop Header: Depth=1 -; MIPS64R6O0-NEXT: ll $7, 0($2) -; MIPS64R6O0-NEXT: addu $8, $7, $4 -; MIPS64R6O0-NEXT: and $8, $8, $3 -; MIPS64R6O0-NEXT: and $9, $7, $5 -; MIPS64R6O0-NEXT: or $9, $9, $8 -; MIPS64R6O0-NEXT: sc $9, 0($2) -; MIPS64R6O0-NEXT: beqzc $9, .LBB14_1 +; MIPS64R6O0-NEXT: ll $8, 0($2) +; MIPS64R6O0-NEXT: addu $9, $8, $4 +; MIPS64R6O0-NEXT: and $9, $9, $5 +; MIPS64R6O0-NEXT: and $10, $8, $6 +; MIPS64R6O0-NEXT: or $10, $10, $9 +; MIPS64R6O0-NEXT: sc $10, 0($2) +; MIPS64R6O0-NEXT: beqzc $10, .LBB14_1 ; MIPS64R6O0-NEXT: # %bb.2: # %entry -; MIPS64R6O0-NEXT: and $6, $7, $3 -; MIPS64R6O0-NEXT: srlv $6, $6, $1 -; MIPS64R6O0-NEXT: seh $6, $6 +; MIPS64R6O0-NEXT: and $7, $8, $5 +; MIPS64R6O0-NEXT: srlv $7, $7, $3 +; MIPS64R6O0-NEXT: seh $7, $7 ; MIPS64R6O0-NEXT: # %bb.3: # %entry -; MIPS64R6O0-NEXT: sw $6, 12($sp) # 4-byte Folded Spill +; MIPS64R6O0-NEXT: sw $7, 12($sp) # 4-byte Folded Spill ; MIPS64R6O0-NEXT: # %bb.4: # %entry ; MIPS64R6O0-NEXT: lw $1, 12($sp) # 4-byte Folded Reload ; MIPS64R6O0-NEXT: seh $2, $1 @@ -6359,33 +6359,33 @@ define {i16, i1} @foo(i16* %addr, i16 %l, i16 %r, i16 %new) { ; MIPS64R6O0-NEXT: sll $3, $5, 0 ; MIPS64R6O0-NEXT: addu $2, $3, $2 ; MIPS64R6O0-NEXT: sync -; MIPS64R6O0-NEXT: daddiu $3, $zero, -4 -; MIPS64R6O0-NEXT: and $3, $4, $3 -; MIPS64R6O0-NEXT: andi $4, $4, 3 -; MIPS64R6O0-NEXT: xori $4, $4, 2 -; MIPS64R6O0-NEXT: sll $4, $4, 3 +; MIPS64R6O0-NEXT: daddiu $8, $zero, -4 +; MIPS64R6O0-NEXT: and $8, $4, $8 +; MIPS64R6O0-NEXT: andi $3, $4, 3 +; MIPS64R6O0-NEXT: xori $3, $3, 2 +; MIPS64R6O0-NEXT: sll $3, $3, 3 ; MIPS64R6O0-NEXT: ori $5, $zero, 65535 -; MIPS64R6O0-NEXT: sllv $5, $5, $4 +; MIPS64R6O0-NEXT: sllv $5, $5, $3 ; MIPS64R6O0-NEXT: nor $6, $zero, $5 ; MIPS64R6O0-NEXT: andi $7, $2, 65535 -; MIPS64R6O0-NEXT: sllv $7, $7, $4 +; MIPS64R6O0-NEXT: sllv $7, $7, $3 ; MIPS64R6O0-NEXT: andi $1, $1, 65535 -; MIPS64R6O0-NEXT: sllv $1, $1, $4 +; MIPS64R6O0-NEXT: sllv $1, $1, $3 ; MIPS64R6O0-NEXT: .LBB15_1: # =>This Inner Loop Header: Depth=1 -; MIPS64R6O0-NEXT: ll $9, 0($3) -; MIPS64R6O0-NEXT: and $10, $9, $5 -; MIPS64R6O0-NEXT: bnec $10, $7, .LBB15_3 +; MIPS64R6O0-NEXT: ll $10, 0($8) +; MIPS64R6O0-NEXT: and $11, $10, $5 +; MIPS64R6O0-NEXT: bnec $11, $7, .LBB15_3 ; MIPS64R6O0-NEXT: # %bb.2: # in Loop: Header=BB15_1 Depth=1 -; MIPS64R6O0-NEXT: and $9, $9, $6 -; MIPS64R6O0-NEXT: or $9, $9, $1 -; MIPS64R6O0-NEXT: sc $9, 0($3) -; MIPS64R6O0-NEXT: beqzc $9, .LBB15_1 +; MIPS64R6O0-NEXT: and $10, $10, $6 +; MIPS64R6O0-NEXT: or $10, $10, $1 +; MIPS64R6O0-NEXT: sc $10, 0($8) +; MIPS64R6O0-NEXT: beqzc $10, .LBB15_1 ; MIPS64R6O0-NEXT: .LBB15_3: -; MIPS64R6O0-NEXT: srlv $8, $10, $4 -; MIPS64R6O0-NEXT: seh $8, $8 +; MIPS64R6O0-NEXT: srlv $9, $11, $3 +; MIPS64R6O0-NEXT: seh $9, $9 ; MIPS64R6O0-NEXT: # %bb.4: ; MIPS64R6O0-NEXT: sw $2, 12($sp) # 4-byte Folded Spill -; MIPS64R6O0-NEXT: sw $8, 8($sp) # 4-byte Folded Spill +; MIPS64R6O0-NEXT: sw $9, 8($sp) # 4-byte Folded Spill ; MIPS64R6O0-NEXT: # %bb.5: ; MIPS64R6O0-NEXT: lw $1, 12($sp) # 4-byte Folded Reload ; MIPS64R6O0-NEXT: seh $2, $1 @@ -7145,8 +7145,8 @@ define i32 @zeroreg() nounwind { ; MIPS64R6O0-NEXT: sc $6, 0($1) ; MIPS64R6O0-NEXT: beqzc $6, .LBB17_1 ; MIPS64R6O0-NEXT: .LBB17_3: # %entry -; MIPS64R6O0-NEXT: xor $1, $5, $3 -; MIPS64R6O0-NEXT: sltiu $2, $1, 1 +; MIPS64R6O0-NEXT: xor $2, $5, $3 +; MIPS64R6O0-NEXT: sltiu $2, $2, 1 ; MIPS64R6O0-NEXT: sync ; MIPS64R6O0-NEXT: jrc $ra ; diff --git a/llvm/test/CodeGen/Mips/implicit-sret.ll b/llvm/test/CodeGen/Mips/implicit-sret.ll index b9f6568e40c92..e86cec37d5100 100644 --- a/llvm/test/CodeGen/Mips/implicit-sret.ll +++ b/llvm/test/CodeGen/Mips/implicit-sret.ll @@ -48,8 +48,8 @@ define internal { i32, i128, i64 } @implicit_sret_impl() unnamed_addr nounwind { ; CHECK-NEXT: sd $zero, 8($4) ; CHECK-NEXT: daddiu $3, $zero, 30 ; CHECK-NEXT: sd $3, 24($4) -; CHECK-NEXT: addiu $3, $zero, 10 -; CHECK-NEXT: sw $3, 0($4) +; CHECK-NEXT: addiu $5, $zero, 10 +; CHECK-NEXT: sw $5, 0($4) ; CHECK-NEXT: jr $ra ; CHECK-NEXT: nop ret { i32, i128, i64 } { i32 10, i128 20, i64 30 } @@ -70,12 +70,10 @@ define internal void @test2() unnamed_addr nounwind { ; CHECK-NEXT: lw $3, 4($sp) ; CHECK-NEXT: # implicit-def: $a0_64 ; CHECK-NEXT: move $4, $3 -; CHECK-NEXT: # implicit-def: $v1_64 -; CHECK-NEXT: move $3, $2 -; CHECK-NEXT: # implicit-def: $v0_64 -; CHECK-NEXT: move $2, $1 -; CHECK-NEXT: move $5, $3 -; CHECK-NEXT: move $6, $2 +; CHECK-NEXT: # implicit-def: $a1_64 +; CHECK-NEXT: move $5, $2 +; CHECK-NEXT: # implicit-def: $a2_64 +; CHECK-NEXT: move $6, $1 ; CHECK-NEXT: jal use_sret2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/PowerPC/GlobalISel/irtranslator-ret.ll b/llvm/test/CodeGen/PowerPC/GlobalISel/irtranslator-ret.ll new file mode 100644 index 0000000000000..86f27a126d5a3 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/GlobalISel/irtranslator-ret.ll @@ -0,0 +1,7 @@ +; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -global-isel -verify-machineinstrs -stop-after=irtranslator < %s | FileCheck %s + +; CHECK: name: f +; CHECK: BLR8 +define void @f() { + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/GlobalISel/legalize-ret.mir b/llvm/test/CodeGen/PowerPC/GlobalISel/legalize-ret.mir new file mode 100644 index 0000000000000..7226511688105 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/GlobalISel/legalize-ret.mir @@ -0,0 +1,17 @@ +# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -global-isel -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_simple +body: | + ; CHECK-LABEL: name: test_simple + ; CHECK: [[IN:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK: $x3 = COPY [[IN]] + ; CHECK: BLR8 implicit $lr8, implicit $rm, implicit $x3 + bb.1.entry: + liveins: $x3 + + %0:_(s64) = COPY $x3 + $x3 = COPY %0(s64) + BLR8 implicit $lr8, implicit $rm, implicit $x3 + +... diff --git a/llvm/test/CodeGen/PowerPC/addegluecrash.ll b/llvm/test/CodeGen/PowerPC/addegluecrash.ll index c38f377869f86..a1d9805458368 100644 --- a/llvm/test/CodeGen/PowerPC/addegluecrash.ll +++ b/llvm/test/CodeGen/PowerPC/addegluecrash.ll @@ -21,11 +21,11 @@ define void @bn_mul_comba8(i64* nocapture %r, i64* nocapture readonly %a, i64* n ; CHECK-NEXT: addze 5, 5 ; CHECK-NEXT: add 4, 5, 4 ; CHECK-NEXT: cmpld 7, 4, 5 -; CHECK-NEXT: mfocrf 4, 1 -; CHECK-NEXT: rlwinm 4, 4, 29, 31, 31 -; CHECK-NEXT: # implicit-def: $x5 -; CHECK-NEXT: mr 5, 4 -; CHECK-NEXT: clrldi 4, 5, 32 +; CHECK-NEXT: mfocrf 10, 1 +; CHECK-NEXT: rlwinm 10, 10, 29, 31, 31 +; CHECK-NEXT: # implicit-def: $x4 +; CHECK-NEXT: mr 4, 10 +; CHECK-NEXT: clrldi 4, 4, 32 ; CHECK-NEXT: std 4, 0(3) ; CHECK-NEXT: blr %1 = load i64, i64* %a, align 8 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll index 3105f5ba5829a..0682d022c5e3f 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll @@ -1842,23 +1842,23 @@ entry: ; 32BIT-DAG: renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.10, $r2 :: (load 4 from got) ; 32BIT-DAG: renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.11, $r2 :: (load 4 from got) ; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 56, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 60, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 60, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 64, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 68, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 68, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 72, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 76, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 76, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 80, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW renamable $r[[SCRATCHREG:[0-9]+]], 84, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW renamable $r[[SCRATCHREG:[0-9]+]], 84, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 88, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 92, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 92, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 96, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 100, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 100, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 104, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 108, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 108, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 112, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 116, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 116, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 120, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 124, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 124, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[SCRATCHREG:[0-9]+]], 128, $r1 :: (store 4) ; 32BIT-DAG: renamable $r[[REGF1:[0-9]+]] = LWZtoc @f14, $r2 :: (load 4 from got) ; 32BIT-DAG: renamable $r3 = LWZ 0, killed renamable $r[[REGF1]] :: (load 4 from @f14) @@ -2243,33 +2243,33 @@ define void @caller_mix() { ; 32BIT-DAG: $r9 = LI 7 ; 32BIT-DAG: $r10 = LI 8 ; 32BIT-DAG: STW killed renamable $r[[REG1:[0-9]+]], 56, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW renamable $r[[REG2:[0-9]+]], 60, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW renamable $r[[REG2:[0-9]+]], 60, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG3:[0-9]+]], 64, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW renamable $r[[REG4:[0-9]+]], 68, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW renamable $r[[REG4:[0-9]+]], 68, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG5:[0-9]+]], 72, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW renamable $r[[REG6:[0-9]+]], 76, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW renamable $r[[REG6:[0-9]+]], 76, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG7:[0-9]+]], 80, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW renamable $r[[REG8:[0-9]+]], 84, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW renamable $r[[REG8:[0-9]+]], 84, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG9:[0-9]+]], 88, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW renamable $r[[REG10:[0-9]+]], 92, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW renamable $r[[REG10:[0-9]+]], 92, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG11:[0-9]+]], 96, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW renamable $r[[REG12:[0-9]+]], 100, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW renamable $r[[REG12:[0-9]+]], 100, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG13:[0-9]+]], 104, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW renamable $r[[REG14:[0-9]+]], 108, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW renamable $r[[REG14:[0-9]+]], 108, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG15:[0-9]+]], 112, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW renamable $r[[REG16:[0-9]+]], 116, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW renamable $r[[REG16:[0-9]+]], 116, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG17:[0-9]+]], 120, $r1 :: (store 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG18:[0-9]+]], 128, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW renamable $r[[REG19:[0-9]+]], 124, $r1 :: (store 4 + 4) -; 32BIT-DAG: STW killed renamable $r[[REG20:[0-9]+]], 132, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW renamable $r[[REG19:[0-9]+]], 124, $r1 :: (store 4 + 4, align 8) +; 32BIT-DAG: STW killed renamable $r[[REG20:[0-9]+]], 132, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG21:[0-9]+]], 136, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW killed renamable $r[[REG22:[0-9]+]], 140, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW killed renamable $r[[REG22:[0-9]+]], 140, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG23:[0-9]+]], 144, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW killed renamable $r[[REG24:[0-9]+]], 148, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW killed renamable $r[[REG24:[0-9]+]], 148, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG25:[0-9]+]], 152, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW killed renamable $r[[REG26:[0-9]+]], 156, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW killed renamable $r[[REG26:[0-9]+]], 156, $r1 :: (store 4 + 4, align 8) ; 32BIT-DAG: STW killed renamable $r[[REG27:[0-9]+]], 160, $r1 :: (store 4, align 8) -; 32BIT-DAG: STW killed renamable $r[[REG28:[0-9]+]], 164, $r1 :: (store 4 + 4) +; 32BIT-DAG: STW killed renamable $r[[REG28:[0-9]+]], 164, $r1 :: (store 4 + 4, align 8) ; 32BIT-NEXT: BL_NOP , csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit $f10, implicit $f11, implicit $f12, implicit $f13, implicit $r2, implicit-def $r1, implicit-def dead $r3 ; 32BIT-NEXT: ADJCALLSTACKUP 168, 0, implicit-def dead $r1, implicit $r1 diff --git a/llvm/test/CodeGen/PowerPC/aix-overflow-toc.py b/llvm/test/CodeGen/PowerPC/aix-overflow-toc.py new file mode 100644 index 0000000000000..e04491bff2fb9 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix-overflow-toc.py @@ -0,0 +1,68 @@ +# UNSUPPORTED: expensive_checks, debug + +# RUN: %python %s > %t.ll +# RUN: llc -mtriple powerpc-ibm-aix-xcoff -code-model=small -mcpu=pwr4 -mattr=-altivec -O0 < %t.ll | \ +# RUN: FileCheck --check-prefix=ASM32 %s + +# RUN: llc -mtriple powerpc64-ibm-aix-xcoff -code-model=small -mcpu=pwr4 -mattr=-altivec -O0 < %t.ll | \ +# RUN: FileCheck --check-prefix=ASM64 %s + +# RUN: llc -mtriple powerpc-ibm-aix-xcoff -code-model=small -mcpu=pwr4 -mattr=-altivec -O0 \ +# RUN: -filetype=obj -o %t.o < %t.ll +# RUN: llvm-objdump -D -r --symbol-description %t.o | FileCheck --check-prefix=DIS32 %s + +# RUN: not --crash llc -mtriple powerpc64-ibm-aix-xcoff \ +# RUN: -mcpu=pwr4 -mattr=-altivec -filetype=obj -o %t.o 2>&1 < %t.ll | \ +# RUN: FileCheck --check-prefix=XCOFF64 %s +# XCOFF64: LLVM ERROR: 64-bit XCOFF object files are not supported yet. + +numentries = 12290 +for x in range(0, numentries): + print("@a%d = global i32 0, align 4" % (x)) + +print("define void @foo() {") +print("entry:") +for x in range(0, numentries): + print("store i32 1, i32* @a%d, align 4" % (x)) +print("ret void") +print("}") + +# 32-bit assembly check +# ASM32: lwz 3, L..C0(2) +# ASM32: lwz 3, L..C1(2) + +# ASM32: lwz 3, L..C8191(2) +# ASM32: lwz 3, L..C8192-65536(2) +# ASM32: lwz 3, L..C8193-65536(2) + +# ASM32: lwz 3, L..C12288-65536(2) +# ASM32: lwz 3, L..C12289-65536(2) + +# 64-bit assembly check +# ASM64: ld 3, L..C0(2) +# ASM64: ld 3, L..C1(2) + +# ASM64: ld 3, L..C4095(2) +# ASM64: ld 3, L..C4096-65536(2) +# ASM64: ld 3, L..C4097-65536(2) + +# ASM64: ld 3, L..C12287-65536(2) +# ASM64: ld 3, L..C12288-131072(2) +# ASM64: ld 3, L..C12289-131072(2) + +# DIS32: 0: 80 62 00 00 lwz 3, 0(2) +# DIS32: 00000002: R_TOC (idx: 24590) a0[TC] +# DIS32: c: 80 62 00 04 lwz 3, 4(2) +# DIS32: 0000000e: R_TOC (idx: 24592) a1[TC] + +# DIS32: fffc: 80 62 7f fc lwz 3, 32764(2) +# DIS32: 0000fffe: R_TOC (idx: 40972) a8191[TC] +# DIS32: 10004: 80 62 80 00 lwz 3, -32768(2) +# DIS32: 00010006: R_TOC (idx: 40974) a8192[TC] +# DIS32: 1000c: 80 62 80 04 lwz 3, -32764(2) +# DIS32: 0001000e: R_TOC (idx: 40976) a8193[TC] + +# DIS32: 18004: 80 62 c0 00 lwz 3, -16384(2) +# DIS32: 00018006: R_TOC (idx: 49166) a12288[TC] +# DIS32: 1800c: 80 62 c0 04 lwz 3, -16380(2) +# DIS32: 0001800e: R_TOC (idx: 49168) a12289[TC] diff --git a/llvm/test/CodeGen/PowerPC/atomics-indexed.ll b/llvm/test/CodeGen/PowerPC/atomics-indexed.ll index b4790adfd9088..cf7225a5fc200 100644 --- a/llvm/test/CodeGen/PowerPC/atomics-indexed.ll +++ b/llvm/test/CodeGen/PowerPC/atomics-indexed.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -verify-machineinstrs -ppc-asm-full-reg-names | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32 ; FIXME: -verify-machineinstrs currently fail on ppc64 (mismatched register/instruction). ; This is already checked for in Atomics-64.ll @@ -8,9 +9,25 @@ ; Indexed version of loads define i8 @load_x_i8_seq_cst([100000 x i8]* %mem) { -; CHECK-LABEL: load_x_i8_seq_cst -; CHECK: sync -; CHECK: lbzx [[VAL:r[0-9]+]] +; PPC32-LABEL: load_x_i8_seq_cst: +; PPC32: # %bb.0: +; PPC32-NEXT: lis r4, 1 +; PPC32-NEXT: sync +; PPC32-NEXT: ori r4, r4, 24464 +; PPC32-NEXT: lbzx r3, r3, r4 +; PPC32-NEXT: lwsync +; PPC32-NEXT: blr +; +; PPC64-LABEL: load_x_i8_seq_cst: +; PPC64: # %bb.0: +; PPC64-NEXT: lis r4, 1 +; PPC64-NEXT: sync +; PPC64-NEXT: ori r4, r4, 24464 +; PPC64-NEXT: lbzx r3, r3, r4 +; PPC64-NEXT: cmpd cr7, r3, r3 +; PPC64-NEXT: bne- cr7, .+4 +; PPC64-NEXT: isync +; PPC64-NEXT: blr ; CHECK-PPC32: lwsync ; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]] ; CHECK-PPC64: bne- [[CR]], .+4 @@ -20,8 +37,23 @@ define i8 @load_x_i8_seq_cst([100000 x i8]* %mem) { ret i8 %val } define i16 @load_x_i16_acquire([100000 x i16]* %mem) { -; CHECK-LABEL: load_x_i16_acquire -; CHECK: lhzx [[VAL:r[0-9]+]] +; PPC32-LABEL: load_x_i16_acquire: +; PPC32: # %bb.0: +; PPC32-NEXT: lis r4, 2 +; PPC32-NEXT: ori r4, r4, 48928 +; PPC32-NEXT: lhzx r3, r3, r4 +; PPC32-NEXT: lwsync +; PPC32-NEXT: blr +; +; PPC64-LABEL: load_x_i16_acquire: +; PPC64: # %bb.0: +; PPC64-NEXT: lis r4, 2 +; PPC64-NEXT: ori r4, r4, 48928 +; PPC64-NEXT: lhzx r3, r3, r4 +; PPC64-NEXT: cmpd cr7, r3, r3 +; PPC64-NEXT: bne- cr7, .+4 +; PPC64-NEXT: isync +; PPC64-NEXT: blr ; CHECK-PPC32: lwsync ; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]] ; CHECK-PPC64: bne- [[CR]], .+4 @@ -31,19 +63,39 @@ define i16 @load_x_i16_acquire([100000 x i16]* %mem) { ret i16 %val } define i32 @load_x_i32_monotonic([100000 x i32]* %mem) { -; CHECK-LABEL: load_x_i32_monotonic -; CHECK: lwzx -; CHECK-NOT: sync +; CHECK-LABEL: load_x_i32_monotonic: +; CHECK: # %bb.0: +; CHECK-NEXT: lis r4, 5 +; CHECK-NEXT: ori r4, r4, 32320 +; CHECK-NEXT: lwzx r3, r3, r4 +; CHECK-NEXT: blr %ptr = getelementptr inbounds [100000 x i32], [100000 x i32]* %mem, i64 0, i64 90000 %val = load atomic i32, i32* %ptr monotonic, align 4 ret i32 %val } define i64 @load_x_i64_unordered([100000 x i64]* %mem) { -; CHECK-LABEL: load_x_i64_unordered -; PPC32: __sync_ -; PPC64-NOT: __sync_ -; PPC64: ldx -; CHECK-NOT: sync +; PPC32-LABEL: load_x_i64_unordered: +; PPC32: # %bb.0: +; PPC32-NEXT: mflr r0 +; PPC32-NEXT: stw r0, 4(r1) +; PPC32-NEXT: stwu r1, -16(r1) +; PPC32-NEXT: .cfi_def_cfa_offset 16 +; PPC32-NEXT: .cfi_offset lr, 4 +; PPC32-NEXT: addi r3, r3, -896 +; PPC32-NEXT: addis r3, r3, 11 +; PPC32-NEXT: li r4, 0 +; PPC32-NEXT: bl __atomic_load_8 +; PPC32-NEXT: lwz r0, 20(r1) +; PPC32-NEXT: addi r1, r1, 16 +; PPC32-NEXT: mtlr r0 +; PPC32-NEXT: blr +; +; PPC64-LABEL: load_x_i64_unordered: +; PPC64: # %bb.0: +; PPC64-NEXT: lis r4, 10 +; PPC64-NEXT: ori r4, r4, 64640 +; PPC64-NEXT: ldx r3, r3, r4 +; PPC64-NEXT: blr %ptr = getelementptr inbounds [100000 x i64], [100000 x i64]* %mem, i64 0, i64 90000 %val = load atomic i64, i64* %ptr unordered, align 8 ret i64 %val @@ -51,35 +103,69 @@ define i64 @load_x_i64_unordered([100000 x i64]* %mem) { ; Indexed version of stores define void @store_x_i8_seq_cst([100000 x i8]* %mem) { -; CHECK-LABEL: store_x_i8_seq_cst -; CHECK: sync -; CHECK: stbx +; CHECK-LABEL: store_x_i8_seq_cst: +; CHECK: # %bb.0: +; CHECK-NEXT: lis r4, 1 +; CHECK-NEXT: ori r4, r4, 24464 +; CHECK-NEXT: li r5, 42 +; CHECK-NEXT: sync +; CHECK-NEXT: stbx r5, r3, r4 +; CHECK-NEXT: blr %ptr = getelementptr inbounds [100000 x i8], [100000 x i8]* %mem, i64 0, i64 90000 store atomic i8 42, i8* %ptr seq_cst, align 1 ret void } define void @store_x_i16_release([100000 x i16]* %mem) { -; CHECK-LABEL: store_x_i16_release -; CHECK: lwsync -; CHECK: sthx +; CHECK-LABEL: store_x_i16_release: +; CHECK: # %bb.0: +; CHECK-NEXT: lis r4, 2 +; CHECK-NEXT: ori r4, r4, 48928 +; CHECK-NEXT: li r5, 42 +; CHECK-NEXT: lwsync +; CHECK-NEXT: sthx r5, r3, r4 +; CHECK-NEXT: blr %ptr = getelementptr inbounds [100000 x i16], [100000 x i16]* %mem, i64 0, i64 90000 store atomic i16 42, i16* %ptr release, align 2 ret void } define void @store_x_i32_monotonic([100000 x i32]* %mem) { -; CHECK-LABEL: store_x_i32_monotonic -; CHECK-NOT: sync -; CHECK: stwx +; CHECK-LABEL: store_x_i32_monotonic: +; CHECK: # %bb.0: +; CHECK-NEXT: lis r4, 5 +; CHECK-NEXT: ori r4, r4, 32320 +; CHECK-NEXT: li r5, 42 +; CHECK-NEXT: stwx r5, r3, r4 +; CHECK-NEXT: blr %ptr = getelementptr inbounds [100000 x i32], [100000 x i32]* %mem, i64 0, i64 90000 store atomic i32 42, i32* %ptr monotonic, align 4 ret void } define void @store_x_i64_unordered([100000 x i64]* %mem) { -; CHECK-LABEL: store_x_i64_unordered -; CHECK-NOT: sync -; PPC32: __sync_ -; PPC64-NOT: __sync_ -; PPC64: stdx +; PPC32-LABEL: store_x_i64_unordered: +; PPC32: # %bb.0: +; PPC32-NEXT: mflr r0 +; PPC32-NEXT: stw r0, 4(r1) +; PPC32-NEXT: stwu r1, -16(r1) +; PPC32-NEXT: .cfi_def_cfa_offset 16 +; PPC32-NEXT: .cfi_offset lr, 4 +; PPC32-NEXT: addi r3, r3, -896 +; PPC32-NEXT: addis r3, r3, 11 +; PPC32-NEXT: li r5, 0 +; PPC32-NEXT: li r6, 42 +; PPC32-NEXT: li r7, 0 +; PPC32-NEXT: bl __atomic_store_8 +; PPC32-NEXT: lwz r0, 20(r1) +; PPC32-NEXT: addi r1, r1, 16 +; PPC32-NEXT: mtlr r0 +; PPC32-NEXT: blr +; +; PPC64-LABEL: store_x_i64_unordered: +; PPC64: # %bb.0: +; PPC64-NEXT: lis r4, 10 +; PPC64-NEXT: ori r4, r4, 64640 +; PPC64-NEXT: li r5, 42 +; PPC64-NEXT: stdx r5, r3, r4 +; PPC64-NEXT: blr %ptr = getelementptr inbounds [100000 x i64], [100000 x i64]* %mem, i64 0, i64 90000 store atomic i64 42, i64* %ptr unordered, align 8 ret void diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll index c964218cb60bf..008cd4c7157c1 100644 --- a/llvm/test/CodeGen/PowerPC/atomics.ll +++ b/llvm/test/CodeGen/PowerPC/atomics.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc-unknown-linux-gnu -verify-machineinstrs -ppc-asm-full-reg-names | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32 ; This is already checked for in Atomics-64.ll ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -ppc-asm-full-reg-names | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64 @@ -9,22 +10,35 @@ ; We first check loads, for all sizes from i8 to i64. ; We also vary orderings to check for barriers. define i8 @load_i8_unordered(i8* %mem) { -; CHECK-LABEL: load_i8_unordered -; CHECK: lbz -; CHECK-NOT: sync +; CHECK-LABEL: load_i8_unordered: +; CHECK: # %bb.0: +; CHECK-NEXT: lbz r3, 0(r3) +; CHECK-NEXT: blr %val = load atomic i8, i8* %mem unordered, align 1 ret i8 %val } define i16 @load_i16_monotonic(i16* %mem) { -; CHECK-LABEL: load_i16_monotonic -; CHECK: lhz -; CHECK-NOT: sync +; CHECK-LABEL: load_i16_monotonic: +; CHECK: # %bb.0: +; CHECK-NEXT: lhz r3, 0(r3) +; CHECK-NEXT: blr %val = load atomic i16, i16* %mem monotonic, align 2 ret i16 %val } define i32 @load_i32_acquire(i32* %mem) { -; CHECK-LABEL: load_i32_acquire -; CHECK: lwz [[VAL:r[0-9]+]] +; PPC32-LABEL: load_i32_acquire: +; PPC32: # %bb.0: +; PPC32-NEXT: lwz r3, 0(r3) +; PPC32-NEXT: lwsync +; PPC32-NEXT: blr +; +; PPC64-LABEL: load_i32_acquire: +; PPC64: # %bb.0: +; PPC64-NEXT: lwz r3, 0(r3) +; PPC64-NEXT: cmpd cr7, r3, r3 +; PPC64-NEXT: bne- cr7, .+4 +; PPC64-NEXT: isync +; PPC64-NEXT: blr %val = load atomic i32, i32* %mem acquire, align 4 ; CHECK-PPC32: lwsync ; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]] @@ -33,11 +47,28 @@ define i32 @load_i32_acquire(i32* %mem) { ret i32 %val } define i64 @load_i64_seq_cst(i64* %mem) { -; CHECK-LABEL: load_i64_seq_cst -; CHECK: sync -; PPC32: __sync_ -; PPC64-NOT: __sync_ -; PPC64: ld [[VAL:r[0-9]+]] +; PPC32-LABEL: load_i64_seq_cst: +; PPC32: # %bb.0: +; PPC32-NEXT: mflr r0 +; PPC32-NEXT: stw r0, 4(r1) +; PPC32-NEXT: stwu r1, -16(r1) +; PPC32-NEXT: .cfi_def_cfa_offset 16 +; PPC32-NEXT: .cfi_offset lr, 4 +; PPC32-NEXT: li r4, 5 +; PPC32-NEXT: bl __atomic_load_8 +; PPC32-NEXT: lwz r0, 20(r1) +; PPC32-NEXT: addi r1, r1, 16 +; PPC32-NEXT: mtlr r0 +; PPC32-NEXT: blr +; +; PPC64-LABEL: load_i64_seq_cst: +; PPC64: # %bb.0: +; PPC64-NEXT: sync +; PPC64-NEXT: ld r3, 0(r3) +; PPC64-NEXT: cmpd cr7, r3, r3 +; PPC64-NEXT: bne- cr7, .+4 +; PPC64-NEXT: isync +; PPC64-NEXT: blr %val = load atomic i64, i64* %mem seq_cst, align 8 ; CHECK-PPC32: lwsync ; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]] @@ -48,95 +79,401 @@ define i64 @load_i64_seq_cst(i64* %mem) { ; Stores define void @store_i8_unordered(i8* %mem) { -; CHECK-LABEL: store_i8_unordered -; CHECK-NOT: sync -; CHECK: stb +; CHECK-LABEL: store_i8_unordered: +; CHECK: # %bb.0: +; CHECK-NEXT: li r4, 42 +; CHECK-NEXT: stb r4, 0(r3) +; CHECK-NEXT: blr store atomic i8 42, i8* %mem unordered, align 1 ret void } define void @store_i16_monotonic(i16* %mem) { -; CHECK-LABEL: store_i16_monotonic -; CHECK-NOT: sync -; CHECK: sth +; CHECK-LABEL: store_i16_monotonic: +; CHECK: # %bb.0: +; CHECK-NEXT: li r4, 42 +; CHECK-NEXT: sth r4, 0(r3) +; CHECK-NEXT: blr store atomic i16 42, i16* %mem monotonic, align 2 ret void } define void @store_i32_release(i32* %mem) { -; CHECK-LABEL: store_i32_release -; CHECK: lwsync -; CHECK: stw +; CHECK-LABEL: store_i32_release: +; CHECK: # %bb.0: +; CHECK-NEXT: li r4, 42 +; CHECK-NEXT: lwsync +; CHECK-NEXT: stw r4, 0(r3) +; CHECK-NEXT: blr store atomic i32 42, i32* %mem release, align 4 ret void } define void @store_i64_seq_cst(i64* %mem) { -; CHECK-LABEL: store_i64_seq_cst -; CHECK: sync -; PPC32: __sync_ -; PPC64-NOT: __sync_ -; PPC64: std +; PPC32-LABEL: store_i64_seq_cst: +; PPC32: # %bb.0: +; PPC32-NEXT: mflr r0 +; PPC32-NEXT: stw r0, 4(r1) +; PPC32-NEXT: stwu r1, -16(r1) +; PPC32-NEXT: .cfi_def_cfa_offset 16 +; PPC32-NEXT: .cfi_offset lr, 4 +; PPC32-NEXT: li r5, 0 +; PPC32-NEXT: li r6, 42 +; PPC32-NEXT: li r7, 5 +; PPC32-NEXT: bl __atomic_store_8 +; PPC32-NEXT: lwz r0, 20(r1) +; PPC32-NEXT: addi r1, r1, 16 +; PPC32-NEXT: mtlr r0 +; PPC32-NEXT: blr +; +; PPC64-LABEL: store_i64_seq_cst: +; PPC64: # %bb.0: +; PPC64-NEXT: li r4, 42 +; PPC64-NEXT: sync +; PPC64-NEXT: std r4, 0(r3) +; PPC64-NEXT: blr store atomic i64 42, i64* %mem seq_cst, align 8 ret void } ; Atomic CmpXchg define i8 @cas_strong_i8_sc_sc(i8* %mem) { -; CHECK-LABEL: cas_strong_i8_sc_sc -; CHECK: sync +; PPC32-LABEL: cas_strong_i8_sc_sc: +; PPC32: # %bb.0: +; PPC32-NEXT: rlwinm r8, r3, 3, 27, 28 +; PPC32-NEXT: li r5, 1 +; PPC32-NEXT: li r6, 0 +; PPC32-NEXT: li r7, 255 +; PPC32-NEXT: rlwinm r4, r3, 0, 0, 29 +; PPC32-NEXT: xori r3, r8, 24 +; PPC32-NEXT: slw r5, r5, r3 +; PPC32-NEXT: slw r8, r6, r3 +; PPC32-NEXT: slw r6, r7, r3 +; PPC32-NEXT: and r7, r5, r6 +; PPC32-NEXT: and r8, r8, r6 +; PPC32-NEXT: sync +; PPC32-NEXT: .LBB8_1: +; PPC32-NEXT: lwarx r9, 0, r4 +; PPC32-NEXT: and r5, r9, r6 +; PPC32-NEXT: cmpw r5, r8 +; PPC32-NEXT: bne cr0, .LBB8_3 +; PPC32-NEXT: # %bb.2: +; PPC32-NEXT: andc r9, r9, r6 +; PPC32-NEXT: or r9, r9, r7 +; PPC32-NEXT: stwcx. r9, 0, r4 +; PPC32-NEXT: bne cr0, .LBB8_1 +; PPC32-NEXT: b .LBB8_4 +; PPC32-NEXT: .LBB8_3: +; PPC32-NEXT: stwcx. r9, 0, r4 +; PPC32-NEXT: .LBB8_4: +; PPC32-NEXT: srw r3, r5, r3 +; PPC32-NEXT: lwsync +; PPC32-NEXT: blr +; +; PPC64-LABEL: cas_strong_i8_sc_sc: +; PPC64: # %bb.0: +; PPC64-NEXT: rlwinm r8, r3, 3, 27, 28 +; PPC64-NEXT: li r5, 1 +; PPC64-NEXT: li r6, 0 +; PPC64-NEXT: li r7, 255 +; PPC64-NEXT: rldicr r4, r3, 0, 61 +; PPC64-NEXT: xori r3, r8, 24 +; PPC64-NEXT: slw r5, r5, r3 +; PPC64-NEXT: slw r8, r6, r3 +; PPC64-NEXT: slw r6, r7, r3 +; PPC64-NEXT: and r7, r5, r6 +; PPC64-NEXT: and r8, r8, r6 +; PPC64-NEXT: sync +; PPC64-NEXT: .LBB8_1: +; PPC64-NEXT: lwarx r9, 0, r4 +; PPC64-NEXT: and r5, r9, r6 +; PPC64-NEXT: cmpw r5, r8 +; PPC64-NEXT: bne cr0, .LBB8_3 +; PPC64-NEXT: # %bb.2: +; PPC64-NEXT: andc r9, r9, r6 +; PPC64-NEXT: or r9, r9, r7 +; PPC64-NEXT: stwcx. r9, 0, r4 +; PPC64-NEXT: bne cr0, .LBB8_1 +; PPC64-NEXT: b .LBB8_4 +; PPC64-NEXT: .LBB8_3: +; PPC64-NEXT: stwcx. r9, 0, r4 +; PPC64-NEXT: .LBB8_4: +; PPC64-NEXT: srw r3, r5, r3 +; PPC64-NEXT: lwsync +; PPC64-NEXT: blr %val = cmpxchg i8* %mem, i8 0, i8 1 seq_cst seq_cst -; CHECK: lwsync %loaded = extractvalue { i8, i1} %val, 0 ret i8 %loaded } define i16 @cas_weak_i16_acquire_acquire(i16* %mem) { -; CHECK-LABEL: cas_weak_i16_acquire_acquire -;CHECK-NOT: sync +; PPC32-LABEL: cas_weak_i16_acquire_acquire: +; PPC32: # %bb.0: +; PPC32-NEXT: li r6, 0 +; PPC32-NEXT: rlwinm r4, r3, 3, 27, 27 +; PPC32-NEXT: li r5, 1 +; PPC32-NEXT: ori r7, r6, 65535 +; PPC32-NEXT: xori r4, r4, 16 +; PPC32-NEXT: slw r8, r5, r4 +; PPC32-NEXT: slw r9, r6, r4 +; PPC32-NEXT: slw r5, r7, r4 +; PPC32-NEXT: rlwinm r3, r3, 0, 0, 29 +; PPC32-NEXT: and r6, r8, r5 +; PPC32-NEXT: and r8, r9, r5 +; PPC32-NEXT: .LBB9_1: +; PPC32-NEXT: lwarx r9, 0, r3 +; PPC32-NEXT: and r7, r9, r5 +; PPC32-NEXT: cmpw r7, r8 +; PPC32-NEXT: bne cr0, .LBB9_3 +; PPC32-NEXT: # %bb.2: +; PPC32-NEXT: andc r9, r9, r5 +; PPC32-NEXT: or r9, r9, r6 +; PPC32-NEXT: stwcx. r9, 0, r3 +; PPC32-NEXT: bne cr0, .LBB9_1 +; PPC32-NEXT: b .LBB9_4 +; PPC32-NEXT: .LBB9_3: +; PPC32-NEXT: stwcx. r9, 0, r3 +; PPC32-NEXT: .LBB9_4: +; PPC32-NEXT: srw r3, r7, r4 +; PPC32-NEXT: lwsync +; PPC32-NEXT: blr +; +; PPC64-LABEL: cas_weak_i16_acquire_acquire: +; PPC64: # %bb.0: +; PPC64-NEXT: li r6, 0 +; PPC64-NEXT: rlwinm r4, r3, 3, 27, 27 +; PPC64-NEXT: li r5, 1 +; PPC64-NEXT: ori r7, r6, 65535 +; PPC64-NEXT: xori r4, r4, 16 +; PPC64-NEXT: slw r8, r5, r4 +; PPC64-NEXT: slw r9, r6, r4 +; PPC64-NEXT: slw r5, r7, r4 +; PPC64-NEXT: rldicr r3, r3, 0, 61 +; PPC64-NEXT: and r6, r8, r5 +; PPC64-NEXT: and r8, r9, r5 +; PPC64-NEXT: .LBB9_1: +; PPC64-NEXT: lwarx r9, 0, r3 +; PPC64-NEXT: and r7, r9, r5 +; PPC64-NEXT: cmpw r7, r8 +; PPC64-NEXT: bne cr0, .LBB9_3 +; PPC64-NEXT: # %bb.2: +; PPC64-NEXT: andc r9, r9, r5 +; PPC64-NEXT: or r9, r9, r6 +; PPC64-NEXT: stwcx. r9, 0, r3 +; PPC64-NEXT: bne cr0, .LBB9_1 +; PPC64-NEXT: b .LBB9_4 +; PPC64-NEXT: .LBB9_3: +; PPC64-NEXT: stwcx. r9, 0, r3 +; PPC64-NEXT: .LBB9_4: +; PPC64-NEXT: srw r3, r7, r4 +; PPC64-NEXT: lwsync +; PPC64-NEXT: blr %val = cmpxchg weak i16* %mem, i16 0, i16 1 acquire acquire -; CHECK: lwsync %loaded = extractvalue { i16, i1} %val, 0 ret i16 %loaded } define i32 @cas_strong_i32_acqrel_acquire(i32* %mem) { -; CHECK-LABEL: cas_strong_i32_acqrel_acquire -; CHECK: lwsync +; CHECK-LABEL: cas_strong_i32_acqrel_acquire: +; CHECK: # %bb.0: +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: li r6, 0 +; CHECK-NEXT: lwsync +; CHECK-NEXT: .LBB10_1: +; CHECK-NEXT: lwarx r4, 0, r3 +; CHECK-NEXT: cmpw r6, r4 +; CHECK-NEXT: bne cr0, .LBB10_3 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: stwcx. r5, 0, r3 +; CHECK-NEXT: bne cr0, .LBB10_1 +; CHECK-NEXT: b .LBB10_4 +; CHECK-NEXT: .LBB10_3: +; CHECK-NEXT: stwcx. r4, 0, r3 +; CHECK-NEXT: .LBB10_4: +; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: lwsync +; CHECK-NEXT: blr %val = cmpxchg i32* %mem, i32 0, i32 1 acq_rel acquire -; CHECK: lwsync %loaded = extractvalue { i32, i1} %val, 0 ret i32 %loaded } define i64 @cas_weak_i64_release_monotonic(i64* %mem) { -; CHECK-LABEL: cas_weak_i64_release_monotonic -; CHECK: lwsync +; PPC32-LABEL: cas_weak_i64_release_monotonic: +; PPC32: # %bb.0: +; PPC32-NEXT: mflr r0 +; PPC32-NEXT: stw r0, 4(r1) +; PPC32-NEXT: stwu r1, -16(r1) +; PPC32-NEXT: .cfi_def_cfa_offset 16 +; PPC32-NEXT: .cfi_offset lr, 4 +; PPC32-NEXT: li r4, 0 +; PPC32-NEXT: stw r4, 12(r1) +; PPC32-NEXT: li r5, 0 +; PPC32-NEXT: stw r4, 8(r1) +; PPC32-NEXT: addi r4, r1, 8 +; PPC32-NEXT: li r6, 1 +; PPC32-NEXT: li r7, 3 +; PPC32-NEXT: li r8, 0 +; PPC32-NEXT: bl __atomic_compare_exchange_8 +; PPC32-NEXT: lwz r4, 12(r1) +; PPC32-NEXT: lwz r3, 8(r1) +; PPC32-NEXT: lwz r0, 20(r1) +; PPC32-NEXT: addi r1, r1, 16 +; PPC32-NEXT: mtlr r0 +; PPC32-NEXT: blr +; +; PPC64-LABEL: cas_weak_i64_release_monotonic: +; PPC64: # %bb.0: +; PPC64-NEXT: li r5, 1 +; PPC64-NEXT: li r6, 0 +; PPC64-NEXT: lwsync +; PPC64-NEXT: .LBB11_1: +; PPC64-NEXT: ldarx r4, 0, r3 +; PPC64-NEXT: cmpd r6, r4 +; PPC64-NEXT: bne cr0, .LBB11_4 +; PPC64-NEXT: # %bb.2: +; PPC64-NEXT: stdcx. r5, 0, r3 +; PPC64-NEXT: bne cr0, .LBB11_1 +; PPC64-NEXT: # %bb.3: +; PPC64-NEXT: mr r3, r4 +; PPC64-NEXT: blr +; PPC64-NEXT: .LBB11_4: +; PPC64-NEXT: stdcx. r4, 0, r3 +; PPC64-NEXT: mr r3, r4 +; PPC64-NEXT: blr %val = cmpxchg weak i64* %mem, i64 0, i64 1 release monotonic -; CHECK-NOT: [sync ] %loaded = extractvalue { i64, i1} %val, 0 ret i64 %loaded } ; AtomicRMW define i8 @add_i8_monotonic(i8* %mem, i8 %operand) { -; CHECK-LABEL: add_i8_monotonic -; CHECK-NOT: sync +; PPC32-LABEL: add_i8_monotonic: +; PPC32: # %bb.0: +; PPC32-NEXT: rlwinm r7, r3, 3, 27, 28 +; PPC32-NEXT: li r6, 255 +; PPC32-NEXT: rlwinm r5, r3, 0, 0, 29 +; PPC32-NEXT: xori r3, r7, 24 +; PPC32-NEXT: slw r4, r4, r3 +; PPC32-NEXT: slw r6, r6, r3 +; PPC32-NEXT: .LBB12_1: +; PPC32-NEXT: lwarx r7, 0, r5 +; PPC32-NEXT: add r8, r4, r7 +; PPC32-NEXT: andc r9, r7, r6 +; PPC32-NEXT: and r8, r8, r6 +; PPC32-NEXT: or r8, r8, r9 +; PPC32-NEXT: stwcx. r8, 0, r5 +; PPC32-NEXT: bne cr0, .LBB12_1 +; PPC32-NEXT: # %bb.2: +; PPC32-NEXT: srw r3, r7, r3 +; PPC32-NEXT: blr +; +; PPC64-LABEL: add_i8_monotonic: +; PPC64: # %bb.0: +; PPC64-NEXT: rlwinm r7, r3, 3, 27, 28 +; PPC64-NEXT: li r6, 255 +; PPC64-NEXT: rldicr r5, r3, 0, 61 +; PPC64-NEXT: xori r3, r7, 24 +; PPC64-NEXT: slw r4, r4, r3 +; PPC64-NEXT: slw r6, r6, r3 +; PPC64-NEXT: .LBB12_1: +; PPC64-NEXT: lwarx r7, 0, r5 +; PPC64-NEXT: add r8, r4, r7 +; PPC64-NEXT: andc r9, r7, r6 +; PPC64-NEXT: and r8, r8, r6 +; PPC64-NEXT: or r8, r8, r9 +; PPC64-NEXT: stwcx. r8, 0, r5 +; PPC64-NEXT: bne cr0, .LBB12_1 +; PPC64-NEXT: # %bb.2: +; PPC64-NEXT: srw r3, r7, r3 +; PPC64-NEXT: blr %val = atomicrmw add i8* %mem, i8 %operand monotonic ret i8 %val } define i16 @xor_i16_seq_cst(i16* %mem, i16 %operand) { -; CHECK-LABEL: xor_i16_seq_cst -; CHECK: sync +; PPC32-LABEL: xor_i16_seq_cst: +; PPC32: # %bb.0: +; PPC32-NEXT: li r6, 0 +; PPC32-NEXT: rlwinm r7, r3, 3, 27, 27 +; PPC32-NEXT: rlwinm r5, r3, 0, 0, 29 +; PPC32-NEXT: ori r6, r6, 65535 +; PPC32-NEXT: xori r3, r7, 16 +; PPC32-NEXT: slw r4, r4, r3 +; PPC32-NEXT: slw r6, r6, r3 +; PPC32-NEXT: sync +; PPC32-NEXT: .LBB13_1: +; PPC32-NEXT: lwarx r7, 0, r5 +; PPC32-NEXT: xor r8, r4, r7 +; PPC32-NEXT: andc r9, r7, r6 +; PPC32-NEXT: and r8, r8, r6 +; PPC32-NEXT: or r8, r8, r9 +; PPC32-NEXT: stwcx. r8, 0, r5 +; PPC32-NEXT: bne cr0, .LBB13_1 +; PPC32-NEXT: # %bb.2: +; PPC32-NEXT: srw r3, r7, r3 +; PPC32-NEXT: lwsync +; PPC32-NEXT: blr +; +; PPC64-LABEL: xor_i16_seq_cst: +; PPC64: # %bb.0: +; PPC64-NEXT: li r6, 0 +; PPC64-NEXT: rlwinm r7, r3, 3, 27, 27 +; PPC64-NEXT: rldicr r5, r3, 0, 61 +; PPC64-NEXT: ori r6, r6, 65535 +; PPC64-NEXT: xori r3, r7, 16 +; PPC64-NEXT: slw r4, r4, r3 +; PPC64-NEXT: slw r6, r6, r3 +; PPC64-NEXT: sync +; PPC64-NEXT: .LBB13_1: +; PPC64-NEXT: lwarx r7, 0, r5 +; PPC64-NEXT: xor r8, r4, r7 +; PPC64-NEXT: andc r9, r7, r6 +; PPC64-NEXT: and r8, r8, r6 +; PPC64-NEXT: or r8, r8, r9 +; PPC64-NEXT: stwcx. r8, 0, r5 +; PPC64-NEXT: bne cr0, .LBB13_1 +; PPC64-NEXT: # %bb.2: +; PPC64-NEXT: srw r3, r7, r3 +; PPC64-NEXT: lwsync +; PPC64-NEXT: blr %val = atomicrmw xor i16* %mem, i16 %operand seq_cst -; CHECK: lwsync ret i16 %val } define i32 @xchg_i32_acq_rel(i32* %mem, i32 %operand) { -; CHECK-LABEL: xchg_i32_acq_rel -; CHECK: lwsync +; CHECK-LABEL: xchg_i32_acq_rel: +; CHECK: # %bb.0: +; CHECK-NEXT: lwsync +; CHECK-NEXT: .LBB14_1: +; CHECK-NEXT: lwarx r5, 0, r3 +; CHECK-NEXT: stwcx. r4, 0, r3 +; CHECK-NEXT: bne cr0, .LBB14_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: mr r3, r5 +; CHECK-NEXT: lwsync +; CHECK-NEXT: blr %val = atomicrmw xchg i32* %mem, i32 %operand acq_rel -; CHECK: lwsync ret i32 %val } define i64 @and_i64_release(i64* %mem, i64 %operand) { -; CHECK-LABEL: and_i64_release -; CHECK: lwsync +; PPC32-LABEL: and_i64_release: +; PPC32: # %bb.0: +; PPC32-NEXT: mflr r0 +; PPC32-NEXT: stw r0, 4(r1) +; PPC32-NEXT: stwu r1, -16(r1) +; PPC32-NEXT: .cfi_def_cfa_offset 16 +; PPC32-NEXT: .cfi_offset lr, 4 +; PPC32-NEXT: li r7, 3 +; PPC32-NEXT: bl __atomic_fetch_and_8 +; PPC32-NEXT: lwz r0, 20(r1) +; PPC32-NEXT: addi r1, r1, 16 +; PPC32-NEXT: mtlr r0 +; PPC32-NEXT: blr +; +; PPC64-LABEL: and_i64_release: +; PPC64: # %bb.0: +; PPC64-NEXT: lwsync +; PPC64-NEXT: .LBB15_1: +; PPC64-NEXT: ldarx r5, 0, r3 +; PPC64-NEXT: and r6, r4, r5 +; PPC64-NEXT: stdcx. r6, 0, r3 +; PPC64-NEXT: bne cr0, .LBB15_1 +; PPC64-NEXT: # %bb.2: +; PPC64-NEXT: mr r3, r5 +; PPC64-NEXT: blr %val = atomicrmw and i64* %mem, i64 %operand release -; CHECK-NOT: [sync ] ret i64 %val } diff --git a/llvm/test/CodeGen/PowerPC/constants-i64.ll b/llvm/test/CodeGen/PowerPC/constants-i64.ll index 956845f5a5b35..38a765343fc74 100644 --- a/llvm/test/CodeGen/PowerPC/constants-i64.ll +++ b/llvm/test/CodeGen/PowerPC/constants-i64.ll @@ -80,47 +80,93 @@ entry: ; CHECK: blr } -define i64 @cn32_1() #0 { +define i64 @uint32_1() #0 { entry: ret i64 3900000000 -; CHECK-LABEL: @cn32_1 +; CHECK-LABEL: @uint32_1 ; CHECK: lis [[REG1:[0-9]+]], 232 ; CHECK: ori [[REG2:[0-9]+]], [[REG1]], 30023 -; CHECK: sldi 3, [[REG1]], 8 +; CHECK: sldi 3, [[REG2]], 8 ; CHECK: blr } -define i32 @cn32_1_i32() #0 { +define i32 @uint32_1_i32() #0 { entry: ret i32 -394967296 -; CHECK-LABEL: @cn32_1_i32 +; CHECK-LABEL: @uint32_1_i32 ; CHECK: lis [[REG1:[0-9]+]], 232 ; CHECK: ori [[REG2:[0-9]+]], [[REG1]], 30023 -; CHECK: sldi 3, [[REG1]], 8 +; CHECK: sldi 3, [[REG2]], 8 ; CHECK: blr } -define i64 @cn32_2() #0 { +define i64 @uint32_2() #0 { entry: ret i64 4294967295 -; CHECK-LABEL: @cn32_2 +; CHECK-LABEL: @uint32_2 ; CHECK: li [[REG1:[0-9]+]], 0 ; CHECK: oris [[REG2:[0-9]+]], [[REG1]], 65535 -; CHECK: ori [[REG2:[0-9]+]], [[REG1]], 65535 +; CHECK: ori 3, [[REG2]], 65535 ; CHECK: blr } -define i32 @cn32_2_i32() #0 { +define i32 @uint32_2_i32() #0 { entry: ret i32 -1 -; CHECK-LABEL: @cn32_2_i32 +; CHECK-LABEL: @uint32_2_i32 ; CHECK: li [[REG1:[0-9]+]], 0 ; CHECK: oris [[REG2:[0-9]+]], [[REG1]], 65535 -; CHECK: ori [[REG2:[0-9]+]], [[REG1]], 65535 +; CHECK: ori 3, [[REG2]], 65535 +; CHECK: blr +} + +define i64 @uint32_3() #0 { +entry: + ret i64 2147483648 + +; CHECK-LABEL: @uint32_3 +; CHECK: li [[REG1:[0-9]+]], 1 +; CHECK: sldi 3, [[REG1]], 31 +; CHECK: blr +} + +define i64 @uint32_4() #0 { +entry: + ret i64 124800000032 + +; CHECK-LABEL: @uint32_4 +; CHECK: li [[REG1:[0-9]+]], 29 +; CHECK: sldi [[REG2:[0-9]+]], [[REG1]], 32 +; CHECK: oris [[REG3:[0-9]+]], [[REG2]], 3752 +; CHECK: ori 3, [[REG3]], 57376 +; CHECK: blr +} + +define i64 @cn_ones_1() #0 { +entry: + ret i64 10460594175 + +; CHECK-LABEL: @cn_ones_1 +; CHECK: li [[REG1:[0-9]+]], 2 +; CHECK: sldi [[REG2:[0-9]+]], [[REG1]], 32 +; CHECK: oris [[REG3:[0-9]+]], [[REG2]], 28543 +; CHECK: ori 3, [[REG3]], 65535 +; CHECK: blr +} + +define i64 @cn_ones_2() #0 { +entry: + ret i64 10459119615 + +; CHECK-LABEL: @cn_ones_2 +; CHECK: li [[REG1:[0-9]+]], 2 +; CHECK: sldi [[REG2:[0-9]+]], [[REG1]], 32 +; CHECK: oris [[REG3:[0-9]+]], [[REG2]], 28521 +; CHECK: ori 3, [[REG3]], 32767 ; CHECK: blr } diff --git a/llvm/test/CodeGen/PowerPC/fma-combine.ll b/llvm/test/CodeGen/PowerPC/fma-combine.ll index bf2abe0b6b837..217d520f89187 100644 --- a/llvm/test/CodeGen/PowerPC/fma-combine.ll +++ b/llvm/test/CodeGen/PowerPC/fma-combine.ll @@ -243,17 +243,18 @@ define double @getNegatedExpression_crash(double %x, double %y) { define double @fma_flag_propagation(double %a) { ; CHECK-FAST-LABEL: fma_flag_propagation: ; CHECK-FAST: # %bb.0: # %entry -; CHECK-FAST-NEXT: xssubdp 1, 1, 1 +; CHECK-FAST-NEXT: xxlxor 1, 1, 1 ; CHECK-FAST-NEXT: blr ; ; CHECK-FAST-NOVSX-LABEL: fma_flag_propagation: ; CHECK-FAST-NOVSX: # %bb.0: # %entry -; CHECK-FAST-NOVSX-NEXT: fsub 1, 1, 1 +; CHECK-FAST-NOVSX-NEXT: addis 3, 2, .LCPI6_0@toc@ha +; CHECK-FAST-NOVSX-NEXT: lfs 1, .LCPI6_0@toc@l(3) ; CHECK-FAST-NOVSX-NEXT: blr ; ; CHECK-LABEL: fma_flag_propagation: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xssubdp 1, 1, 1 +; CHECK-NEXT: xxlxor 1, 1, 1 ; CHECK-NEXT: blr entry: %0 = fneg double %a @@ -261,4 +262,56 @@ entry: ret double %1 } +define double @neg_fma_flag_propagation(double %a) { +; CHECK-FAST-LABEL: neg_fma_flag_propagation: +; CHECK-FAST: # %bb.0: # %entry +; CHECK-FAST-NEXT: xxlxor 1, 1, 1 +; CHECK-FAST-NEXT: blr +; +; CHECK-FAST-NOVSX-LABEL: neg_fma_flag_propagation: +; CHECK-FAST-NOVSX: # %bb.0: # %entry +; CHECK-FAST-NOVSX-NEXT: addis 3, 2, .LCPI7_0@toc@ha +; CHECK-FAST-NOVSX-NEXT: lfs 1, .LCPI7_0@toc@l(3) +; CHECK-FAST-NOVSX-NEXT: blr +; +; CHECK-LABEL: neg_fma_flag_propagation: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor 1, 1, 1 +; CHECK-NEXT: blr +entry: + %0 = call reassoc nnan double @llvm.fma.f64(double %a, double -1.0, double %a) + ret double %0 +} + +define <2 x double> @vec_neg_fma_flag_propagation(<2 x double> %a) { +; CHECK-FAST-LABEL: vec_neg_fma_flag_propagation: +; CHECK-FAST: # %bb.0: # %entry +; CHECK-FAST-NEXT: addis 3, 2, .LCPI8_0@toc@ha +; CHECK-FAST-NEXT: addi 3, 3, .LCPI8_0@toc@l +; CHECK-FAST-NEXT: lxvd2x 0, 0, 3 +; CHECK-FAST-NEXT: xxswapd 0, 0 +; CHECK-FAST-NEXT: xvmaddadp 34, 34, 0 +; CHECK-FAST-NEXT: blr +; +; CHECK-FAST-NOVSX-LABEL: vec_neg_fma_flag_propagation: +; CHECK-FAST-NOVSX: # %bb.0: # %entry +; CHECK-FAST-NOVSX-NEXT: addis 3, 2, .LCPI8_0@toc@ha +; CHECK-FAST-NOVSX-NEXT: lfs 1, .LCPI8_0@toc@l(3) +; CHECK-FAST-NOVSX-NEXT: fmr 2, 1 +; CHECK-FAST-NOVSX-NEXT: blr +; +; CHECK-LABEL: vec_neg_fma_flag_propagation: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis 3, 2, .LCPI8_0@toc@ha +; CHECK-NEXT: addi 3, 3, .LCPI8_0@toc@l +; CHECK-NEXT: lxvd2x 0, 0, 3 +; CHECK-NEXT: xxswapd 0, 0 +; CHECK-NEXT: xvmaddadp 34, 34, 0 +; CHECK-NEXT: blr +entry: + %0 = call reassoc nnan <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> , <2 x double> %a) + ret <2 x double> %0 +} + declare double @llvm.fma.f64(double, double, double) nounwind readnone +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll index 90ea31b26916e..91745b4b3ea21 100644 --- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll +++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll @@ -557,13 +557,13 @@ define double @fcmp_nnan(double %a, double %y, double %z) { ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'log2_approx:' ; FMFDEBUG: ch,glue = PPCISD::CALL_NOP t11, TargetGlobalAddress:i64 ; FMFDEBUG: ch,glue = callseq_end t15, TargetConstant:i64<32>, TargetConstant:i64<0>, t15:1 -; FMFDEBUG: f64,ch,glue = CopyFromReg afn t16, Register:f64 $f1, t16:1 +; FMFDEBUG: f64,ch,glue = CopyFromReg t16, Register:f64 $f1, t16:1 ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'log2_approx:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'log2_approx:' ; GLOBALDEBUG: ch,glue = PPCISD::CALL_NOP t11, TargetGlobalAddress:i64 ; GLOBALDEBUG: ch,glue = callseq_end t15, TargetConstant:i64<32>, TargetConstant:i64<0>, t15:1 -; GLOBALDEBUG: f64,ch,glue = CopyFromReg afn t16, Register:f64 $f1, t16:1 +; GLOBALDEBUG: f64,ch,glue = CopyFromReg t16, Register:f64 $f1, t16:1 ; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'log2_approx:' declare double @log2(double) diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-round.ll b/llvm/test/CodeGen/PowerPC/fp-strict-round.ll index 3a43b3584caf8..fa36f244d6239 100644 --- a/llvm/test/CodeGen/PowerPC/fp-strict-round.ll +++ b/llvm/test/CodeGen/PowerPC/fp-strict-round.ll @@ -170,12 +170,30 @@ define <2 x double> @floor_v2f64(<2 x double> %vf1) { define double @nearbyint_f64(double %f1, double %f2) { ; P8-LABEL: nearbyint_f64: ; P8: # %bb.0: -; P8-NEXT: xsrdpic f1, f1 +; P8-NEXT: mflr r0 +; P8-NEXT: std r0, 16(r1) +; P8-NEXT: stdu r1, -112(r1) +; P8-NEXT: .cfi_def_cfa_offset 112 +; P8-NEXT: .cfi_offset lr, 16 +; P8-NEXT: bl nearbyint +; P8-NEXT: nop +; P8-NEXT: addi r1, r1, 112 +; P8-NEXT: ld r0, 16(r1) +; P8-NEXT: mtlr r0 ; P8-NEXT: blr ; ; P9-LABEL: nearbyint_f64: ; P9: # %bb.0: -; P9-NEXT: xsrdpic f1, f1 +; P9-NEXT: mflr r0 +; P9-NEXT: std r0, 16(r1) +; P9-NEXT: stdu r1, -32(r1) +; P9-NEXT: .cfi_def_cfa_offset 32 +; P9-NEXT: .cfi_offset lr, 16 +; P9-NEXT: bl nearbyint +; P9-NEXT: nop +; P9-NEXT: addi r1, r1, 32 +; P9-NEXT: ld r0, 16(r1) +; P9-NEXT: mtlr r0 ; P9-NEXT: blr %res = call double @llvm.experimental.constrained.nearbyint.f64( double %f1, @@ -187,12 +205,104 @@ define double @nearbyint_f64(double %f1, double %f2) { define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) { ; P8-LABEL: nearbyint_v4f32: ; P8: # %bb.0: -; P8-NEXT: xvrspic v2, v2 +; P8-NEXT: mflr r0 +; P8-NEXT: std r0, 16(r1) +; P8-NEXT: stdu r1, -176(r1) +; P8-NEXT: .cfi_def_cfa_offset 176 +; P8-NEXT: .cfi_offset lr, 16 +; P8-NEXT: .cfi_offset v30, -32 +; P8-NEXT: .cfi_offset v31, -16 +; P8-NEXT: xxsldwi vs0, v2, v2, 3 +; P8-NEXT: li r3, 144 +; P8-NEXT: stxvd2x v30, r1, r3 # 16-byte Folded Spill +; P8-NEXT: li r3, 160 +; P8-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill +; P8-NEXT: vmr v31, v2 +; P8-NEXT: xscvspdpn f1, vs0 +; P8-NEXT: bl nearbyintf +; P8-NEXT: nop +; P8-NEXT: xxsldwi vs0, v31, v31, 1 +; P8-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; P8-NEXT: li r3, 128 +; P8-NEXT: stxvd2x vs1, r1, r3 # 16-byte Folded Spill +; P8-NEXT: xscvspdpn f1, vs0 +; P8-NEXT: bl nearbyintf +; P8-NEXT: nop +; P8-NEXT: li r3, 128 +; P8-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; P8-NEXT: lxvd2x vs0, r1, r3 # 16-byte Folded Reload +; P8-NEXT: xxmrghd vs0, vs1, vs0 +; P8-NEXT: xscvspdpn f1, v31 +; P8-NEXT: xvcvdpsp v30, vs0 +; P8-NEXT: bl nearbyintf +; P8-NEXT: nop +; P8-NEXT: xxswapd vs0, v31 +; P8-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; P8-NEXT: li r3, 128 +; P8-NEXT: stxvd2x vs1, r1, r3 # 16-byte Folded Spill +; P8-NEXT: xscvspdpn f1, vs0 +; P8-NEXT: bl nearbyintf +; P8-NEXT: nop +; P8-NEXT: li r3, 128 +; P8-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; P8-NEXT: lxvd2x vs0, r1, r3 # 16-byte Folded Reload +; P8-NEXT: li r3, 160 +; P8-NEXT: lxvd2x v31, r1, r3 # 16-byte Folded Reload +; P8-NEXT: li r3, 144 +; P8-NEXT: xxmrghd vs0, vs0, vs1 +; P8-NEXT: xvcvdpsp v2, vs0 +; P8-NEXT: vmrgew v2, v2, v30 +; P8-NEXT: lxvd2x v30, r1, r3 # 16-byte Folded Reload +; P8-NEXT: addi r1, r1, 176 +; P8-NEXT: ld r0, 16(r1) +; P8-NEXT: mtlr r0 ; P8-NEXT: blr ; ; P9-LABEL: nearbyint_v4f32: ; P9: # %bb.0: -; P9-NEXT: xvrspic v2, v2 +; P9-NEXT: mflr r0 +; P9-NEXT: std r0, 16(r1) +; P9-NEXT: stdu r1, -80(r1) +; P9-NEXT: .cfi_def_cfa_offset 80 +; P9-NEXT: .cfi_offset lr, 16 +; P9-NEXT: .cfi_offset v30, -32 +; P9-NEXT: .cfi_offset v31, -16 +; P9-NEXT: xxsldwi vs0, v2, v2, 3 +; P9-NEXT: stxv v30, 48(r1) # 16-byte Folded Spill +; P9-NEXT: xscvspdpn f1, vs0 +; P9-NEXT: stxv v31, 64(r1) # 16-byte Folded Spill +; P9-NEXT: vmr v31, v2 +; P9-NEXT: bl nearbyintf +; P9-NEXT: nop +; P9-NEXT: xxsldwi vs0, v31, v31, 1 +; P9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; P9-NEXT: stxv vs1, 32(r1) # 16-byte Folded Spill +; P9-NEXT: xscvspdpn f1, vs0 +; P9-NEXT: bl nearbyintf +; P9-NEXT: nop +; P9-NEXT: lxv vs0, 32(r1) # 16-byte Folded Reload +; P9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; P9-NEXT: xxmrghd vs0, vs1, vs0 +; P9-NEXT: xscvspdpn f1, v31 +; P9-NEXT: xvcvdpsp v30, vs0 +; P9-NEXT: bl nearbyintf +; P9-NEXT: nop +; P9-NEXT: xxswapd vs0, v31 +; P9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; P9-NEXT: stxv vs1, 32(r1) # 16-byte Folded Spill +; P9-NEXT: xscvspdpn f1, vs0 +; P9-NEXT: bl nearbyintf +; P9-NEXT: nop +; P9-NEXT: lxv vs0, 32(r1) # 16-byte Folded Reload +; P9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; P9-NEXT: lxv v31, 64(r1) # 16-byte Folded Reload +; P9-NEXT: xxmrghd vs0, vs0, vs1 +; P9-NEXT: xvcvdpsp v2, vs0 +; P9-NEXT: vmrgew v2, v2, v30 +; P9-NEXT: lxv v30, 48(r1) # 16-byte Folded Reload +; P9-NEXT: addi r1, r1, 80 +; P9-NEXT: ld r0, 16(r1) +; P9-NEXT: mtlr r0 ; P9-NEXT: blr %res = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32( <4 x float> %vf1, @@ -204,12 +314,62 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) { define <2 x double> @nearbyint_v2f64(<2 x double> %vf1, <2 x double> %vf2) { ; P8-LABEL: nearbyint_v2f64: ; P8: # %bb.0: -; P8-NEXT: xvrdpic v2, v2 +; P8-NEXT: mflr r0 +; P8-NEXT: std r0, 16(r1) +; P8-NEXT: stdu r1, -160(r1) +; P8-NEXT: .cfi_def_cfa_offset 160 +; P8-NEXT: .cfi_offset lr, 16 +; P8-NEXT: .cfi_offset v31, -16 +; P8-NEXT: li r3, 144 +; P8-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill +; P8-NEXT: vmr v31, v2 +; P8-NEXT: xxlor f1, v31, v31 +; P8-NEXT: bl nearbyint +; P8-NEXT: nop +; P8-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; P8-NEXT: li r3, 128 +; P8-NEXT: stxvd2x vs1, r1, r3 # 16-byte Folded Spill +; P8-NEXT: xxswapd vs1, v31 +; P8-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; P8-NEXT: bl nearbyint +; P8-NEXT: nop +; P8-NEXT: li r3, 128 +; P8-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; P8-NEXT: lxvd2x vs0, r1, r3 # 16-byte Folded Reload +; P8-NEXT: li r3, 144 +; P8-NEXT: lxvd2x v31, r1, r3 # 16-byte Folded Reload +; P8-NEXT: xxmrghd v2, vs0, vs1 +; P8-NEXT: addi r1, r1, 160 +; P8-NEXT: ld r0, 16(r1) +; P8-NEXT: mtlr r0 ; P8-NEXT: blr ; ; P9-LABEL: nearbyint_v2f64: ; P9: # %bb.0: -; P9-NEXT: xvrdpic v2, v2 +; P9-NEXT: mflr r0 +; P9-NEXT: std r0, 16(r1) +; P9-NEXT: stdu r1, -64(r1) +; P9-NEXT: .cfi_def_cfa_offset 64 +; P9-NEXT: .cfi_offset lr, 16 +; P9-NEXT: .cfi_offset v31, -16 +; P9-NEXT: stxv v31, 48(r1) # 16-byte Folded Spill +; P9-NEXT: vmr v31, v2 +; P9-NEXT: xscpsgndp f1, v31, v31 +; P9-NEXT: bl nearbyint +; P9-NEXT: nop +; P9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; P9-NEXT: stxv vs1, 32(r1) # 16-byte Folded Spill +; P9-NEXT: xxswapd vs1, v31 +; P9-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; P9-NEXT: bl nearbyint +; P9-NEXT: nop +; P9-NEXT: lxv vs0, 32(r1) # 16-byte Folded Reload +; P9-NEXT: lxv v31, 48(r1) # 16-byte Folded Reload +; P9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; P9-NEXT: xxmrghd v2, vs0, vs1 +; P9-NEXT: addi r1, r1, 64 +; P9-NEXT: ld r0, 16(r1) +; P9-NEXT: mtlr r0 ; P9-NEXT: blr %res = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64( <2 x double> %vf1, diff --git a/llvm/test/CodeGen/PowerPC/fusion-load-store.ll b/llvm/test/CodeGen/PowerPC/fusion-load-store.ll new file mode 100644 index 0000000000000..75b2eca2168c0 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/fusion-load-store.ll @@ -0,0 +1,268 @@ +; Test if several consecutive loads/stores can be clustered(fused) by scheduler. The +; scheduler will print "Cluster ld/st SU(x) - SU(y)" if SU(x) and SU(y) are fused. + +; REQUIRES: asserts +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 \ +; RUN: -mattr=-paired-vector-memops,-pcrelative-memops -verify-misched \ +; RUN: -debug-only=machine-scheduler 2>&1 | FileCheck %s + +define i64 @store_i64(i64* nocapture %P, i64 %v) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i64:%bb.0 +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: SU([[SU2]]): STD %[[REG:[0-9]+]]:g8rc, 24 +; CHECK: SU([[SU3]]): STD %[[REG]]:g8rc, 16 +; CHECK: SU([[SU4]]): STD %[[REG]]:g8rc, 8 +; CHECK: SU([[SU5]]): STD %[[REG]]:g8rc, 32 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i64:%bb.0 +; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]]) +; CHECK: SU([[SU0]]): STD renamable $x[[REG:[0-9]+]], 16 +; CHECK: SU([[SU1]]): STD renamable $x[[REG]], 8 +; CHECK: SU([[SU2]]): STD renamable $x[[REG]], 24 +; CHECK: SU([[SU3]]): STD renamable $x[[REG]], 32 + %arrayidx = getelementptr inbounds i64, i64* %P, i64 3 + store i64 %v, i64* %arrayidx + %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2 + store i64 %v, i64* %arrayidx1 + %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx2 + %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4 + store i64 %v, i64* %arrayidx3 + ret i64 %v +} + +define i32 @store_i32(i32* nocapture %P, i32 %v) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32:%bb.0 +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: SU([[SU2]]): STW %[[REG:[0-9]+]].sub_32:g8rc, 52 +; CHECK: SU([[SU3]]): STW %[[REG]].sub_32:g8rc, 48 +; CHECK: SU([[SU4]]): STW %[[REG]].sub_32:g8rc, 44 +; CHECK: SU([[SU5]]): STW %[[REG]].sub_32:g8rc, 56 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32:%bb.0 +; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]]) +; CHECK: SU([[SU0]]): STW renamable $r[[REG:[0-9]+]], 48 +; CHECK: SU([[SU1]]): STW renamable $r[[REG]], 44 +; CHECK: SU([[SU2]]): STW renamable $r[[REG]], 52 +; CHECK: SU([[SU3]]): STW renamable $r[[REG]], 56 + %arrayidx = getelementptr inbounds i32, i32* %P, i32 13 + store i32 %v, i32* %arrayidx + %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 12 + store i32 %v, i32* %arrayidx1 + %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 11 + store i32 %v, i32* %arrayidx2 + %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 14 + store i32 %v, i32* %arrayidx3 + ret i32 %v +} + +define void @store_i64_neg(i64* nocapture %P, i64 %v) #0 { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i64_neg:%bb.0 +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: SU([[SU2]]): STD %[[REG:[0-9]+]]:g8rc, -24 +; CHECK: SU([[SU3]]): STD %[[REG]]:g8rc, -8 +; CHECK: SU([[SU4]]): STD %[[REG]]:g8rc, -16 +; CHECK: SU([[SU5]]): STD %[[REG]]:g8rc, -32 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i64_neg:%bb.0 +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]]) +; CHECK: SU([[SU0]]): STD renamable $x[[REG:[0-9]+]], -8 +; CHECK: SU([[SU1]]): STD renamable $x[[REG]], -16 +; CHECK: SU([[SU2]]): STD renamable $x[[REG]], -24 +; CHECK: SU([[SU3]]): STD renamable $x[[REG]], -32 + %arrayidx = getelementptr inbounds i64, i64* %P, i64 -3 + store i64 %v, i64* %arrayidx + %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx1 + %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 -2 + store i64 %v, i64* %arrayidx2 + %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 -4 + store i64 %v, i64* %arrayidx3 + ret void +} + +define void @store_i32_neg(i32* nocapture %P, i32 %v) #0 { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32_neg:%bb.0 +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: SU([[SU2]]): STW %[[REG:[0-9]+]].sub_32:g8rc, -12 +; CHECK: SU([[SU3]]): STW %[[REG]].sub_32:g8rc, -4 +; CHECK: SU([[SU4]]): STW %[[REG]].sub_32:g8rc, -8 +; CHECK: SU([[SU5]]): STW %[[REG]].sub_32:g8rc, -16 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32_neg:%bb.0 +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]]) +; CHECK:SU([[SU0]]): STW renamable $r[[REG:[0-9]+]], -4 +; CHECK:SU([[SU1]]): STW renamable $r[[REG]], -8 +; CHECK:SU([[SU2]]): STW renamable $r[[REG]], -12 +; CHECK:SU([[SU3]]): STW renamable $r[[REG]], -16 + %arrayidx = getelementptr inbounds i32, i32* %P, i32 -3 + store i32 %v, i32* %arrayidx + %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 -1 + store i32 %v, i32* %arrayidx1 + %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 -2 + store i32 %v, i32* %arrayidx2 + %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 -4 + store i32 %v, i32* %arrayidx3 + ret void +} + +define void @store_double(double* nocapture %P, double %v) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_double:%bb.0 +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: SU([[SU2]]): DFSTOREf64 %[[REG:[0-9]+]]:vsfrc, 24 +; CHECK: SU([[SU3]]): DFSTOREf64 %[[REG]]:vsfrc, 8 +; CHECK: SU([[SU4]]): DFSTOREf64 %[[REG]]:vsfrc, 16 +; CHECK: SU([[SU5]]): DFSTOREf64 %[[REG]]:vsfrc, 32 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_double:%bb.0 +; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]]) +; CHECK: SU([[SU0]]): STFD renamable $f[[REG:[0-9]+]], 8 +; CHECK: SU([[SU1]]): STFD renamable $f[[REG]], 16 +; CHECK: SU([[SU2]]): STFD renamable $f[[REG]], 24 +; CHECK: SU([[SU3]]): STFD renamable $f[[REG]], 32 + %arrayidx = getelementptr inbounds double, double* %P, i64 3 + store double %v, double* %arrayidx + %arrayidx1 = getelementptr inbounds double, double* %P, i64 1 + store double %v, double* %arrayidx1 + %arrayidx2 = getelementptr inbounds double, double* %P, i64 2 + store double %v, double* %arrayidx2 + %arrayidx3 = getelementptr inbounds double, double* %P, i64 4 + store double %v, double* %arrayidx3 + ret void +} + +define void @store_float(float* nocapture %P, float %v) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_float:%bb.0 +; CHECK-NOT: Cluster ld/st +; CHECK-NOT: Cluster ld/st +; CHECK: SU([[SU2]]): DFSTOREf32 %[[REG:[0-9]+]]:vssrc, 12 +; CHECK: SU([[SU3]]): DFSTOREf32 %[[REG]]:vssrc, 4 +; CHECK: SU([[SU4]]): DFSTOREf32 %[[REG]]:vssrc, 8 +; CHECK: SU([[SU5]]): DFSTOREf32 %[[REG]]:vssrc, 16 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_float:%bb.0 +; CHECK-NOT: Cluster ld/st +; CHECK-NOT: Cluster ld/st +; CHECK: SU([[SU0]]): STFS renamable $f[[REG:[0-9]+]], 12 +; CHECK: SU([[SU1]]): STFS renamable $f[[REG]], 4 +; CHECK: SU([[SU2]]): STFS renamable $f[[REG]], 8 +; CHECK: SU([[SU3]]): STFS renamable $f[[REG]], 16 + %arrayidx = getelementptr inbounds float, float* %P, i64 3 + store float %v, float* %arrayidx + %arrayidx1 = getelementptr inbounds float, float* %P, i64 1 + store float %v, float* %arrayidx1 + %arrayidx2 = getelementptr inbounds float, float* %P, i64 2 + store float %v, float* %arrayidx2 + %arrayidx3 = getelementptr inbounds float, float* %P, i64 4 + store float %v, float* %arrayidx3 + ret void +} + +; Cannot fuse the store/load if there is volatile in between +define i64 @store_volatile(i64* nocapture %P, i64 %v) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_volatile:%bb.0 +; CHECK-NOT: Cluster ld/st +; CHECK: SU([[SU2]]): STD %[[REG:[0-9]+]]:g8rc, 24 +; CHECK: SU([[SU3]]): STD %[[REG]]:g8rc, 16 +; CHECK: SU([[SU4]]): STD %[[REG]]:g8rc, 8 +; CHECK: SU([[SU5]]): STD %[[REG]]:g8rc, 32 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_volatile:%bb.0 +; CHECK-NOT: Cluster ld/st +; CHECK: SU([[SU0]]): STD renamable $x[[REG:[0-9]+]], 24 +; CHECK: SU([[SU1]]): STD renamable $x[[REG]], 16 +; CHECK: SU([[SU2]]): STD renamable $x[[REG]], 8 +; CHECK: SU([[SU3]]): STD renamable $x[[REG]], 32 + %arrayidx = getelementptr inbounds i64, i64* %P, i64 3 + store volatile i64 %v, i64* %arrayidx + %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2 + store volatile i64 %v, i64* %arrayidx1 + %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1 + store volatile i64 %v, i64* %arrayidx2 + %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4 + store volatile i64 %v, i64* %arrayidx3 + ret i64 %v +} + +@p = common local_unnamed_addr global [100 x i32] zeroinitializer, align 4 + +define void @store_i32_stw_stw8(i32 signext %m, i32 signext %n) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32_stw_stw8:%bb.0 +; CHECK: Cluster ld/st SU([[SU5:[0-9]+]]) - SU([[SU8:[0-9]+]]) +; CHECK: SU([[SU5]]): STW8 %{{[0-9]+}}:g8rc, 24 +; CHECK: SU([[SU8]]): STW %{{[0-9]+}}:gprc, 20 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32_stw_stw8:%bb.0 +; CHECK: Cluster ld/st SU([[SU5:[0-9]+]]) - SU([[SU6:[0-9]+]]) +; CHECK: SU([[SU5]]): STW8 renamable $x{{[0-9]+}}, 24 +; CHECK: SU([[SU6]]): STW renamable $r{{[0-9]+}}, 20 + store i32 9, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 6), align 4 + store i32 %n, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 7), align 4 + %add = add nsw i32 %n, %m + store i32 %add, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 5), align 4 + ret void +} + +define void @store_i32_stw8(i32 signext %m, i32 signext %n) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32_stw8:%bb.0 +; CHECK: Cluster ld/st SU([[SU4:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: SU([[SU4]]): STW8 %{{[0-9]+}}:g8rc, 24 +; CHECK: SU([[SU5]]): STW8 %{{[0-9]+}}:g8rc, 28 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32_stw8:%bb.0 +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: SU([[SU3]]): STW8 renamable $x{{[0-9]+}}, 24 +; CHECK: SU([[SU4]]): STW8 renamable $x{{[0-9]+}}, 28 + store i32 9, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 6), align 4 + store i32 %n, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 7), align 4 + ret void +} + +declare void @bar(i64*) + +define void @store_frame_index(i32 %a, i32 %b) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_frame_index:%bb.0 +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]]) +; CHECK: SU([[SU2]]): STD %{{[0-9]+}}:g8rc, 0, %stack.0.buf +; CHECK: SU([[SU3]]): STD %{{[0-9]+}}:g8rc, 8, %stack.0.buf + %buf = alloca [8 x i64], align 8 + %0 = bitcast [8 x i64]* %buf to i8* + %conv = zext i32 %a to i64 + %arrayidx = getelementptr inbounds [8 x i64], [8 x i64]* %buf, i64 0, i64 0 + store i64 %conv, i64* %arrayidx, align 8 + %conv1 = zext i32 %b to i64 + %arrayidx2 = getelementptr inbounds [8 x i64], [8 x i64]* %buf, i64 0, i64 1 + store i64 %conv1, i64* %arrayidx2, align 8 + call void @bar(i64* nonnull %arrayidx) + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/lit.local.cfg b/llvm/test/CodeGen/PowerPC/lit.local.cfg index 091332439b186..1dbbf92fcf5e3 100644 --- a/llvm/test/CodeGen/PowerPC/lit.local.cfg +++ b/llvm/test/CodeGen/PowerPC/lit.local.cfg @@ -1,2 +1,4 @@ if not 'PowerPC' in config.root.targets: config.unsupported = True + +config.suffixes.add('.py') diff --git a/llvm/test/CodeGen/PowerPC/nofpexcept.ll b/llvm/test/CodeGen/PowerPC/nofpexcept.ll new file mode 100644 index 0000000000000..e15b06e0babea --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/nofpexcept.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s \ +; RUN: -stop-after=finalize-isel -verify-machineinstrs | FileCheck %s + +; Verify if the mayRaiseFPException is set for FCMPD/FCMPS +define i32 @fcmpu(double %a, double %b) { + ; CHECK-LABEL: name: fcmpu + ; CHECK: bb.0.entry: + ; CHECK: liveins: $f1, $f2 + ; CHECK: [[COPY:%[0-9]+]]:f8rc = COPY $f2 + ; CHECK: [[COPY1:%[0-9]+]]:f8rc = COPY $f1 + ; CHECK: %2:crrc = nofpexcept FCMPUD [[COPY1]], [[COPY]] + ; CHECK: [[COPY2:%[0-9]+]]:crbitrc = COPY %2.sub_gt + ; CHECK: [[LI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = LI8 0 + ; CHECK: [[LI8_1:%[0-9]+]]:g8rc_and_g8rc_nox0 = LI8 1 + ; CHECK: [[ISEL8_:%[0-9]+]]:g8rc = ISEL8 [[LI8_1]], [[LI8_]], [[COPY2]] + ; CHECK: $x3 = COPY [[ISEL8_]] + ; CHECK: BLR8 implicit $lr8, implicit $rm, implicit $x3 +entry: + %r = fcmp ogt double %a, %b + %g = zext i1 %r to i32 + ret i32 %g +} diff --git a/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll b/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll index dc21b4fb49eef..b5f36a78b2b26 100644 --- a/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll +++ b/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll @@ -76,6 +76,24 @@ entry: ret <4 x i32> %div } +define <1 x i128> @test_vdivsq(<1 x i128> %x, <1 x i128> %y) nounwind readnone { +; CHECK-LABEL: test_vdivsq: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsq v2, v2, v3 +; CHECK-NEXT: blr + %tmp = sdiv <1 x i128> %x, %y + ret <1 x i128> %tmp +} + +define <1 x i128> @test_vdivuq(<1 x i128> %x, <1 x i128> %y) nounwind readnone { +; CHECK-LABEL: test_vdivuq: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivuq v2, v2, v3 +; CHECK-NEXT: blr + %tmp = udiv <1 x i128> %x, %y + ret <1 x i128> %tmp +} + define <2 x i64> @test_vdivesd(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_vdivesd: ; CHECK: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/PowerPC/p10-vector-mask-ops.ll b/llvm/test/CodeGen/PowerPC/p10-vector-mask-ops.ll index 637361f7b1c96..65e9abd657ad1 100644 --- a/llvm/test/CodeGen/PowerPC/p10-vector-mask-ops.ll +++ b/llvm/test/CodeGen/PowerPC/p10-vector-mask-ops.ll @@ -120,3 +120,48 @@ entry: %exp = tail call <1 x i128> @llvm.ppc.altivec.vexpandqm(<1 x i128> %a) ret <1 x i128> %exp } + +declare i64 @llvm.ppc.altivec.vcntmbb(<16 x i8>, i32) +declare i64 @llvm.ppc.altivec.vcntmbh(<8 x i16>, i32) +declare i64 @llvm.ppc.altivec.vcntmbw(<4 x i32>, i32) +declare i64 @llvm.ppc.altivec.vcntmbd(<2 x i64>, i32) + +define i64 @test_vcntmbb(<16 x i8> %a) { +; CHECK-LABEL: test_vcntmbb: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcntmbb r3, v2, 1 +; CHECK-NEXT: blr +entry: + %cnt = tail call i64 @llvm.ppc.altivec.vcntmbb(<16 x i8> %a, i32 1) + ret i64 %cnt +} + +define i64 @test_vcntmbh(<8 x i16> %a) { +; CHECK-LABEL: test_vcntmbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcntmbh r3, v2, 0 +; CHECK-NEXT: blr +entry: + %cnt = tail call i64 @llvm.ppc.altivec.vcntmbh(<8 x i16> %a, i32 0) + ret i64 %cnt +} + +define i64 @test_vcntmbw(<4 x i32> %a) { +; CHECK-LABEL: test_vcntmbw: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcntmbw r3, v2, 1 +; CHECK-NEXT: blr +entry: + %cnt = tail call i64 @llvm.ppc.altivec.vcntmbw(<4 x i32> %a, i32 1) + ret i64 %cnt +} + +define i64 @test_vcntmbd(<2 x i64> %a) { +; CHECK-LABEL: test_vcntmbd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcntmbd r3, v2, 0 +; CHECK-NEXT: blr +entry: + %cnt = tail call i64 @llvm.ppc.altivec.vcntmbd(<2 x i64> %a, i32 0) + ret i64 %cnt +} diff --git a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll index 9141fdc735a0e..f2da036a37c50 100644 --- a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll +++ b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll @@ -45,12 +45,12 @@ define dso_local signext i32 @AsmClobberX2WithTOC(i32 signext %a, i32 signext %b ; CHECK-LARGE: ld r2, .Lfunc_toc2-.Lfunc_gep2(r12) ; CHECK-LARGE: add r2, r2, r12 ; CHECK-S: .localentry AsmClobberX2WithTOC -; CHECK-S: #APP +; CHECK-S: add r3, r4, r3 +; CHECK-S-NEXT: #APP ; CHECK-S-NEXT: li r2, 0 ; CHECK-S-NEXT: #NO_APP -; CHECK-S-NEXT: plwz r5, global_int@PCREL(0), 1 -; CHECK-S-NEXT: add r3, r4, r3 -; CHECK-S-NEXT: add r3, r3, r5 +; CHECK-S-NEXT: plwz r4, global_int@PCREL(0), 1 +; CHECK-S-NEXT: add r3, r3, r4 ; CHECK-S-NEXT: extsw r3, r3 ; CHECK-S-NEXT: blr entry: @@ -67,10 +67,10 @@ define dso_local signext i32 @AsmClobberX5(i32 signext %a, i32 signext %b) local ; CHECK-P9-NOT: .localentry ; CHECK-ALL: # %bb.0: # %entry ; CHECK-S-NEXT: add r3, r4, r3 -; CHECK-S-NEXT: extsw r3, r3 ; CHECK-S-NEXT: #APP ; CHECK-S-NEXT: nop ; CHECK-S-NEXT: #NO_APP +; CHECK-S-NEXT: extsw r3, r3 ; CHECK-S-NEXT: blr entry: %add = add nsw i32 %b, %a @@ -104,29 +104,29 @@ define dso_local signext i32 @X2IsCallerSaved(i32 signext %a, i32 signext %b, i3 ; CHECK-P9-NOT: .localentry ; CHECK-ALL: # %bb.0: # %entry ; CHECK-S-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; CHECK-S-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-S-NEXT: add r11, r4, r3 ; CHECK-S-NEXT: sub r29, r8, r9 ; CHECK-S-NEXT: add r9, r10, r9 ; CHECK-S-NEXT: sub r10, r10, r3 +; CHECK-S-NEXT: sub r12, r4, r5 +; CHECK-S-NEXT: add r0, r6, r5 +; CHECK-S-NEXT: sub r2, r6, r7 ; CHECK-S-NEXT: mullw r3, r4, r3 +; CHECK-S-NEXT: add r30, r8, r7 ; CHECK-S-NEXT: mullw r3, r3, r11 ; CHECK-S-NEXT: mullw r3, r3, r5 -; CHECK-S-NEXT: sub r12, r4, r5 ; CHECK-S-NEXT: mullw r3, r3, r6 -; CHECK-S-NEXT: add r0, r6, r5 ; CHECK-S-NEXT: mullw r3, r3, r12 ; CHECK-S-NEXT: mullw r3, r3, r0 ; CHECK-S-NEXT: mullw r3, r3, r7 -; CHECK-S-NEXT: sub r2, r6, r7 ; CHECK-S-NEXT: mullw r3, r3, r8 -; CHECK-S-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; CHECK-S-NEXT: add r30, r8, r7 ; CHECK-S-NEXT: mullw r3, r3, r2 ; CHECK-S-NEXT: mullw r3, r3, r30 -; CHECK-S-NEXT: mullw r3, r3, r29 -; CHECK-S-NEXT: mullw r3, r3, r9 ; CHECK-S-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-S-NEXT: mullw r3, r3, r29 ; CHECK-S-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; CHECK-S-NEXT: mullw r3, r3, r9 ; CHECK-S-NEXT: mullw r3, r3, r10 ; CHECK-S-NEXT: extsw r3, r3 ; CHECK-S-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-with-calls.ll b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-with-calls.ll index 0a4f2f38c816b..8fa86ef50ea57 100644 --- a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-with-calls.ll +++ b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-with-calls.ll @@ -353,10 +353,10 @@ define dso_local signext i32 @IndirectCall3(i32 signext %a, i32 signext %b, i32 ; CHECK-S-NEXT: stdu r1, -32(r1) ; CHECK-S-NEXT: .cfi_def_cfa_offset 32 ; CHECK-S-NEXT: .cfi_offset lr, 16 -; CHECK-S-NEXT: add r3, r4, r3 -; CHECK-S-NEXT: extsw r3, r3 ; CHECK-S-NEXT: mtctr r5 +; CHECK-S-NEXT: add r3, r4, r3 ; CHECK-S-NEXT: mr r12, r5 +; CHECK-S-NEXT: extsw r3, r3 ; CHECK-S-NEXT: bctrl ; CHECK-S-NEXT: plwz r4, globalVar@PCREL(0), 1 ; CHECK-S-NEXT: mullw r3, r4, r3 diff --git a/llvm/test/CodeGen/PowerPC/pcrel-tail-calls.ll b/llvm/test/CodeGen/PowerPC/pcrel-tail-calls.ll index 56e49780c5f0f..1340197b3ccba 100644 --- a/llvm/test/CodeGen/PowerPC/pcrel-tail-calls.ll +++ b/llvm/test/CodeGen/PowerPC/pcrel-tail-calls.ll @@ -185,8 +185,8 @@ define dso_local signext i32 @TailCallAbs() local_unnamed_addr { ; CHECK: .localentry TailCallAbs, 1 ; CHECK-NEXT: # %bb.0: # %entry ; CHECK-NEXT: li r3, 400 -; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: li r12, 400 +; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: bctr ; CHECK-NEXT: #TC_RETURNr8 ctr 0 entry: @@ -207,8 +207,8 @@ define dso_local signext i32 @NoTailCallAbs(i32 signext %a) local_unnamed_addr { ; CHECK-NEXT: stdu r1, -48(r1) ; CHECK-NEXT: mr r30, r3 ; CHECK-NEXT: li r3, 400 -; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: li r12, 400 +; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: bctrl ; CHECK-NEXT: add r3, r3, r30 ; CHECK-NEXT: extsw r3, r3 diff --git a/llvm/test/CodeGen/PowerPC/pcrel-tls-local-exec.ll b/llvm/test/CodeGen/PowerPC/pcrel-tls-local-exec.ll new file mode 100644 index 0000000000000..47245991d82fc --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/pcrel-tls-local-exec.ll @@ -0,0 +1,74 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -enable-ppc-pcrel-tls -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: < %s | FileCheck %s --check-prefix=CHECK-S +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -enable-ppc-pcrel-tls -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: --filetype=obj < %s | llvm-objdump --no-show-raw-insn --mcpu=pwr10 -dr - \ +; RUN: | FileCheck %s --check-prefix=CHECK-O + +; These test cases are to ensure that when using pc relative memory operations +; ABI correct code and relocations are produced for the Local Exec TLS Model. + +@x = thread_local global i32 0, align 4 +@y = thread_local global [5 x i32] [i32 0, i32 0, i32 0, i32 0, i32 0], align 4 + +define i32* @LocalExecAddressLoad() { +; CHECK-S-LABEL: LocalExecAddressLoad: +; CHECK-S: # %bb.0: # %entry +; CHECK-S-NEXT: paddi r3, r13, x@TPREL, 0 +; CHECK-S-NEXT: blr +; CHECK-O-LABEL: : +; CHECK-O: 0: paddi 3, 13, 0, 0 +; CHECK-O-NEXT: 0000000000000000: R_PPC64_TPREL34 x +; CHECK-O-NEXT: 8: blr +entry: + ret i32* @x +} + +define i32 @LocalExecValueLoad() { +; CHECK-S-LABEL: LocalExecValueLoad: +; CHECK-S: # %bb.0: # %entry +; CHECK-S-NEXT: paddi r3, r13, x@TPREL, 0 +; CHECK-S-NEXT: lwz r3, 0(r3) +; CHECK-S-NEXT: blr +; CHECK-O-LABEL: : +; CHECK-O: 20: paddi 3, 13, 0, 0 +; CHECK-O-NEXT: 0000000000000020: R_PPC64_TPREL34 x +; CHECK-O-NEXT: 28: lwz 3, 0(3) +; CHECK-O-NEXT: 2c: blr +entry: + %0 = load i32, i32* @x, align 4 + ret i32 %0 +} + +define i32 @LocalExecValueLoadOffset() { +; CHECK-S-LABEL: LocalExecValueLoadOffset: +; CHECK-S: # %bb.0: # %entry +; CHECK-S-NEXT: paddi r3, r13, y@TPREL, 0 +; CHECK-S-NEXT: lwz r3, 12(r3) +; CHECK-S-NEXT: blr +; CHECK-O-LABEL: : +; CHECK-O: 40: paddi 3, 13, 0, 0 +; CHECK-O-NEXT: 0000000000000040: R_PPC64_TPREL34 y +; CHECK-O-NEXT: 48: lwz 3, 12(3) +; CHECK-O-NEXT: 4c: blr +entry: + %0 = load i32, i32* getelementptr inbounds ([5 x i32], [5 x i32]* @y, i64 0, i64 3), align 4 + ret i32 %0 +} + + +define i32* @LocalExecValueLoadOffsetNoLoad() { +; CHECK-S-LABEL: LocalExecValueLoadOffsetNoLoad: +; CHECK-S: # %bb.0: # %entry +; CHECK-S-NEXT: paddi r3, r13, y@TPREL, 0 +; CHECK-S-NEXT: addi r3, r3, 12 +; CHECK-S-NEXT: blr +; CHECK-O-LABEL: : +; CHECK-O: 60: paddi 3, 13, 0, 0 +; CHECK-O-NEXT: 0000000000000060: R_PPC64_TPREL34 y +; CHECK-O-NEXT: 68: addi 3, 3, 12 +; CHECK-O-NEXT: 6c: blr +entry: + ret i32* getelementptr inbounds ([5 x i32], [5 x i32]* @y, i64 0, i64 3) +} diff --git a/llvm/test/CodeGen/PowerPC/popcount.ll b/llvm/test/CodeGen/PowerPC/popcount.ll index fb20f1d3ee43b..170d3d77d0886 100644 --- a/llvm/test/CodeGen/PowerPC/popcount.ll +++ b/llvm/test/CodeGen/PowerPC/popcount.ll @@ -58,17 +58,17 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) { ; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vsl0 ; CHECK-NEXT: mffprd 3, 0 ; CHECK-NEXT: popcntd 3, 3 -; CHECK-NEXT: xxswapd 0, 34 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vsl0 -; CHECK-NEXT: mffprd 4, 0 +; CHECK-NEXT: xxswapd 1, 34 +; CHECK-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; CHECK-NEXT: mffprd 4, 1 ; CHECK-NEXT: popcntd 4, 4 ; CHECK-NEXT: add 3, 4, 3 ; CHECK-NEXT: mtfprd 0, 3 -; CHECK-NEXT: # kill: def $vsl0 killed $f0 +; CHECK-NEXT: fmr 2, 0 ; CHECK-NEXT: li 3, 0 -; CHECK-NEXT: mtfprd 1, 3 -; CHECK-NEXT: # kill: def $vsl1 killed $f1 -; CHECK-NEXT: xxmrghd 34, 1, 0 +; CHECK-NEXT: mtfprd 0, 3 +; CHECK-NEXT: fmr 3, 0 +; CHECK-NEXT: xxmrghd 34, 3, 2 ; CHECK-NEXT: blr Entry: %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0) diff --git a/llvm/test/CodeGen/PowerPC/sink-down-more-instructions-1.mir b/llvm/test/CodeGen/PowerPC/sink-down-more-instructions-1.mir new file mode 100644 index 0000000000000..5e19b9d005e4e --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/sink-down-more-instructions-1.mir @@ -0,0 +1,597 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple powerpc64le-unknown-linux-gnu -o - %s -verify-machineinstrs \ +# RUN: -run-pass=machine-sink | FileCheck %s + +--- | + ; ModuleID = 'sink-down-more-instructions-1.ll' + source_filename = "sink-down-more-instructions-1.c" + target datalayout = "e-m:e-i64:64-n32:64" + target triple = "powerpc64le-unknown-linux-gnu" + + ; Function Attrs: nofree norecurse nounwind + define dso_local signext i32 @foo(i32 signext %0, i32 signext %1, i32* nocapture readonly %2, i32* nocapture %3, i32 signext %4) local_unnamed_addr #0 { + %6 = icmp sgt i32 %4, 0 + br i1 %6, label %7, label %37 + + 7: ; preds = %5 + %8 = zext i32 %4 to i64 + %9 = icmp eq i32 %4, 1 + br i1 %9, label %17, label %10 + + 10: ; preds = %7 + %11 = and i64 %8, 4294967294 + %scevgep20 = getelementptr i32, i32* %2, i64 -2 + %scevgep2021 = bitcast i32* %scevgep20 to i8* + %scevgep22 = getelementptr i32, i32* %3, i64 -2 + %scevgep2223 = bitcast i32* %scevgep22 to i8* + %12 = add nsw i64 %11, -2 + %13 = lshr i64 %12, 1 + %14 = add nuw i64 %13, 1 + call void @llvm.set.loop.iterations.i64(i64 %14) + br label %38 + + 15: ; preds = %74 + %16 = add nuw i32 %tmp18, 102 + br label %17 + + 17: ; preds = %15, %7 + %18 = phi i64 [ 0, %7 ], [ %78, %15 ] + %19 = phi i32 [ 100, %7 ], [ %16, %15 ] + %20 = phi i32 [ 0, %7 ], [ %66, %15 ] + %21 = and i64 %8, 1 + %22 = icmp eq i64 %21, 0 + br i1 %22, label %37, label %23 + + 23: ; preds = %17 + %24 = getelementptr inbounds i32, i32* %2, i64 %18 + %25 = load i32, i32* %24, align 4, !tbaa !2 + %26 = add nsw i32 %25, %20 + switch i32 %0, label %30 [ + i32 1, label %27 + i32 3, label %33 + ] + + 27: ; preds = %23 + %28 = trunc i64 %18 to i32 + %29 = shl i32 %28, 1 + br label %33 + + 30: ; preds = %23 + %31 = trunc i64 %18 to i32 + %32 = urem i32 %31, 30 + br label %33 + + 33: ; preds = %30, %27, %23 + %34 = phi i32 [ %32, %30 ], [ %29, %27 ], [ %19, %23 ] + %35 = add nsw i32 %34, %26 + %36 = getelementptr inbounds i32, i32* %3, i64 %18 + store i32 %35, i32* %36, align 4, !tbaa !2 + br label %37 + + 37: ; preds = %33, %17, %5 + ret i32 undef + + 38: ; preds = %74, %10 + %39 = phi i64 [ 0, %10 ], [ %78, %74 ] + %40 = phi i32 [ 0, %10 ], [ %66, %74 ] + %41 = phi i8* [ %scevgep2021, %10 ], [ %45, %74 ] + %42 = phi i8* [ %scevgep2223, %10 ], [ %43, %74 ] + %43 = getelementptr i8, i8* %42, i64 8 + %44 = bitcast i8* %43 to i32* + %45 = getelementptr i8, i8* %41, i64 8 + %46 = bitcast i8* %45 to i32* + %lsr19 = trunc i64 %39 to i32 + %47 = udiv i32 %lsr19, 30 + %48 = mul nsw i32 %47, -30 + %49 = zext i32 %48 to i64 + %50 = add nuw nsw i64 %49, 1 + %51 = load i32, i32* %46, align 4, !tbaa !2 + %52 = add nsw i32 %51, %40 + switch i32 %0, label %58 [ + i32 1, label %53 + i32 3, label %56 + ] + + 53: ; preds = %38 + %54 = trunc i64 %39 to i32 + %55 = shl i32 %54, 1 + br label %60 + + 56: ; preds = %38 + %57 = add nuw nsw i32 %lsr19, 100 + br label %60 + + 58: ; preds = %38 + %59 = add i64 %39, %49 + %tmp15 = trunc i64 %59 to i32 + br label %60 + + 60: ; preds = %58, %56, %53 + %61 = phi i32 [ %tmp15, %58 ], [ %57, %56 ], [ %55, %53 ] + %62 = add nsw i32 %61, %52 + store i32 %62, i32* %44, align 4, !tbaa !2 + %63 = or i64 %39, 1 + %64 = getelementptr i8, i8* %45, i64 4 + %uglygep1112.cast = bitcast i8* %64 to i32* + %65 = load i32, i32* %uglygep1112.cast, align 4, !tbaa !2 + %66 = add nsw i32 %65, %52 + switch i32 %0, label %72 [ + i32 1, label %69 + i32 3, label %67 + ] + + 67: ; preds = %60 + %68 = add nuw nsw i32 %lsr19, 101 + br label %74 + + 69: ; preds = %60 + %70 = trunc i64 %63 to i32 + %71 = shl i32 %70, 1 + br label %74 + + 72: ; preds = %60 + %73 = add i64 %39, %50 + %tmp = trunc i64 %73 to i32 + br label %74 + + 74: ; preds = %72, %69, %67 + %75 = phi i32 [ %tmp, %72 ], [ %68, %67 ], [ %71, %69 ] + %76 = add nsw i32 %75, %66 + %77 = getelementptr i8, i8* %43, i64 4 + %uglygep78.cast = bitcast i8* %77 to i32* + store i32 %76, i32* %uglygep78.cast, align 4, !tbaa !2 + %78 = add nuw nsw i64 %39, 2 + %79 = add i64 %78, -2 + %tmp18 = trunc i64 %79 to i32 + %80 = call i1 @llvm.loop.decrement.i64(i64 1) + br i1 %80, label %38, label %15 + } + + ; Function Attrs: noduplicate nounwind + declare void @llvm.set.loop.iterations.i64(i64) #1 + + ; Function Attrs: noduplicate nounwind + declare i1 @llvm.loop.decrement.i64(i64) #1 + + attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-spe" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { noduplicate nounwind } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 12.0.0"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + +... +--- +name: foo +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc } + - { id: 1, class: g8rc } + - { id: 2, class: g8rc } + - { id: 3, class: gprc } + - { id: 4, class: g8rc } + - { id: 5, class: gprc } + - { id: 6, class: gprc } + - { id: 7, class: gprc } + - { id: 8, class: gprc_and_gprc_nor0 } + - { id: 9, class: gprc } + - { id: 10, class: gprc } + - { id: 11, class: g8rc_and_g8rc_nox0 } + - { id: 12, class: gprc } + - { id: 13, class: g8rc_and_g8rc_nox0 } + - { id: 14, class: g8rc_and_g8rc_nox0 } + - { id: 15, class: g8rc_and_g8rc_nox0 } + - { id: 16, class: g8rc_and_g8rc_nox0 } + - { id: 17, class: g8rc_and_g8rc_nox0 } + - { id: 18, class: gprc_and_gprc_nor0 } + - { id: 19, class: g8rc } + - { id: 20, class: g8rc } + - { id: 21, class: gprc } + - { id: 22, class: gprc_and_gprc_nor0 } + - { id: 23, class: gprc } + - { id: 24, class: gprc } + - { id: 25, class: gprc } + - { id: 26, class: g8rc } + - { id: 27, class: gprc } + - { id: 28, class: gprc } + - { id: 29, class: gprc } + - { id: 30, class: gprc } + - { id: 31, class: gprc } + - { id: 32, class: g8rc } + - { id: 33, class: gprc_and_gprc_nor0 } + - { id: 34, class: g8rc } + - { id: 35, class: g8rc } + - { id: 36, class: g8rc_and_g8rc_nox0 } + - { id: 37, class: g8rc_and_g8rc_nox0 } + - { id: 38, class: g8rc } + - { id: 39, class: gprc } + - { id: 40, class: gprc } + - { id: 41, class: crrc } + - { id: 42, class: g8rc } + - { id: 43, class: gprc } + - { id: 44, class: gprc } + - { id: 45, class: g8rc } + - { id: 46, class: g8rc } + - { id: 47, class: crrc } + - { id: 48, class: g8rc } + - { id: 49, class: gprc } + - { id: 50, class: g8rc_and_g8rc_nox0 } + - { id: 51, class: g8rc } + - { id: 52, class: g8rc_and_g8rc_nox0 } + - { id: 53, class: g8rc } + - { id: 54, class: gprc } + - { id: 55, class: g8rc_and_g8rc_nox0 } + - { id: 56, class: gprc } + - { id: 57, class: gprc } + - { id: 58, class: gprc } + - { id: 59, class: gprc } + - { id: 60, class: gprc } + - { id: 61, class: g8rc } + - { id: 62, class: g8rc } + - { id: 63, class: crrc } + - { id: 64, class: crrc } + - { id: 65, class: gprc } + - { id: 66, class: g8rc } + - { id: 67, class: gprc } + - { id: 68, class: gprc } + - { id: 69, class: crrc } + - { id: 70, class: crrc } + - { id: 71, class: gprc } + - { id: 72, class: g8rc } + - { id: 73, class: gprc } + - { id: 74, class: gprc_and_gprc_nor0 } + - { id: 75, class: crbitrc } + - { id: 76, class: g8rc } + - { id: 77, class: gprc } + - { id: 78, class: crrc } + - { id: 79, class: crrc } + - { id: 80, class: gprc } + - { id: 81, class: gprc } + - { id: 82, class: gprc } + - { id: 83, class: gprc } + - { id: 84, class: gprc } + - { id: 85, class: gprc } + - { id: 86, class: gprc } + - { id: 87, class: gprc } + - { id: 88, class: g8rc } + - { id: 89, class: g8rc } + - { id: 90, class: g8rc } + - { id: 91, class: gprc } + - { id: 92, class: gprc_nor0 } + - { id: 93, class: gprc } + - { id: 94, class: gprc_nor0 } + - { id: 95, class: crrc } +liveins: + - { reg: '$x3', virtual-reg: '%34' } + - { reg: '$x5', virtual-reg: '%36' } + - { reg: '$x6', virtual-reg: '%37' } + - { reg: '$x7', virtual-reg: '%38' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: foo + ; CHECK: bb.0 (%ir-block.5): + ; CHECK: successors: %bb.1(0x50000000), %bb.8(0x30000000) + ; CHECK: liveins: $x3, $x5, $x6, $x7 + ; CHECK: [[COPY:%[0-9]+]]:g8rc = COPY $x7 + ; CHECK: [[COPY1:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY $x6 + ; CHECK: [[COPY2:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY $x5 + ; CHECK: [[COPY3:%[0-9]+]]:g8rc = COPY $x3 + ; CHECK: [[COPY4:%[0-9]+]]:gprc = COPY [[COPY]].sub_32 + ; CHECK: [[CMPWI:%[0-9]+]]:crrc = CMPWI [[COPY4]], 1 + ; CHECK: BCC 12, killed [[CMPWI]], %bb.8 + ; CHECK: B %bb.1 + ; CHECK: bb.1 (%ir-block.7): + ; CHECK: successors: %bb.18(0x40000000), %bb.2(0x40000000) + ; CHECK: [[COPY5:%[0-9]+]]:gprc = COPY [[COPY3]].sub_32 + ; CHECK: [[DEF:%[0-9]+]]:g8rc = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:g8rc = INSERT_SUBREG [[DEF]], [[COPY4]], %subreg.sub_32 + ; CHECK: [[RLDICL:%[0-9]+]]:g8rc = RLDICL killed [[INSERT_SUBREG]], 0, 32 + ; CHECK: [[CMPLWI:%[0-9]+]]:crrc = CMPLWI [[COPY4]], 1 + ; CHECK: [[CMPLWI1:%[0-9]+]]:crrc = CMPLWI [[COPY5]], 3 + ; CHECK: BCC 68, killed [[CMPLWI]], %bb.2 + ; CHECK: bb.18: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: [[LI:%[0-9]+]]:gprc = LI 0 + ; CHECK: [[LI1:%[0-9]+]]:gprc = LI 100 + ; CHECK: [[LI8_:%[0-9]+]]:g8rc = LI8 0 + ; CHECK: B %bb.4 + ; CHECK: bb.2 (%ir-block.10): + ; CHECK: successors: %bb.9(0x80000000) + ; CHECK: [[RLWINM8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = RLWINM8 [[RLDICL]], 0, 0, 30 + ; CHECK: [[ADDI8_:%[0-9]+]]:g8rc = ADDI8 [[COPY2]], -8 + ; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 [[COPY1]], -8 + ; CHECK: [[ADDI8_2:%[0-9]+]]:g8rc = nsw ADDI8 killed [[RLWINM8_]], -2 + ; CHECK: [[RLDICL1:%[0-9]+]]:g8rc_and_g8rc_nox0 = RLDICL [[ADDI8_2]], 63, 1 + ; CHECK: [[ADDI8_3:%[0-9]+]]:g8rc = nuw ADDI8 killed [[RLDICL1]], 1 + ; CHECK: MTCTR8loop killed [[ADDI8_3]], implicit-def dead $ctr8 + ; CHECK: [[LI2:%[0-9]+]]:gprc = LI 0 + ; CHECK: [[LI8_1:%[0-9]+]]:g8rc = LI8 0 + ; CHECK: [[LIS:%[0-9]+]]:gprc = LIS 34952 + ; CHECK: [[ORI:%[0-9]+]]:gprc = ORI [[LIS]], 34953 + ; CHECK: [[DEF1:%[0-9]+]]:g8rc = IMPLICIT_DEF + ; CHECK: [[CMPLWI2:%[0-9]+]]:crrc = CMPLWI [[COPY5]], 1 + ; CHECK: B %bb.9 + ; CHECK: bb.3 (%ir-block.15): + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: [[COPY6:%[0-9]+]]:gprc_and_gprc_nor0 = COPY %32.sub_32 + ; CHECK: [[ADDI:%[0-9]+]]:gprc_and_gprc_nor0 = ADDI [[COPY6]], -2 + ; CHECK: [[ADDI1:%[0-9]+]]:gprc = nuw ADDI [[ADDI]], 102 + ; CHECK: bb.4 (%ir-block.17): + ; CHECK: successors: %bb.8(0x40000000), %bb.5(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:g8rc = PHI [[LI8_]], %bb.18, %32, %bb.3 + ; CHECK: [[PHI1:%[0-9]+]]:gprc = PHI [[LI1]], %bb.18, [[ADDI1]], %bb.3 + ; CHECK: [[PHI2:%[0-9]+]]:gprc = PHI [[LI]], %bb.18, %27, %bb.3 + ; CHECK: [[ANDI8_rec:%[0-9]+]]:g8rc = ANDI8_rec [[RLDICL]], 1, implicit-def $cr0 + ; CHECK: [[COPY7:%[0-9]+]]:crbitrc = COPY $cr0gt + ; CHECK: BCn killed [[COPY7]], %bb.8 + ; CHECK: B %bb.5 + ; CHECK: bb.5 (%ir-block.23): + ; CHECK: successors: %bb.7(0x2aaaaaab), %bb.6(0x55555555) + ; CHECK: [[RLDICR:%[0-9]+]]:g8rc = RLDICR [[PHI]], 2, 61 + ; CHECK: [[LWZX:%[0-9]+]]:gprc = LWZX [[COPY2]], [[RLDICR]] :: (load 4 from %ir.24, !tbaa !2) + ; CHECK: [[ADD4_:%[0-9]+]]:gprc = nsw ADD4 killed [[LWZX]], [[PHI2]] + ; CHECK: BCC 76, [[CMPLWI1]], %bb.7 + ; CHECK: B %bb.6 + ; CHECK: bb.6 (%ir-block.23): + ; CHECK: successors: %bb.7(0x80000000) + ; CHECK: [[CMPLWI3:%[0-9]+]]:crrc = CMPLWI [[COPY5]], 1 + ; CHECK: [[COPY8:%[0-9]+]]:gprc = COPY [[PHI]].sub_32 + ; CHECK: [[LIS1:%[0-9]+]]:gprc = LIS 34952 + ; CHECK: [[ORI1:%[0-9]+]]:gprc = ORI killed [[LIS1]], 34953 + ; CHECK: [[MULHWU:%[0-9]+]]:gprc = MULHWU [[COPY8]], killed [[ORI1]] + ; CHECK: [[RLWINM:%[0-9]+]]:gprc = RLWINM [[MULHWU]], 28, 4, 31 + ; CHECK: [[MULLI:%[0-9]+]]:gprc = MULLI killed [[RLWINM]], 30 + ; CHECK: [[SUBF:%[0-9]+]]:gprc = SUBF killed [[MULLI]], [[COPY8]] + ; CHECK: [[COPY9:%[0-9]+]]:gprc = COPY [[PHI]].sub_32 + ; CHECK: [[RLWINM1:%[0-9]+]]:gprc_and_gprc_nor0 = RLWINM [[COPY9]], 1, 0, 30 + ; CHECK: [[ISEL:%[0-9]+]]:gprc = ISEL [[RLWINM1]], [[SUBF]], [[CMPLWI3]].sub_eq + ; CHECK: B %bb.7 + ; CHECK: bb.7 (%ir-block.33): + ; CHECK: successors: %bb.8(0x80000000) + ; CHECK: [[PHI3:%[0-9]+]]:gprc = PHI [[PHI1]], %bb.5, [[ISEL]], %bb.6 + ; CHECK: [[ADD4_1:%[0-9]+]]:gprc = nsw ADD4 [[PHI3]], [[ADD4_]] + ; CHECK: STWX killed [[ADD4_1]], [[COPY1]], [[RLDICR]] :: (store 4 into %ir.36, !tbaa !2) + ; CHECK: bb.8 (%ir-block.37): + ; CHECK: [[LI8_2:%[0-9]+]]:g8rc = LI8 0 + ; CHECK: $x3 = COPY [[LI8_2]] + ; CHECK: BLR8 implicit $lr8, implicit $rm, implicit $x3 + ; CHECK: bb.9 (%ir-block.38): + ; CHECK: successors: %bb.11(0x2aaaaaab), %bb.10(0x55555555) + ; CHECK: [[PHI4:%[0-9]+]]:g8rc_and_g8rc_nox0 = PHI [[LI8_1]], %bb.2, %32, %bb.17 + ; CHECK: [[PHI5:%[0-9]+]]:gprc = PHI [[LI2]], %bb.2, %27, %bb.17 + ; CHECK: [[PHI6:%[0-9]+]]:g8rc_and_g8rc_nox0 = PHI [[ADDI8_]], %bb.2, %55, %bb.17 + ; CHECK: [[PHI7:%[0-9]+]]:g8rc_and_g8rc_nox0 = PHI [[ADDI8_1]], %bb.2, %15, %bb.17 + ; CHECK: [[ADDI8_4:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[PHI7]], 8 + ; CHECK: [[LWZU:%[0-9]+]]:gprc, [[LWZU1:%[0-9]+]]:g8rc_and_g8rc_nox0 = LWZU 8, [[PHI6]] :: (load 4 from %ir.46, !tbaa !2) + ; CHECK: [[COPY10:%[0-9]+]]:gprc_and_gprc_nor0 = COPY [[PHI4]].sub_32 + ; CHECK: [[MULHWU1:%[0-9]+]]:gprc = MULHWU [[COPY10]], [[ORI]] + ; CHECK: [[RLWINM2:%[0-9]+]]:gprc = RLWINM [[MULHWU1]], 28, 4, 31 + ; CHECK: [[MULLI1:%[0-9]+]]:gprc = nsw MULLI killed [[RLWINM2]], -30 + ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:g8rc = INSERT_SUBREG [[DEF1]], killed [[MULLI1]], %subreg.sub_32 + ; CHECK: [[RLDICL2:%[0-9]+]]:g8rc = RLDICL killed [[INSERT_SUBREG1]], 0, 32 + ; CHECK: [[ADD4_2:%[0-9]+]]:gprc = nsw ADD4 killed [[LWZU]], [[PHI5]] + ; CHECK: BCC 76, [[CMPLWI1]], %bb.11 + ; CHECK: B %bb.10 + ; CHECK: bb.10 (%ir-block.38): + ; CHECK: successors: %bb.12(0x80000000) + ; CHECK: [[ADD8_:%[0-9]+]]:g8rc = ADD8 [[PHI4]], [[RLDICL2]] + ; CHECK: [[COPY11:%[0-9]+]]:gprc = COPY [[ADD8_]].sub_32 + ; CHECK: [[COPY12:%[0-9]+]]:gprc = COPY [[PHI4]].sub_32 + ; CHECK: [[RLWINM3:%[0-9]+]]:gprc_and_gprc_nor0 = RLWINM [[COPY12]], 1, 0, 30 + ; CHECK: [[ISEL1:%[0-9]+]]:gprc = ISEL [[RLWINM3]], [[COPY11]], [[CMPLWI2]].sub_eq + ; CHECK: B %bb.12 + ; CHECK: bb.11 (%ir-block.56): + ; CHECK: successors: %bb.12(0x80000000) + ; CHECK: [[ADDI2:%[0-9]+]]:gprc = nuw nsw ADDI [[COPY10]], 100 + ; CHECK: B %bb.12 + ; CHECK: bb.12 (%ir-block.60): + ; CHECK: successors: %bb.15(0x2aaaaaab), %bb.13(0x55555555) + ; CHECK: [[PHI8:%[0-9]+]]:gprc = PHI [[ADDI2]], %bb.11, [[ISEL1]], %bb.10 + ; CHECK: [[COPY13:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY [[ADDI8_4]] + ; CHECK: [[ADD4_3:%[0-9]+]]:gprc = nsw ADD4 [[PHI8]], [[ADD4_2]] + ; CHECK: STW killed [[ADD4_3]], 0, [[ADDI8_4]] :: (store 4 into %ir.44, !tbaa !2) + ; CHECK: [[LWZ:%[0-9]+]]:gprc = LWZ 4, [[LWZU1]] :: (load 4 from %ir.uglygep1112.cast, !tbaa !2) + ; CHECK: [[ADD4_4:%[0-9]+]]:gprc = nsw ADD4 killed [[LWZ]], [[ADD4_2]] + ; CHECK: BCC 76, [[CMPLWI2]], %bb.15 + ; CHECK: B %bb.13 + ; CHECK: bb.13 (%ir-block.60): + ; CHECK: successors: %bb.14(0x40000001), %bb.16(0x3fffffff) + ; CHECK: BCC 68, [[CMPLWI1]], %bb.16 + ; CHECK: B %bb.14 + ; CHECK: bb.14 (%ir-block.67): + ; CHECK: successors: %bb.17(0x80000000) + ; CHECK: [[ADDI3:%[0-9]+]]:gprc = nuw nsw ADDI [[COPY10]], 101 + ; CHECK: B %bb.17 + ; CHECK: bb.15 (%ir-block.69): + ; CHECK: successors: %bb.17(0x80000000) + ; CHECK: [[ORI8_:%[0-9]+]]:g8rc = ORI8 [[PHI4]], 1 + ; CHECK: [[COPY14:%[0-9]+]]:gprc = COPY [[ORI8_]].sub_32 + ; CHECK: [[RLWINM4:%[0-9]+]]:gprc = RLWINM [[COPY14]], 1, 0, 30 + ; CHECK: B %bb.17 + ; CHECK: bb.16 (%ir-block.72): + ; CHECK: successors: %bb.17(0x80000000) + ; CHECK: [[ORI8_1:%[0-9]+]]:g8rc = ORI8 [[RLDICL2]], 1 + ; CHECK: [[ADD8_1:%[0-9]+]]:g8rc = ADD8 [[PHI4]], [[ORI8_1]] + ; CHECK: [[COPY15:%[0-9]+]]:gprc = COPY [[ADD8_1]].sub_32 + ; CHECK: bb.17 (%ir-block.74): + ; CHECK: successors: %bb.9(0x7c000000), %bb.3(0x04000000) + ; CHECK: [[PHI9:%[0-9]+]]:gprc = PHI [[ADDI3]], %bb.14, [[RLWINM4]], %bb.15, [[COPY15]], %bb.16 + ; CHECK: [[ADD4_5:%[0-9]+]]:gprc = nsw ADD4 [[PHI9]], [[ADD4_4]] + ; CHECK: STW killed [[ADD4_5]], 4, [[COPY13]] :: (store 4 into %ir.uglygep78.cast, !tbaa !2) + ; CHECK: [[ADDI8_5:%[0-9]+]]:g8rc = nuw nsw ADDI8 [[PHI4]], 2 + ; CHECK: BDNZ8 %bb.9, implicit-def dead $ctr8, implicit $ctr8 + ; CHECK: B %bb.3 + bb.0 (%ir-block.5): + successors: %bb.1(0x50000000), %bb.9(0x30000000) + liveins: $x3, $x5, $x6, $x7 + + %38:g8rc = COPY $x7 + %37:g8rc_and_g8rc_nox0 = COPY $x6 + %36:g8rc_and_g8rc_nox0 = COPY $x5 + %34:g8rc = COPY $x3 + %39:gprc = COPY %34.sub_32 + %40:gprc = COPY %38.sub_32 + %41:crrc = CMPWI %40, 1 + BCC 12, killed %41, %bb.9 + B %bb.1 + + bb.1 (%ir-block.7): + %46:g8rc = IMPLICIT_DEF + %45:g8rc = INSERT_SUBREG %46, %40, %subreg.sub_32 + %0:g8rc = RLDICL killed %45, 0, 32 + %44:gprc = LI 0 + %43:gprc = LI 100 + %42:g8rc = LI8 0 + %47:crrc = CMPLWI %40, 1 + %95:crrc = CMPLWI %39, 3 + BCC 76, killed %47, %bb.4 + B %bb.2 + + bb.2 (%ir-block.10): + %50:g8rc_and_g8rc_nox0 = RLWINM8 %0, 0, 0, 30 + %1:g8rc = ADDI8 %36, -8 + %2:g8rc = ADDI8 %37, -8 + %51:g8rc = nsw ADDI8 killed %50, -2 + %52:g8rc_and_g8rc_nox0 = RLDICL %51, 63, 1 + %53:g8rc = nuw ADDI8 killed %52, 1 + MTCTR8loop killed %53, implicit-def dead $ctr8 + %49:gprc = LI 0 + %48:g8rc = LI8 0 + %56:gprc = LIS 34952 + %57:gprc = ORI %56, 34953 + %62:g8rc = IMPLICIT_DEF + %69:crrc = CMPLWI %39, 1 + B %bb.10 + + bb.3 (%ir-block.15): + %3:gprc = nuw ADDI %33, 102 + + bb.4 (%ir-block.17): + %4:g8rc = PHI %42, %bb.1, %32, %bb.3 + %5:gprc = PHI %43, %bb.1, %3, %bb.3 + %6:gprc = PHI %44, %bb.1, %27, %bb.3 + %90:g8rc = ANDI8_rec %0, 1, implicit-def $cr0 + %75:crbitrc = COPY $cr0gt + BCn killed %75, %bb.9 + B %bb.5 + + bb.5 (%ir-block.23): + successors: %bb.8(0x2aaaaaab), %bb.21(0x55555555) + + %76:g8rc = RLDICR %4, 2, 61 + %77:gprc = LWZX %36, %76 :: (load 4 from %ir.24, !tbaa !2) + %7:gprc = nsw ADD4 killed %77, %6 + BCC 76, %95, %bb.8 + B %bb.21 + + bb.21 (%ir-block.23): + %79:crrc = CMPLWI %39, 1 + %81:gprc = COPY %4.sub_32 + %82:gprc = LIS 34952 + %83:gprc = ORI killed %82, 34953 + %84:gprc = MULHWU %81, killed %83 + %85:gprc = RLWINM %84, 28, 4, 31 + %86:gprc = MULLI killed %85, 30 + %9:gprc = SUBF killed %86, %81 + %80:gprc = COPY %4.sub_32 + %8:gprc_and_gprc_nor0 = RLWINM %80, 1, 0, 30 + %91:gprc = ISEL %8, %9, %79.sub_eq + B %bb.8 + + bb.8 (%ir-block.33): + %10:gprc = PHI %5, %bb.5, %91, %bb.21 + %87:gprc = nsw ADD4 %10, %7 + STWX killed %87, %37, %76 :: (store 4 into %ir.36, !tbaa !2) + + bb.9 (%ir-block.37): + %89:g8rc = LI8 0 + $x3 = COPY %89 + BLR8 implicit $lr8, implicit $rm, implicit $x3 + + bb.10 (%ir-block.38): + successors: %bb.12(0x2aaaaaab), %bb.19(0x55555555) + + %11:g8rc_and_g8rc_nox0 = PHI %48, %bb.2, %32, %bb.18 + %12:gprc = PHI %49, %bb.2, %27, %bb.18 + %13:g8rc_and_g8rc_nox0 = PHI %1, %bb.2, %17, %bb.18 + %14:g8rc_and_g8rc_nox0 = PHI %2, %bb.2, %15, %bb.18 + %16:g8rc_and_g8rc_nox0 = ADDI8 %14, 8 + %15:g8rc_and_g8rc_nox0 = COPY %16 + %54:gprc, %55:g8rc_and_g8rc_nox0 = LWZU 8, %13 :: (load 4 from %ir.46, !tbaa !2) + %17:g8rc_and_g8rc_nox0 = COPY %55 + %18:gprc_and_gprc_nor0 = COPY %11.sub_32 + %58:gprc = MULHWU %18, %57 + %59:gprc = RLWINM %58, 28, 4, 31 + %60:gprc = nsw MULLI killed %59, -30 + %61:g8rc = INSERT_SUBREG %62, killed %60, %subreg.sub_32 + %19:g8rc = RLDICL killed %61, 0, 32 + %20:g8rc = ORI8 %19, 1 + %21:gprc = nsw ADD4 killed %54, %12 + BCC 76, %95, %bb.12 + B %bb.19 + + bb.19 (%ir-block.38): + %66:g8rc = ADD8 %11, %19 + %24:gprc = COPY %66.sub_32 + %65:gprc = COPY %11.sub_32 + %22:gprc_and_gprc_nor0 = RLWINM %65, 1, 0, 30 + %93:gprc = ISEL %22, %24, %69.sub_eq + B %bb.14 + + bb.12 (%ir-block.56): + %23:gprc = nuw nsw ADDI %18, 100 + B %bb.14 + + bb.14 (%ir-block.60): + successors: %bb.16(0x2aaaaaab), %bb.20(0x55555555) + + %25:gprc = PHI %23, %bb.12, %93, %bb.19 + %67:gprc = nsw ADD4 %25, %21 + STW killed %67, 0, %16 :: (store 4 into %ir.44, !tbaa !2) + %26:g8rc = ORI8 %11, 1 + %68:gprc = LWZ 4, %17 :: (load 4 from %ir.uglygep1112.cast, !tbaa !2) + %27:gprc = nsw ADD4 killed %68, %21 + BCC 76, %69, %bb.16 + B %bb.20 + + bb.20 (%ir-block.60): + successors: %bb.15(0x40000001), %bb.17(0x3fffffff) + + BCC 68, %95, %bb.17 + B %bb.15 + + bb.15 (%ir-block.67): + %28:gprc = nuw nsw ADDI %18, 101 + B %bb.18 + + bb.16 (%ir-block.69): + %71:gprc = COPY %26.sub_32 + %29:gprc = RLWINM %71, 1, 0, 30 + B %bb.18 + + bb.17 (%ir-block.72): + %72:g8rc = ADD8 %11, %20 + %30:gprc = COPY %72.sub_32 + + bb.18 (%ir-block.74): + successors: %bb.10(0x7c000000), %bb.3(0x04000000) + + %31:gprc = PHI %28, %bb.15, %29, %bb.16, %30, %bb.17 + %73:gprc = nsw ADD4 %31, %27 + STW killed %73, 4, %15 :: (store 4 into %ir.uglygep78.cast, !tbaa !2) + %32:g8rc = nuw nsw ADDI8 %11, 2 + %74:gprc_and_gprc_nor0 = COPY %32.sub_32 + %33:gprc_and_gprc_nor0 = ADDI killed %74, -2 + BDNZ8 %bb.10, implicit-def dead $ctr8, implicit $ctr8 + B %bb.3 + +... diff --git a/llvm/test/CodeGen/PowerPC/store_fptoi.ll b/llvm/test/CodeGen/PowerPC/store_fptoi.ll index e4f47ab7628fd..1e5b8414243b1 100644 --- a/llvm/test/CodeGen/PowerPC/store_fptoi.ll +++ b/llvm/test/CodeGen/PowerPC/store_fptoi.ll @@ -7,6 +7,82 @@ ; Tests for store of fp_to_sint converstions ; ========================================== +; Function Attrs: norecurse nounwind +define void @qpConv2sdw(fp128* nocapture readonly %a, i64* nocapture %b) { +entry: + %0 = load fp128, fp128* %a, align 16 + %conv = fptosi fp128 %0 to i64 + store i64 %conv, i64* %b, align 8 + ret void + +; CHECK-LABEL: qpConv2sdw +; CHECK: lxv [[LD:[0-9]+]], 0(3) +; CHECK-NEXT: xscvqpsdz [[CONV:[0-9]+]], [[LD]] +; CHECK-NEXT: stxsd [[CONV]], 0(4) +; CHECK-NEXT: blr + +; CHECK-PWR8-LABEL: qpConv2sdw +; CHECK-PWR8: bl __fixkfdi +; CHECK-PWR8: blr +} + +; Function Attrs: norecurse nounwind +define void @qpConv2sw(fp128* nocapture readonly %a, i32* nocapture %b) { +entry: + %0 = load fp128, fp128* %a, align 16 + %conv = fptosi fp128 %0 to i32 + store i32 %conv, i32* %b, align 4 + ret void + +; CHECK-LABEL: qpConv2sw +; CHECK: lxv [[LD:[0-9]+]], 0(3) +; CHECK-NEXT: xscvqpswz [[CONV:[0-9]+]], [[LD]] +; CHECK-NEXT: stxsiwx [[CONV]], 0, 4 +; CHECK-NEXT: blr + +; CHECK-PWR8-LABEL: qpConv2sw +; CHECK-PWR8: bl __fixkfsi +; CHECK-PWR8: blr +} + +; Function Attrs: norecurse nounwind +define void @qpConv2udw(fp128* nocapture readonly %a, i64* nocapture %b) { +entry: + %0 = load fp128, fp128* %a, align 16 + %conv = fptoui fp128 %0 to i64 + store i64 %conv, i64* %b, align 8 + ret void + +; CHECK-LABEL: qpConv2udw +; CHECK: lxv [[LD:[0-9]+]], 0(3) +; CHECK-NEXT: xscvqpudz [[CONV:[0-9]+]], [[LD]] +; CHECK-NEXT: stxsd [[CONV]], 0(4) +; CHECK-NEXT: blr + +; CHECK-PWR8-LABEL: qpConv2udw +; CHECK-PWR8: bl __fixunskfdi +; CHECK-PWR8: blr +} + +; Function Attrs: norecurse nounwind +define void @qpConv2uw(fp128* nocapture readonly %a, i32* nocapture %b) { +entry: + %0 = load fp128, fp128* %a, align 16 + %conv = fptoui fp128 %0 to i32 + store i32 %conv, i32* %b, align 4 + ret void + +; CHECK-LABEL: qpConv2uw +; CHECK: lxv [[LD:[0-9]+]], 0(3) +; CHECK-NEXT: xscvqpuwz [[CONV:[0-9]+]], [[LD]] +; CHECK-NEXT: stxsiwx [[CONV]], 0, 4 +; CHECK-NEXT: blr + +; CHECK-PWR8-LABEL: qpConv2uw +; CHECK-PWR8: bl __fixunskfsi +; CHECK-PWR8: blr +} + ; Function Attrs: norecurse nounwind define void @dpConv2sdw(double* nocapture readonly %a, i64* nocapture %b) { entry: diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll index 1acf71e8f1597..21fc855aa8547 100644 --- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll @@ -4899,19 +4899,50 @@ entry: define <2 x double> @constrained_vector_nearbyint_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_nearbyint_v2f64: ; PC64LE: # %bb.0: # %entry +; PC64LE-NEXT: mflr 0 +; PC64LE-NEXT: std 0, 16(1) +; PC64LE-NEXT: stdu 1, -64(1) ; PC64LE-NEXT: addis 3, 2, .LCPI81_0@toc@ha -; PC64LE-NEXT: addi 3, 3, .LCPI81_0@toc@l -; PC64LE-NEXT: lxvd2x 0, 0, 3 -; PC64LE-NEXT: xxswapd 0, 0 -; PC64LE-NEXT: xvrdpic 34, 0 +; PC64LE-NEXT: lfd 1, .LCPI81_0@toc@l(3) +; PC64LE-NEXT: bl nearbyint +; PC64LE-NEXT: nop +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE-NEXT: stxvd2x 1, 1, 3 # 16-byte Folded Spill +; PC64LE-NEXT: addis 3, 2, .LCPI81_1@toc@ha +; PC64LE-NEXT: lfs 1, .LCPI81_1@toc@l(3) +; PC64LE-NEXT: bl nearbyint +; PC64LE-NEXT: nop +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE-NEXT: lxvd2x 0, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: xxmrghd 34, 1, 0 +; PC64LE-NEXT: addi 1, 1, 64 +; PC64LE-NEXT: ld 0, 16(1) +; PC64LE-NEXT: mtlr 0 ; PC64LE-NEXT: blr ; ; PC64LE9-LABEL: constrained_vector_nearbyint_v2f64: ; PC64LE9: # %bb.0: # %entry +; PC64LE9-NEXT: mflr 0 +; PC64LE9-NEXT: std 0, 16(1) +; PC64LE9-NEXT: stdu 1, -48(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI81_0@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI81_0@toc@l -; PC64LE9-NEXT: lxvx 0, 0, 3 -; PC64LE9-NEXT: xvrdpic 34, 0 +; PC64LE9-NEXT: lfd 1, .LCPI81_0@toc@l(3) +; PC64LE9-NEXT: bl nearbyint +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: addis 3, 2, .LCPI81_1@toc@ha +; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfs 1, .LCPI81_1@toc@l(3) +; PC64LE9-NEXT: bl nearbyint +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload +; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: xxmrghd 34, 1, 0 +; PC64LE9-NEXT: addi 1, 1, 48 +; PC64LE9-NEXT: ld 0, 16(1) +; PC64LE9-NEXT: mtlr 0 ; PC64LE9-NEXT: blr entry: %nearby = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64( @@ -5010,31 +5041,72 @@ entry: define <3 x double> @constrained_vector_nearby_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_nearby_v3f64: ; PC64LE: # %bb.0: # %entry -; PC64LE-NEXT: addis 3, 2, .LCPI83_1@toc@ha -; PC64LE-NEXT: addi 3, 3, .LCPI83_1@toc@l -; PC64LE-NEXT: lxvd2x 0, 0, 3 +; PC64LE-NEXT: mflr 0 +; PC64LE-NEXT: std 0, 16(1) +; PC64LE-NEXT: stdu 1, -80(1) +; PC64LE-NEXT: li 3, 64 +; PC64LE-NEXT: stxvd2x 63, 1, 3 # 16-byte Folded Spill ; PC64LE-NEXT: addis 3, 2, .LCPI83_0@toc@ha ; PC64LE-NEXT: lfd 1, .LCPI83_0@toc@l(3) -; PC64LE-NEXT: xxswapd 0, 0 -; PC64LE-NEXT: xsrdpic 3, 1 -; PC64LE-NEXT: xvrdpic 2, 0 -; PC64LE-NEXT: xxswapd 1, 2 -; PC64LE-NEXT: # kill: def $f2 killed $f2 killed $vsl2 -; PC64LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PC64LE-NEXT: bl nearbyint +; PC64LE-NEXT: nop +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE-NEXT: stxvd2x 1, 1, 3 # 16-byte Folded Spill +; PC64LE-NEXT: addis 3, 2, .LCPI83_1@toc@ha +; PC64LE-NEXT: lfs 1, .LCPI83_1@toc@l(3) +; PC64LE-NEXT: bl nearbyint +; PC64LE-NEXT: nop +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE-NEXT: lxvd2x 0, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: addis 3, 2, .LCPI83_2@toc@ha +; PC64LE-NEXT: xxmrghd 63, 0, 1 +; PC64LE-NEXT: lfd 1, .LCPI83_2@toc@l(3) +; PC64LE-NEXT: bl nearbyint +; PC64LE-NEXT: nop +; PC64LE-NEXT: xxswapd 0, 63 +; PC64LE-NEXT: li 3, 64 +; PC64LE-NEXT: xxlor 2, 63, 63 +; PC64LE-NEXT: lxvd2x 63, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: fmr 3, 1 +; PC64LE-NEXT: fmr 1, 0 +; PC64LE-NEXT: addi 1, 1, 80 +; PC64LE-NEXT: ld 0, 16(1) +; PC64LE-NEXT: mtlr 0 ; PC64LE-NEXT: blr ; ; PC64LE9-LABEL: constrained_vector_nearby_v3f64: ; PC64LE9: # %bb.0: # %entry +; PC64LE9-NEXT: mflr 0 +; PC64LE9-NEXT: std 0, 16(1) +; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI83_0@toc@ha -; PC64LE9-NEXT: lfd 0, .LCPI83_0@toc@l(3) +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI83_0@toc@l(3) +; PC64LE9-NEXT: bl nearbyint +; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI83_1@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI83_1@toc@l -; PC64LE9-NEXT: xsrdpic 3, 0 -; PC64LE9-NEXT: lxvx 0, 0, 3 -; PC64LE9-NEXT: xvrdpic 2, 0 -; PC64LE9-NEXT: xxswapd 1, 2 +; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfs 1, .LCPI83_1@toc@l(3) +; PC64LE9-NEXT: bl nearbyint +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload +; PC64LE9-NEXT: addis 3, 2, .LCPI83_2@toc@ha +; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: xxmrghd 63, 0, 1 +; PC64LE9-NEXT: lfd 1, .LCPI83_2@toc@l(3) +; PC64LE9-NEXT: bl nearbyint +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: fmr 3, 1 +; PC64LE9-NEXT: xxswapd 1, 63 +; PC64LE9-NEXT: xscpsgndp 2, 63, 63 +; PC64LE9-NEXT: lxv 63, 48(1) # 16-byte Folded Reload ; PC64LE9-NEXT: # kill: def $f1 killed $f1 killed $vsl1 -; PC64LE9-NEXT: # kill: def $f2 killed $f2 killed $vsl2 +; PC64LE9-NEXT: addi 1, 1, 64 +; PC64LE9-NEXT: ld 0, 16(1) +; PC64LE9-NEXT: mtlr 0 ; PC64LE9-NEXT: blr entry: %nearby = call <3 x double> @llvm.experimental.constrained.nearbyint.v3f64( @@ -5047,28 +5119,86 @@ entry: define <4 x double> @constrained_vector_nearbyint_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_nearbyint_v4f64: ; PC64LE: # %bb.0: # %entry +; PC64LE-NEXT: mflr 0 +; PC64LE-NEXT: std 0, 16(1) +; PC64LE-NEXT: stdu 1, -80(1) +; PC64LE-NEXT: li 3, 64 +; PC64LE-NEXT: stxvd2x 63, 1, 3 # 16-byte Folded Spill ; PC64LE-NEXT: addis 3, 2, .LCPI84_0@toc@ha -; PC64LE-NEXT: addis 4, 2, .LCPI84_1@toc@ha -; PC64LE-NEXT: addi 3, 3, .LCPI84_0@toc@l -; PC64LE-NEXT: lxvd2x 0, 0, 3 -; PC64LE-NEXT: addi 3, 4, .LCPI84_1@toc@l -; PC64LE-NEXT: lxvd2x 1, 0, 3 -; PC64LE-NEXT: xxswapd 0, 0 -; PC64LE-NEXT: xxswapd 1, 1 -; PC64LE-NEXT: xvrdpic 35, 0 -; PC64LE-NEXT: xvrdpic 34, 1 +; PC64LE-NEXT: lfd 1, .LCPI84_0@toc@l(3) +; PC64LE-NEXT: bl nearbyint +; PC64LE-NEXT: nop +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE-NEXT: stxvd2x 1, 1, 3 # 16-byte Folded Spill +; PC64LE-NEXT: addis 3, 2, .LCPI84_1@toc@ha +; PC64LE-NEXT: lfd 1, .LCPI84_1@toc@l(3) +; PC64LE-NEXT: bl nearbyint +; PC64LE-NEXT: nop +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE-NEXT: lxvd2x 0, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: addis 3, 2, .LCPI84_2@toc@ha +; PC64LE-NEXT: xxmrghd 63, 1, 0 +; PC64LE-NEXT: lfd 1, .LCPI84_2@toc@l(3) +; PC64LE-NEXT: bl nearbyint +; PC64LE-NEXT: nop +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE-NEXT: stxvd2x 1, 1, 3 # 16-byte Folded Spill +; PC64LE-NEXT: addis 3, 2, .LCPI84_3@toc@ha +; PC64LE-NEXT: lfd 1, .LCPI84_3@toc@l(3) +; PC64LE-NEXT: bl nearbyint +; PC64LE-NEXT: nop +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: vmr 2, 31 +; PC64LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE-NEXT: lxvd2x 0, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: li 3, 64 +; PC64LE-NEXT: lxvd2x 63, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: xxmrghd 35, 1, 0 +; PC64LE-NEXT: addi 1, 1, 80 +; PC64LE-NEXT: ld 0, 16(1) +; PC64LE-NEXT: mtlr 0 ; PC64LE-NEXT: blr ; ; PC64LE9-LABEL: constrained_vector_nearbyint_v4f64: ; PC64LE9: # %bb.0: # %entry +; PC64LE9-NEXT: mflr 0 +; PC64LE9-NEXT: std 0, 16(1) +; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI84_0@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI84_0@toc@l -; PC64LE9-NEXT: lxvx 0, 0, 3 +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI84_0@toc@l(3) +; PC64LE9-NEXT: bl nearbyint +; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI84_1@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI84_1@toc@l -; PC64LE9-NEXT: xvrdpic 35, 0 -; PC64LE9-NEXT: lxvx 0, 0, 3 -; PC64LE9-NEXT: xvrdpic 34, 0 +; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI84_1@toc@l(3) +; PC64LE9-NEXT: bl nearbyint +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload +; PC64LE9-NEXT: addis 3, 2, .LCPI84_2@toc@ha +; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: xxmrghd 63, 1, 0 +; PC64LE9-NEXT: lfd 1, .LCPI84_2@toc@l(3) +; PC64LE9-NEXT: bl nearbyint +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: addis 3, 2, .LCPI84_3@toc@ha +; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI84_3@toc@l(3) +; PC64LE9-NEXT: bl nearbyint +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload +; PC64LE9-NEXT: vmr 2, 31 +; PC64LE9-NEXT: lxv 63, 48(1) # 16-byte Folded Reload +; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: xxmrghd 35, 1, 0 +; PC64LE9-NEXT: addi 1, 1, 64 +; PC64LE9-NEXT: ld 0, 16(1) +; PC64LE9-NEXT: mtlr 0 ; PC64LE9-NEXT: blr entry: %nearby = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64( @@ -7038,19 +7168,19 @@ define <3 x double> @constrained_vector_fpext_v3f32() #0 { ; PC64LE-NEXT: addis 3, 2, .LCPI133_0@toc@ha ; PC64LE-NEXT: addis 4, 2, .LCPI133_1@toc@ha ; PC64LE-NEXT: addis 5, 2, .LCPI133_2@toc@ha -; PC64LE-NEXT: lfs 1, .LCPI133_0@toc@l(3) +; PC64LE-NEXT: lfs 3, .LCPI133_0@toc@l(3) ; PC64LE-NEXT: lfs 2, .LCPI133_1@toc@l(4) -; PC64LE-NEXT: lfs 3, .LCPI133_2@toc@l(5) +; PC64LE-NEXT: lfs 1, .LCPI133_2@toc@l(5) ; PC64LE-NEXT: blr ; ; PC64LE9-LABEL: constrained_vector_fpext_v3f32: ; PC64LE9: # %bb.0: # %entry ; PC64LE9-NEXT: addis 3, 2, .LCPI133_0@toc@ha -; PC64LE9-NEXT: lfs 1, .LCPI133_0@toc@l(3) +; PC64LE9-NEXT: lfs 3, .LCPI133_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI133_1@toc@ha ; PC64LE9-NEXT: lfs 2, .LCPI133_1@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI133_2@toc@ha -; PC64LE9-NEXT: lfs 3, .LCPI133_2@toc@l(3) +; PC64LE9-NEXT: lfs 1, .LCPI133_2@toc@l(3) ; PC64LE9-NEXT: blr entry: %result = call <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32( diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll index 4a78218262ca0..39469d63b9078 100644 --- a/llvm/test/CodeGen/PowerPC/vsx.ll +++ b/llvm/test/CodeGen/PowerPC/vsx.ll @@ -1548,8 +1548,8 @@ define <2 x i64> @test46(<2 x float> %a) { ; CHECK-FISL-NEXT: ld r3, -24(r1) ; CHECK-FISL-NEXT: std r3, -16(r1) ; CHECK-FISL-NEXT: addi r3, r1, -16 -; CHECK-FISL-NEXT: lxvd2x vs0, 0, r3 -; CHECK-FISL-NEXT: xxlor v2, vs0, vs0 +; CHECK-FISL-NEXT: lxvd2x vs1, 0, r3 +; CHECK-FISL-NEXT: xxlor v2, vs1, vs1 ; CHECK-FISL-NEXT: blr ; ; CHECK-LE-LABEL: test46: @@ -1616,8 +1616,8 @@ define <2 x i64> @test47(<2 x float> %a) { ; CHECK-FISL-NEXT: ld r3, -24(r1) ; CHECK-FISL-NEXT: std r3, -16(r1) ; CHECK-FISL-NEXT: addi r3, r1, -16 -; CHECK-FISL-NEXT: lxvd2x vs0, 0, r3 -; CHECK-FISL-NEXT: xxlor v2, vs0, vs0 +; CHECK-FISL-NEXT: lxvd2x vs1, 0, r3 +; CHECK-FISL-NEXT: xxlor v2, vs1, vs1 ; CHECK-FISL-NEXT: blr ; ; CHECK-LE-LABEL: test47: @@ -1859,13 +1859,13 @@ define <2 x i64> @test60(<2 x i64> %a, <2 x i64> %b) { ; CHECK-FISL-NEXT: stxvd2x v3, 0, r3 ; CHECK-FISL-NEXT: addi r3, r1, -48 ; CHECK-FISL-NEXT: stxvd2x v2, 0, r3 -; CHECK-FISL-NEXT: lwz r3, -20(r1) -; CHECK-FISL-NEXT: ld r4, -40(r1) -; CHECK-FISL-NEXT: sld r3, r4, r3 +; CHECK-FISL-NEXT: lwz r4, -20(r1) +; CHECK-FISL-NEXT: ld r3, -40(r1) +; CHECK-FISL-NEXT: sld r3, r3, r4 ; CHECK-FISL-NEXT: std r3, -8(r1) -; CHECK-FISL-NEXT: lwz r3, -28(r1) -; CHECK-FISL-NEXT: ld r4, -48(r1) -; CHECK-FISL-NEXT: sld r3, r4, r3 +; CHECK-FISL-NEXT: lwz r4, -28(r1) +; CHECK-FISL-NEXT: ld r3, -48(r1) +; CHECK-FISL-NEXT: sld r3, r3, r4 ; CHECK-FISL-NEXT: std r3, -16(r1) ; CHECK-FISL-NEXT: addi r3, r1, -16 ; CHECK-FISL-NEXT: lxvd2x vs0, 0, r3 @@ -1925,13 +1925,13 @@ define <2 x i64> @test61(<2 x i64> %a, <2 x i64> %b) { ; CHECK-FISL-NEXT: stxvd2x v3, 0, r3 ; CHECK-FISL-NEXT: addi r3, r1, -48 ; CHECK-FISL-NEXT: stxvd2x v2, 0, r3 -; CHECK-FISL-NEXT: lwz r3, -20(r1) -; CHECK-FISL-NEXT: ld r4, -40(r1) -; CHECK-FISL-NEXT: srd r3, r4, r3 +; CHECK-FISL-NEXT: lwz r4, -20(r1) +; CHECK-FISL-NEXT: ld r3, -40(r1) +; CHECK-FISL-NEXT: srd r3, r3, r4 ; CHECK-FISL-NEXT: std r3, -8(r1) -; CHECK-FISL-NEXT: lwz r3, -28(r1) -; CHECK-FISL-NEXT: ld r4, -48(r1) -; CHECK-FISL-NEXT: srd r3, r4, r3 +; CHECK-FISL-NEXT: lwz r4, -28(r1) +; CHECK-FISL-NEXT: ld r3, -48(r1) +; CHECK-FISL-NEXT: srd r3, r3, r4 ; CHECK-FISL-NEXT: std r3, -16(r1) ; CHECK-FISL-NEXT: addi r3, r1, -16 ; CHECK-FISL-NEXT: lxvd2x vs0, 0, r3 @@ -1991,13 +1991,13 @@ define <2 x i64> @test62(<2 x i64> %a, <2 x i64> %b) { ; CHECK-FISL-NEXT: stxvd2x v3, 0, r3 ; CHECK-FISL-NEXT: addi r3, r1, -48 ; CHECK-FISL-NEXT: stxvd2x v2, 0, r3 -; CHECK-FISL-NEXT: lwz r3, -20(r1) -; CHECK-FISL-NEXT: ld r4, -40(r1) -; CHECK-FISL-NEXT: srad r3, r4, r3 +; CHECK-FISL-NEXT: lwz r4, -20(r1) +; CHECK-FISL-NEXT: ld r3, -40(r1) +; CHECK-FISL-NEXT: srad r3, r3, r4 ; CHECK-FISL-NEXT: std r3, -8(r1) -; CHECK-FISL-NEXT: lwz r3, -28(r1) -; CHECK-FISL-NEXT: ld r4, -48(r1) -; CHECK-FISL-NEXT: srad r3, r4, r3 +; CHECK-FISL-NEXT: lwz r4, -28(r1) +; CHECK-FISL-NEXT: ld r3, -48(r1) +; CHECK-FISL-NEXT: srad r3, r3, r4 ; CHECK-FISL-NEXT: std r3, -16(r1) ; CHECK-FISL-NEXT: addi r3, r1, -16 ; CHECK-FISL-NEXT: lxvd2x vs0, 0, r3 @@ -2426,12 +2426,12 @@ define <2 x i32> @test80(i32 %v) { ; CHECK-FISL: # %bb.0: ; CHECK-FISL-NEXT: # kill: def $r3 killed $r3 killed $x3 ; CHECK-FISL-NEXT: stw r3, -16(r1) -; CHECK-FISL-NEXT: addi r3, r1, -16 -; CHECK-FISL-NEXT: lxvw4x vs0, 0, r3 +; CHECK-FISL-NEXT: addi r4, r1, -16 +; CHECK-FISL-NEXT: lxvw4x vs0, 0, r4 ; CHECK-FISL-NEXT: xxspltw v2, vs0, 0 -; CHECK-FISL-NEXT: addis r3, r2, .LCPI65_0@toc@ha -; CHECK-FISL-NEXT: addi r3, r3, .LCPI65_0@toc@l -; CHECK-FISL-NEXT: lxvw4x v3, 0, r3 +; CHECK-FISL-NEXT: addis r4, r2, .LCPI65_0@toc@ha +; CHECK-FISL-NEXT: addi r4, r4, .LCPI65_0@toc@l +; CHECK-FISL-NEXT: lxvw4x v3, 0, r4 ; CHECK-FISL-NEXT: vadduwm v2, v2, v3 ; CHECK-FISL-NEXT: blr ; diff --git a/llvm/test/CodeGen/RISCV/shadowcallstack.ll b/llvm/test/CodeGen/RISCV/shadowcallstack.ll new file mode 100644 index 0000000000000..0c9c17ac7a4a7 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/shadowcallstack.ll @@ -0,0 +1,174 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+reserve-x18 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefix=RV32 +; RUN: llc -mtriple=riscv64 -mattr=+reserve-x18 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefix=RV64 + +define void @f1() shadowcallstack { +; RV32-LABEL: f1: +; RV32: # %bb.0: +; RV32-NEXT: ret +; +; RV64-LABEL: f1: +; RV64: # %bb.0: +; RV64-NEXT: ret + ret void +} + +declare void @foo() + +define void @f2() shadowcallstack { +; RV32-LABEL: f2: +; RV32: # %bb.0: +; RV32-NEXT: tail foo +; +; RV64-LABEL: f2: +; RV64: # %bb.0: +; RV64-NEXT: tail foo + tail call void @foo() + ret void +} + +declare i32 @bar() + +define i32 @f3() shadowcallstack { +; RV32-LABEL: f3: +; RV32: # %bb.0: +; RV32-NEXT: sw ra, 0(s2) +; RV32-NEXT: addi s2, s2, 4 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: call bar +; RV32-NEXT: lw ra, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lw ra, -4(s2) +; RV32-NEXT: addi s2, s2, -4 +; RV32-NEXT: ret +; +; RV64-LABEL: f3: +; RV64: # %bb.0: +; RV64-NEXT: sd ra, 0(s2) +; RV64-NEXT: addi s2, s2, 8 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd ra, 8(sp) +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: call bar +; RV64-NEXT: ld ra, 8(sp) +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ld ra, -8(s2) +; RV64-NEXT: addi s2, s2, -8 +; RV64-NEXT: ret + %res = call i32 @bar() + %res1 = add i32 %res, 1 + ret i32 %res +} + +define i32 @f4() shadowcallstack { +; RV32-LABEL: f4: +; RV32: # %bb.0: +; RV32-NEXT: sw ra, 0(s2) +; RV32-NEXT: addi s2, s2, 4 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) +; RV32-NEXT: sw s0, 8(sp) +; RV32-NEXT: sw s1, 4(sp) +; RV32-NEXT: sw s3, 0(sp) +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: .cfi_offset s1, -12 +; RV32-NEXT: .cfi_offset s3, -16 +; RV32-NEXT: call bar +; RV32-NEXT: mv s3, a0 +; RV32-NEXT: call bar +; RV32-NEXT: mv s1, a0 +; RV32-NEXT: call bar +; RV32-NEXT: mv s0, a0 +; RV32-NEXT: call bar +; RV32-NEXT: add a1, s3, s1 +; RV32-NEXT: add a0, s0, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: lw s3, 0(sp) +; RV32-NEXT: lw s1, 4(sp) +; RV32-NEXT: lw s0, 8(sp) +; RV32-NEXT: lw ra, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lw ra, -4(s2) +; RV32-NEXT: addi s2, s2, -4 +; RV32-NEXT: ret +; +; RV64-LABEL: f4: +; RV64: # %bb.0: +; RV64-NEXT: sd ra, 0(s2) +; RV64-NEXT: addi s2, s2, 8 +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) +; RV64-NEXT: sd s0, 16(sp) +; RV64-NEXT: sd s1, 8(sp) +; RV64-NEXT: sd s3, 0(sp) +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: .cfi_offset s1, -24 +; RV64-NEXT: .cfi_offset s3, -32 +; RV64-NEXT: call bar +; RV64-NEXT: mv s3, a0 +; RV64-NEXT: call bar +; RV64-NEXT: mv s1, a0 +; RV64-NEXT: call bar +; RV64-NEXT: mv s0, a0 +; RV64-NEXT: call bar +; RV64-NEXT: add a1, s3, s1 +; RV64-NEXT: add a0, s0, a0 +; RV64-NEXT: addw a0, a1, a0 +; RV64-NEXT: ld s3, 0(sp) +; RV64-NEXT: ld s1, 8(sp) +; RV64-NEXT: ld s0, 16(sp) +; RV64-NEXT: ld ra, 24(sp) +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: ld ra, -8(s2) +; RV64-NEXT: addi s2, s2, -8 +; RV64-NEXT: ret + %res1 = call i32 @bar() + %res2 = call i32 @bar() + %res3 = call i32 @bar() + %res4 = call i32 @bar() + %res12 = add i32 %res1, %res2 + %res34 = add i32 %res3, %res4 + %res1234 = add i32 %res12, %res34 + ret i32 %res1234 +} + +define i32 @f5() shadowcallstack nounwind { +; RV32-LABEL: f5: +; RV32: # %bb.0: +; RV32-NEXT: sw ra, 0(s2) +; RV32-NEXT: addi s2, s2, 4 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) +; RV32-NEXT: call bar +; RV32-NEXT: lw ra, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lw ra, -4(s2) +; RV32-NEXT: addi s2, s2, -4 +; RV32-NEXT: ret +; +; RV64-LABEL: f5: +; RV64: # %bb.0: +; RV64-NEXT: sd ra, 0(s2) +; RV64-NEXT: addi s2, s2, 8 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) +; RV64-NEXT: call bar +; RV64-NEXT: ld ra, 8(sp) +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ld ra, -8(s2) +; RV64-NEXT: addi s2, s2, -8 +; RV64-NEXT: ret + %res = call i32 @bar() + %res1 = add i32 %res, 1 + ret i32 %res +} diff --git a/llvm/test/CodeGen/SPARC/fp16-promote.ll b/llvm/test/CodeGen/SPARC/fp16-promote.ll index 0c402430dadc1..9709322f48a57 100644 --- a/llvm/test/CodeGen/SPARC/fp16-promote.ll +++ b/llvm/test/CodeGen/SPARC/fp16-promote.ll @@ -182,11 +182,11 @@ define void @test_fptrunc_double(double %d, half* %p) nounwind { ; V8-UNOPT-NEXT: std %i4, [%fp+-8] ; V8-UNOPT-NEXT: ldd [%fp+-8], %f0 ; V8-UNOPT-NEXT: std %f0, [%fp+-16] -; V8-UNOPT-NEXT: ldd [%fp+-16], %i0 -; V8-UNOPT-NEXT: mov %i0, %i3 -; V8-UNOPT-NEXT: ! kill: def $i1 killed $i1 killed $i0_i1 -; V8-UNOPT-NEXT: mov %i3, %o0 -; V8-UNOPT-NEXT: mov %i1, %o1 +; V8-UNOPT-NEXT: ldd [%fp+-16], %i4 +; V8-UNOPT-NEXT: mov %i4, %i0 +; V8-UNOPT-NEXT: ! kill: def $i5 killed $i5 killed $i4_i5 +; V8-UNOPT-NEXT: mov %i0, %o0 +; V8-UNOPT-NEXT: mov %i5, %o1 ; V8-UNOPT-NEXT: call __truncdfhf2 ; V8-UNOPT-NEXT: st %i2, [%fp+-20] ; V8-UNOPT-NEXT: ld [%fp+-20], %i0 ! 4-byte Folded Reload diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-14.ll b/llvm/test/CodeGen/SystemZ/fp-mul-14.ll index 8bab2135739c4..363511655ad91 100644 --- a/llvm/test/CodeGen/SystemZ/fp-mul-14.ll +++ b/llvm/test/CodeGen/SystemZ/fp-mul-14.ll @@ -2,9 +2,6 @@ ; ; Check that a multiply-and-add results. -; FIXME: This test is xfailed temporarily -; XFAIL: * - define void @f1(float %arg, float* %Dst) { ; CHECK-LABEL: f1: ; CHECK: maeb diff --git a/llvm/test/CodeGen/SystemZ/vec-zext.ll b/llvm/test/CodeGen/SystemZ/vec-zext.ll index b4c8f2307b0b7..cb61d31e5ebe3 100644 --- a/llvm/test/CodeGen/SystemZ/vec-zext.ll +++ b/llvm/test/CodeGen/SystemZ/vec-zext.ll @@ -92,3 +92,19 @@ define <8 x i16> @fun10(<8 x i8> %val1) { ret <8 x i16> %z } +define <2 x i32> @fun11(<2 x i64> %Arg1, <2 x i64> %Arg2) { +; CHECK-LABEL: fun11: +; CHECK: vgbm %v0, 0 +; CHECK-NEXT: vceqg %v1, %v24, %v0 +; CHECK-NEXT: vceqg %v0, %v26, %v0 +; CHECK-NEXT: vo %v0, %v1, %v0 +; CHECK-NEXT: vrepig %v1, 1 +; CHECK-NEXT: vn %v0, %v0, %v1 +; CHECK-NEXT: vpkg %v24, %v0, %v0 +; CHECK-NEXT: br %r14 + %i3 = icmp eq <2 x i64> %Arg1, zeroinitializer + %i5 = icmp eq <2 x i64> %Arg2, zeroinitializer + %i6 = or <2 x i1> %i3, %i5 + %i7 = zext <2 x i1> %i6 to <2 x i32> + ret <2 x i32> %i7 +} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll index 2fa8a4d8ed7ef..a60ad09dd360d 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -10,7 +10,6 @@ define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: add.w r12, r3, #3 ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: bic r12, r12, #3 @@ -21,28 +20,26 @@ define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: and r4, r12, #15 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r2], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vdup.32 q3, r4 ; CHECK-NEXT: vpt.i32 eq, q3, zr ; CHECK-NEXT: vmovt q1, q2 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload +; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vmul.i32 q1, q1, q2 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp eq i32 %N, 0 @@ -101,8 +98,7 @@ define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldr.w r12, [sp, #40] +; CHECK-NEXT: ldr.w r12, [sp, #32] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -116,10 +112,9 @@ define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: and r5, r4, #15 -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r3], #16 @@ -127,22 +122,21 @@ define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, ; CHECK-NEXT: vdup.32 q4, r5 ; CHECK-NEXT: vpt.i32 eq, q4, zr ; CHECK-NEXT: vsubt.i32 q1, q3, q2 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vmul.i32 q1, q1, q2 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: b .LBB1_5 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { @@ -414,8 +408,7 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpst +; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q1, [r0] ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst @@ -471,19 +464,28 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB5_1: @ %bb4 +; CHECK-NEXT: add.w r12, r3, #3 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: bic r12, r12, #3 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: add.w lr, lr, r12, lsr #2 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_2: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vptt.i32 ne, q0, zr +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vpttt.i32 ne, q0, zr ; CHECK-NEXT: vcmpt.s32 le, q0, r2 +; CHECK-NEXT: vctpt.32 r3 ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB5_2 +; CHECK-NEXT: le lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %bb32 ; CHECK-NEXT: pop {r7, pc} bb: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir index 37a7b7bd010dd..550972e4a4f45 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir @@ -135,27 +135,34 @@ body: | ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: $r12 = t2MOVi16 target-flags(arm-lo16) @mask, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @mask, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4 = t2BICri killed renamable $r4, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r5 = t2LDRHi12 killed renamable $r12, 0, 14 /* CC::al */, $noreg :: (dereferenceable load 2 from %ir.mask.gep9) + ; CHECK: renamable $r12 = t2SUBri killed renamable $r4, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r4, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg ; CHECK: $vpr = VMSR_P0 $r5, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 16, 14 /* CC::al */, $noreg, $noreg ; CHECK: VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) ; CHECK: renamable $q0 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q0 ; CHECK: $r3 = tMOVr $r0, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: bb.2.bb9: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r1, $r3, $r12 + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r12 ; CHECK: renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) - ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: MVE_VPST 2, implicit $vpr + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr ; CHECK: renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) ; CHECK: renamable $r3, renamable $q2 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep2, align 8) ; CHECK: MVE_VPTv4u32 8, renamable $q0, killed renamable $q2, 2, implicit-def $vpr ; CHECK: MVE_VSTRWU32 killed renamable $q1, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) ; CHECK: $r0 = tMOVr $r3, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.bb27: ; CHECK: $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r7, def $pc diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir index f754559c4f264..29ebd7bd6cf13 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir @@ -118,8 +118,7 @@ body: | ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $q1, $q2, $q3, $r0, $r1 - ; CHECK: renamable $vpr = MVE_VCMPu32 renamable $q1, renamable $q0, 8, 0, killed $noreg - ; CHECK: MVE_VPST 2, implicit $vpr + ; CHECK: MVE_VPTv4u32 2, renamable $q1, renamable $q0, 8, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPu32 renamable $q0, renamable $q2, 2, 1, killed renamable $vpr ; CHECK: renamable $r1, renamable $q4 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv35, align 4) ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q4, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv12, align 4) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir index 5ec6079e6cbfd..a1a1e785672db 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir @@ -110,8 +110,7 @@ body: | ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $lr, $q0, $q1, $q2, $r0, $r1 - ; CHECK: renamable $vpr = MVE_VCMPu32 renamable $q1, renamable $q0, 8, 0, killed $noreg - ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: MVE_VPTv4u32 4, renamable $q1, renamable $q0, 8, implicit-def $vpr ; CHECK: renamable $r1, renamable $q3 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv35, align 4) ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q3, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv12, align 4) ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q0, renamable $q2, 0, $noreg, undef renamable $q0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir index 9a5856335dfc6..210eae9e64350 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir @@ -173,11 +173,10 @@ body: | ; CHECK: renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1 - ; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg + ; CHECK: dead $r3 = tMOVr $r1, 14 /* CC::al */, $noreg ; CHECK: bb.3.do.body: ; CHECK: successors: %bb.3(0x7c000000), %bb.4(0x04000000) - ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3 - ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2 ; CHECK: renamable $r0, renamable $q2 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.01, align 4) ; CHECK: renamable $q2 = nnan ninf nsz arcp contract afn reassoc MVE_VSUBf32 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VFMAf32 killed renamable $q0, killed renamable $q2, killed renamable $q2, 0, killed $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll index 5a370e5f96e76..3cd24f8f52471 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -17,20 +17,18 @@ define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %arm_mean_f32_mve.exit ; CHECK-NEXT: vmov s4, r1 -; CHECK-NEXT: dlstp.32 lr, r1 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: vadd.f32 s0, s3, s3 ; CHECK-NEXT: vcvt.f32.u32 s4, s4 ; CHECK-NEXT: vdiv.f32 s0, s0, s4 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vdup.32 q1, r3 -; CHECK-NEXT: mov r3, r1 ; CHECK-NEXT: .LBB0_3: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vsub.f32 q2, q2, q1 -; CHECK-NEXT: vfma.f32 q0, q2, q2 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vsub.f32 q1, q1, r12 +; CHECK-NEXT: vfma.f32 q0, q1, q1 ; CHECK-NEXT: letp lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %do.end ; CHECK-NEXT: subs r0, r1, #1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll new file mode 100644 index 0000000000000..f6e175d792d14 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -O3 -tail-predication=force-enabled-no-reductions %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <4 x float> @arm_max_no_idx_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) { +; CHECK-LABEL: arm_max_no_idx_f32_mve: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: subs r2, r1, #4 +; CHECK-NEXT: adr r3, .LCPI0_0 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dlstp.32 lr, r1 +; CHECK-NEXT: .LBB0_1: @ %do.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vmaxnm.f32 q0, q1, q0 +; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %do.end +; CHECK-NEXT: pop {r7, pc} +entry: + br label %do.body + +do.body: ; preds = %do.body, %entry + %blockSize.addr.0 = phi i32 [ %blockSize, %entry ], [ %sub, %do.body ] + %curExtremValVec.0 = phi <4 x float> [ , %entry ], [ %3, %do.body ] + %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blockSize.addr.0) + %1 = bitcast float* %pSrc.addr.0 to <4 x float>* + %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) + %3 = tail call fast <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %curExtremValVec.0, i32 0, <4 x i1> %0, <4 x float> %curExtremValVec.0) + %add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4 + %sub = add i32 %blockSize.addr.0, -4 + %cmp = icmp sgt i32 %sub, 0 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + ret <4 x float> %3 +} + +declare <4 x i1> @llvm.arm.mve.vctp32(i32) + +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) + +declare <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float>, <4 x float>, i32, <4 x i1>, <4 x float>) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll index 0554742369fdc..b5cac5d6a3cf8 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -9,7 +9,7 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_add_add_v16i8(i8* nocaptur ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.8 lr, r2 -; CHECK: .LBB0_2: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16 ; CHECK-NEXT: vldrb.u8 q2, [r0], #16 @@ -75,7 +75,7 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocaptu ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB1_2: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vmov q0, q1 @@ -148,7 +148,7 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_sub_add_v16i8(i8* nocaptur ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #4 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB2_2: @ %vector.body +; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.8 r2 ; CHECK-NEXT: vmov q0, q1 @@ -218,7 +218,7 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_sub_add_v8i16(i8* nocaptu ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB3_2: @ %vector.body +; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vmov q0, q1 @@ -290,7 +290,7 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_mul_add_v16i8(i8* nocaptur ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #4 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB4_2: @ %vector.body +; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.8 r2 ; CHECK-NEXT: vmov q0, q1 @@ -360,7 +360,7 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_mul_add_v8i16(i8* nocaptu ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB5_2: @ %vector.body +; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vmov q0, q1 @@ -432,7 +432,7 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read ; CHECK-NEXT: add.w lr, r3, r6, lsr #2 ; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB6_2: @ %vector.body +; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vmov q0, q1 @@ -454,7 +454,7 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: vmov.32 q0[0], r12 -; CHECK: .LBB6_5: @ %vector.body46 +; CHECK-NEXT: .LBB6_5: @ %vector.body46 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 @@ -559,7 +559,7 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(i8* nocaptur ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB7_2: @ %vector.body +; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vmov q0, q1 @@ -670,32 +670,31 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) { ; CHECK-NEXT: cmp r1, r2 ; CHECK-NEXT: cset r4, lo ; CHECK-NEXT: .LBB8_4: @ %lor.end -; CHECK-NEXT: ldr.w r3, [r12, #4] -; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: ldr.w r1, [r12, #4] +; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB8_5: @ %vector.ph -; CHECK-NEXT: adds r1, r3, #3 +; CHECK-NEXT: adds r3, r1, #3 ; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: bic r1, r1, #3 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w lr, r2, r1, lsr #2 -; CHECK-NEXT: movw r1, :lower16:days -; CHECK-NEXT: movt r1, :upper16:days -; CHECK-NEXT: movs r2, #52 -; CHECK-NEXT: mla r1, r4, r2, r1 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vdup.32 q0, r2 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: add.w lr, r2, r3, lsr #2 +; CHECK-NEXT: movw r2, :lower16:days +; CHECK-NEXT: movt r2, :upper16:days +; CHECK-NEXT: movs r3, #52 +; CHECK-NEXT: mla r2, r4, r3, r2 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: subs r0, r3, #1 -; CHECK: .LBB8_6: @ %vector.body +; CHECK-NEXT: .LBB8_6: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r1], #16 -; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: vldrwt.u32 q0, [r2], #16 +; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB8_6 ; CHECK-NEXT: @ %bb.7: @ %middle.block @@ -738,7 +737,7 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ %5, %vector.ph ], [ %8, %vector.body ] %6 = getelementptr inbounds [2 x [13 x i32]], [2 x [13 x i32]]* @days, i32 0, i32 %3, i32 %index - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %4) %7 = bitcast i32* %6 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %7, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %8 = add <4 x i32> %wide.masked.load, %vec.phi diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll new file mode 100644 index 0000000000000..6ce2b9f5f1c02 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll @@ -0,0 +1,223 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp %s -o - | FileCheck %s + +define void @remat_vctp(i32* %arg, i32* %arg1, i32* %arg2, i32* %arg3, i32* %arg4, i16 zeroext %arg5) { +; CHECK-LABEL: remat_vctp: +; CHECK: @ %bb.0: @ %bb +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: ldrd r5, r12, [sp, #80] +; CHECK-NEXT: cmp.w r12, #4 +; CHECK-NEXT: mov r4, r12 +; CHECK-NEXT: vmvn.i32 q0, #0x80000000 +; CHECK-NEXT: it ge +; CHECK-NEXT: movge r4, #4 +; CHECK-NEXT: vmov.i32 q1, #0x3f +; CHECK-NEXT: sub.w r4, r12, r4 +; CHECK-NEXT: vmov.i32 q2, #0x1 +; CHECK-NEXT: add.w lr, r4, #3 +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB0_1: @ %bb6 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q4, [r1], #16 +; CHECK-NEXT: vabs.s32 q5, q4 +; CHECK-NEXT: vcls.s32 q3, q5 +; CHECK-NEXT: vshl.u32 q5, q5, q3 +; CHECK-NEXT: vadd.i32 q3, q3, q2 +; CHECK-NEXT: vshr.u32 q6, q5, #24 +; CHECK-NEXT: vand q6, q6, q1 +; CHECK-NEXT: vldrw.u32 q7, [r5, q6, uxtw #2] +; CHECK-NEXT: vqrdmulh.s32 q6, q7, q5 +; CHECK-NEXT: vqsub.s32 q6, q0, q6 +; CHECK-NEXT: vqrdmulh.s32 q6, q7, q6 +; CHECK-NEXT: vqshl.s32 q6, q6, #1 +; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5 +; CHECK-NEXT: vqsub.s32 q5, q0, q5 +; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5 +; CHECK-NEXT: vqshl.s32 q5, q5, #1 +; CHECK-NEXT: vpt.s32 lt, q4, zr +; CHECK-NEXT: vnegt.s32 q5, q5 +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 +; CHECK-NEXT: vqrdmulh.s32 q4, q4, q5 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vstrwt.32 q4, [r2], #16 +; CHECK-NEXT: vstrwt.32 q3, [r3], #16 +; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %bb44 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5, r7, pc} +bb: + %i = zext i16 %arg5 to i32 + br label %bb6 + +bb6: ; preds = %bb6, %bb + %i7 = phi i32* [ %arg3, %bb ], [ %i38, %bb6 ] + %i8 = phi i32 [ %i, %bb ], [ %i42, %bb6 ] + %i9 = phi i32* [ %arg2, %bb ], [ %i41, %bb6 ] + %i10 = phi i32* [ %arg1, %bb ], [ %i40, %bb6 ] + %i11 = phi i32* [ %arg, %bb ], [ %i39, %bb6 ] + %i12 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i8) + %i13 = bitcast i32* %i11 to <4 x i32>* + %i14 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %i13, i32 4, <4 x i1> %i12, <4 x i32> zeroinitializer) + %i15 = bitcast i32* %i10 to <4 x i32>* + %i16 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %i15, i32 4, <4 x i1> %i12, <4 x i32> zeroinitializer) + %i17 = icmp slt <4 x i32> %i16, zeroinitializer + %i18 = sub <4 x i32> zeroinitializer, %i16 + %i19 = select <4 x i1> %i17, <4 x i32> %i18, <4 x i32> %i16 + %i20 = tail call <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32> %i19) + %i21 = shl <4 x i32> %i19, %i20 + %i22 = add <4 x i32> %i20, + %i23 = lshr <4 x i32> %i21, + %i24 = and <4 x i32> %i23, + %i25 = tail call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %arg4, <4 x i32> %i24, i32 32, i32 2, i32 0) + %i26 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i21) + %i27 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> , <4 x i32> %i26) + %i28 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i27) + %i29 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i28, i32 1, i32 0) + %i30 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i21) + %i31 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> , <4 x i32> %i30) + %i32 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i31) + %i33 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i32, i32 1, i32 0) + %i34 = tail call <4 x i32> @llvm.arm.mve.neg.predicated.v4i32.v4i1(<4 x i32> %i33, <4 x i1> %i17, <4 x i32> %i33) + %i35 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i14, <4 x i32> %i34) + %i36 = bitcast i32* %i9 to <4 x i32>* + %i37 = bitcast i32* %i7 to <4 x i32>* + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %i35, <4 x i32>* %i36, i32 4, <4 x i1> %i12) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %i22, <4 x i32>* %i37, i32 4, <4 x i1> %i12) + %i38 = getelementptr inbounds i32, i32* %i7, i32 4 + %i39 = getelementptr inbounds i32, i32* %i11, i32 4 + %i40 = getelementptr inbounds i32, i32* %i10, i32 4 + %i41 = getelementptr inbounds i32, i32* %i9, i32 4 + %i42 = add nsw i32 %i8, -4 + %i43 = icmp sgt i32 %i8, 4 + br i1 %i43, label %bb6, label %bb44 + +bb44: ; preds = %bb6 + ret void +} + +define void @dont_remat_predicated_vctp(i32* %arg, i32* %arg1, i32* %arg2, i32* %arg3, i32* %arg4, i16 zeroext %arg5, i32 %conv.mask) { +; CHECK-LABEL: dont_remat_predicated_vctp: +; CHECK: @ %bb.0: @ %bb +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: ldrd r6, r12, [sp, #88] +; CHECK-NEXT: movs r4, #4 +; CHECK-NEXT: cmp.w r12, #4 +; CHECK-NEXT: vmvn.i32 q0, #0x80000000 +; CHECK-NEXT: csel r5, r12, r4, lt +; CHECK-NEXT: vmov.i32 q1, #0x3f +; CHECK-NEXT: sub.w r5, r12, r5 +; CHECK-NEXT: vmov.i32 q2, #0x1 +; CHECK-NEXT: add.w lr, r5, #3 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: add.w lr, r5, lr, lsr #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB1_1: @ %bb6 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vpst +; CHECK-NEXT: vctpt.32 r4 +; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q4, [r1], #16 +; CHECK-NEXT: vabs.s32 q5, q4 +; CHECK-NEXT: vcls.s32 q3, q5 +; CHECK-NEXT: vshl.u32 q5, q5, q3 +; CHECK-NEXT: vadd.i32 q3, q3, q2 +; CHECK-NEXT: vshr.u32 q6, q5, #24 +; CHECK-NEXT: vand q6, q6, q1 +; CHECK-NEXT: vldrw.u32 q7, [r6, q6, uxtw #2] +; CHECK-NEXT: vqrdmulh.s32 q6, q7, q5 +; CHECK-NEXT: vqsub.s32 q6, q0, q6 +; CHECK-NEXT: vqrdmulh.s32 q6, q7, q6 +; CHECK-NEXT: vqshl.s32 q6, q6, #1 +; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5 +; CHECK-NEXT: vqsub.s32 q5, q0, q5 +; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5 +; CHECK-NEXT: vqshl.s32 q5, q5, #1 +; CHECK-NEXT: vpt.s32 lt, q4, zr +; CHECK-NEXT: vnegt.s32 q5, q5 +; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 +; CHECK-NEXT: vqrdmulh.s32 q4, q4, q5 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vstrwt.32 q4, [r2], #16 +; CHECK-NEXT: vstrwt.32 q3, [r3], #16 +; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: @ %bb.2: @ %bb44 +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5, r6, pc} +bb: + %i = zext i16 %arg5 to i32 + br label %bb6 + +bb6: ; preds = %bb6, %bb + %i7 = phi i32* [ %arg3, %bb ], [ %i38, %bb6 ] + %i8 = phi i32 [ %i, %bb ], [ %i42, %bb6 ] + %i9 = phi i32* [ %arg2, %bb ], [ %i41, %bb6 ] + %i10 = phi i32* [ %arg1, %bb ], [ %i40, %bb6 ] + %i11 = phi i32* [ %arg, %bb ], [ %i39, %bb6 ] + %i12 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 4) + %mask = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i8) + %pred = and <4 x i1> %i12, %mask + %i13 = bitcast i32* %i11 to <4 x i32>* + %i14 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %i13, i32 4, <4 x i1> %pred, <4 x i32> zeroinitializer) + %i15 = bitcast i32* %i10 to <4 x i32>* + %i16 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %i15, i32 4, <4 x i1> %pred, <4 x i32> zeroinitializer) + %i17 = icmp slt <4 x i32> %i16, zeroinitializer + %i18 = sub <4 x i32> zeroinitializer, %i16 + %i19 = select <4 x i1> %i17, <4 x i32> %i18, <4 x i32> %i16 + %i20 = tail call <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32> %i19) + %i21 = shl <4 x i32> %i19, %i20 + %i22 = add <4 x i32> %i20, + %i23 = lshr <4 x i32> %i21, + %i24 = and <4 x i32> %i23, + %i25 = tail call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %arg4, <4 x i32> %i24, i32 32, i32 2, i32 0) + %i26 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i21) + %i27 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> , <4 x i32> %i26) + %i28 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i27) + %i29 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i28, i32 1, i32 0) + %i30 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i21) + %i31 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> , <4 x i32> %i30) + %i32 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i31) + %i33 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i32, i32 1, i32 0) + %i34 = tail call <4 x i32> @llvm.arm.mve.neg.predicated.v4i32.v4i1(<4 x i32> %i33, <4 x i1> %i17, <4 x i32> %i33) + %i35 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i14, <4 x i32> %i34) + %i36 = bitcast i32* %i9 to <4 x i32>* + %i37 = bitcast i32* %i7 to <4 x i32>* + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %i35, <4 x i32>* %i36, i32 4, <4 x i1> %pred) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %i22, <4 x i32>* %i37, i32 4, <4 x i1> %pred) + %i38 = getelementptr inbounds i32, i32* %i7, i32 4 + %i39 = getelementptr inbounds i32, i32* %i11, i32 4 + %i40 = getelementptr inbounds i32, i32* %i10, i32 4 + %i41 = getelementptr inbounds i32, i32* %i9, i32 4 + %i42 = add nsw i32 %i8, -4 + %i43 = icmp sgt i32 %i8, 4 + br i1 %i43, label %bb6, label %bb44 + +bb44: ; preds = %bb6 + ret void +} + +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) +declare <4 x i1> @llvm.arm.mve.vctp32(i32) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) +declare <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32>) +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32*, <4 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32>, i32, i32) +declare <4 x i32> @llvm.arm.mve.neg.predicated.v4i32.v4i1(<4 x i32>, <4 x i1>, <4 x i32>) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll similarity index 74% rename from llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll rename to llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll index fb974048b1ef4..22ffa12c93ea4 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s ; CHECK-LABEL: mul_v16i8 @@ -431,6 +430,195 @@ for.cond.cleanup: ret void } +; CHECK-LABEL: const_expected_in_set_loop +; CHECK: call <4 x i1> @llvm.get.active.lane.mask +; CHECK-NOT: vctp +; CHECK: ret void +; +define dso_local void @const_expected_in_set_loop(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + +vector.ph: + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ] + %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* + + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 + %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %9 = icmp ne i32 %8, 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +; CHECK-LABEL: wrong_tripcount_arg +; CHECK: vector.body: +; CHECK: call <4 x i1> @llvm.arm.mve.vctp32 +; CHECK-NOT: call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32 +; CHECK: vector.body35: +; CHECK: call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32 +; CHECK-NOT: call <4 x i1> @llvm.arm.mve.vctp32 +; CHECK: ret void +; +define dso_local void @wrong_tripcount_arg(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture %D, i32 %N1, i32 %N2) local_unnamed_addr #0 { +entry: + %cmp29 = icmp sgt i32 %N1, 0 + %0 = add i32 %N1, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp29, label %vector.ph, label %for.cond4.preheader + +vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv62 = phi i32* [ %scevgep63, %vector.body ], [ %D, %vector.ph ] + %lsr.iv59 = phi i32* [ %scevgep60, %vector.body ], [ %C, %vector.ph ] + %lsr.iv56 = phi i32* [ %scevgep57, %vector.body ], [ %B, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ] + %lsr.iv5658 = bitcast i32* %lsr.iv56 to <4 x i32>* + %lsr.iv5961 = bitcast i32* %lsr.iv59 to <4 x i32>* + %lsr.iv6264 = bitcast i32* %lsr.iv62 to <4 x i32>* + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N1) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv5658, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %wide.masked.load32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv5961, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %7 = add nsw <4 x i32> %wide.masked.load32, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv6264, i32 4, <4 x i1> %active.lane.mask) + %index.next = add i32 %index, 4 + %scevgep57 = getelementptr i32, i32* %lsr.iv56, i32 4 + %scevgep60 = getelementptr i32, i32* %lsr.iv59, i32 4 + %scevgep63 = getelementptr i32, i32* %lsr.iv62, i32 4 + %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %9 = icmp ne i32 %8, 0 + br i1 %9, label %vector.body, label %for.cond4.preheader + +for.cond4.preheader: ; preds = %vector.body, %entry + %cmp527 = icmp sgt i32 %N2, 0 + %10 = add i32 %N2, 3 + %11 = lshr i32 %10, 2 + %12 = shl nuw i32 %11, 2 + %13 = add i32 %12, -4 + %14 = lshr i32 %13, 2 + %15 = add nuw nsw i32 %14, 1 + br i1 %cmp527, label %vector.ph36, label %for.cond.cleanup6 + +vector.ph36: ; preds = %for.cond4.preheader + call void @llvm.set.loop.iterations.i32(i32 %15) + br label %vector.body35 + +vector.body35: ; preds = %vector.body35, %vector.ph36 + %lsr.iv53 = phi i32* [ %scevgep54, %vector.body35 ], [ %A, %vector.ph36 ] + %lsr.iv50 = phi i32* [ %scevgep51, %vector.body35 ], [ %C, %vector.ph36 ] + %lsr.iv = phi i32* [ %scevgep, %vector.body35 ], [ %B, %vector.ph36 ] + %index40 = phi i32 [ 0, %vector.ph36 ], [ %index.next41, %vector.body35 ] + %16 = phi i32 [ %15, %vector.ph36 ], [ %18, %vector.body35 ] + %lsr.iv49 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv5052 = bitcast i32* %lsr.iv50 to <4 x i32>* + %lsr.iv5355 = bitcast i32* %lsr.iv53 to <4 x i32>* + +; This has N1 as the tripcount / element count, which is the tripcount of the +; first loop and not this one: + %active.lane.mask46 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index40, i32 %N1) + + %wide.masked.load47 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv49, i32 4, <4 x i1> %active.lane.mask46, <4 x i32> undef) + %wide.masked.load48 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv5052, i32 4, <4 x i1> %active.lane.mask46, <4 x i32> undef) + %17 = add nsw <4 x i32> %wide.masked.load48, %wide.masked.load47 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %17, <4 x i32>* %lsr.iv5355, i32 4, <4 x i1> %active.lane.mask46) + %index.next41 = add i32 %index40, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep51 = getelementptr i32, i32* %lsr.iv50, i32 4 + %scevgep54 = getelementptr i32, i32* %lsr.iv53, i32 4 + %18 = call i32 @llvm.loop.decrement.reg.i32(i32 %16, i32 1) + %19 = icmp ne i32 %18, 0 + br i1 %19, label %vector.body35, label %for.cond.cleanup6 + +for.cond.cleanup6: ; preds = %vector.body35, %for.cond4.preheader + ret void +} + +; CHECK-LABEL: tripcount_arg_not_invariant +; CHECK: call <4 x i1> @llvm.get.active.lane.mask +; CHECK-NOT: vctp +; CHECK: ret void +; +define dso_local void @tripcount_arg_not_invariant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %trip.count.minus.1 = add i32 %N, -1 + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ] + + %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* + + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index) + + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 + %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %9 = icmp ne i32 %8, 0 + ;br i1 %9, label %vector.body, label %for.cond.cleanup + br i1 %9, label %vector.body, label %vector.ph + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll index 4cd0c54c666c8..8bf15aba9d975 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll @@ -265,13 +265,13 @@ for.cond.cleanup: ret void } -; CHECK-LABEL: @overflow_BTC_plus_1( +; CHECK-LABEL: @inconsistent_tripcounts( ; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 ; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; -define dso_local void @overflow_BTC_plus_1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { +define dso_local void @inconsistent_tripcounts(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { entry: call void @llvm.set.loop.iterations.i32(i32 8001) br label %vector.body @@ -316,63 +316,7 @@ for.cond.cleanup: ; define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { entry: - call void @llvm.set.loop.iterations.i32(i32 8001) - br label %vector.body - -vector.body: - %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] - %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] - %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] - %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] - %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* - %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* - %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* - %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, - -; Overflow in the substraction. This should hold: -; -; ceil(ElementCount / VectorWidth) >= TripCount -; -; But we have: -; -; ceil(3200 / 4) >= 8001 -; 8000 >= 8001 -; - %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 31999) - - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) - %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) - %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) - %index.next = add i32 %index, 4 - %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 - %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 - %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 - %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) - %4 = icmp ne i32 %3, 0 - br i1 %4, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -; CHECK-LABEL: @overflow_in_rounding_tripcount( -; CHECK: vector.body: -; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK: @llvm.get.active.lane.mask -; CHECK: ret void -; -define dso_local void @overflow_in_rounding_tripcount(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { -entry: - -; TC = 4294967292 -; 4294967292 <= 4294967291 (MAX - vectorwidth) -; False -; - call void @llvm.set.loop.iterations.i32(i32 4294967291) + call void @llvm.set.loop.iterations.i32(i32 1073741824) br label %vector.body vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll new file mode 100644 index 0000000000000..e2fa8ea77071d --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll @@ -0,0 +1,61 @@ +; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s --check-prefixes=CHECK,ENABLED +; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=force-enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s --check-prefixes=CHECK,FORCED + +; CHECK-LABEL: set_iterations_not_rounded_up +; +; ENABLED: call <4 x i1> @llvm.get.active.lane.mask +; ENABLED-NOT: vctp +; +; FORCED-NOT: call <4 x i1> @llvm.get.active.lane.mask +; FORCED: vctp +; +; CHECK: ret void +; +define dso_local void @set_iterations_not_rounded_up(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp8 = icmp sgt i32 %N, 0 + +; Here, v5 which is used in set.loop.iterations which is usually rounded up to +; a next multiple of the VF when emitted from the vectoriser, which means a +; bound can be put on this expression. Without this, we can't, and should flag +; this as potentially overflow behaviour. + + %v5 = add nuw nsw i32 %N, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %trip.count.minus.1 = add i32 %N, -1 + call void @llvm.set.loop.iterations.i32(i32 %v5) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %v6 = phi i32 [ %v5, %vector.ph ], [ %v8, %vector.body ] + %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %v7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 + %v8 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1) + %v9 = icmp ne i32 %v8, 0 + br i1 %v9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) +declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.loop.decrement.reg.i32(i32, i32) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll index 5b2f3a7c98e8a..98d48d49539c5 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll @@ -10,7 +10,6 @@ define arm_aapcs_vfpcc void @usub_sat(i16* noalias nocapture readonly %pSrcA, i1 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB0_1: @ %vector.ph -; CHECK-NEXT: subs r3, #1 ; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -36,7 +35,7 @@ vector.body: ; preds = %vector.body, %vecto %next.gep = getelementptr i16, i16* %pSrcA, i32 %index %next.gep20 = getelementptr i16, i16* %pDst, i32 %index %next.gep21 = getelementptr i16, i16* %pSrcB, i32 %index - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %blockSize) %0 = bitcast i16* %next.gep to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) %1 = bitcast i16* %next.gep21 to <8 x i16>* @@ -61,7 +60,6 @@ define arm_aapcs_vfpcc void @ssub_sat(i16* noalias nocapture readonly %pSrcA, i1 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB1_1: @ %vector.ph -; CHECK-NEXT: subs r3, #1 ; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -87,7 +85,7 @@ vector.body: ; preds = %vector.body, %vecto %next.gep = getelementptr i16, i16* %pSrcA, i32 %index %next.gep20 = getelementptr i16, i16* %pDst, i32 %index %next.gep21 = getelementptr i16, i16* %pSrcB, i32 %index - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %blockSize) %0 = bitcast i16* %next.gep to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) %1 = bitcast i16* %next.gep21 to <8 x i16>* diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll similarity index 98% rename from llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll rename to llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll index 0c85e89133374..338c980eeb9b0 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll @@ -135,8 +135,9 @@ for.cond.cleanup: } ; The vector loop is not guarded with an entry check (N == 0). Check that -; despite this we can still calculate a precise enough range for the -; backedge count to safely insert a vctp here. +; despite this we can still calculate a precise enough range so that the +; the overflow checks for get.active.active.lane.mask don't reject +; tail-predication. ; ; CHECK-LABEL: @reduction_not_guarded ; diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll new file mode 100644 index 0000000000000..222c2f036ca8b --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll @@ -0,0 +1,49 @@ +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=force-enabled-no-reductions -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <16 x i8> @vcmp_vpst_combination(<16 x i8>* %pSrc, i16 zeroext %blockSize, i8* nocapture %pResult, i32* nocapture %pIndex) { +; CHECK-LABEL: vcmp_vpst_combination: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov.i8 q0, #0x7f +; CHECK-NEXT: dlstp.8 lr, r1 +; CHECK-NEXT: .LBB0_1: @ %do.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q1, [r0] +; CHECK-NEXT: vpt.s8 ge, q0, q1 +; CHECK-NEXT: vmovt q0, q1 +; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %do.end +; CHECK-NEXT: pop {r7, pc} +entry: + %conv = zext i16 %blockSize to i32 + %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 0, i32 1) + %1 = extractvalue { <16 x i8>, i32 } %0, 0 + br label %do.body + +do.body: ; preds = %do.body, %entry + %indexVec.0 = phi <16 x i8> [ %1, %entry ], [ %add, %do.body ] + %curExtremIdxVec.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %6, %do.body ] + %curExtremValVec.0 = phi <16 x i8> [ , %entry ], [ %6, %do.body ] + %blkCnt.0 = phi i32 [ %conv, %entry ], [ %sub2, %do.body ] + %2 = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 %blkCnt.0) + %3 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %pSrc, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer) + %4 = icmp sle <16 x i8> %3, %curExtremValVec.0 + %5 = and <16 x i1> %4, %2 + %6 = tail call <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8> %3, <16 x i8> %3, <16 x i1> %5, <16 x i8> %curExtremValVec.0) + %add = add <16 x i8> %indexVec.0, + %sub2 = add nsw i32 %blkCnt.0, -16 + %cmp = icmp sgt i32 %blkCnt.0, 16 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + ret <16 x i8> %6 +} + +declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32) + +declare <16 x i1> @llvm.arm.mve.vctp8(i32) + +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) + +declare <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir index cdc9d7e7be9c6..4f80869de3ccb 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir @@ -122,18 +122,28 @@ body: | ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 - ; CHECK: $lr = MVE_DLSTP_32 renamable $r2 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK: dead $lr = t2DLS renamable $r12 + ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2 + ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + ; CHECK: $lr = tMOVr $r3, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0, $q1, $r2 ; CHECK: renamable $r0, dead $cpsr = tADDi3 killed renamable $r2, 4, 14 /* CC::al */, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir index 2f1641516a0d9..6df9702ca01dc 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir @@ -118,16 +118,24 @@ body: | ; CHECK: bb.1.bb3: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2, $r3 + ; CHECK: renamable $r12 = t2ADDri renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: $vpr = VMSR_P0 killed $r3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg ; CHECK: VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) ; CHECK: $r3 = tMOVr $r0, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: bb.2.bb9: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $r0, $r1, $r3 + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) - ; CHECK: MVE_VPST 8, implicit $vpr - ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: MVE_VPST 4, implicit $vpr ; CHECK: renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) @@ -135,7 +143,7 @@ body: | ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) ; CHECK: $r0 = tMOVr $r3, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.bb27: ; CHECK: $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir index 60a578d81594f..74f1e05684449 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir @@ -215,17 +215,26 @@ body: | ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 - ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg - ; CHECK: MVE_VPTv4s32r 4, renamable $q1, renamable $r2, 11, implicit-def $vpr + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr + ; CHECK: MVE_VPTv4s32r 2, renamable $q1, renamable $r2, 11, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: @@ -593,17 +602,26 @@ body: | ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 - ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg - ; CHECK: MVE_VPTv4s32r 12, renamable $q1, renamable $r2, 10, implicit-def $vpr + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr + ; CHECK: MVE_VPTv4s32r 14, renamable $q1, renamable $r2, 10, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 13, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 2, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 2, killed renamable $vpr - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ; @@ -713,17 +731,26 @@ body: | ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 - ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg - ; CHECK: MVE_VPTv4s32r 4, renamable $q0, renamable $r2, 11, implicit-def $vpr + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr + ; CHECK: MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 11, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir index 7578b429790be..23cdf73263b01 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir @@ -425,8 +425,13 @@ body: | ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $lr = MVE_WLSTP_32 $r2, %bb.1 + ; CHECK: $lr = t2WLS killed renamable $lr, %bb.1 ; CHECK: tB %bb.4, 14 /* CC::al */, $noreg ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) @@ -436,15 +441,18 @@ body: | ; CHECK: successors: %bb.3(0x04000000), %bb.2(0x7c000000) ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2 ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 - ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 4) - ; CHECK: renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 0, $noreg :: (load 16 from %ir.lsr.iv1, align 4) + ; CHECK: renamable $vpr = MVE_VCTP32 $r2, 0, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) + ; CHECK: renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) ; CHECK: $r3 = tMOVr $r2, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14 /* CC::al */, $noreg ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14 /* CC::al */, $noreg - ; CHECK: renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 1, killed renamable $vpr, undef renamable $q1 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: successors: %bb.4(0x80000000) ; CHECK: liveins: $q0, $q1, $r3 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir index e377b06fea9f8..d91556e3e70b9 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir @@ -133,21 +133,23 @@ body: | ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK: dead $lr = t2DLS renamable $r3 ; CHECK: $r12 = tMOVr killed $r3, 14 /* CC::al */, $noreg ; CHECK: $r3 = tMOVr $r2, 14 /* CC::al */, $noreg - ; CHECK: dead $lr = MVE_DLSTP_32 renamable $r3 ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3, $r12 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) ; CHECK: $lr = tMOVr $r12, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $r12 = nsw t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0, $q1, $r2, $r3 ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r2, 1, 14 /* CC::al */, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir index 05bfdbb2fc0f8..337816146e5f0 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir @@ -119,18 +119,28 @@ body: | ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 - ; CHECK: $lr = MVE_DLSTP_32 renamable $r2 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK: dead $lr = t2DLS renamable $r12 + ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2 + ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + ; CHECK: $lr = tMOVr $r3, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0, $q1, $r2 ; CHECK: renamable $vpr = MVE_VCTP32 killed renamable $r2, 0, $noreg diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll index 116031cb895ff..2a5d32013d473 100644 --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve %s -o - | FileCheck %s -define <4 x i32> @v4i32(i32 %index, i32 %BTC, <4 x i32> %V1, <4 x i32> %V2) { +define <4 x i32> @v4i32(i32 %index, i32 %TC, <4 x i32> %V1, <4 x i32> %V2) { ; CHECK-LABEL: v4i32: ; CHECK: @ %bb.0: ; CHECK-NEXT: adr.w r12, .LCPI0_0 @@ -28,12 +28,12 @@ define <4 x i32> @v4i32(i32 %index, i32 %BTC, <4 x i32> %V1, <4 x i32> %V2) { ; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 2 @ 0x2 ; CHECK-NEXT: .long 3 @ 0x3 - %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %BTC) + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %TC) %select = select <4 x i1> %active.lane.mask, <4 x i32> %V1, <4 x i32> %V2 ret <4 x i32> %select } -define <7 x i32> @v7i32(i32 %index, i32 %BTC, <7 x i32> %V1, <7 x i32> %V2) { +define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) { ; CHECK-LABEL: v7i32: ; CHECK: @ %bb.0: ; CHECK-NEXT: adr r3, .LCPI1_0 @@ -105,12 +105,12 @@ define <7 x i32> @v7i32(i32 %index, i32 %BTC, <7 x i32> %V1, <7 x i32> %V2) { ; CHECK-NEXT: .long 5 @ 0x5 ; CHECK-NEXT: .long 6 @ 0x6 ; CHECK-NEXT: .zero 4 - %active.lane.mask = call <7 x i1> @llvm.get.active.lane.mask.v7i1.i32(i32 %index, i32 %BTC) + %active.lane.mask = call <7 x i1> @llvm.get.active.lane.mask.v7i1.i32(i32 %index, i32 %TC) %select = select <7 x i1> %active.lane.mask, <7 x i32> %V1, <7 x i32> %V2 ret <7 x i32> %select } -define <8 x i16> @v8i16(i32 %index, i32 %BTC, <8 x i16> %V1, <8 x i16> %V2) { +define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) { ; CHECK-LABEL: v8i16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} @@ -189,12 +189,12 @@ define <8 x i16> @v8i16(i32 %index, i32 %BTC, <8 x i16> %V1, <8 x i16> %V2) { ; CHECK-NEXT: .long 5 @ 0x5 ; CHECK-NEXT: .long 6 @ 0x6 ; CHECK-NEXT: .long 7 @ 0x7 - %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %BTC) + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC) %select = select <8 x i1> %active.lane.mask, <8 x i16> %V1, <8 x i16> %V2 ret <8 x i16> %select } -define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) { +define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) { ; CHECK-LABEL: v16i8: ; CHECK: @ %bb.0: ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} @@ -405,7 +405,7 @@ define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) { ; CHECK-NEXT: .long 13 @ 0xd ; CHECK-NEXT: .long 14 @ 0xe ; CHECK-NEXT: .long 15 @ 0xf - %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %BTC) + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %TC) %select = select <16 x i1> %active.lane.mask, <16 x i8> %V1, <16 x i8> %V2 ret <16 x i8> %select } diff --git a/llvm/test/CodeGen/Thumb2/mve-abs.ll b/llvm/test/CodeGen/Thumb2/mve-abs.ll index 0b5dcbced1a56..8a9b8814ef2ec 100644 --- a/llvm/test/CodeGen/Thumb2/mve-abs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-abs.ll @@ -40,33 +40,24 @@ entry: define arm_aapcs_vfpcc <2 x i64> @abs_v2i64(<2 x i64> %s1) { ; CHECK-LABEL: abs_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: rsbs.w lr, r1, #0 -; CHECK-NEXT: sbc.w r2, r12, r0 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cset r3, mi -; CHECK-NEXT: ands r3, r3, #1 -; CHECK-NEXT: csel r1, lr, r1, ne -; CHECK-NEXT: csel r0, r2, r0, ne -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds.w r1, r1, r0, asr #31 +; CHECK-NEXT: adc.w r2, r0, r0, asr #31 +; CHECK-NEXT: eor.w r2, r2, r0, asr #31 +; CHECK-NEXT: eor.w r0, r1, r0, asr #31 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: rsbs r2, r1, #0 -; CHECK-NEXT: sbc.w r12, r12, r0 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cset r3, mi -; CHECK-NEXT: ands r3, r3, #1 -; CHECK-NEXT: csel r1, r2, r1, ne -; CHECK-NEXT: csel r0, r12, r0, ne +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: adds.w r1, r1, r0, asr #31 +; CHECK-NEXT: eor.w r1, r1, r0, asr #31 ; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: adc.w r1, r0, r0, asr #31 +; CHECK-NEXT: eor.w r0, r1, r0, asr #31 ; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %0 = icmp slt <2 x i64> %s1, zeroinitializer %1 = sub nsw <2 x i64> zeroinitializer, %s1 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.mir b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.mir index 5fc89549ec923..77ca49378e63d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.mir +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.mir @@ -33,6 +33,43 @@ define i32* @addUseDom(i32* %x) { unreachable } define i32* @addUseKilled(i32* %x) { unreachable } + define i32* @MVE_VLDRWU32_post(i32* %x) { unreachable } + define i32* @MVE_VLDRHU16_post(i32* %x) { unreachable } + define i32* @MVE_VLDRBU8_post(i32* %x) { unreachable } + define i32* @MVE_VLDRBS32_post(i32* %x) { unreachable } + define i32* @MVE_VLDRBU32_post(i32* %x) { unreachable } + define i32* @MVE_VLDRHS32_post(i32* %x) { unreachable } + define i32* @MVE_VLDRHU32_post(i32* %x) { unreachable } + define i32* @MVE_VLDRBS16_post(i32* %x) { unreachable } + define i32* @MVE_VLDRBU16_post(i32* %x) { unreachable } + define i32* @MVE_VSTRWU32_post(i32* %x, <4 x i32> %y) { unreachable } + define i32* @MVE_VSTRHU16_post(i32* %x, <4 x i32> %y) { unreachable } + define i32* @MVE_VSTRBU8_post(i32* %x, <4 x i32> %y) { unreachable } + define i32* @MVE_VSTRH32_post(i32* %x, <4 x i32> %y) { unreachable } + define i32* @MVE_VSTRB32_post(i32* %x, <4 x i32> %y) { unreachable } + define i32* @MVE_VSTRB16_post(i32* %x, <4 x i32> %y) { unreachable } + define i32* @MVE_VLDRWU32_pre(i32* %x) { unreachable } + define i32* @MVE_VLDRHU16_pre(i32* %x) { unreachable } + define i32* @MVE_VLDRBU8_pre(i32* %x) { unreachable } + define i32* @MVE_VLDRBS32_pre(i32* %x) { unreachable } + define i32* @MVE_VLDRBU32_pre(i32* %x) { unreachable } + define i32* @MVE_VLDRHS32_pre(i32* %x) { unreachable } + define i32* @MVE_VLDRHU32_pre(i32* %x) { unreachable } + define i32* @MVE_VLDRBS16_pre(i32* %x) { unreachable } + define i32* @MVE_VLDRBU16_pre(i32* %x) { unreachable } + define i32* @MVE_VSTRWU32_pre(i32* %x, <4 x i32> %y) { unreachable } + define i32* @MVE_VSTRHU16_pre(i32* %x, <4 x i32> %y) { unreachable } + define i32* @MVE_VSTRBU8_pre(i32* %x, <4 x i32> %y) { unreachable } + define i32* @MVE_VSTRH32_pre(i32* %x, <4 x i32> %y) { unreachable } + define i32* @MVE_VSTRB32_pre(i32* %x, <4 x i32> %y) { unreachable } + define i32* @MVE_VSTRB16_pre(i32* %x, <4 x i32> %y) { unreachable } + + define i32* @multiple2(i32* %x) { unreachable } + define i32* @multiple3(i32* %x) { unreachable } + define i32* @multiple4(i32* %x) { unreachable } + define i32* @badScale2(i32* %x) { unreachable } + define i32* @badRange2(i32* %x) { unreachable } + ... --- name: MVE_VLDRWU32 @@ -864,3 +901,1027 @@ body: | tBX_RET 14, $noreg, implicit $r0 ... +--- +name: MVE_VLDRWU32_post +tracksRegLiveness: true +registers: + - { id: 0, class: gprnopc, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRWU32_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 + ; CHECK: [[MVE_VLDRWU32_post:%[0-9]+]]:rgpr, [[MVE_VLDRWU32_post1:%[0-9]+]]:mqpr = MVE_VLDRWU32_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[MVE_VLDRWU32_post]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRWU32_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:gprnopc = COPY $r0 + %2:rgpr, %1:mqpr = MVE_VLDRWU32_post %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRWU32 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRHU16_post +tracksRegLiveness: true +registers: + - { id: 0, class: gprnopc, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRHU16_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 + ; CHECK: [[MVE_VLDRHU16_post:%[0-9]+]]:rgpr, [[MVE_VLDRHU16_post1:%[0-9]+]]:mqpr = MVE_VLDRHU16_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRHU16_:%[0-9]+]]:mqpr = MVE_VLDRHU16 [[MVE_VLDRHU16_post]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRHU16_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:gprnopc = COPY $r0 + %2:rgpr, %1:mqpr = MVE_VLDRHU16_post %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRHU16 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRBU8_post +tracksRegLiveness: true +registers: + - { id: 0, class: gprnopc, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRBU8_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 + ; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRBU8_:%[0-9]+]]:mqpr = MVE_VLDRBU8 [[MVE_VLDRBU8_post]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBU8_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:gprnopc = COPY $r0 + %2:rgpr, %1:mqpr = MVE_VLDRBU8_post %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRBU8 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRBS32_post +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRBS32_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VLDRBS32_post:%[0-9]+]]:tgpr, [[MVE_VLDRBS32_post1:%[0-9]+]]:mqpr = MVE_VLDRBS32_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRBS32_:%[0-9]+]]:mqpr = MVE_VLDRBS32 [[MVE_VLDRBS32_post]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBS32_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:tgpr = COPY $r0 + %2:tgpr, %1:mqpr = MVE_VLDRBS32_post %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRBS32 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRBU32_post +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRBU32_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VLDRBU32_post:%[0-9]+]]:tgpr, [[MVE_VLDRBU32_post1:%[0-9]+]]:mqpr = MVE_VLDRBU32_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRBU32_:%[0-9]+]]:mqpr = MVE_VLDRBU32 [[MVE_VLDRBU32_post]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBU32_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:tgpr = COPY $r0 + %2:tgpr, %1:mqpr = MVE_VLDRBU32_post %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRBU32 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRHS32_post +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRHS32_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VLDRHS32_post:%[0-9]+]]:tgpr, [[MVE_VLDRHS32_post1:%[0-9]+]]:mqpr = MVE_VLDRHS32_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRHS32_:%[0-9]+]]:mqpr = MVE_VLDRHS32 [[MVE_VLDRHS32_post]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRHS32_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:tgpr = COPY $r0 + %2:tgpr, %1:mqpr = MVE_VLDRHS32_post %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRHS32 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRHU32_post +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRHU32_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VLDRHU32_post:%[0-9]+]]:tgpr, [[MVE_VLDRHU32_post1:%[0-9]+]]:mqpr = MVE_VLDRHU32_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRHU32_:%[0-9]+]]:mqpr = MVE_VLDRHU32 [[MVE_VLDRHU32_post]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRHU32_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:tgpr = COPY $r0 + %2:tgpr, %1:mqpr = MVE_VLDRHU32_post %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRHU32 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRBS16_post +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRBS16_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VLDRBS16_post:%[0-9]+]]:tgpr, [[MVE_VLDRBS16_post1:%[0-9]+]]:mqpr = MVE_VLDRBS16_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRBS16_:%[0-9]+]]:mqpr = MVE_VLDRBS16 [[MVE_VLDRBS16_post]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBS16_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:tgpr = COPY $r0 + %2:tgpr, %1:mqpr = MVE_VLDRBS16_post %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRBS16 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRBU16_post +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRBU16_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VLDRBU16_post:%[0-9]+]]:tgpr, [[MVE_VLDRBU16_post1:%[0-9]+]]:mqpr = MVE_VLDRBU16_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRBU16_:%[0-9]+]]:mqpr = MVE_VLDRBU16 [[MVE_VLDRBU16_post]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBU16_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:tgpr = COPY $r0 + %2:tgpr, %1:mqpr = MVE_VLDRBU16_post %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRBU16 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VSTRWU32_post +tracksRegLiveness: true +registers: + - { id: 0, class: rgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VSTRWU32_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[MVE_VSTRWU32_post:%[0-9]+]]:rgpr = MVE_VSTRWU32_post [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRWU32 [[COPY]], [[MVE_VSTRWU32_post]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRWU32_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:rgpr = COPY $r0 + %2:rgpr = MVE_VSTRWU32_post %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRWU32 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VSTRHU16_post +tracksRegLiveness: true +registers: + - { id: 0, class: rgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VSTRHU16_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[MVE_VSTRHU16_post:%[0-9]+]]:rgpr = MVE_VSTRHU16_post [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRHU16 [[COPY]], [[MVE_VSTRHU16_post]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRHU16_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:rgpr = COPY $r0 + %2:rgpr = MVE_VSTRHU16_post %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRHU16 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VSTRBU8_post +tracksRegLiveness: true +registers: + - { id: 0, class: rgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VSTRBU8_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRBU8 [[COPY]], [[MVE_VSTRBU8_post]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRBU8_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:rgpr = COPY $r0 + %2:rgpr = MVE_VSTRBU8_post %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRBU8 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VSTRH32_post +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VSTRH32_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VSTRH32_post:%[0-9]+]]:tgpr = MVE_VSTRH32_post [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRH32 [[COPY]], [[MVE_VSTRH32_post]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRH32_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:tgpr = COPY $r0 + %2:tgpr = MVE_VSTRH32_post %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRH32 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VSTRB32_post +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VSTRB32_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VSTRB32_post:%[0-9]+]]:tgpr = MVE_VSTRB32_post [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRB32 [[COPY]], [[MVE_VSTRB32_post]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRB32_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:tgpr = COPY $r0 + %2:tgpr = MVE_VSTRB32_post %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRB32 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VSTRB16_post +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VSTRB16_post + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VSTRB16_post:%[0-9]+]]:tgpr = MVE_VSTRB16_post [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRB16 [[COPY]], [[MVE_VSTRB16_post]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRB16_post]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:tgpr = COPY $r0 + %2:tgpr = MVE_VSTRB16_post %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRB16 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRWU32_pre +tracksRegLiveness: true +registers: + - { id: 0, class: rgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRWU32_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[MVE_VLDRWU32_pre:%[0-9]+]]:rgpr, [[MVE_VLDRWU32_pre1:%[0-9]+]]:mqpr = MVE_VLDRWU32_pre [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[MVE_VLDRWU32_pre]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRWU32_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:rgpr = COPY $r0 + %2:rgpr, %1:mqpr = MVE_VLDRWU32_pre %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRWU32 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRHU16_pre +tracksRegLiveness: true +registers: + - { id: 0, class: rgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRHU16_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[MVE_VLDRHU16_pre:%[0-9]+]]:rgpr, [[MVE_VLDRHU16_pre1:%[0-9]+]]:mqpr = MVE_VLDRHU16_pre [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRHU16_:%[0-9]+]]:mqpr = MVE_VLDRHU16 [[MVE_VLDRHU16_pre]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRHU16_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:rgpr = COPY $r0 + %2:rgpr, %1:mqpr = MVE_VLDRHU16_pre %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRHU16 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRBU8_pre +tracksRegLiveness: true +registers: + - { id: 0, class: rgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRBU8_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[MVE_VLDRBU8_pre:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_pre1:%[0-9]+]]:mqpr = MVE_VLDRBU8_pre [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRBU8_:%[0-9]+]]:mqpr = MVE_VLDRBU8 [[MVE_VLDRBU8_pre]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBU8_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:rgpr = COPY $r0 + %2:rgpr, %1:mqpr = MVE_VLDRBU8_pre %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRBU8 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRBS32_pre +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRBS32_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VLDRBS32_pre:%[0-9]+]]:tgpr, [[MVE_VLDRBS32_pre1:%[0-9]+]]:mqpr = MVE_VLDRBS32_pre [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRBS32_:%[0-9]+]]:mqpr = MVE_VLDRBS32 [[MVE_VLDRBS32_pre]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBS32_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:tgpr = COPY $r0 + %2:tgpr, %1:mqpr = MVE_VLDRBS32_pre %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRBS32 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRBU32_pre +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRBU32_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VLDRBU32_pre:%[0-9]+]]:tgpr, [[MVE_VLDRBU32_pre1:%[0-9]+]]:mqpr = MVE_VLDRBU32_pre [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRBU32_:%[0-9]+]]:mqpr = MVE_VLDRBU32 [[MVE_VLDRBU32_pre]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBU32_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:tgpr = COPY $r0 + %2:tgpr, %1:mqpr = MVE_VLDRBU32_pre %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRBU32 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRHS32_pre +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRHS32_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VLDRHS32_pre:%[0-9]+]]:tgpr, [[MVE_VLDRHS32_pre1:%[0-9]+]]:mqpr = MVE_VLDRHS32_pre [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRHS32_:%[0-9]+]]:mqpr = MVE_VLDRHS32 [[MVE_VLDRHS32_pre]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRHS32_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:tgpr = COPY $r0 + %2:tgpr, %1:mqpr = MVE_VLDRHS32_pre %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRHS32 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRHU32_pre +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRHU32_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VLDRHU32_pre:%[0-9]+]]:tgpr, [[MVE_VLDRHU32_pre1:%[0-9]+]]:mqpr = MVE_VLDRHU32_pre [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRHU32_:%[0-9]+]]:mqpr = MVE_VLDRHU32 [[MVE_VLDRHU32_pre]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRHU32_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:tgpr = COPY $r0 + %2:tgpr, %1:mqpr = MVE_VLDRHU32_pre %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRHU32 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRBS16_pre +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRBS16_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VLDRBS16_pre:%[0-9]+]]:tgpr, [[MVE_VLDRBS16_pre1:%[0-9]+]]:mqpr = MVE_VLDRBS16_pre [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRBS16_:%[0-9]+]]:mqpr = MVE_VLDRBS16 [[MVE_VLDRBS16_pre]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBS16_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:tgpr = COPY $r0 + %2:tgpr, %1:mqpr = MVE_VLDRBS16_pre %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRBS16 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VLDRBU16_pre +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VLDRBU16_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VLDRBU16_pre:%[0-9]+]]:tgpr, [[MVE_VLDRBU16_pre1:%[0-9]+]]:mqpr = MVE_VLDRBU16_pre [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRBU16_:%[0-9]+]]:mqpr = MVE_VLDRBU16 [[MVE_VLDRBU16_pre]], -16, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBU16_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:tgpr = COPY $r0 + %2:tgpr, %1:mqpr = MVE_VLDRBU16_pre %0, 32, 0, $noreg :: (load 16, align 8) + %1:mqpr = MVE_VLDRBU16 %0, 16, 0, $noreg :: (load 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VSTRWU32_pre +tracksRegLiveness: true +registers: + - { id: 0, class: rgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VSTRWU32_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[MVE_VSTRWU32_pre:%[0-9]+]]:rgpr = MVE_VSTRWU32_pre [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRWU32 [[COPY]], [[MVE_VSTRWU32_pre]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRWU32_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:rgpr = COPY $r0 + %2:rgpr = MVE_VSTRWU32_pre %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRWU32 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VSTRHU16_pre +tracksRegLiveness: true +registers: + - { id: 0, class: rgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VSTRHU16_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[MVE_VSTRHU16_pre:%[0-9]+]]:rgpr = MVE_VSTRHU16_pre [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRHU16 [[COPY]], [[MVE_VSTRHU16_pre]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRHU16_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:rgpr = COPY $r0 + %2:rgpr = MVE_VSTRHU16_pre %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRHU16 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VSTRBU8_pre +tracksRegLiveness: true +registers: + - { id: 0, class: rgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VSTRBU8_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[MVE_VSTRBU8_pre:%[0-9]+]]:rgpr = MVE_VSTRBU8_pre [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRBU8 [[COPY]], [[MVE_VSTRBU8_pre]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRBU8_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:rgpr = COPY $r0 + %2:rgpr = MVE_VSTRBU8_pre %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRBU8 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VSTRH32_pre +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VSTRH32_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VSTRH32_pre:%[0-9]+]]:tgpr = MVE_VSTRH32_pre [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRH32 [[COPY]], [[MVE_VSTRH32_pre]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRH32_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:tgpr = COPY $r0 + %2:tgpr = MVE_VSTRH32_pre %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRH32 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VSTRB32_pre +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VSTRB32_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VSTRB32_pre:%[0-9]+]]:tgpr = MVE_VSTRB32_pre [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRB32 [[COPY]], [[MVE_VSTRB32_pre]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRB32_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:tgpr = COPY $r0 + %2:tgpr = MVE_VSTRB32_pre %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRB32 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: MVE_VSTRB16_pre +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: MVE_VSTRB16_pre + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VSTRB16_pre:%[0-9]+]]:tgpr = MVE_VSTRB16_pre [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRB16 [[COPY]], [[MVE_VSTRB16_pre]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRB16_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:tgpr = COPY $r0 + %2:tgpr = MVE_VSTRB16_pre %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRB16 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: multiple2 +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: multiple2 + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VSTRB16_pre:%[0-9]+]]:tgpr = MVE_VSTRB16_pre [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRB16 [[COPY]], [[MVE_VSTRB16_pre]], -16, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRB16 [[COPY]], [[MVE_VSTRB16_pre]], -48, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRB16 [[COPY]], [[MVE_VSTRB16_pre]], 2, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRB16_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:tgpr = COPY $r0 + %2:tgpr = MVE_VSTRB16_pre %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRB16 %1, %0, 16, 0, $noreg :: (store 16, align 8) + MVE_VSTRB16 %1, %0, -16, 0, $noreg :: (store 16, align 8) + MVE_VSTRB16 %1, %0, 34, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: multiple3 +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } + - { id: 3, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: multiple3 + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VSTRB16_pre:%[0-9]+]]:tgpr = MVE_VSTRB16_pre [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: [[MVE_VSTRB16_pre1:%[0-9]+]]:tgpr = MVE_VSTRB16_pre [[COPY]], [[COPY1]], 64, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRB16 [[COPY]], [[MVE_VSTRB16_pre1]], -48, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRB16_pre1]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:tgpr = COPY $r0 + %2:tgpr = MVE_VSTRB16_pre %1, %0, 32, 0, $noreg :: (store 16, align 8) + %3:tgpr = MVE_VSTRB16_pre %1, %0, 64, 0, $noreg :: (store 16, align 8) + MVE_VSTRB16 %1, %0, 16, 0, $noreg :: (store 16, align 8) + $r0 = COPY %3 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: multiple4 +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } + - { id: 3, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: multiple4 + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VSTRB16_pre:%[0-9]+]]:tgpr = MVE_VSTRB16_pre [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRB16 [[COPY]], [[COPY1]], 0, 0, $noreg :: (store 16, align 8) + ; CHECK: [[t2ADDri:%[0-9]+]]:tgpr = nuw t2ADDri [[COPY1]], 32, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:tgpr = COPY $r0 + %2:tgpr = MVE_VSTRB16_pre %1, %0, 32, 0, $noreg :: (store 16, align 8) + MVE_VSTRB16 %1, %0, 0, 0, $noreg :: (store 16, align 8) + %3:tgpr = nuw t2ADDri %0, 32, 14, $noreg, $noreg + $r0 = COPY %3 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: badScale2 +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: badScale2 + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VSTRBU8_pre:%[0-9]+]]:tgpr = MVE_VSTRBU8_pre [[COPY]], [[COPY1]], 33, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRWU32 [[COPY]], [[COPY1]], 0, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRBU8_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:tgpr = COPY $r0 + %2:tgpr = MVE_VSTRBU8_pre %1, %0, 33, 0, $noreg :: (store 16, align 8) + MVE_VSTRWU32 %1, %0, 0, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: badRange2 +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr, preferred-register: '' } + - { id: 1, class: mqpr, preferred-register: '' } + - { id: 2, class: tgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$q0', virtual-reg: '%1' } +body: | + bb.0: + liveins: $r0, $q0 + + ; CHECK-LABEL: name: badRange2 + ; CHECK: liveins: $r0, $q0 + ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 + ; CHECK: [[MVE_VSTRB16_pre:%[0-9]+]]:tgpr = MVE_VSTRB16_pre [[COPY]], [[COPY1]], 100, 0, $noreg :: (store 16, align 8) + ; CHECK: MVE_VSTRB16 [[COPY]], [[COPY1]], -100, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRB16_pre]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %1:mqpr = COPY $q0 + %0:tgpr = COPY $r0 + %2:tgpr = MVE_VSTRB16_pre %1, %0, 100, 0, $noreg :: (store 16, align 8) + MVE_VSTRB16 %1, %0, -100, 0, $noreg :: (store 16, align 8) + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll index 646124e0cf983..0f3e893fd8017 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -683,84 +683,86 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #20 -; CHECK-NEXT: sub sp, #20 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: add.w r12, sp, #12 ; CHECK-NEXT: cmp r3, #4 -; CHECK-NEXT: strd r0, r1, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill ; CHECK-NEXT: bne .LBB5_8 ; CHECK-NEXT: @ %bb.1: @ %for.cond.preheader -; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr r7, [sp, #84] -; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: ldr r3, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r0, [sp, #68] -; CHECK-NEXT: add.w r1, r3, r7, lsl #1 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: adds r1, r3, r7 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r1, r7, r7, lsl #1 -; CHECK-NEXT: vdup.16 q0, r0 -; CHECK-NEXT: adds r0, r3, r1 +; CHECK-NEXT: ldr r2, [sp, #88] +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r4, [sp, #72] +; CHECK-NEXT: add.w r0, r1, r2, lsl #1 +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: adds r0, r1, r2 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r2, r2, lsl #1 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: adds r0, r7, #7 -; CHECK-NEXT: lsr.w r9, r0, #3 +; CHECK-NEXT: adds r0, r2, #7 +; CHECK-NEXT: lsrs r2, r0, #3 ; CHECK-NEXT: b .LBB5_5 ; CHECK-NEXT: .LBB5_3: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: .LBB5_4: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #92] -; CHECK-NEXT: add.w r0, r8, r10 +; CHECK-NEXT: add.w r0, r10, r8 +; CHECK-NEXT: ldr r1, [sp, #96] ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 -; CHECK-NEXT: strb.w r0, [r1, r11] -; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: cmp r11, r2 +; CHECK-NEXT: strb.w r0, [r1, r9] +; CHECK-NEXT: add.w r9, r9, #1 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: cmp r9, r0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: .LBB5_5: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_7 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #88] -; CHECK-NEXT: subs.w lr, r9, r9 -; CHECK-NEXT: ldr.w r12, [r0, r11, lsl #2] +; CHECK-NEXT: ldr r0, [sp, #92] +; CHECK-NEXT: subs.w lr, r2, r2 +; CHECK-NEXT: ldr.w r12, [r0, r9, lsl #2] ; CHECK-NEXT: ble .LBB5_3 ; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #84] +; CHECK-NEXT: ldr.w r11, [sp, #88] ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: dlstp.16 lr, r3 +; CHECK-NEXT: dlstp.16 lr, r11 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mla r5, r11, r3, r0 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldrd r4, r7, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mla r3, r9, r11, r0 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: .LBB5_7: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB5_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrb.s16 q1, [r4], #8 -; CHECK-NEXT: vadd.i16 q2, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r7], #8 +; CHECK-NEXT: vadd.i16 q1, q0, r4 +; CHECK-NEXT: vldrb.s16 q0, [r3], #8 +; CHECK-NEXT: vmlava.s16 r12, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vmlava.s16 r12, q1, q2 -; CHECK-NEXT: vldrb.s16 q2, [r0], #8 -; CHECK-NEXT: vadd.i16 q2, q2, q0 -; CHECK-NEXT: vmlava.s16 r6, q1, q2 -; CHECK-NEXT: vldrb.s16 q2, [r7], #8 -; CHECK-NEXT: vadd.i16 q2, q2, q0 -; CHECK-NEXT: vmlava.s16 r8, q1, q2 -; CHECK-NEXT: vldrb.s16 q2, [r1], #8 -; CHECK-NEXT: vadd.i16 q2, q2, q0 -; CHECK-NEXT: vmlava.s16 r10, q1, q2 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r6, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r0], #8 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r10, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r1], #8 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r8, q0, q1 ; CHECK-NEXT: letp lr, .LBB5_7 ; CHECK-NEXT: b .LBB5_4 ; CHECK-NEXT: .LBB5_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #92] -; CHECK-NEXT: add sp, #20 +; CHECK-NEXT: ldr r0, [sp, #96] +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4 @@ -869,83 +871,85 @@ define i8* @signext_optsize(i8* %input_row, i8* %input_col, i16 zeroext %output_ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #20 -; CHECK-NEXT: sub sp, #20 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: add.w r12, sp, #12 ; CHECK-NEXT: cmp r3, #4 -; CHECK-NEXT: strd r0, r1, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill ; CHECK-NEXT: bne .LBB6_8 ; CHECK-NEXT: @ %bb.1: @ %for.cond.preheader -; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr r7, [sp, #84] -; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: ldr r3, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r0, [sp, #68] -; CHECK-NEXT: add.w r1, r3, r7, lsl #1 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: adds r1, r3, r7 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r1, r7, r7, lsl #1 -; CHECK-NEXT: vdup.16 q0, r0 -; CHECK-NEXT: adds r0, r3, r1 +; CHECK-NEXT: ldr r2, [sp, #88] +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r4, [sp, #72] +; CHECK-NEXT: add.w r0, r1, r2, lsl #1 +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: adds r0, r1, r2 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r2, r2, lsl #1 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: adds r0, r7, #7 -; CHECK-NEXT: lsr.w r9, r0, #3 +; CHECK-NEXT: adds r0, r2, #7 +; CHECK-NEXT: lsrs r2, r0, #3 ; CHECK-NEXT: .LBB6_3: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_5 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #88] -; CHECK-NEXT: subs.w lr, r9, r9 -; CHECK-NEXT: ldr.w r12, [r0, r11, lsl #2] +; CHECK-NEXT: ldr r0, [sp, #92] +; CHECK-NEXT: subs.w lr, r2, r2 +; CHECK-NEXT: ldr.w r12, [r0, r9, lsl #2] ; CHECK-NEXT: ble .LBB6_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #84] +; CHECK-NEXT: ldr.w r11, [sp, #88] ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: dlstp.16 lr, r3 +; CHECK-NEXT: dlstp.16 lr, r11 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mla r5, r11, r3, r0 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldrd r4, r7, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mla r3, r9, r11, r0 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: .LBB6_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB6_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrb.s16 q1, [r4], #8 -; CHECK-NEXT: vadd.i16 q2, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r7], #8 +; CHECK-NEXT: vadd.i16 q1, q0, r4 +; CHECK-NEXT: vldrb.s16 q0, [r3], #8 +; CHECK-NEXT: vmlava.s16 r12, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vmlava.s16 r12, q1, q2 -; CHECK-NEXT: vldrb.s16 q2, [r0], #8 -; CHECK-NEXT: vadd.i16 q2, q2, q0 -; CHECK-NEXT: vmlava.s16 r6, q1, q2 -; CHECK-NEXT: vldrb.s16 q2, [r7], #8 -; CHECK-NEXT: vadd.i16 q2, q2, q0 -; CHECK-NEXT: vmlava.s16 r8, q1, q2 -; CHECK-NEXT: vldrb.s16 q2, [r1], #8 -; CHECK-NEXT: vadd.i16 q2, q2, q0 -; CHECK-NEXT: vmlava.s16 r10, q1, q2 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r6, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r0], #8 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r10, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r1], #8 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r8, q0, q1 ; CHECK-NEXT: letp lr, .LBB6_5 ; CHECK-NEXT: b .LBB6_7 ; CHECK-NEXT: .LBB6_6: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #92] -; CHECK-NEXT: add.w r0, r8, r10 +; CHECK-NEXT: add.w r0, r10, r8 +; CHECK-NEXT: ldr r1, [sp, #96] ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 -; CHECK-NEXT: strb.w r0, [r1, r11] -; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: cmp r11, r2 +; CHECK-NEXT: strb.w r0, [r1, r9] +; CHECK-NEXT: add.w r9, r9, #1 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: cmp r9, r0 ; CHECK-NEXT: bne .LBB6_3 ; CHECK-NEXT: .LBB6_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #92] -; CHECK-NEXT: add sp, #20 +; CHECK-NEXT: ldr r0, [sp, #96] +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll index ed7e84a899d24..2d890aaac331e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll @@ -9,32 +9,21 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture % ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: vidup.u32 q2, r6, #1 -; CHECK-NEXT: cmp r1, #4 -; CHECK-NEXT: it ge -; CHECK-NEXT: movge.w r12, #4 -; CHECK-NEXT: sub.w r6, r1, r12 -; CHECK-NEXT: adds r6, #3 -; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: adr r4, .LCPI0_0 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, lr, r6, lsr #2 ; CHECK-NEXT: vldrw.u32 q1, [r4] ; CHECK-NEXT: vmov.i32 q3, #0x4 ; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 -; CHECK-NEXT: vcmpt.f32 ge, q1, q4 +; CHECK-NEXT: vldrw.u32 q4, [r0], #16 +; CHECK-NEXT: vptt.f32 ge, q1, q4 ; CHECK-NEXT: vmovt q1, q4 ; CHECK-NEXT: vmovt q0, q2 ; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: vldr s8, .LCPI0_1 ; CHECK-NEXT: vdup.32 q3, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-qrintr.ll b/llvm/test/CodeGen/Thumb2/mve-qrintr.ll new file mode 100644 index 0000000000000..31f3378fc23fc --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-qrintr.ll @@ -0,0 +1,693 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s + +define void @vadd(i32* %s1, i32 %c0, i32 %N) { +; CHECK-LABEL: vadd: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB0_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB0_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vadd.i32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast i32* %s1.addr.013 to <4 x i32>* + %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) + %3 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vsub(i32* %s1, i32 %c0, i32 %N) { +; CHECK-LABEL: vsub: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB1_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB1_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vsub.i32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB1_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast i32* %s1.addr.013 to <4 x i32>* + %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) + %3 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vmul(i32* %s1, i32 %c0, i32 %N) { +; CHECK-LABEL: vmul: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB2_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB2_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmul.i32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB2_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast i32* %s1.addr.013 to <4 x i32>* + %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) + %3 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vqadd(i32* %s1, i32 %c0, i32 %N) { +; CHECK-LABEL: vqadd: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB3_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB3_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vqadd.s32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB3_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast i32* %s1.addr.013 to <4 x i32>* + %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) + %3 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vqsub(i32* %s1, i32 %c0, i32 %N) { +; CHECK-LABEL: vqsub: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB4_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB4_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vqsub.s32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB4_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast i32* %s1.addr.013 to <4 x i32>* + %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) + %3 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vhadd(i32* %s1, i32 %c0, i32 %N) { +; CHECK-LABEL: vhadd: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB5_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB5_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vhadd.s32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB5_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast i32* %s1.addr.013 to <4 x i32>* + %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) + %3 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vhsub(i32* %s1, i32 %c0, i32 %N) { +; CHECK-LABEL: vhsub: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB6_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB6_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vhsub.s32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB6_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast i32* %s1.addr.013 to <4 x i32>* + %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) + %3 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vqdmull(i32* %s1, i32 %c0, i32 %N) { +; CHECK-LABEL: vqdmull: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB7_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB7_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vqdmullb.s16 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB7_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %conv = trunc i32 %c0 to i16 + %.splatinsert = insertelement <8 x i16> undef, i16 %conv, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast i32* %s1.addr.013 to <4 x i16>* + %2 = tail call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %1, i32 2, <4 x i1> %0, <4 x i16> zeroinitializer) + %3 = sext <4 x i16> %2 to <4 x i32> + %4 = bitcast <4 x i32> %3 to <8 x i16> + %5 = tail call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %4, <8 x i16> %.splat, i32 0, <4 x i1> %0, <4 x i32> %3) + %6 = bitcast i32* %s1.addr.013 to <4 x i32>* + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %6, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vqdmulh(i32* %s1, i32 %c0, i32 %N) { +; CHECK-LABEL: vqdmulh: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB8_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB8_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vqdmulh.s32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB8_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast i32* %s1.addr.013 to <4 x i32>* + %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) + %3 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vqrdmulh(i32* %s1, i32 %c0, i32 %N) { +; CHECK-LABEL: vqrdmulh: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB9_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB9_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vqrdmulh.s32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB9_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast i32* %s1.addr.013 to <4 x i32>* + %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) + %3 = tail call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2) + tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vaddf(float* %s1, float %c0, i32 %N) { +; CHECK-LABEL: vaddf: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB10_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB10_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vadd.f32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB10_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 + %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast float* %s1.addr.013 to <4 x float>* + %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) + %3 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2) + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vsubf(float* %s1, float %c0, i32 %N) { +; CHECK-LABEL: vsubf: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB11_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB11_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vsub.f32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB11_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 + %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast float* %s1.addr.013 to <4 x float>* + %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) + %3 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2) + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vmulf(float* %s1, float %c0, i32 %N) { +; CHECK-LABEL: vmulf: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB12_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: .LBB12_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmul.f32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB12_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 + %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) + %1 = bitcast float* %s1.addr.013 to <4 x float>* + %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) + %3 = tail call fast <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2) + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0) + %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4 + %sub = add nsw i32 %N.addr.012, -4 + %cmp = icmp sgt i32 %N.addr.012, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vfma(float* %s1, float* %s2, float %c0, i32 %N) { +; CHECK-LABEL: vfma: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB13_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: .LBB13_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vfma.f32 q1, q0, r2 +; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: letp lr, .LBB13_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp12 = icmp sgt i32 %N, 0 + br i1 %cmp12, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %0 = bitcast float* %s2 to <4 x float>* + %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 + %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013) + %2 = bitcast float* %s1.addr.014 to <4 x float>* + %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) + %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) + %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %4, <4 x float> %.splat, <4 x float> %3, <4 x i1> %1) + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1) + %add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4 + %sub = add nsw i32 %N.addr.013, -4 + %cmp = icmp sgt i32 %N.addr.013, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @vfmas(float* %s1, float* %s2, float %c0, i32 %N) { +; CHECK-LABEL: vfmas: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} +; CHECK-NEXT: .LBB14_1: @ %while.body.lr.ph +; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: .LBB14_2: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vfmas.f32 q1, q0, r2 +; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: letp lr, .LBB14_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp12 = icmp sgt i32 %N, 0 + br i1 %cmp12, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %0 = bitcast float* %s2 to <4 x float>* + %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 + %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] + %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013) + %2 = bitcast float* %s1.addr.014 to <4 x float>* + %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) + %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) + %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %3, <4 x float> %4, <4 x float> %.splat, <4 x i1> %1) + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1) + %add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4 + %sub = add nsw i32 %N.addr.013, -4 + %cmp = icmp sgt i32 %N.addr.013, 4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +declare <4 x i1> @llvm.arm.mve.vctp32(i32) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) +declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) +declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) +declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) +declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) +declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) +declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll index 2ea70f1b06de2..36e620d50758e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -2240,15 +2240,9 @@ define arm_aapcs_vfpcc void @ssatmul_4_q7(i8* nocapture readonly %pSrcA, i8* noc ; CHECK-NEXT: ldrsb r0, [r12], #1 ; CHECK-NEXT: ldrsb r1, [r6], #1 ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: asrs r1, r0, #7 -; CHECK-NEXT: cmn.w r1, #128 -; CHECK-NEXT: mvn r1, #127 -; CHECK-NEXT: it gt -; CHECK-NEXT: asrgt r1, r0, #7 -; CHECK-NEXT: cmp r1, #127 -; CHECK-NEXT: it ge -; CHECK-NEXT: movge r1, #127 -; CHECK-NEXT: strb r1, [r4], #1 +; CHECK-NEXT: asrs r0, r0, #7 +; CHECK-NEXT: ssat r0, #8, r0 +; CHECK-NEXT: strb r0, [r4], #1 ; CHECK-NEXT: le lr, .LBB13_7 ; CHECK-NEXT: .LBB13_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll index a1f25e0f33342..77f0c77033f95 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll @@ -3,30 +3,11 @@ ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP define arm_aapcs_vfpcc float @fadd_v2f32(<2 x float> %x, float %y) { -; CHECK-FP-LABEL: fadd_v2f32: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vadd.f32 s0, s0, s1 -; CHECK-FP-NEXT: vldr s2, .LCPI0_0 -; CHECK-FP-NEXT: vadd.f32 s0, s0, s2 -; CHECK-FP-NEXT: vadd.f32 s0, s4, s0 -; CHECK-FP-NEXT: bx lr -; CHECK-FP-NEXT: .p2align 2 -; CHECK-FP-NEXT: @ %bb.1: -; CHECK-FP-NEXT: .LCPI0_0: -; CHECK-FP-NEXT: .long 0x00000000 @ float 0 -; -; CHECK-NOFP-LABEL: fadd_v2f32: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vldr s2, .LCPI0_0 -; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NOFP-NEXT: vadd.f32 s0, s4, s0 -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 2 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI0_0: -; CHECK-NOFP-NEXT: .long 0x00000000 @ float 0 +; CHECK-LABEL: fadd_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vadd.f32 s0, s4, s0 +; CHECK-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %y, <2 x float> %x) ret float %z @@ -80,34 +61,14 @@ entry: } define arm_aapcs_vfpcc void @fadd_v2f16(<2 x half> %x, half* %yy) { -; CHECK-FP-LABEL: fadd_v2f16: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s0 -; CHECK-FP-NEXT: vadd.f16 s0, s0, s4 -; CHECK-FP-NEXT: vldr.16 s2, [r0] -; CHECK-FP-NEXT: vadd.f16 s0, s2, s0 -; CHECK-FP-NEXT: vstr.16 s0, [r0] -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fadd_v2f16: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI3_0 -; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vadd.f16 s0, s2, s0 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI3_0: -; CHECK-NOFP-NEXT: .short 0x0000 @ half 0 +; CHECK-LABEL: fadd_v2f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vadd.f16 s0, s0, s4 +; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vadd.f16 s0, s2, s0 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: bx lr entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half %y, <2 x half> %x) @@ -134,20 +95,11 @@ define arm_aapcs_vfpcc void @fadd_v4f16(<4 x half> %x, half* %yy) { ; CHECK-NOFP-NEXT: vadd.f16 s4, s0, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 ; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI4_0 -; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0 -; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vadd.f16 s0, s2, s0 ; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI4_0: -; CHECK-NOFP-NEXT: .short 0x0000 @ half 0 entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half %y, <4 x half> %x) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll index 6936b7ea3ad1f..a83fa6882cb90 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll @@ -2,30 +2,11 @@ ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP -; FIXME minnum nonan X, +Inf -> X ? define arm_aapcs_vfpcc float @fmin_v2f32(<2 x float> %x) { -; CHECK-FP-LABEL: fmin_v2f32: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vldr s4, .LCPI0_0 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 -; CHECK-FP-NEXT: bx lr -; CHECK-FP-NEXT: .p2align 2 -; CHECK-FP-NEXT: @ %bb.1: -; CHECK-FP-NEXT: .LCPI0_0: -; CHECK-FP-NEXT: .long 0x7f800000 @ float +Inf -; -; CHECK-NOFP-LABEL: fmin_v2f32: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vldr s4, .LCPI0_0 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 2 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI0_0: -; CHECK-NOFP-NEXT: .long 0x7f800000 @ float +Inf +; CHECK-LABEL: fmin_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) ret float %z @@ -99,17 +80,8 @@ define arm_aapcs_vfpcc half @fmin_v4f16(<4 x half> %x) { ; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 ; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI3_0 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI3_0: -; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf entry: %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) ret half %z @@ -237,23 +209,11 @@ entry: ret double %z } -; FIXME should not be vminnm -; FIXME better reductions (no vmovs/vdups) define arm_aapcs_vfpcc float @fmin_v2f32_nofast(<2 x float> %x) { -; CHECK-FP-LABEL: fmin_v2f32_nofast: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmin_v2f32_nofast: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s1, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: bx lr +; CHECK-LABEL: fmin_v2f32_nofast: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) ret float %z @@ -262,28 +222,16 @@ entry: define arm_aapcs_vfpcc float @fmin_v4f32_nofast(<4 x float> %x) { ; CHECK-FP-LABEL: fmin_v4f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s3, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s8, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f32 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f32 s4, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s4, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x) @@ -294,38 +242,20 @@ define arm_aapcs_vfpcc float @fmin_v8f32_nofast(<8 x float> %x) { ; CHECK-FP-LABEL: fmin_v8f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s7, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f32 s8, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s8, s10 -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s12, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s2, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f32 s2, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f32 s8, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f32 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x) @@ -335,30 +265,20 @@ entry: define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmin_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r0, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) @@ -368,47 +288,26 @@ entry: define arm_aapcs_vfpcc half @fmin_v8f16_nofast(<8 x half> %x) { ; CHECK-FP-LABEL: fmin_v8f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s3 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s3, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x) @@ -419,73 +318,38 @@ define arm_aapcs_vfpcc half @fmin_v16f16_nofast(<16 x half> %x) { ; CHECK-FP-LABEL: fmin_v16f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s3 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmovx.f16 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s14 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x) @@ -504,9 +368,7 @@ entry: define arm_aapcs_vfpcc double @fmin_v2f64_nofast(<2 x double> %x) { ; CHECK-LABEL: fmin_v2f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d1, d0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d1 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x) @@ -516,15 +378,9 @@ entry: define arm_aapcs_vfpcc double @fmin_v4f64_nofast(<4 x double> %x) { ; CHECK-LABEL: fmin_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d3, d1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f64 d2, d0 -; CHECK-NEXT: vselgt.f64 d4, d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vcmp.f64 d4, d0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d4 +; CHECK-NEXT: vminnm.f64 d4, d1, d3 +; CHECK-NEXT: vminnm.f64 d0, d0, d2 +; CHECK-NEXT: vminnm.f64 d0, d0, d4 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x) @@ -532,30 +388,11 @@ entry: } define arm_aapcs_vfpcc float @fmin_v2f32_acc(<2 x float> %x, float %y) { -; CHECK-FP-LABEL: fmin_v2f32_acc: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vldr s6, .LCPI18_0 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6 -; CHECK-FP-NEXT: vminnm.f32 s0, s4, s0 -; CHECK-FP-NEXT: bx lr -; CHECK-FP-NEXT: .p2align 2 -; CHECK-FP-NEXT: @ %bb.1: -; CHECK-FP-NEXT: .LCPI18_0: -; CHECK-FP-NEXT: .long 0x7f800000 @ float +Inf -; -; CHECK-NOFP-LABEL: fmin_v2f32_acc: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vldr s6, .LCPI18_0 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s6 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s6 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s0 -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 2 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI18_0: -; CHECK-NOFP-NEXT: .long 0x7f800000 @ float +Inf +; CHECK-LABEL: fmin_v2f32_acc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: vminnm.f32 s0, s4, s0 +; CHECK-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) %c = fcmp fast olt float %y, %z @@ -641,20 +478,11 @@ define arm_aapcs_vfpcc void @fmin_v4f16_acc(<4 x half> %x, half* %yy) { ; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 ; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI21_0 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0 ; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI21_0: -; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) @@ -665,34 +493,14 @@ entry: } define arm_aapcs_vfpcc void @fmin_v2f16_acc(<2 x half> %x, half* %yy) { -; CHECK-FP-LABEL: fmin_v2f16_acc: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s0 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 -; CHECK-FP-NEXT: vldr.16 s2, [r0] -; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0 -; CHECK-FP-NEXT: vstr.16 s0, [r0] -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmin_v2f16_acc: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI22_0 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI22_0: -; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf +; CHECK-LABEL: fmin_v2f16_acc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: bx lr entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %x) @@ -854,25 +662,13 @@ entry: } define arm_aapcs_vfpcc float @fmin_v2f32_acc_nofast(<2 x float> %x, float %y) { -; CHECK-FP-LABEL: fmin_v2f32_acc_nofast: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q2, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q2 -; CHECK-FP-NEXT: vcmp.f32 s0, s4 -; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmin_v2f32_acc_nofast: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s1, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 -; CHECK-NOFP-NEXT: bx lr +; CHECK-LABEL: fmin_v2f32_acc_nofast: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: vcmp.f32 s0, s4 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vselgt.f32 s0, s4, s0 +; CHECK-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) %c = fcmp olt float %y, %z @@ -883,12 +679,9 @@ entry: define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v4f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d4, d1 -; CHECK-FP-NEXT: vmov.f32 s9, s3 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q2 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q2, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q2 +; CHECK-FP-NEXT: vminnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6 ; CHECK-FP-NEXT: vcmp.f32 s0, s4 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 @@ -896,17 +689,9 @@ define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) { ; ; CHECK-NOFP-LABEL: fmin_v4f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s3, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f64 d4, d1 -; CHECK-NOFP-NEXT: vmov.f32 s9, s3 -; CHECK-NOFP-NEXT: vcmp.f32 s8, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s6, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 -; CHECK-NOFP-NEXT: vcmp.f32 s6, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s6, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s6, s6, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s6, s3 ; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 @@ -922,12 +707,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc_nofast(<8 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v8f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 +; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: vcmp.f32 s0, s8 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s8, s0 @@ -935,27 +717,13 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc_nofast(<8 x float> %x, float %y) { ; ; CHECK-NOFP-LABEL: fmin_v8f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s7, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s1, s5 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s14, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s10, s12 -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s2, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s14 -; CHECK-NOFP-NEXT: vcmp.f32 s2, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s12, s10 +; CHECK-NOFP-NEXT: vminnm.f32 s12, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s10, s12 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s10, s0 ; CHECK-NOFP-NEXT: vcmp.f32 s0, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0 @@ -970,35 +738,26 @@ entry: define arm_aapcs_vfpcc void @fmin_v4f16_acc_nofast(<4 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmin_v4f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s0, s4 +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s0, s2 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r1, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r1 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -1016,52 +775,32 @@ entry: define arm_aapcs_vfpcc void @fmin_v8f16_acc_nofast(<8 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmin_v8f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s0, s2 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s3 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s3, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s10, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -1080,78 +819,44 @@ define arm_aapcs_vfpcc void @fmin_v16f16_acc_nofast(<16 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmin_v16f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s0, s2 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s3 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmovx.f16 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s14 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -1183,9 +888,7 @@ entry: define arm_aapcs_vfpcc double @fmin_v2f64_acc_nofast(<2 x double> %x, double %y) { ; CHECK-LABEL: fmin_v2f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d1, d0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d1 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: vcmp.f64 d0, d2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d2, d0 @@ -1200,15 +903,9 @@ entry: define arm_aapcs_vfpcc double @fmin_v4f64_acc_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fmin_v4f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d3, d1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f64 d2, d0 -; CHECK-NEXT: vselgt.f64 d5, d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vcmp.f64 d5, d0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d5 +; CHECK-NEXT: vminnm.f64 d5, d1, d3 +; CHECK-NEXT: vminnm.f64 d0, d0, d2 +; CHECK-NEXT: vminnm.f64 d0, d0, d5 ; CHECK-NEXT: vcmp.f64 d0, d4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d4, d0 @@ -1221,28 +918,10 @@ entry: } define arm_aapcs_vfpcc float @fmax_v2f32(<2 x float> %x) { -; CHECK-FP-LABEL: fmax_v2f32: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vldr s4, .LCPI37_0 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 -; CHECK-FP-NEXT: bx lr -; CHECK-FP-NEXT: .p2align 2 -; CHECK-FP-NEXT: @ %bb.1: -; CHECK-FP-NEXT: .LCPI37_0: -; CHECK-FP-NEXT: .long 0xff800000 @ float -Inf -; -; CHECK-NOFP-LABEL: fmax_v2f32: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vldr s4, .LCPI37_0 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 2 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI37_0: -; CHECK-NOFP-NEXT: .long 0xff800000 @ float -Inf +; CHECK-LABEL: fmax_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) ret float %z @@ -1315,17 +994,8 @@ define arm_aapcs_vfpcc half @fmax_v4f16(<4 x half> %x) { ; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 ; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI40_0 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI40_0: -; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf entry: %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) ret half %z @@ -1454,20 +1124,10 @@ entry: } define arm_aapcs_vfpcc float @fmax_v2f32_nofast(<2 x float> %x) { -; CHECK-FP-LABEL: fmax_v2f32_nofast: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmax_v2f32_nofast: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s0, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: bx lr +; CHECK-LABEL: fmax_v2f32_nofast: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) ret float %z @@ -1476,28 +1136,16 @@ entry: define arm_aapcs_vfpcc float @fmax_v4f32_nofast(<4 x float> %x) { ; CHECK-FP-LABEL: fmax_v4f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f32 s8, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x) @@ -1508,38 +1156,20 @@ define arm_aapcs_vfpcc float @fmax_v8f32_nofast(<8 x float> %x) { ; CHECK-FP-LABEL: fmax_v8f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f32 s8, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s10, s8 -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s0, s12 -; CHECK-NOFP-NEXT: vselgt.f32 s2, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s2 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x) @@ -1549,30 +1179,20 @@ entry: define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmax_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r0, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) @@ -1582,47 +1202,26 @@ entry: define arm_aapcs_vfpcc half @fmax_v8f16_nofast(<8 x half> %x) { ; CHECK-FP-LABEL: fmax_v8f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s3 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s1, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x) @@ -1633,73 +1232,38 @@ define arm_aapcs_vfpcc half @fmax_v16f16_nofast(<16 x half> %x) { ; CHECK-FP-LABEL: fmax_v16f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s3 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmovx.f16 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s14, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x) @@ -1718,9 +1282,7 @@ entry: define arm_aapcs_vfpcc double @fmax_v2f64_nofast(<2 x double> %x) { ; CHECK-LABEL: fmax_v2f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d0, d1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d1 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x) @@ -1730,15 +1292,9 @@ entry: define arm_aapcs_vfpcc double @fmax_v4f64_nofast(<4 x double> %x) { ; CHECK-LABEL: fmax_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f64 d0, d2 -; CHECK-NEXT: vselgt.f64 d4, d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vcmp.f64 d0, d4 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d4 +; CHECK-NEXT: vmaxnm.f64 d4, d1, d3 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d2 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d4 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x) @@ -1746,30 +1302,11 @@ entry: } define arm_aapcs_vfpcc float @fmax_v2f32_acc(<2 x float> %x, float %y) { -; CHECK-FP-LABEL: fmax_v2f32_acc: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vldr s6, .LCPI55_0 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s0 -; CHECK-FP-NEXT: bx lr -; CHECK-FP-NEXT: .p2align 2 -; CHECK-FP-NEXT: @ %bb.1: -; CHECK-FP-NEXT: .LCPI55_0: -; CHECK-FP-NEXT: .long 0xff800000 @ float -Inf -; -; CHECK-NOFP-LABEL: fmax_v2f32_acc: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vldr s6, .LCPI55_0 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s6 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s6 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s0 -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 2 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI55_0: -; CHECK-NOFP-NEXT: .long 0xff800000 @ float -Inf +; CHECK-LABEL: fmax_v2f32_acc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: vmaxnm.f32 s0, s4, s0 +; CHECK-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) %c = fcmp fast ogt float %y, %z @@ -1837,34 +1374,14 @@ entry: } define arm_aapcs_vfpcc void @fmax_v2f16_acc(<2 x half> %x, half* %yy) { -; CHECK-FP-LABEL: fmax_v2f16_acc: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s0 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 -; CHECK-FP-NEXT: vldr.16 s2, [r0] -; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0 -; CHECK-FP-NEXT: vstr.16 s0, [r0] -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmax_v2f16_acc: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI58_0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI58_0: -; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf +; CHECK-LABEL: fmax_v2f16_acc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: bx lr entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %x) @@ -1893,20 +1410,11 @@ define arm_aapcs_vfpcc void @fmax_v4f16_acc(<4 x half> %x, half* %yy) { ; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 ; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI59_0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0 ; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI59_0: -; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) @@ -2068,25 +1576,13 @@ entry: } define arm_aapcs_vfpcc float @fmax_v2f32_acc_nofast(<2 x float> %x, float %y) { -; CHECK-FP-LABEL: fmax_v2f32_acc_nofast: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q2, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q2 -; CHECK-FP-NEXT: vcmp.f32 s4, s0 -; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmax_v2f32_acc_nofast: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s0, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 -; CHECK-NOFP-NEXT: bx lr +; CHECK-LABEL: fmax_v2f32_acc_nofast: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: vcmp.f32 s4, s0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vselgt.f32 s0, s4, s0 +; CHECK-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) %c = fcmp ogt float %y, %z @@ -2097,12 +1593,9 @@ entry: define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v4f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d4, d1 -; CHECK-FP-NEXT: vmov.f32 s9, s3 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q2 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q2, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q2 +; CHECK-FP-NEXT: vmaxnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6 ; CHECK-FP-NEXT: vcmp.f32 s4, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 @@ -2110,17 +1603,9 @@ define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) { ; ; CHECK-NOFP-LABEL: fmax_v4f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f64 d4, d1 -; CHECK-NOFP-NEXT: vmov.f32 s9, s3 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s8 -; CHECK-NOFP-NEXT: vselgt.f32 s6, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s6, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s6, s3 ; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 @@ -2136,12 +1621,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc_nofast(<8 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v8f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 +; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: vcmp.f32 s8, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s8, s0 @@ -2149,27 +1631,13 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc_nofast(<8 x float> %x, float %y) { ; ; CHECK-NOFP-LABEL: fmax_v8f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s1, s5 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f32 s14, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s12, s10 -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s0, s14 -; CHECK-NOFP-NEXT: vselgt.f32 s2, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s14 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s2 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s12, s10 +; CHECK-NOFP-NEXT: vmaxnm.f32 s12, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s10, s12 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s10, s0 ; CHECK-NOFP-NEXT: vcmp.f32 s8, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0 @@ -2184,35 +1652,26 @@ entry: define arm_aapcs_vfpcc void @fmax_v4f16_acc_nofast(<4 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmax_v4f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s4, s0 +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s2, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r1, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r1 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s2, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -2230,52 +1689,32 @@ entry: define arm_aapcs_vfpcc void @fmax_v8f16_acc_nofast(<8 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmax_v8f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s4, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s2, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s3 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s1, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s0, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s2, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -2294,78 +1733,44 @@ define arm_aapcs_vfpcc void @fmax_v16f16_acc_nofast(<16 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmax_v16f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s4, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s2, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s3 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmovx.f16 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s14, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s0, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s2, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -2397,9 +1802,7 @@ entry: define arm_aapcs_vfpcc double @fmax_v2f64_acc_nofast(<2 x double> %x, double %y) { ; CHECK-LABEL: fmax_v2f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d0, d1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d1 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: vcmp.f64 d2, d0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d2, d0 @@ -2414,15 +1817,9 @@ entry: define arm_aapcs_vfpcc double @fmax_v4f64_acc_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fmax_v4f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f64 d0, d2 -; CHECK-NEXT: vselgt.f64 d5, d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vcmp.f64 d0, d5 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d5 +; CHECK-NEXT: vmaxnm.f64 d5, d1, d3 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d2 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d5 ; CHECK-NEXT: vcmp.f64 d4, d0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d4, d0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll index 64a76f38920a7..382c32dbe2bf5 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -1512,13 +1512,10 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: le lr, .LBB15_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vdup.32 q1, r3 -; CHECK-NEXT: vminnm.f32 q0, q0, q1 ; CHECK-NEXT: beq .LBB15_9 ; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r2 @@ -1526,10 +1523,10 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB15_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldmia r0!, {s4} -; CHECK-NEXT: vcmp.f32 s0, s4 +; CHECK-NEXT: vldmia r0!, {s2} +; CHECK-NEXT: vcmp.f32 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselge.f32 s0, s4, s0 +; CHECK-NEXT: vselge.f32 s0, s2, s0 ; CHECK-NEXT: le lr, .LBB15_8 ; CHECK-NEXT: .LBB15_9: @ %for.cond.cleanup ; CHECK-NEXT: vmov r0, s0 @@ -1620,13 +1617,10 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: le lr, .LBB16_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vdup.32 q1, r3 -; CHECK-NEXT: vmaxnm.f32 q0, q0, q1 ; CHECK-NEXT: beq .LBB16_9 ; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r2 @@ -1634,10 +1628,10 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB16_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldmia r0!, {s4} -; CHECK-NEXT: vcmp.f32 s4, s0 +; CHECK-NEXT: vldmia r0!, {s2} +; CHECK-NEXT: vcmp.f32 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselge.f32 s0, s4, s0 +; CHECK-NEXT: vselge.f32 s0, s2, s0 ; CHECK-NEXT: le lr, .LBB16_8 ; CHECK-NEXT: .LBB16_9: @ %for.cond.cleanup ; CHECK-NEXT: vmov r0, s0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll index 93e3b16590b32..b83b51b6f564f 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -170,6 +170,47 @@ entry: ret i64 %z } +define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_zext(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: add_v8i16_v8i32_v8i64_zext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %xx = zext <8 x i16> %x to <8 x i32> + %yy = zext <8 x i16> %y to <8 x i32> + %m = mul <8 x i32> %xx, %yy + %ma = zext <8 x i32> %m to <8 x i64> + %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma) + ret i64 %z +} + +define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sext(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: add_v8i16_v8i32_v8i64_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %xx = sext <8 x i16> %x to <8 x i32> + %yy = sext <8 x i16> %y to <8 x i32> + %m = mul <8 x i32> %xx, %yy + %ma = sext <8 x i32> %m to <8 x i64> + %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma) + ret i64 %z +} + +define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sextzext(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: add_v8i16_v8i32_v8i64_sextzext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q0 +; CHECK-NEXT: bx lr +entry: + %xx = sext <8 x i16> %x to <8 x i32> + %m = mul <8 x i32> %xx, %xx + %ma = zext <8 x i32> %m to <8 x i64> + %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma) + ret i64 %z +} + define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) { ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry @@ -239,6 +280,47 @@ entry: ret i32 %z } +define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlav.u8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %xx = zext <16 x i8> %x to <16 x i16> + %yy = zext <16 x i8> %y to <16 x i16> + %m = mul <16 x i16> %xx, %yy + %ma = zext <16 x i16> %m to <16 x i32> + %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma) + ret i32 %z +} + +define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sext(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: add_v16i8_v16i16_v16i32_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlav.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %xx = sext <16 x i8> %x to <16 x i16> + %yy = sext <16 x i8> %y to <16 x i16> + %m = mul <16 x i16> %xx, %yy + %ma = sext <16 x i16> %m to <16 x i32> + %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma) + ret i32 %z +} + +define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sextzext(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: add_v16i8_v16i16_v16i32_sextzext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlav.s8 r0, q0, q0 +; CHECK-NEXT: bx lr +entry: + %xx = sext <16 x i8> %x to <16 x i16> + %m = mul <16 x i16> %xx, %xx + %ma = zext <16 x i16> %m to <16 x i32> + %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma) + ret i32 %z +} + define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: add_v4i8_v4i32_zext: ; CHECK: @ %bb.0: @ %entry @@ -990,6 +1072,50 @@ entry: ret i64 %r } +define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, i64 %a) { +; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_zext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalva.u16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %xx = zext <8 x i16> %x to <8 x i32> + %yy = zext <8 x i16> %y to <8 x i32> + %m = mul <8 x i32> %xx, %yy + %ma = zext <8 x i32> %m to <8 x i64> + %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma) + %r = add i64 %z, %a + ret i64 %r +} + +define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, i64 %a) { +; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %xx = sext <8 x i16> %x to <8 x i32> + %yy = sext <8 x i16> %y to <8 x i32> + %m = mul <8 x i32> %xx, %yy + %ma = sext <8 x i32> %m to <8 x i64> + %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma) + %r = add i64 %z, %a + ret i64 %r +} + +define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sextzext(<8 x i16> %x, <8 x i16> %y, i64 %a) { +; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sextzext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q0 +; CHECK-NEXT: bx lr +entry: + %xx = sext <8 x i16> %x to <8 x i32> + %m = mul <8 x i32> %xx, %xx + %ma = zext <8 x i32> %m to <8 x i64> + %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma) + %r = add i64 %z, %a + ret i64 %r +} + define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry @@ -1071,6 +1197,50 @@ entry: ret i32 %r } +define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, i32 %a) { +; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_zext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlava.u8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %xx = zext <16 x i8> %x to <16 x i16> + %yy = zext <16 x i8> %y to <16 x i16> + %m = mul <16 x i16> %xx, %yy + %ma = zext <16 x i16> %m to <16 x i32> + %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma) + %r = add i32 %z, %a + ret i32 %r +} + +define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, i32 %a) { +; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlava.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %xx = sext <16 x i8> %x to <16 x i16> + %yy = sext <16 x i8> %y to <16 x i16> + %m = mul <16 x i16> %xx, %yy + %ma = sext <16 x i16> %m to <16 x i32> + %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma) + %r = add i32 %z, %a + ret i32 %r +} + +define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sextzext(<16 x i8> %x, i32 %a) { +; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sextzext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlava.s8 r0, q0, q0 +; CHECK-NEXT: bx lr +entry: + %xx = sext <16 x i8> %x to <16 x i16> + %m = mul <16 x i16> %xx, %xx + %ma = zext <16 x i16> %m to <16 x i32> + %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma) + %r = add i32 %z, %a + ret i32 %r +} + define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, i32 %a) { ; CHECK-LABEL: add_v4i8_v4i32_acc_zext: ; CHECK: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll index f30856d32b113..02d124890c6bb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -236,6 +236,56 @@ entry: ret i64 %z } +define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { +; CHECK-LABEL: add_v8i16_v8i32_v8i64_zext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vpt.i16 eq, q2, zr +; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %b, zeroinitializer + %xx = zext <8 x i16> %x to <8 x i32> + %yy = zext <8 x i16> %y to <8 x i32> + %m = mul <8 x i32> %xx, %yy + %ma = zext <8 x i32> %m to <8 x i64> + %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer + %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + ret i64 %z +} + +define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { +; CHECK-LABEL: add_v8i16_v8i32_v8i64_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vpt.i16 eq, q2, zr +; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %b, zeroinitializer + %xx = sext <8 x i16> %x to <8 x i32> + %yy = sext <8 x i16> %y to <8 x i32> + %m = mul <8 x i32> %xx, %yy + %ma = sext <8 x i32> %m to <8 x i64> + %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer + %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + ret i64 %z +} + +define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { +; CHECK-LABEL: add_v8i16_v8i32_v8i64_sextzext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vpt.i16 eq, q2, zr +; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q0 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %b, zeroinitializer + %xx = sext <8 x i16> %x to <8 x i32> + %m = mul <8 x i32> %xx, %xx + %ma = zext <8 x i32> %m to <8 x i64> + %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer + %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + ret i64 %z +} + define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) { ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry @@ -371,6 +421,56 @@ entry: ret i32 %z } +define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { +; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vpt.i8 eq, q2, zr +; CHECK-NEXT: vmlavt.u8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %b, zeroinitializer + %xx = zext <16 x i8> %x to <16 x i16> + %yy = zext <16 x i8> %y to <16 x i16> + %m = mul <16 x i16> %xx, %yy + %ma = zext <16 x i16> %m to <16 x i32> + %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer + %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + ret i32 %z +} + +define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { +; CHECK-LABEL: add_v16i8_v16i16_v16i32_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vpt.i8 eq, q2, zr +; CHECK-NEXT: vmlavt.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %b, zeroinitializer + %xx = sext <16 x i8> %x to <16 x i16> + %yy = sext <16 x i8> %y to <16 x i16> + %m = mul <16 x i16> %xx, %yy + %ma = sext <16 x i16> %m to <16 x i32> + %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer + %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + ret i32 %z +} + +define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { +; CHECK-LABEL: add_v16i8_v16i16_v16i32_sextzext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vpt.i8 eq, q2, zr +; CHECK-NEXT: vmlavt.s8 r0, q0, q0 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %b, zeroinitializer + %xx = sext <16 x i8> %x to <16 x i16> + %m = mul <16 x i16> %xx, %xx + %ma = zext <16 x i16> %m to <16 x i32> + %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer + %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + ret i32 %z +} + define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) { ; CHECK-LABEL: add_v4i8_v4i32_zext: ; CHECK: @ %bb.0: @ %entry @@ -1668,6 +1768,59 @@ entry: ret i64 %r } +define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { +; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_zext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vpt.i16 eq, q2, zr +; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %b, zeroinitializer + %xx = zext <8 x i16> %x to <8 x i32> + %yy = zext <8 x i16> %y to <8 x i32> + %m = mul <8 x i32> %xx, %yy + %ma = zext <8 x i32> %m to <8 x i64> + %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer + %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %r = add i64 %z, %a + ret i64 %r +} + +define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { +; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vpt.i16 eq, q2, zr +; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %b, zeroinitializer + %xx = sext <8 x i16> %x to <8 x i32> + %yy = sext <8 x i16> %y to <8 x i32> + %m = mul <8 x i32> %xx, %yy + %ma = sext <8 x i32> %m to <8 x i64> + %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer + %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %r = add i64 %z, %a + ret i64 %r +} + +define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { +; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sextzext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vpt.i16 eq, q2, zr +; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q0 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %b, zeroinitializer + %xx = sext <8 x i16> %x to <8 x i32> + %m = mul <8 x i32> %xx, %xx + %ma = zext <8 x i32> %m to <8 x i64> + %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer + %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %r = add i64 %z, %a + ret i64 %r +} + define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry @@ -1815,6 +1968,59 @@ entry: ret i32 %r } +define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { +; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_zext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vpt.i8 eq, q2, zr +; CHECK-NEXT: vmlavat.u8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %b, zeroinitializer + %xx = zext <16 x i8> %x to <16 x i16> + %yy = zext <16 x i8> %y to <16 x i16> + %m = mul <16 x i16> %xx, %yy + %ma = zext <16 x i16> %m to <16 x i32> + %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer + %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %r = add i32 %z, %a + ret i32 %r +} + +define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { +; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vpt.i8 eq, q2, zr +; CHECK-NEXT: vmlavat.s8 r0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %b, zeroinitializer + %xx = sext <16 x i8> %x to <16 x i16> + %yy = sext <16 x i8> %y to <16 x i16> + %m = mul <16 x i16> %xx, %yy + %ma = sext <16 x i16> %m to <16 x i32> + %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer + %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %r = add i32 %z, %a + ret i32 %r +} + +define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { +; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sextzext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vpt.i8 eq, q2, zr +; CHECK-NEXT: vmlavat.s8 r0, q0, q0 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %b, zeroinitializer + %xx = sext <16 x i8> %x to <16 x i16> + %m = mul <16 x i16> %xx, %xx + %ma = zext <16 x i16> %m to <16 x i32> + %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer + %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %r = add i32 %z, %a + ret i32 %r +} + define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) { ; CHECK-LABEL: add_v4i8_v4i32_acc_zext: ; CHECK: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll index 9b68f7d4c0744..b815ed24ae263 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -303,25 +303,24 @@ define void @vst2_v4i64(<4 x i64> *%src, <8 x i64> *%dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vmov.f64 d6, d1 -; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vmov.f64 d10, d3 ; CHECK-NEXT: vmov.f32 s13, s3 ; CHECK-NEXT: vmov.f32 s21, s7 +; CHECK-NEXT: vmov.f32 s2, s16 ; CHECK-NEXT: vmov.f32 s6, s8 ; CHECK-NEXT: vmov.f32 s14, s18 ; CHECK-NEXT: vmov.f32 s22, s10 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmov.f32 s7, s9 -; CHECK-NEXT: vmov.f32 s23, s11 -; CHECK-NEXT: vstrb.8 q1, [r0], #48 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vstrw.32 q5, [r1, #16] ; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vstrw.32 q3, [r0] +; CHECK-NEXT: vmov.f32 s7, s9 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vstrb.8 q1, [r1], #48 +; CHECK-NEXT: vmov.f32 s23, s11 +; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vstrw.32 q5, [r1, #-32] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll index 52de7a45e85b6..600c5279ca917 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -1085,7 +1085,6 @@ define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) { ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vmov.f64 d6, d5 ; CHECK-NEXT: vmov.f32 s13, s11 ; CHECK-NEXT: vmov.f32 s14, s2 @@ -1093,10 +1092,10 @@ define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) { ; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s7, s9 -; CHECK-NEXT: vstrb.8 q1, [r0], #32 -; CHECK-NEXT: vstrw.32 q3, [r0] +; CHECK-NEXT: vstrb.8 q1, [r1], #32 +; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vstrw.32 q0, [r1, #-16] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0 diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll index 887dc470b3bc8..f78d56ca0b962 100644 --- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll +++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll @@ -1023,6 +1023,54 @@ while.end: ; preds = %while.body, %while. ret void } +; When the function return type is non-void and 'end' instructions are at the +; very end of a function, CFGStackify's fixEndsAtEndOfFunction function fixes +; the corresponding block/loop/try's type to match the function's return type. +; But when a `try`'s type is fixed, we should also check `end` instructions +; before its corresponding `catch`, because both `try` and `catch` body should +; satisfy the return type requirements. + +; NOSORT-LABEL: test19 +; NOSORT: try i32 +; NOSORT: loop i32 +; NOSORT: end_loop +; NOSORT: catch +; NOSORT: end_try +; NOSORT-NEXT: end_function +define i32 @test19(i32 %n) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) { +entry: + %t = alloca %class.Object, align 1 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %n + br label %for.body + +for.body: ; preds = %for.cond + %div = sdiv i32 %n, 2 + %cmp1 = icmp eq i32 %i.0, %div + br i1 %cmp1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %call = invoke i32 @baz() + to label %invoke.cont unwind label %ehcleanup + +invoke.cont: ; preds = %if.then + %call2 = call %class.Object* @_ZN6ObjectD2Ev(%class.Object* %t) #4 + ret i32 %call + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +ehcleanup: ; preds = %if.then + %0 = cleanuppad within none [] + %call3 = call %class.Object* @_ZN6ObjectD2Ev(%class.Object* %t) #4 [ "funclet"(token %0) ] + cleanupret from %0 unwind to caller +} + + ; Check if the unwind destination mismatch stats are correct ; NOSORT-STAT: 17 wasm-cfg-stackify - Number of EH pad unwind mismatches found diff --git a/llvm/test/CodeGen/X86/2009-04-14-IllegalRegs.ll b/llvm/test/CodeGen/X86/2009-04-14-IllegalRegs.ll index b5635c7e0f067..48ad2a2c07770 100644 --- a/llvm/test/CodeGen/X86/2009-04-14-IllegalRegs.ll +++ b/llvm/test/CodeGen/X86/2009-04-14-IllegalRegs.ll @@ -8,34 +8,34 @@ define i32 @z() nounwind ssp { ; CHECK-LABEL: z: ; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi -; CHECK-NEXT: subl $148, %esp +; CHECK-NEXT: subl $144, %esp ; CHECK-NEXT: movl L___stack_chk_guard$non_lazy_ptr, %eax ; CHECK-NEXT: movl (%eax), %eax ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movb $48, {{[0-9]+}}(%esp) -; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%esp) +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl +; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) ; CHECK-NEXT: movb $15, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl $8, %ecx -; CHECK-NEXT: leal {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl $8, %edx +; CHECK-NEXT: leal {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl %edx, %ecx ; CHECK-NEXT: movl %eax, %edi -; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: rep;movsl (%esi), %es:(%edi) ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: addl $36, %ecx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload ; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: movl %edx, %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload ; CHECK-NEXT: rep;movsl (%esi), %es:(%edi) -; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl -; CHECK-NEXT: movb %cl, 32(%eax) -; CHECK-NEXT: movb %cl, 68(%eax) +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %bl +; CHECK-NEXT: movb %bl, 32(%eax) +; CHECK-NEXT: movb %bl, 68(%eax) ; CHECK-NEXT: calll _f ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -50,9 +50,10 @@ define i32 @z() nounwind ssp { ; CHECK-NEXT: jne LBB0_3 ; CHECK-NEXT: ## %bb.2: ## %SP_return ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; CHECK-NEXT: addl $148, %esp +; CHECK-NEXT: addl $144, %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebx ; CHECK-NEXT: retl ; CHECK-NEXT: LBB0_3: ## %CallStackCheckFailBlk ; CHECK-NEXT: calll ___stack_chk_fail diff --git a/llvm/test/CodeGen/X86/GlobalISel/phi.ll b/llvm/test/CodeGen/X86/GlobalISel/phi.ll index 28e65c73acae5..d2ce98d0fb41a 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/phi.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/phi.ll @@ -71,10 +71,11 @@ define i32 @test_i32(i32 %a, i32 %f, i32 %t) { ; ALL-NEXT: cmpl %ecx, %edi ; ALL-NEXT: setg %cl ; ALL-NEXT: testb $1, %cl -; ALL-NEXT: jne .LBB2_2 -; ALL-NEXT: # %bb.1: # %cond.false +; ALL-NEXT: je .LBB2_1 +; ALL-NEXT: # %bb.2: # %cond.end +; ALL-NEXT: retq +; ALL-NEXT: .LBB2_1: # %cond.false ; ALL-NEXT: movl %edx, %eax -; ALL-NEXT: .LBB2_2: # %cond.end ; ALL-NEXT: retq entry: %cmp = icmp sgt i32 %a, 0 @@ -99,10 +100,11 @@ define i64 @test_i64(i32 %a, i64 %f, i64 %t) { ; ALL-NEXT: cmpl %ecx, %edi ; ALL-NEXT: setg %cl ; ALL-NEXT: testb $1, %cl -; ALL-NEXT: jne .LBB3_2 -; ALL-NEXT: # %bb.1: # %cond.false +; ALL-NEXT: je .LBB3_1 +; ALL-NEXT: # %bb.2: # %cond.end +; ALL-NEXT: retq +; ALL-NEXT: .LBB3_1: # %cond.false ; ALL-NEXT: movq %rdx, %rax -; ALL-NEXT: .LBB3_2: # %cond.end ; ALL-NEXT: retq entry: %cmp = icmp sgt i32 %a, 0 @@ -126,10 +128,11 @@ define float @test_float(i32 %a, float %f, float %t) { ; ALL-NEXT: cmpl %eax, %edi ; ALL-NEXT: setg %al ; ALL-NEXT: testb $1, %al -; ALL-NEXT: jne .LBB4_2 -; ALL-NEXT: # %bb.1: # %cond.false +; ALL-NEXT: je .LBB4_1 +; ALL-NEXT: # %bb.2: # %cond.end +; ALL-NEXT: retq +; ALL-NEXT: .LBB4_1: # %cond.false ; ALL-NEXT: movaps %xmm1, %xmm0 -; ALL-NEXT: .LBB4_2: # %cond.end ; ALL-NEXT: retq entry: %cmp = icmp sgt i32 %a, 0 @@ -153,10 +156,11 @@ define double @test_double(i32 %a, double %f, double %t) { ; ALL-NEXT: cmpl %eax, %edi ; ALL-NEXT: setg %al ; ALL-NEXT: testb $1, %al -; ALL-NEXT: jne .LBB5_2 -; ALL-NEXT: # %bb.1: # %cond.false +; ALL-NEXT: je .LBB5_1 +; ALL-NEXT: # %bb.2: # %cond.end +; ALL-NEXT: retq +; ALL-NEXT: .LBB5_1: # %cond.false ; ALL-NEXT: movaps %xmm1, %xmm0 -; ALL-NEXT: .LBB5_2: # %cond.end ; ALL-NEXT: retq entry: %cmp = icmp sgt i32 %a, 0 diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll index 63faafc10ec8d..8e20b001cc3e8 100644 --- a/llvm/test/CodeGen/X86/abs.ll +++ b/llvm/test/CodeGen/X86/abs.ll @@ -144,35 +144,31 @@ define i128 @test_i128(i128 %a) nounwind { ; ; X86-LABEL: test_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: negl %edi -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: testl %eax, %eax -; X86-NEXT: cmovnsl %eax, %esi -; X86-NEXT: cmovnsl %ecx, %ebp -; X86-NEXT: cmovnsl %edx, %ebx -; X86-NEXT: cmovnsl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %ebp, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: adcl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: adcl %edx, %ebx +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: xorl %edx, %ebx +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebx, 8(%eax) +; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %r = call i128 @llvm.abs.i128(i128 %a, i1 false) ret i128 %r diff --git a/llvm/test/CodeGen/X86/asm-reject-x87-int.ll b/llvm/test/CodeGen/X86/asm-reject-x87-int.ll new file mode 100644 index 0000000000000..ec5c35abc7679 --- /dev/null +++ b/llvm/test/CodeGen/X86/asm-reject-x87-int.ll @@ -0,0 +1,39 @@ +; RUN: not llc -o /dev/null %s -mtriple=i386-unknown-unknown 2>&1 | FileCheck %s + +; This test was derived from this C code. The frontend sees that the constraint +; doesn't accept memory, but the argument is a strict. So it tries to bitcast +; to an integer of the same size. SelectionDAGBuilder doesn't know how to copy +; between integers and fp80 so it asserts or crashes. +; +; gcc accepts the code. But rejects it if the struct is replaced by an int. From +; the InlineAsm block those two cases look the same in LLVM IR. So if the single +; elementstruct case is valid, then the frontend needs to emit different IR. + +; typedef struct float4 { +; float f; +; } float4; +; +; int main() { +; float4 f4; +; f4.f = 4.0f; +; __asm ("fadd %%st(0), %%st(0)" : "+t" (f4)); +; return 0; +; } + +%struct.float4 = type { float } + +; CHECK: error: couldn't allocate output register for constraint '{st}' +define dso_local i32 @foo() { +entry: + %retval = alloca i32, align 4 + %f4 = alloca %struct.float4, align 4 + store i32 0, i32* %retval, align 4 + %f = getelementptr inbounds %struct.float4, %struct.float4* %f4, i32 0, i32 0 + store float 4.000000e+00, float* %f, align 4 + %0 = bitcast %struct.float4* %f4 to i32* + %1 = load i32, i32* %0, align 4 + %2 = call i32 asm "fadd %st(0), %st(0)", "={st},0,~{dirflag},~{fpsr},~{flags}"(i32 %1) + %3 = bitcast %struct.float4* %f4 to i32* + store i32 %2, i32* %3, align 4 + ret i32 0 +} diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index 7a1f34c65c183..16fde4074ea0e 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -126,8 +126,8 @@ define void @narrow_writeback_and(i64* %ptr) { ; CHECK-O0-NEXT: movq (%rdi), %rax ; CHECK-O0-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O0-NEXT: andl $-256, %eax -; CHECK-O0-NEXT: # kill: def $rax killed $eax -; CHECK-O0-NEXT: movq %rax, (%rdi) +; CHECK-O0-NEXT: movl %eax, %ecx +; CHECK-O0-NEXT: movq %rcx, (%rdi) ; CHECK-O0-NEXT: retq ; ; CHECK-O3-LABEL: narrow_writeback_and: @@ -231,10 +231,10 @@ define i128 @load_i128(i128* %ptr) { ; CHECK-O0-NEXT: .cfi_def_cfa_offset 16 ; CHECK-O0-NEXT: .cfi_offset %rbx, -16 ; CHECK-O0-NEXT: xorl %eax, %eax -; CHECK-O0-NEXT: # kill: def $rax killed $eax -; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-O0-NEXT: movl %eax, %ecx +; CHECK-O0-NEXT: movq %rcx, %rax +; CHECK-O0-NEXT: movq %rcx, %rdx +; CHECK-O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; CHECK-O0-NEXT: lock cmpxchg16b (%rdi) ; CHECK-O0-NEXT: popq %rbx @@ -326,14 +326,14 @@ define i256 @load_i256(i256* %ptr) { ; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-O0-NEXT: callq __atomic_load ; CHECK-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-O0-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; CHECK-O0-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; CHECK-O0-NEXT: movq %rsi, 24(%rdi) -; CHECK-O0-NEXT: movq %rdx, 16(%rdi) -; CHECK-O0-NEXT: movq %rcx, 8(%rdi) -; CHECK-O0-NEXT: movq %rax, (%rdi) +; CHECK-O0-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; CHECK-O0-NEXT: movq %rdi, 24(%r9) +; CHECK-O0-NEXT: movq %rsi, 16(%r9) +; CHECK-O0-NEXT: movq %rdx, 8(%r9) +; CHECK-O0-NEXT: movq %rax, (%r9) ; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-O0-NEXT: addq $56, %rsp ; CHECK-O0-NEXT: .cfi_def_cfa_offset 8 @@ -831,8 +831,8 @@ define i64 @load_fold_udiv1(i64* %p) { ; CHECK-O0-NEXT: movq (%rdi), %rax ; CHECK-O0-NEXT: xorl %ecx, %ecx ; CHECK-O0-NEXT: movl %ecx, %edx -; CHECK-O0-NEXT: movl $15, %ecx -; CHECK-O0-NEXT: divq %rcx +; CHECK-O0-NEXT: movl $15, %esi +; CHECK-O0-NEXT: divq %rsi ; CHECK-O0-NEXT: retq ; ; CHECK-O3-CUR-LABEL: load_fold_udiv1: @@ -1024,8 +1024,8 @@ define i64 @load_fold_urem1(i64* %p) { ; CHECK-O0-NEXT: movq (%rdi), %rax ; CHECK-O0-NEXT: xorl %ecx, %ecx ; CHECK-O0-NEXT: movl %ecx, %edx -; CHECK-O0-NEXT: movl $15, %ecx -; CHECK-O0-NEXT: divq %rcx +; CHECK-O0-NEXT: movl $15, %esi +; CHECK-O0-NEXT: divq %rsi ; CHECK-O0-NEXT: movq %rdx, %rax ; CHECK-O0-NEXT: retq ; @@ -1475,9 +1475,9 @@ define i1 @load_fold_icmp3(i64* %p1, i64* %p2) { ; CHECK-O0-NEXT: movq (%rdi), %rax ; CHECK-O0-NEXT: movq (%rsi), %rcx ; CHECK-O0-NEXT: subq %rcx, %rax -; CHECK-O0-NEXT: sete %cl +; CHECK-O0-NEXT: sete %dl ; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-O0-NEXT: movb %cl, %al +; CHECK-O0-NEXT: movb %dl, %al ; CHECK-O0-NEXT: retq ; ; CHECK-O3-CUR-LABEL: load_fold_icmp3: @@ -2076,8 +2076,8 @@ define void @rmw_fold_and1(i64* %p, i64 %v) { ; CHECK-O0-NEXT: movq (%rdi), %rax ; CHECK-O0-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O0-NEXT: andl $15, %eax -; CHECK-O0-NEXT: # kill: def $rax killed $eax -; CHECK-O0-NEXT: movq %rax, (%rdi) +; CHECK-O0-NEXT: movl %eax, %ecx +; CHECK-O0-NEXT: movq %rcx, (%rdi) ; CHECK-O0-NEXT: retq ; ; CHECK-O3-LABEL: rmw_fold_and1: @@ -2541,8 +2541,9 @@ define i16 @load_i8_anyext_i16(i8* %ptr) { ; CHECK-O0-CUR-LABEL: load_i8_anyext_i16: ; CHECK-O0-CUR: # %bb.0: ; CHECK-O0-CUR-NEXT: movb (%rdi), %al -; CHECK-O0-CUR-NEXT: movzbl %al, %eax -; CHECK-O0-CUR-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-O0-CUR-NEXT: movzbl %al, %ecx +; CHECK-O0-CUR-NEXT: # kill: def $cx killed $cx killed $ecx +; CHECK-O0-CUR-NEXT: movw %cx, %ax ; CHECK-O0-CUR-NEXT: retq ; ; CHECK-O3-CUR-LABEL: load_i8_anyext_i16: @@ -2670,12 +2671,13 @@ define i16 @load_combine(i8* %p) { ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movb (%rdi), %al ; CHECK-O0-NEXT: movb 1(%rdi), %cl -; CHECK-O0-NEXT: movzbl %al, %eax -; CHECK-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-O0-NEXT: movzbl %cl, %ecx -; CHECK-O0-NEXT: # kill: def $cx killed $cx killed $ecx -; CHECK-O0-NEXT: shlw $8, %cx -; CHECK-O0-NEXT: orw %cx, %ax +; CHECK-O0-NEXT: movzbl %al, %edx +; CHECK-O0-NEXT: # kill: def $dx killed $dx killed $edx +; CHECK-O0-NEXT: movzbl %cl, %esi +; CHECK-O0-NEXT: # kill: def $si killed $si killed $esi +; CHECK-O0-NEXT: shlw $8, %si +; CHECK-O0-NEXT: orw %si, %dx +; CHECK-O0-NEXT: movw %dx, %ax ; CHECK-O0-NEXT: retq ; ; CHECK-O3-LABEL: load_combine: diff --git a/llvm/test/CodeGen/X86/atomic32.ll b/llvm/test/CodeGen/X86/atomic32.ll index 05a10966a4f1a..24aebbba60d19 100644 --- a/llvm/test/CodeGen/X86/atomic32.ll +++ b/llvm/test/CodeGen/X86/atomic32.ll @@ -70,8 +70,8 @@ define void @atomic_fetch_and32() nounwind { ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $5, %ecx ; X64-NEXT: lock cmpxchgl %ecx, {{.*}}(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl +; X64-NEXT: sete %dl +; X64-NEXT: testb $1, %dl ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill @@ -94,8 +94,8 @@ define void @atomic_fetch_and32() nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $5, %ecx ; X86-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NEXT: sete %cl -; X86-NEXT: testb $1, %cl +; X86-NEXT: sete %dl +; X86-NEXT: testb $1, %dl ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill @@ -124,8 +124,8 @@ define void @atomic_fetch_or32() nounwind { ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: orl $5, %ecx ; X64-NEXT: lock cmpxchgl %ecx, {{.*}}(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl +; X64-NEXT: sete %dl +; X64-NEXT: testb $1, %dl ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill @@ -148,8 +148,8 @@ define void @atomic_fetch_or32() nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: orl $5, %ecx ; X86-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NEXT: sete %cl -; X86-NEXT: testb $1, %cl +; X86-NEXT: sete %dl +; X86-NEXT: testb $1, %dl ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill @@ -178,8 +178,8 @@ define void @atomic_fetch_xor32() nounwind { ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: xorl $5, %ecx ; X64-NEXT: lock cmpxchgl %ecx, {{.*}}(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl +; X64-NEXT: sete %dl +; X64-NEXT: testb $1, %dl ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill @@ -202,8 +202,8 @@ define void @atomic_fetch_xor32() nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: xorl $5, %ecx ; X86-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NEXT: sete %cl -; X86-NEXT: testb $1, %cl +; X86-NEXT: sete %dl +; X86-NEXT: testb $1, %dl ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill @@ -234,8 +234,8 @@ define void @atomic_fetch_nand32(i32 %x) nounwind { ; X64-NEXT: andl %edx, %ecx ; X64-NEXT: notl %ecx ; X64-NEXT: lock cmpxchgl %ecx, {{.*}}(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl +; X64-NEXT: sete %sil +; X64-NEXT: testb $1, %sil ; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB5_2 ; X64-NEXT: jmp .LBB5_1 @@ -244,6 +244,7 @@ define void @atomic_fetch_nand32(i32 %x) nounwind { ; ; X86-LABEL: atomic_fetch_nand32: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl sc32, %ecx @@ -257,13 +258,14 @@ define void @atomic_fetch_nand32(i32 %x) nounwind { ; X86-NEXT: andl %edx, %ecx ; X86-NEXT: notl %ecx ; X86-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NEXT: sete %cl -; X86-NEXT: testb $1, %cl +; X86-NEXT: sete %bl +; X86-NEXT: testb $1, %bl ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: jne .LBB5_2 ; X86-NEXT: jmp .LBB5_1 ; X86-NEXT: .LBB5_2: # %atomicrmw.end ; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %ebx ; X86-NEXT: retl %t1 = atomicrmw nand i32* @sc32, i32 %x acquire ret void @@ -283,8 +285,8 @@ define void @atomic_fetch_max32(i32 %x) nounwind { ; X64-NEXT: subl %edx, %ecx ; X64-NEXT: cmovgl %eax, %edx ; X64-NEXT: lock cmpxchgl %edx, {{.*}}(%rip) -; X64-NEXT: sete %dl -; X64-NEXT: testb $1, %dl +; X64-NEXT: sete %sil +; X64-NEXT: testb $1, %sil ; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB6_2 @@ -294,6 +296,7 @@ define void @atomic_fetch_max32(i32 %x) nounwind { ; ; X86-CMOV-LABEL: atomic_fetch_max32: ; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: pushl %ebx ; X86-CMOV-NEXT: subl $12, %esp ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-CMOV-NEXT: movl sc32, %ecx @@ -307,18 +310,20 @@ define void @atomic_fetch_max32(i32 %x) nounwind { ; X86-CMOV-NEXT: subl %edx, %ecx ; X86-CMOV-NEXT: cmovgl %eax, %edx ; X86-CMOV-NEXT: lock cmpxchgl %edx, sc32 -; X86-CMOV-NEXT: sete %dl -; X86-CMOV-NEXT: testb $1, %dl +; X86-CMOV-NEXT: sete %bl +; X86-CMOV-NEXT: testb $1, %bl ; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-CMOV-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-CMOV-NEXT: jne .LBB6_2 ; X86-CMOV-NEXT: jmp .LBB6_1 ; X86-CMOV-NEXT: .LBB6_2: # %atomicrmw.end ; X86-CMOV-NEXT: addl $12, %esp +; X86-CMOV-NEXT: popl %ebx ; X86-CMOV-NEXT: retl ; ; X86-NOCMOV-LABEL: atomic_fetch_max32: ; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: pushl %ebx ; X86-NOCMOV-NEXT: pushl %esi ; X86-NOCMOV-NEXT: subl $20, %esp ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -347,18 +352,20 @@ define void @atomic_fetch_max32(i32 %x) nounwind { ; X86-NOCMOV-NEXT: movl %ecx, %eax ; X86-NOCMOV-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NOCMOV-NEXT: lock cmpxchgl %edx, sc32 -; X86-NOCMOV-NEXT: sete %dl -; X86-NOCMOV-NEXT: testb $1, %dl +; X86-NOCMOV-NEXT: sete %bl +; X86-NOCMOV-NEXT: testb $1, %bl ; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: jne .LBB6_2 ; X86-NOCMOV-NEXT: jmp .LBB6_1 ; X86-NOCMOV-NEXT: .LBB6_2: # %atomicrmw.end ; X86-NOCMOV-NEXT: addl $20, %esp ; X86-NOCMOV-NEXT: popl %esi +; X86-NOCMOV-NEXT: popl %ebx ; X86-NOCMOV-NEXT: retl ; ; X86-NOX87-LABEL: atomic_fetch_max32: ; X86-NOX87: # %bb.0: +; X86-NOX87-NEXT: pushl %ebx ; X86-NOX87-NEXT: pushl %esi ; X86-NOX87-NEXT: subl $20, %esp ; X86-NOX87-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -387,14 +394,15 @@ define void @atomic_fetch_max32(i32 %x) nounwind { ; X86-NOX87-NEXT: movl %ecx, %eax ; X86-NOX87-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NOX87-NEXT: lock cmpxchgl %edx, sc32 -; X86-NOX87-NEXT: sete %dl -; X86-NOX87-NEXT: testb $1, %dl +; X86-NOX87-NEXT: sete %bl +; X86-NOX87-NEXT: testb $1, %bl ; X86-NOX87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOX87-NEXT: jne .LBB6_2 ; X86-NOX87-NEXT: jmp .LBB6_1 ; X86-NOX87-NEXT: .LBB6_2: # %atomicrmw.end ; X86-NOX87-NEXT: addl $20, %esp ; X86-NOX87-NEXT: popl %esi +; X86-NOX87-NEXT: popl %ebx ; X86-NOX87-NEXT: retl %t1 = atomicrmw max i32* @sc32, i32 %x acquire ret void @@ -414,8 +422,8 @@ define void @atomic_fetch_min32(i32 %x) nounwind { ; X64-NEXT: subl %edx, %ecx ; X64-NEXT: cmovlel %eax, %edx ; X64-NEXT: lock cmpxchgl %edx, {{.*}}(%rip) -; X64-NEXT: sete %dl -; X64-NEXT: testb $1, %dl +; X64-NEXT: sete %sil +; X64-NEXT: testb $1, %sil ; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB7_2 @@ -425,6 +433,7 @@ define void @atomic_fetch_min32(i32 %x) nounwind { ; ; X86-CMOV-LABEL: atomic_fetch_min32: ; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: pushl %ebx ; X86-CMOV-NEXT: subl $12, %esp ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-CMOV-NEXT: movl sc32, %ecx @@ -438,18 +447,20 @@ define void @atomic_fetch_min32(i32 %x) nounwind { ; X86-CMOV-NEXT: subl %edx, %ecx ; X86-CMOV-NEXT: cmovlel %eax, %edx ; X86-CMOV-NEXT: lock cmpxchgl %edx, sc32 -; X86-CMOV-NEXT: sete %dl -; X86-CMOV-NEXT: testb $1, %dl +; X86-CMOV-NEXT: sete %bl +; X86-CMOV-NEXT: testb $1, %bl ; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-CMOV-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-CMOV-NEXT: jne .LBB7_2 ; X86-CMOV-NEXT: jmp .LBB7_1 ; X86-CMOV-NEXT: .LBB7_2: # %atomicrmw.end ; X86-CMOV-NEXT: addl $12, %esp +; X86-CMOV-NEXT: popl %ebx ; X86-CMOV-NEXT: retl ; ; X86-NOCMOV-LABEL: atomic_fetch_min32: ; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: pushl %ebx ; X86-NOCMOV-NEXT: pushl %esi ; X86-NOCMOV-NEXT: subl $20, %esp ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -478,18 +489,20 @@ define void @atomic_fetch_min32(i32 %x) nounwind { ; X86-NOCMOV-NEXT: movl %ecx, %eax ; X86-NOCMOV-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NOCMOV-NEXT: lock cmpxchgl %edx, sc32 -; X86-NOCMOV-NEXT: sete %dl -; X86-NOCMOV-NEXT: testb $1, %dl +; X86-NOCMOV-NEXT: sete %bl +; X86-NOCMOV-NEXT: testb $1, %bl ; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: jne .LBB7_2 ; X86-NOCMOV-NEXT: jmp .LBB7_1 ; X86-NOCMOV-NEXT: .LBB7_2: # %atomicrmw.end ; X86-NOCMOV-NEXT: addl $20, %esp ; X86-NOCMOV-NEXT: popl %esi +; X86-NOCMOV-NEXT: popl %ebx ; X86-NOCMOV-NEXT: retl ; ; X86-NOX87-LABEL: atomic_fetch_min32: ; X86-NOX87: # %bb.0: +; X86-NOX87-NEXT: pushl %ebx ; X86-NOX87-NEXT: pushl %esi ; X86-NOX87-NEXT: subl $20, %esp ; X86-NOX87-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -518,14 +531,15 @@ define void @atomic_fetch_min32(i32 %x) nounwind { ; X86-NOX87-NEXT: movl %ecx, %eax ; X86-NOX87-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NOX87-NEXT: lock cmpxchgl %edx, sc32 -; X86-NOX87-NEXT: sete %dl -; X86-NOX87-NEXT: testb $1, %dl +; X86-NOX87-NEXT: sete %bl +; X86-NOX87-NEXT: testb $1, %bl ; X86-NOX87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOX87-NEXT: jne .LBB7_2 ; X86-NOX87-NEXT: jmp .LBB7_1 ; X86-NOX87-NEXT: .LBB7_2: # %atomicrmw.end ; X86-NOX87-NEXT: addl $20, %esp ; X86-NOX87-NEXT: popl %esi +; X86-NOX87-NEXT: popl %ebx ; X86-NOX87-NEXT: retl %t1 = atomicrmw min i32* @sc32, i32 %x acquire ret void @@ -545,8 +559,8 @@ define void @atomic_fetch_umax32(i32 %x) nounwind { ; X64-NEXT: subl %edx, %ecx ; X64-NEXT: cmoval %eax, %edx ; X64-NEXT: lock cmpxchgl %edx, {{.*}}(%rip) -; X64-NEXT: sete %dl -; X64-NEXT: testb $1, %dl +; X64-NEXT: sete %sil +; X64-NEXT: testb $1, %sil ; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB8_2 @@ -556,6 +570,7 @@ define void @atomic_fetch_umax32(i32 %x) nounwind { ; ; X86-CMOV-LABEL: atomic_fetch_umax32: ; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: pushl %ebx ; X86-CMOV-NEXT: subl $12, %esp ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-CMOV-NEXT: movl sc32, %ecx @@ -569,18 +584,20 @@ define void @atomic_fetch_umax32(i32 %x) nounwind { ; X86-CMOV-NEXT: subl %edx, %ecx ; X86-CMOV-NEXT: cmoval %eax, %edx ; X86-CMOV-NEXT: lock cmpxchgl %edx, sc32 -; X86-CMOV-NEXT: sete %dl -; X86-CMOV-NEXT: testb $1, %dl +; X86-CMOV-NEXT: sete %bl +; X86-CMOV-NEXT: testb $1, %bl ; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-CMOV-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-CMOV-NEXT: jne .LBB8_2 ; X86-CMOV-NEXT: jmp .LBB8_1 ; X86-CMOV-NEXT: .LBB8_2: # %atomicrmw.end ; X86-CMOV-NEXT: addl $12, %esp +; X86-CMOV-NEXT: popl %ebx ; X86-CMOV-NEXT: retl ; ; X86-NOCMOV-LABEL: atomic_fetch_umax32: ; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: pushl %ebx ; X86-NOCMOV-NEXT: pushl %esi ; X86-NOCMOV-NEXT: subl $20, %esp ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -609,18 +626,20 @@ define void @atomic_fetch_umax32(i32 %x) nounwind { ; X86-NOCMOV-NEXT: movl %ecx, %eax ; X86-NOCMOV-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NOCMOV-NEXT: lock cmpxchgl %edx, sc32 -; X86-NOCMOV-NEXT: sete %dl -; X86-NOCMOV-NEXT: testb $1, %dl +; X86-NOCMOV-NEXT: sete %bl +; X86-NOCMOV-NEXT: testb $1, %bl ; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: jne .LBB8_2 ; X86-NOCMOV-NEXT: jmp .LBB8_1 ; X86-NOCMOV-NEXT: .LBB8_2: # %atomicrmw.end ; X86-NOCMOV-NEXT: addl $20, %esp ; X86-NOCMOV-NEXT: popl %esi +; X86-NOCMOV-NEXT: popl %ebx ; X86-NOCMOV-NEXT: retl ; ; X86-NOX87-LABEL: atomic_fetch_umax32: ; X86-NOX87: # %bb.0: +; X86-NOX87-NEXT: pushl %ebx ; X86-NOX87-NEXT: pushl %esi ; X86-NOX87-NEXT: subl $20, %esp ; X86-NOX87-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -649,14 +668,15 @@ define void @atomic_fetch_umax32(i32 %x) nounwind { ; X86-NOX87-NEXT: movl %ecx, %eax ; X86-NOX87-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NOX87-NEXT: lock cmpxchgl %edx, sc32 -; X86-NOX87-NEXT: sete %dl -; X86-NOX87-NEXT: testb $1, %dl +; X86-NOX87-NEXT: sete %bl +; X86-NOX87-NEXT: testb $1, %bl ; X86-NOX87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOX87-NEXT: jne .LBB8_2 ; X86-NOX87-NEXT: jmp .LBB8_1 ; X86-NOX87-NEXT: .LBB8_2: # %atomicrmw.end ; X86-NOX87-NEXT: addl $20, %esp ; X86-NOX87-NEXT: popl %esi +; X86-NOX87-NEXT: popl %ebx ; X86-NOX87-NEXT: retl %t1 = atomicrmw umax i32* @sc32, i32 %x acquire ret void @@ -676,8 +696,8 @@ define void @atomic_fetch_umin32(i32 %x) nounwind { ; X64-NEXT: subl %edx, %ecx ; X64-NEXT: cmovbel %eax, %edx ; X64-NEXT: lock cmpxchgl %edx, {{.*}}(%rip) -; X64-NEXT: sete %dl -; X64-NEXT: testb $1, %dl +; X64-NEXT: sete %sil +; X64-NEXT: testb $1, %sil ; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: jne .LBB9_2 @@ -687,6 +707,7 @@ define void @atomic_fetch_umin32(i32 %x) nounwind { ; ; X86-CMOV-LABEL: atomic_fetch_umin32: ; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: pushl %ebx ; X86-CMOV-NEXT: subl $12, %esp ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-CMOV-NEXT: movl sc32, %ecx @@ -700,18 +721,20 @@ define void @atomic_fetch_umin32(i32 %x) nounwind { ; X86-CMOV-NEXT: subl %edx, %ecx ; X86-CMOV-NEXT: cmovbel %eax, %edx ; X86-CMOV-NEXT: lock cmpxchgl %edx, sc32 -; X86-CMOV-NEXT: sete %dl -; X86-CMOV-NEXT: testb $1, %dl +; X86-CMOV-NEXT: sete %bl +; X86-CMOV-NEXT: testb $1, %bl ; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-CMOV-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-CMOV-NEXT: jne .LBB9_2 ; X86-CMOV-NEXT: jmp .LBB9_1 ; X86-CMOV-NEXT: .LBB9_2: # %atomicrmw.end ; X86-CMOV-NEXT: addl $12, %esp +; X86-CMOV-NEXT: popl %ebx ; X86-CMOV-NEXT: retl ; ; X86-NOCMOV-LABEL: atomic_fetch_umin32: ; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: pushl %ebx ; X86-NOCMOV-NEXT: pushl %esi ; X86-NOCMOV-NEXT: subl $20, %esp ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -740,18 +763,20 @@ define void @atomic_fetch_umin32(i32 %x) nounwind { ; X86-NOCMOV-NEXT: movl %ecx, %eax ; X86-NOCMOV-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NOCMOV-NEXT: lock cmpxchgl %edx, sc32 -; X86-NOCMOV-NEXT: sete %dl -; X86-NOCMOV-NEXT: testb $1, %dl +; X86-NOCMOV-NEXT: sete %bl +; X86-NOCMOV-NEXT: testb $1, %bl ; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOCMOV-NEXT: jne .LBB9_2 ; X86-NOCMOV-NEXT: jmp .LBB9_1 ; X86-NOCMOV-NEXT: .LBB9_2: # %atomicrmw.end ; X86-NOCMOV-NEXT: addl $20, %esp ; X86-NOCMOV-NEXT: popl %esi +; X86-NOCMOV-NEXT: popl %ebx ; X86-NOCMOV-NEXT: retl ; ; X86-NOX87-LABEL: atomic_fetch_umin32: ; X86-NOX87: # %bb.0: +; X86-NOX87-NEXT: pushl %ebx ; X86-NOX87-NEXT: pushl %esi ; X86-NOX87-NEXT: subl $20, %esp ; X86-NOX87-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -780,14 +805,15 @@ define void @atomic_fetch_umin32(i32 %x) nounwind { ; X86-NOX87-NEXT: movl %ecx, %eax ; X86-NOX87-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NOX87-NEXT: lock cmpxchgl %edx, sc32 -; X86-NOX87-NEXT: sete %dl -; X86-NOX87-NEXT: testb $1, %dl +; X86-NOX87-NEXT: sete %bl +; X86-NOX87-NEXT: testb $1, %bl ; X86-NOX87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOX87-NEXT: jne .LBB9_2 ; X86-NOX87-NEXT: jmp .LBB9_1 ; X86-NOX87-NEXT: .LBB9_2: # %atomicrmw.end ; X86-NOX87-NEXT: addl $20, %esp ; X86-NOX87-NEXT: popl %esi +; X86-NOX87-NEXT: popl %ebx ; X86-NOX87-NEXT: retl %t1 = atomicrmw umin i32* @sc32, i32 %x acquire ret void diff --git a/llvm/test/CodeGen/X86/atomic64.ll b/llvm/test/CodeGen/X86/atomic64.ll index 963561dc8deb2..8b40380afcb2a 100644 --- a/llvm/test/CodeGen/X86/atomic64.ll +++ b/llvm/test/CodeGen/X86/atomic64.ll @@ -137,12 +137,12 @@ define void @atomic_fetch_and64() nounwind { ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $5, %ecx -; X64-NEXT: # kill: def $rcx killed $ecx -; X64-NEXT: lock cmpxchgq %rcx, {{.*}}(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: lock cmpxchgq %rdx, {{.*}}(%rip) +; X64-NEXT: sete %sil +; X64-NEXT: testb $1, %sil +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: jne .LBB2_2 ; X64-NEXT: jmp .LBB2_1 @@ -202,8 +202,8 @@ define void @atomic_fetch_or64() nounwind { ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: orq $5, %rcx ; X64-NEXT: lock cmpxchgq %rcx, {{.*}}(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl +; X64-NEXT: sete %dl +; X64-NEXT: testb $1, %dl ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -265,8 +265,8 @@ define void @atomic_fetch_xor64() nounwind { ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: xorq $5, %rcx ; X64-NEXT: lock cmpxchgq %rcx, {{.*}}(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl +; X64-NEXT: sete %dl +; X64-NEXT: testb $1, %dl ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -330,8 +330,8 @@ define void @atomic_fetch_nand64(i64 %x) nounwind { ; X64-NEXT: andq %rdx, %rcx ; X64-NEXT: notq %rcx ; X64-NEXT: lock cmpxchgq %rcx, {{.*}}(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl +; X64-NEXT: sete %sil +; X64-NEXT: testb $1, %sil ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: jne .LBB5_2 ; X64-NEXT: jmp .LBB5_1 @@ -373,8 +373,8 @@ define void @atomic_fetch_max64(i64 %x) nounwind { ; X64-NEXT: subq %rdx, %rcx ; X64-NEXT: cmovgq %rax, %rdx ; X64-NEXT: lock cmpxchgq %rdx, {{.*}}(%rip) -; X64-NEXT: sete %dl -; X64-NEXT: testb $1, %dl +; X64-NEXT: sete %sil +; X64-NEXT: testb $1, %sil ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: jne .LBB6_2 @@ -471,8 +471,8 @@ define void @atomic_fetch_min64(i64 %x) nounwind { ; X64-NEXT: subq %rdx, %rcx ; X64-NEXT: cmovleq %rax, %rdx ; X64-NEXT: lock cmpxchgq %rdx, {{.*}}(%rip) -; X64-NEXT: sete %dl -; X64-NEXT: testb $1, %dl +; X64-NEXT: sete %sil +; X64-NEXT: testb $1, %sil ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: jne .LBB7_2 @@ -569,8 +569,8 @@ define void @atomic_fetch_umax64(i64 %x) nounwind { ; X64-NEXT: subq %rdx, %rcx ; X64-NEXT: cmovaq %rax, %rdx ; X64-NEXT: lock cmpxchgq %rdx, {{.*}}(%rip) -; X64-NEXT: sete %dl -; X64-NEXT: testb $1, %dl +; X64-NEXT: sete %sil +; X64-NEXT: testb $1, %sil ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: jne .LBB8_2 @@ -667,8 +667,8 @@ define void @atomic_fetch_umin64(i64 %x) nounwind { ; X64-NEXT: subq %rdx, %rcx ; X64-NEXT: cmovbeq %rax, %rdx ; X64-NEXT: lock cmpxchgq %rdx, {{.*}}(%rip) -; X64-NEXT: sete %dl -; X64-NEXT: testb $1, %dl +; X64-NEXT: sete %sil +; X64-NEXT: testb $1, %sil ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: jne .LBB9_2 diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 051493a4ab57a..e2139fd20d32c 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -90,157 +90,29 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind { define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v24i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm6 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; SSE2-NEXT: paddd %xmm9, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE2-NEXT: paddd %xmm5, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE2-NEXT: paddd %xmm10, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: psubd %xmm6, %xmm3 -; SSE2-NEXT: psubd %xmm6, %xmm2 -; SSE2-NEXT: psubd %xmm6, %xmm4 -; SSE2-NEXT: psubd %xmm6, %xmm0 -; SSE2-NEXT: psubd %xmm6, %xmm5 -; SSE2-NEXT: psubd %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: packuswb %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: pavgb (%rdi), %xmm0 +; SSE2-NEXT: pavgb 16(%rdi), %xmm1 ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v24i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v24i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm3 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX2-NEXT: vpsubd %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm1, (%rax) ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -248,17 +120,11 @@ define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind { ; ; AVX512-LABEL: avg_v24i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512-NEXT: vpsubd %ymm2, %ymm1, %ymm1 -; AVX512-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmovq %xmm1, (%rax) +; AVX512-NEXT: vmovdqu %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = load <24 x i8>, <24 x i8>* %a @@ -324,314 +190,60 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind { define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v48i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm6 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 -; SSE2-NEXT: movdqa (%rsi), %xmm12 -; SSE2-NEXT: movdqa 16(%rsi), %xmm13 -; SSE2-NEXT: movdqa 32(%rsi), %xmm0 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; SSE2-NEXT: movdqa %xmm5, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE2-NEXT: movdqa %xmm6, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: movdqa %xmm12, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; SSE2-NEXT: paddd %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm11, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; SSE2-NEXT: paddd %xmm10, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; SSE2-NEXT: paddd %xmm1, %xmm12 -; SSE2-NEXT: movdqa %xmm13, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE2-NEXT: paddd %xmm15, %xmm10 -; SSE2-NEXT: movdqa %xmm2, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE2-NEXT: paddd %xmm5, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; SSE2-NEXT: movdqa %xmm13, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE2-NEXT: paddd %xmm14, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] -; SSE2-NEXT: paddd %xmm6, %xmm13 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; SSE2-NEXT: movdqa %xmm6, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; SSE2-NEXT: paddd %xmm15, %xmm14 -; SSE2-NEXT: movdqa %xmm11, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: paddd %xmm2, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE2-NEXT: paddd %xmm11, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: psubd %xmm5, %xmm8 -; SSE2-NEXT: psubd %xmm5, %xmm3 -; SSE2-NEXT: psubd %xmm5, %xmm9 -; SSE2-NEXT: psubd %xmm5, %xmm12 -; SSE2-NEXT: psubd %xmm5, %xmm10 -; SSE2-NEXT: psubd %xmm5, %xmm4 -; SSE2-NEXT: psubd %xmm5, %xmm1 -; SSE2-NEXT: psubd %xmm5, %xmm13 -; SSE2-NEXT: psubd %xmm5, %xmm14 -; SSE2-NEXT: psubd %xmm5, %xmm6 -; SSE2-NEXT: psubd %xmm5, %xmm2 -; SSE2-NEXT: psubd %xmm5, %xmm0 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: packuswb %xmm8, %xmm3 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: psrld $1, %xmm9 -; SSE2-NEXT: pand %xmm7, %xmm9 -; SSE2-NEXT: pand %xmm7, %xmm12 -; SSE2-NEXT: packuswb %xmm9, %xmm12 -; SSE2-NEXT: packuswb %xmm3, %xmm12 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: pand %xmm7, %xmm10 -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: packuswb %xmm10, %xmm4 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pand %xmm7, %xmm13 -; SSE2-NEXT: packuswb %xmm1, %xmm13 -; SSE2-NEXT: packuswb %xmm4, %xmm13 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm14 -; SSE2-NEXT: pand %xmm7, %xmm14 -; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: packuswb %xmm14, %xmm6 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 +; SSE2-NEXT: pavgb (%rdi), %xmm0 +; SSE2-NEXT: pavgb 16(%rdi), %xmm1 +; SSE2-NEXT: pavgb 32(%rdi), %xmm2 +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: movdqu %xmm13, (%rax) -; SSE2-NEXT: movdqu %xmm12, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v48i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,3,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm10 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm3, %xmm15, %xmm15 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm11, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,3,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm3, %xmm12, %xmm11 -; AVX1-NEXT: vpsubd %xmm3, %xmm10, %xmm10 -; AVX1-NEXT: vpsubd %xmm3, %xmm9, %xmm9 -; AVX1-NEXT: vpsubd %xmm3, %xmm8, %xmm8 -; AVX1-NEXT: vpsubd %xmm3, %xmm15, %xmm12 -; AVX1-NEXT: vpsubd %xmm3, %xmm7, %xmm7 -; AVX1-NEXT: vpsubd %xmm3, %xmm14, %xmm0 -; AVX1-NEXT: vpsubd %xmm3, %xmm13, %xmm2 -; AVX1-NEXT: vpsubd %xmm3, %xmm5, %xmm5 -; AVX1-NEXT: vpsubd %xmm3, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm4 -; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm7, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm12, %xmm4 -; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm8, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm9, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm10, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm11, %xmm6 -; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm2 -; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vmovdqu %xmm4, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v48i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm4 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm6 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm6 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vpbroadcastq 40(%rsi), %xmm6 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 -; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm0[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,2,1,3] -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-NEXT: vpavgb 32(%rdi), %xmm1, %xmm1 ; AVX2-NEXT: vmovdqu %xmm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: avg_v48i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 -; AVX512-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 -; AVX512-NEXT: vmovdqu %xmm1, (%rax) -; AVX512-NEXT: vmovdqu %xmm0, (%rax) -; AVX512-NEXT: vmovdqu %xmm2, (%rax) -; AVX512-NEXT: retq +; AVX512F-LABEL: avg_v48i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-NEXT: vpavgb 32(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqu %xmm1, (%rax) +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: avg_v48i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %1 = load <48 x i8>, <48 x i8>* %a %2 = load <48 x i8>, <48 x i8>* %b %3 = zext <48 x i8> %1 to <48 x i32> @@ -897,193 +509,78 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind { define void @avg_v40i16(<40 x i16>* %a, <40 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v40i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa 64(%rdi), %xmm10 -; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm6 -; SSE2-NEXT: movdqa 32(%rdi), %xmm13 -; SSE2-NEXT: movdqa 48(%rdi), %xmm12 -; SSE2-NEXT: movdqa 64(%rsi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm1 -; SSE2-NEXT: movdqa 16(%rsi), %xmm14 -; SSE2-NEXT: movdqa 32(%rsi), %xmm11 -; SSE2-NEXT: movdqa 48(%rsi), %xmm9 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE2-NEXT: paddd %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm14, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm13, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; SSE2-NEXT: paddd %xmm6, %xmm14 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; SSE2-NEXT: paddd %xmm5, %xmm7 -; SSE2-NEXT: movdqa %xmm12, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; SSE2-NEXT: paddd %xmm13, %xmm11 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSE2-NEXT: paddd %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE2-NEXT: paddd %xmm12, %xmm9 -; SSE2-NEXT: movdqa %xmm8, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; SSE2-NEXT: paddd %xmm10, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm4 -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: psubd %xmm0, %xmm3 -; SSE2-NEXT: psubd %xmm0, %xmm14 -; SSE2-NEXT: psubd %xmm0, %xmm7 -; SSE2-NEXT: psubd %xmm0, %xmm11 -; SSE2-NEXT: psubd %xmm0, %xmm6 -; SSE2-NEXT: psubd %xmm0, %xmm9 -; SSE2-NEXT: psubd %xmm0, %xmm5 -; SSE2-NEXT: psubd %xmm0, %xmm8 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: psrld $1, %xmm14 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: psrld $1, %xmm11 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE2-NEXT: psrld $1, %xmm9 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE2-NEXT: movdqu %xmm5, (%rax) +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: pavgw (%rdi), %xmm0 +; SSE2-NEXT: pavgw 16(%rdi), %xmm1 +; SSE2-NEXT: pavgw 32(%rdi), %xmm2 +; SSE2-NEXT: pavgw 48(%rdi), %xmm3 +; SSE2-NEXT: movdqa 64(%rsi), %xmm4 +; SSE2-NEXT: pavgw 64(%rdi), %xmm4 ; SSE2-NEXT: movdqu %xmm4, (%rax) ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v40i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX1-NEXT: vpavgw 64(%rsi), %xmm4, %xmm4 -; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX1-NEXT: vpavgw 64(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX1-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX1-NEXT: vpavgw (%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgw 16(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vpavgw 32(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vpavgw 48(%rdi), %xmm4, %xmm4 +; AVX1-NEXT: vmovdqu %xmm4, (%rax) ; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: vmovdqu %xmm2, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vmovdqu %xmm4, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v40i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-NEXT: vpavgw 64(%rsi), %xmm4, %xmm4 -; AVX2-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2 -; AVX2-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3 -; AVX2-NEXT: vmovdqu %xmm3, (%rax) +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX2-NEXT: vpavgw 64(%rdi), %xmm2, %xmm2 ; AVX2-NEXT: vmovdqu %xmm2, (%rax) -; AVX2-NEXT: vmovdqu %xmm1, (%rax) -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: vmovdqu %xmm4, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v40i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubd %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512F-NEXT: vpavgw 64(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vpavgw 32(%rdi), %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, (%rax) ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vmovdqu %xmm2, (%rax) +; AVX512F-NEXT: vmovdqu %xmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v40i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512BW-NEXT: vpaddd %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX512BW-NEXT: vpsubd %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 -; AVX512BW-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdw %zmm2, %ymm1 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512BW-NEXT: vpavgw 64(%rdi), %xmm1, %xmm1 ; AVX512BW-NEXT: vmovdqu %xmm1, (%rax) +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <40 x i16>, <40 x i16>* %a diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll index f448bfec2ec99..718449d7a771f 100644 --- a/llvm/test/CodeGen/X86/avx-load-store.ll +++ b/llvm/test/CodeGen/X86/avx-load-store.ll @@ -175,8 +175,8 @@ define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp ; CHECK_O0: # %bb.0: ; CHECK_O0-NEXT: # implicit-def: $ymm2 ; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2 -; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 -; CHECK_O0-NEXT: vmovdqu %ymm0, (%rdi) +; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi) ; CHECK_O0-NEXT: vzeroupper ; CHECK_O0-NEXT: retq %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> @@ -197,8 +197,8 @@ define void @double_save_volatile(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nou ; CHECK_O0: # %bb.0: ; CHECK_O0-NEXT: # implicit-def: $ymm2 ; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2 -; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 -; CHECK_O0-NEXT: vmovdqu %ymm0, (%rdi) +; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi) ; CHECK_O0-NEXT: vzeroupper ; CHECK_O0-NEXT: retq %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> @@ -239,10 +239,10 @@ define void @f_f() nounwind { ; CHECK_O0-NEXT: .LBB9_3: # %cif_mixed_test_all ; CHECK_O0-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,0,0,0] ; CHECK_O0-NEXT: vmovdqa %xmm0, %xmm0 -; CHECK_O0-NEXT: # kill: def $ymm0 killed $xmm0 +; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1 ; CHECK_O0-NEXT: # implicit-def: $rax -; CHECK_O0-NEXT: # implicit-def: $ymm1 -; CHECK_O0-NEXT: vmaskmovps %ymm1, %ymm0, (%rax) +; CHECK_O0-NEXT: # implicit-def: $ymm2 +; CHECK_O0-NEXT: vmaskmovps %ymm2, %ymm1, (%rax) ; CHECK_O0-NEXT: .LBB9_4: # %cif_mixed_test_any_check allocas: br i1 undef, label %cif_mask_all, label %cif_mask_mixed @@ -276,8 +276,8 @@ define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind { ; CHECK_O0-NEXT: vmovdqu 16(%rsi), %xmm1 ; CHECK_O0-NEXT: # implicit-def: $ymm2 ; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2 -; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 -; CHECK_O0-NEXT: vmovdqu %ymm0, (%rdi) +; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi) ; CHECK_O0-NEXT: vzeroupper ; CHECK_O0-NEXT: retq %b = load <8 x i32>, <8 x i32>* %bp, align 1 @@ -321,8 +321,8 @@ define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { ; CHECK_O0-NEXT: vmovdqa 16(%rsi), %xmm1 ; CHECK_O0-NEXT: # implicit-def: $ymm2 ; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2 -; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 -; CHECK_O0-NEXT: vmovdqu %ymm0, (%rdi) +; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi) ; CHECK_O0-NEXT: vzeroupper ; CHECK_O0-NEXT: retq %b = load <4 x i64>, <4 x i64>* %bp, align 16 diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll index 0fe9d0b0d35c8..49f6c2b849b65 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -1632,11 +1632,11 @@ define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %cmp = icmp sgt <32 x i8> %arg0, %arg1 - %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 + %sel = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) %bc = bitcast <32 x i8> %sel to <4 x i64> ret <4 x i64> %bc } +declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>) define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_max_epi16: @@ -1645,11 +1645,11 @@ define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %cmp = icmp sgt <16 x i16> %arg0, %arg1 - %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 + %sel = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) %bc = bitcast <16 x i16> %sel to <4 x i64> ret <4 x i64> %bc } +declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>) define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_max_epi32: @@ -1658,11 +1658,11 @@ define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> - %cmp = icmp sgt <8 x i32> %arg0, %arg1 - %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 + %sel = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1) %bc = bitcast <8 x i32> %sel to <4 x i64> ret <4 x i64> %bc } +declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_max_epu8: @@ -1671,11 +1671,11 @@ define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %cmp = icmp ugt <32 x i8> %arg0, %arg1 - %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 + %sel = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) %bc = bitcast <32 x i8> %sel to <4 x i64> ret <4 x i64> %bc } +declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>) define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_max_epu16: @@ -1684,11 +1684,11 @@ define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %cmp = icmp ugt <16 x i16> %arg0, %arg1 - %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 + %sel = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) %bc = bitcast <16 x i16> %sel to <4 x i64> ret <4 x i64> %bc } +declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>) define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_max_epu32: @@ -1697,11 +1697,11 @@ define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> - %cmp = icmp ugt <8 x i32> %arg0, %arg1 - %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 + %sel = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1) %bc = bitcast <8 x i32> %sel to <4 x i64> ret <4 x i64> %bc } +declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_min_epi8: @@ -1710,11 +1710,11 @@ define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %cmp = icmp slt <32 x i8> %arg0, %arg1 - %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 + %sel = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) %bc = bitcast <32 x i8> %sel to <4 x i64> ret <4 x i64> %bc } +declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>) define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_min_epi16: @@ -1723,11 +1723,11 @@ define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %cmp = icmp slt <16 x i16> %arg0, %arg1 - %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 + %sel = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) %bc = bitcast <16 x i16> %sel to <4 x i64> ret <4 x i64> %bc } +declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>) define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_min_epi32: @@ -1736,11 +1736,11 @@ define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> - %cmp = icmp slt <8 x i32> %arg0, %arg1 - %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 + %sel = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1) %bc = bitcast <8 x i32> %sel to <4 x i64> ret <4 x i64> %bc } +declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_min_epu8: @@ -1749,11 +1749,11 @@ define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %cmp = icmp ult <32 x i8> %arg0, %arg1 - %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 + %sel = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) %bc = bitcast <32 x i8> %sel to <4 x i64> ret <4 x i64> %bc } +declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>) define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_min_epu16: @@ -1762,11 +1762,11 @@ define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %cmp = icmp ult <16 x i16> %arg0, %arg1 - %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 + %sel = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) %bc = bitcast <16 x i16> %sel to <4 x i64> ret <4 x i64> %bc } +declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>) define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_min_epu32: @@ -1775,11 +1775,11 @@ define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> - %cmp = icmp ult <8 x i32> %arg0, %arg1 - %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1 + %sel = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1) %bc = bitcast <8 x i32> %sel to <4 x i64> ret <4 x i64> %bc } +declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind { ; CHECK-LABEL: test_mm256_movemask_epi8: diff --git a/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll b/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll index 186370ca675c7..c4e009d54ec7a 100755 --- a/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll @@ -40,20 +40,22 @@ define void @test_xmm(i32 %shift, i32 %mulp, <2 x i64> %a,i8* %arraydecay,i8* %f ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; CHECK-NEXT: vpmovd2m %xmm0, %k0 ; CHECK-NEXT: kmovq %k0, %k1 -; CHECK-NEXT: kmovd %k0, %ecx -; CHECK-NEXT: ## kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: ## kill: def $cx killed $cx killed $ecx -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload -; CHECK-NEXT: movl $4, %edx -; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: kmovd %k0, %esi +; CHECK-NEXT: ## kill: def $sil killed $sil killed $esi +; CHECK-NEXT: movzbl %sil, %edi +; CHECK-NEXT: ## kill: def $di killed $di killed $edi +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload +; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; CHECK-NEXT: movq %rcx, %rdi +; CHECK-NEXT: movl $4, %r8d +; CHECK-NEXT: movl %r8d, %esi +; CHECK-NEXT: movl %r8d, %edx ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK-NEXT: callq _calc_expected_mask_val ; CHECK-NEXT: ## kill: def $ax killed $ax killed $rax -; CHECK-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx ## 2-byte Reload -; CHECK-NEXT: movzwl %cx, %edi +; CHECK-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %r9w ## 2-byte Reload +; CHECK-NEXT: movzwl %r9w, %edi ; CHECK-NEXT: movzwl %ax, %esi ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload diff --git a/llvm/test/CodeGen/X86/basic-block-sections-labels-functions-sections.ll b/llvm/test/CodeGen/X86/basic-block-sections-labels-functions-sections.ll new file mode 100644 index 0000000000000..1142a8a1ec1ba --- /dev/null +++ b/llvm/test/CodeGen/X86/basic-block-sections-labels-functions-sections.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -mtriple=x86_64 -function-sections -basic-block-sections=labels | FileCheck %s + +$_Z4fooTIiET_v = comdat any + +define dso_local i32 @_Z3barv() { + ret i32 0 +} +;; Check we add SHF_LINK_ORDER for .bb_addr_map and link it with the corresponding .text sections. +; CHECK: .section .text._Z3barv,"ax",@progbits +; CHECK-LABEL: _Z3barv: +; CHECK-NEXT: [[BAR_BEGIN:.Lfunc_begin[0-9]+]]: +; CHECK: .section .bb_addr_map,"o",@progbits,.text._Z3barv{{$}} +; CHECK-NEXT: .quad [[BAR_BEGIN]] + + +define dso_local i32 @_Z3foov() { + %1 = call i32 @_Z4fooTIiET_v() + ret i32 %1 +} +; CHECK: .section .text._Z3foov,"ax",@progbits +; CHECK-LABEL: _Z3foov: +; CHECK-NEXT: [[FOO_BEGIN:.Lfunc_begin[0-9]+]]: +; CHECK: .section .bb_addr_map,"o",@progbits,.text._Z3foov{{$}} +; CHECK-NEXT: .quad [[FOO_BEGIN]] + + +define linkonce_odr dso_local i32 @_Z4fooTIiET_v() comdat { + ret i32 0 +} +;; Check we add .bb_addr_map section to a COMDAT group with the corresponding .text section if such a COMDAT exists. +; CHECK: .section .text._Z4fooTIiET_v,"axG",@progbits,_Z4fooTIiET_v,comdat +; CHECK-LABEL: _Z4fooTIiET_v: +; CHECK-NEXT: [[FOOCOMDAT_BEGIN:.Lfunc_begin[0-9]+]]: +; CHECK: .section .bb_addr_map,"Go",@progbits,_Z4fooTIiET_v,comdat,.text._Z4fooTIiET_v{{$}} +; CHECK-NEXT: .quad [[FOOCOMDAT_BEGIN]] diff --git a/llvm/test/CodeGen/X86/basic-block-sections-labels.ll b/llvm/test/CodeGen/X86/basic-block-sections-labels.ll index 80aaf79c115a4..267132c92e982 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-labels.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-labels.ll @@ -1,23 +1,24 @@ ; Check the basic block sections labels option -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=labels | FileCheck %s -check-prefix=LINUX-LABELS +; RUN: llc < %s -mtriple=x86_64 -function-sections -basic-block-sections=labels | FileCheck %s -define void @_Z3bazb(i1 zeroext) { - %2 = alloca i8, align 1 - %3 = zext i1 %0 to i8 - store i8 %3, i8* %2, align 1 - %4 = load i8, i8* %2, align 1 - %5 = trunc i8 %4 to i1 - br i1 %5, label %6, label %8 +define void @_Z3bazb(i1 zeroext) personality i32 (...)* @__gxx_personality_v0 { + br i1 %0, label %2, label %7 -6: ; preds = %1 - %7 = call i32 @_Z3barv() - br label %10 +2: + %3 = invoke i32 @_Z3barv() + to label %7 unwind label %5 + br label %9 -8: ; preds = %1 - %9 = call i32 @_Z3foov() - br label %10 +5: + landingpad { i8*, i32 } + catch i8* null + br label %9 -10: ; preds = %8, %6 +7: + %8 = call i32 @_Z3foov() + br label %9 + +9: ret void } @@ -25,9 +26,31 @@ declare i32 @_Z3barv() #1 declare i32 @_Z3foov() #1 -; LINUX-LABELS: .section -; LINUX-LABELS: _Z3bazb: -; LINUX-LABELS-NOT: .section -; LINUX-LABELS: r.BB._Z3bazb: -; LINUX-LABELS-NOT: .section -; LINUX-LABELS: rr.BB._Z3bazb: +declare i32 @__gxx_personality_v0(...) + +; CHECK-LABEL: _Z3bazb: +; CHECK-LABEL: .Lfunc_begin0: +; CHECK-LABEL: .LBB_END0_0: +; CHECK-LABEL: .LBB0_1: +; CHECK-LABEL: .LBB_END0_1: +; CHECK-LABEL: .LBB0_2: +; CHECK-LABEL: .LBB_END0_2: +; CHECK-LABEL: .LBB0_3: +; CHECK-LABEL: .LBB_END0_3: +; CHECK-LABEL: .Lfunc_end0: + +; CHECK: .section .bb_addr_map,"o",@progbits,.text +; CHECK-NEXT: .quad .Lfunc_begin0 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .uleb128 .Lfunc_begin0-.Lfunc_begin0 +; CHECK-NEXT: .uleb128 .LBB_END0_0-.Lfunc_begin0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .uleb128 .LBB0_1-.Lfunc_begin0 +; CHECK-NEXT: .uleb128 .LBB_END0_1-.LBB0_1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .uleb128 .LBB0_2-.Lfunc_begin0 +; CHECK-NEXT: .uleb128 .LBB_END0_2-.LBB0_2 +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .uleb128 .LBB0_3-.Lfunc_begin0 +; CHECK-NEXT: .uleb128 .LBB_END0_3-.LBB0_3 +; CHECK-NEXT: .byte 5 diff --git a/llvm/test/CodeGen/X86/bmi2-x86_64.ll b/llvm/test/CodeGen/X86/bmi2-x86_64.ll index bb03138ccf763..9f8214d5b3b5d 100644 --- a/llvm/test/CodeGen/X86/bmi2-x86_64.ll +++ b/llvm/test/CodeGen/X86/bmi2-x86_64.ll @@ -41,6 +41,18 @@ define i64 @pdep64_load(i64 %x, i64* %y) { ret i64 %tmp } +define i64 @pdep64_anyext(i32 %x) { +; CHECK-LABEL: pdep64_anyext: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rax +; CHECK-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; CHECK-NEXT: pdepq %rcx, %rax, %rax +; CHECK-NEXT: retq + %x1 = sext i32 %x to i64 + %tmp = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x1, i64 6148914691236517205) + ret i64 %tmp +} + declare i64 @llvm.x86.bmi.pdep.64(i64, i64) define i64 @pext64(i64 %x, i64 %y) { diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll index bf78cb4f72efb..94bddf4cd6038 100644 --- a/llvm/test/CodeGen/X86/bmi2.ll +++ b/llvm/test/CodeGen/X86/bmi2.ll @@ -76,6 +76,25 @@ define i32 @pdep32_load(i32 %x, i32* %y) { ret i32 %tmp } +define i32 @pdep32_anyext(i16 %x) { +; X86-LABEL: pdep32_anyext: +; X86: # %bb.0: +; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $-1431655766, %ecx # imm = 0xAAAAAAAA +; X86-NEXT: pdepl %ecx, %eax, %eax +; X86-NEXT: retl +; +; X64-LABEL: pdep32_anyext: +; X64: # %bb.0: +; X64-NEXT: movswl %di, %eax +; X64-NEXT: movl $-1431655766, %ecx # imm = 0xAAAAAAAA +; X64-NEXT: pdepl %ecx, %eax, %eax +; X64-NEXT: retq + %x1 = sext i16 %x to i32 + %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x1, i32 -1431655766) + ret i32 %tmp +} + declare i32 @llvm.x86.bmi.pdep.32(i32, i32) define i32 @pext32(i32 %x, i32 %y) { diff --git a/llvm/test/CodeGen/X86/crash-O0.ll b/llvm/test/CodeGen/X86/crash-O0.ll index 9f9e5584d6f21..a93d3dd267b52 100644 --- a/llvm/test/CodeGen/X86/crash-O0.ll +++ b/llvm/test/CodeGen/X86/crash-O0.ll @@ -79,12 +79,11 @@ define i64 @addressModeWith32bitIndex(i32 %V) { ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: ## kill: def $rax killed $eax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: cqto -; CHECK-NEXT: movslq %edi, %rcx -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload -; CHECK-NEXT: idivq (%rsi,%rcx,8) +; CHECK-NEXT: movslq %edi, %rsi +; CHECK-NEXT: idivq (%rcx,%rsi,8) ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %gep = getelementptr i64, i64* null, i32 %V diff --git a/llvm/test/CodeGen/X86/debug-loclists-lto.ll b/llvm/test/CodeGen/X86/debug-loclists-lto.ll index 7578e09c84a20..fde8e00920adf 100644 --- a/llvm/test/CodeGen/X86/debug-loclists-lto.ll +++ b/llvm/test/CodeGen/X86/debug-loclists-lto.ll @@ -1,10 +1,18 @@ -; RUN: llc -mtriple=x86_64-pc-linux -filetype=asm -function-sections < %s | FileCheck --implicit-check-not=loclists_table_base %s +; RUN: llc -mtriple=x86_64-pc-linux -filetype=asm -function-sections < %s | \ +; RUN: FileCheck --check-prefixes=CHECK,DWARF32 --implicit-check-not=loclists_table_base %s +; RUN: llc -dwarf64 -mtriple=x86_64-pc-linux -filetype=asm -function-sections < %s | \ +; RUN: FileCheck --check-prefixes=CHECK,DWARF64 --implicit-check-not=loclists_table_base %s -; CHECK: {{^}}.Lloclists_table_base0: -; CHECK-NEXT: .long .Ldebug_loc0-.Lloclists_table_base0 -; CHECK-NEXT: .long .Ldebug_loc1-.Lloclists_table_base0 -; CHECK: .long .Lloclists_table_base0 # DW_AT_loclists_base -; CHECK: .long .Lloclists_table_base0 # DW_AT_loclists_base +; CHECK: {{^}}.Lloclists_table_base0: +; DWARF32-NEXT: .long .Ldebug_loc0-.Lloclists_table_base0 +; DWARF32-NEXT: .long .Ldebug_loc1-.Lloclists_table_base0 +; DWARF64-NEXT: .quad .Ldebug_loc0-.Lloclists_table_base0 +; DWARF64-NEXT: .quad .Ldebug_loc1-.Lloclists_table_base0 + +; DWARF32: .long .Lloclists_table_base0 # DW_AT_loclists_base +; DWARF32: .long .Lloclists_table_base0 # DW_AT_loclists_base +; DWARF64: .quad .Lloclists_table_base0 # DW_AT_loclists_base +; DWARF64: .quad .Lloclists_table_base0 # DW_AT_loclists_base ; Function Attrs: uwtable define dso_local void @_Z2f2v() local_unnamed_addr #0 !dbg !15 { diff --git a/llvm/test/CodeGen/X86/debug-loclists.ll b/llvm/test/CodeGen/X86/debug-loclists.ll index 59f244e62669d..d13ad6a11262e 100644 --- a/llvm/test/CodeGen/X86/debug-loclists.ll +++ b/llvm/test/CodeGen/X86/debug-loclists.ll @@ -1,42 +1,61 @@ ; RUN: llc -mtriple=x86_64-pc-linux -filetype=obj -function-sections -o %t < %s -; RUN: llvm-dwarfdump -v -debug-info -debug-loclists %t | FileCheck %s +; RUN: llvm-dwarfdump -v -debug-info -debug-loclists %t | \ +; RUN: FileCheck %s --check-prefixes=CHECK,DWARF32 -; RUN: llc -dwarf-version=5 -split-dwarf-file=foo.dwo -mtriple=x86_64-pc-linux -filetype=obj -function-sections -o %t < %s -; RUN: llvm-dwarfdump -v -debug-info -debug-loclists %t | FileCheck %s --check-prefix=DWO - -; CHECK: DW_TAG_variable -; CHECK-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x0) loclist = 0x00000018: -; CHECK-NEXT: [0x0000000000000000, 0x0000000000000003) ".text._Z2f1ii": DW_OP_consts +3, DW_OP_stack_value -; CHECK-NEXT: [0x0000000000000003, 0x0000000000000004) ".text._Z2f1ii": DW_OP_consts +4, DW_OP_stack_value) -; CHECK-NEXT: DW_AT_name {{.*}} "y" - -; CHECK: DW_TAG_variable -; CHECK-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x1) loclist = 0x00000029: -; CHECK-NEXT: [0x0000000000000000, 0x0000000000000003) ".text._Z2f1ii": DW_OP_consts +5, DW_OP_stack_value) -; CHECK-NEXT: DW_AT_name {{.*}} "x" +; RUN: llc -dwarf64 -mtriple=x86_64-pc-linux -filetype=obj -function-sections -o %t < %s +; RUN: llvm-dwarfdump -v -debug-info -debug-loclists %t | \ +; RUN: FileCheck %s --check-prefixes=CHECK,DWARF64 -; CHECK: DW_TAG_variable -; CHECK-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x2) loclist = 0x00000031: -; CHECK-NEXT: [0x0000000000000003, 0x0000000000000004) ".text._Z2f1ii": DW_OP_reg0 RAX) -; CHECK-NEXT: DW_AT_name {{.*}} "r" - -; CHECK: .debug_loclists contents: -; CHECK-NEXT: 0x00000000: locations list header: length = 0x00000035, format = DWARF32, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000003 +; RUN: llc -dwarf-version=5 -split-dwarf-file=foo.dwo -mtriple=x86_64-pc-linux -filetype=obj -function-sections -o %t < %s +; RUN: llvm-dwarfdump -v -debug-info -debug-loclists %t | \ +; RUN: FileCheck %s --check-prefixes=DWO,DWO32 + +; RUN: llc -dwarf64 -dwarf-version=5 -split-dwarf-file=foo.dwo -mtriple=x86_64-pc-linux -filetype=obj -function-sections -o %t < %s +; RUN: llvm-dwarfdump -v -debug-info -debug-loclists %t | \ +; RUN: FileCheck %s --check-prefixes=DWO,DWO64 + +; CHECK: DW_TAG_variable +; DWARF32-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x0) loclist = 0x00000018: +; DWARF64-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x0) loclist = 0x0000002c: +; CHECK-NEXT: [0x0000000000000000, 0x0000000000000003) ".text._Z2f1ii": DW_OP_consts +3, DW_OP_stack_value +; CHECK-NEXT: [0x0000000000000003, 0x0000000000000004) ".text._Z2f1ii": DW_OP_consts +4, DW_OP_stack_value) +; CHECK-NEXT: DW_AT_name {{.*}} "y" + +; CHECK: DW_TAG_variable +; DWARF32-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x1) loclist = 0x00000029: +; DWARF64-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x1) loclist = 0x0000003d: +; CHECK-NEXT: [0x0000000000000000, 0x0000000000000003) ".text._Z2f1ii": DW_OP_consts +5, DW_OP_stack_value) +; CHECK-NEXT: DW_AT_name {{.*}} "x" + +; CHECK: DW_TAG_variable +; DWARF32-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x2) loclist = 0x00000031: +; DWARF64-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x2) loclist = 0x00000045: +; CHECK-NEXT: [0x0000000000000003, 0x0000000000000004) ".text._Z2f1ii": DW_OP_reg0 RAX) +; CHECK-NEXT: DW_AT_name {{.*}} "r" + +; CHECK: .debug_loclists contents: +; DWARF32-NEXT: 0x00000000: locations list header: length = 0x00000035, format = DWARF32, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000003 +; DWARF64-NEXT: 0x00000000: locations list header: length = 0x0000000000000041, format = DWARF64, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000003 ; DWO: .debug_loclists.dwo contents: -; DWO-NEXT: 0x00000000: locations list header: length = 0x00000035, format = DWARF32, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000003 - -; CHECK-NEXT: offsets: [ -; CHECK-NEXT: 0x0000000c => 0x00000018 -; CHECK-NEXT: 0x0000001d => 0x00000029 -; CHECK-NEXT: 0x00000025 => 0x00000031 -; CHECK-NEXT: ] +; DWO32-NEXT: 0x00000000: locations list header: length = 0x00000035, format = DWARF32, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000003 +; DWO64-NEXT: 0x00000000: locations list header: length = 0x0000000000000041, format = DWARF64, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000003 + +; CHECK-NEXT: offsets: [ +; DWARF32-NEXT: 0x0000000c => 0x00000018 +; DWARF32-NEXT: 0x0000001d => 0x00000029 +; DWARF32-NEXT: 0x00000025 => 0x00000031 +; DWARF64-NEXT: 0x0000000000000018 => 0x0000002c +; DWARF64-NEXT: 0x0000000000000029 => 0x0000003d +; DWARF64-NEXT: 0x0000000000000031 => 0x00000045 +; CHECK-NEXT: ] ; Don't use startx_length if there's more than one entry, because the shared ; base address will be useful for both the range that does start at the start of ; the function, and the one that doesn't. -; CHECK-NEXT: 0x00000018: +; DWARF32-NEXT: 0x00000018: +; DWARF64-NEXT: 0x0000002c: ; CHECK-NEXT: DW_LLE_base_addressx (0x0000000000000000) ; CHECK-NEXT: DW_LLE_offset_pair (0x0000000000000000, 0x0000000000000003): DW_OP_consts +3, DW_OP_stack_value ; CHECK-NEXT: DW_LLE_offset_pair (0x0000000000000003, 0x0000000000000004): DW_OP_consts +4, DW_OP_stack_value @@ -44,14 +63,16 @@ ; Show that startx_length can be used when the address range starts at the start of the function. -; CHECK: 0x00000029: +; DWARF32: 0x00000029: +; DWARF64: 0x0000003d: ; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000000, 0x0000000000000003): DW_OP_consts +5, DW_OP_stack_value ; CHECK-NEXT: DW_LLE_end_of_list () ; And use a base address when the range doesn't start at an existing/useful ; address in the pool. -; CHECK: 0x00000031: +; DWARF32: 0x00000031: +; DWARF64: 0x00000045: ; CHECK-NEXT: DW_LLE_base_addressx (0x0000000000000000) ; CHECK-NEXT: DW_LLE_offset_pair (0x0000000000000003, 0x0000000000000004): DW_OP_reg0 RAX ; CHECK-NEXT: DW_LLE_end_of_list () diff --git a/llvm/test/CodeGen/X86/extend-set-cc-uses-dbg.ll b/llvm/test/CodeGen/X86/extend-set-cc-uses-dbg.ll index 664d9ded1e0e1..7d05a869be893 100644 --- a/llvm/test/CodeGen/X86/extend-set-cc-uses-dbg.ll +++ b/llvm/test/CodeGen/X86/extend-set-cc-uses-dbg.ll @@ -7,8 +7,8 @@ define void @foo(i32* %p) !dbg !4 { bb: %tmp = load i32, i32* %p, align 4, !dbg !7 ; CHECK: $eax = MOV32rm killed {{.*}} $rdi, {{.*}} debug-location !7 :: (load 4 from %ir.p) - ; CHECK-NEXT: $rax = KILL killed renamable $eax, debug-location !7 - ; CHECK-NEXT: $rcx = MOV64rr $rax, debug-location !7 + ; CHECK-NEXT: $ecx = MOV32rr killed $eax, implicit-def $rcx, debug-location !7 + ; CHECK-NEXT: $rdx = MOV64rr $rcx, debug-location !7 switch i32 %tmp, label %bb7 [ i32 0, label %bb1 diff --git a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll index 7fffa21f0d24d..5d7c83fa19d44 100644 --- a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll +++ b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll @@ -1013,11 +1013,11 @@ define <16 x float> @test_load_nt16xfloat(<16 x float>* nocapture %ptr) { ; AVX1-NEXT: vmovaps %xmm0, %xmm1 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1 -; AVX1-NEXT: # implicit-def: $ymm2 -; AVX1-NEXT: vmovaps %xmm1, %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: # implicit-def: $ymm1 +; AVX1-NEXT: vmovaps %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_nt16xfloat: @@ -1067,11 +1067,11 @@ define <8 x double> @test_load_nt8xdouble(<8 x double>* nocapture %ptr) { ; AVX1-NEXT: vmovaps %xmm0, %xmm1 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1 -; AVX1-NEXT: # implicit-def: $ymm2 -; AVX1-NEXT: vmovaps %xmm1, %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: # implicit-def: $ymm1 +; AVX1-NEXT: vmovaps %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_nt8xdouble: @@ -1121,11 +1121,11 @@ define <64 x i8> @test_load_nt64xi8(<64 x i8>* nocapture %ptr) { ; AVX1-NEXT: vmovaps %xmm0, %xmm1 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1 -; AVX1-NEXT: # implicit-def: $ymm2 -; AVX1-NEXT: vmovaps %xmm1, %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: # implicit-def: $ymm1 +; AVX1-NEXT: vmovaps %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_nt64xi8: @@ -1175,11 +1175,11 @@ define <32 x i16> @test_load_nt32xi16(<32 x i16>* nocapture %ptr) { ; AVX1-NEXT: vmovaps %xmm0, %xmm1 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1 -; AVX1-NEXT: # implicit-def: $ymm2 -; AVX1-NEXT: vmovaps %xmm1, %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: # implicit-def: $ymm1 +; AVX1-NEXT: vmovaps %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_nt32xi16: @@ -1229,11 +1229,11 @@ define <16 x i32> @test_load_nt16xi32(<16 x i32>* nocapture %ptr) { ; AVX1-NEXT: vmovaps %xmm0, %xmm1 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1 -; AVX1-NEXT: # implicit-def: $ymm2 -; AVX1-NEXT: vmovaps %xmm1, %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: # implicit-def: $ymm1 +; AVX1-NEXT: vmovaps %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_nt16xi32: @@ -1283,11 +1283,11 @@ define <8 x i64> @test_load_nt8xi64(<8 x i64>* nocapture %ptr) { ; AVX1-NEXT: vmovaps %xmm0, %xmm1 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1 -; AVX1-NEXT: # implicit-def: $ymm2 -; AVX1-NEXT: vmovaps %xmm1, %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: # implicit-def: $ymm1 +; AVX1-NEXT: vmovaps %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_load_nt8xi64: diff --git a/llvm/test/CodeGen/X86/fmaxnum.ll b/llvm/test/CodeGen/X86/fmaxnum.ll index 2a7bb25164d31..fd5b638a146da 100644 --- a/llvm/test/CodeGen/X86/fmaxnum.ll +++ b/llvm/test/CodeGen/X86/fmaxnum.ll @@ -609,5 +609,13 @@ define float @test_maxnum_const_op2(float %x) { ret float %r } +define float @test_maxnum_const_nan(float %x) { +; CHECK-LABEL: test_maxnum_const_nan: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %r = call float @llvm.maxnum.f32(float %x, float 0x7fff000000000000) + ret float %r +} + attributes #0 = { "no-nans-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/fminnum.ll b/llvm/test/CodeGen/X86/fminnum.ll index fc4c48686a953..dc1b8ca8eb4db 100644 --- a/llvm/test/CodeGen/X86/fminnum.ll +++ b/llvm/test/CodeGen/X86/fminnum.ll @@ -609,5 +609,13 @@ define float @test_minnum_const_op2(float %x) { ret float %r } +define float @test_minnum_const_nan(float %x) { +; CHECK-LABEL: test_minnum_const_nan: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %r = call float @llvm.minnum.f32(float %x, float 0x7fff000000000000) + ret float %r +} + attributes #0 = { "no-nans-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/fp-undef.ll b/llvm/test/CodeGen/X86/fp-undef.ll index d46bea703fdf0..95049d16a7bf4 100644 --- a/llvm/test/CodeGen/X86/fp-undef.ll +++ b/llvm/test/CodeGen/X86/fp-undef.ll @@ -100,7 +100,6 @@ define float @frem_undef_op1(float %x) { define float @fadd_undef_op0_nnan(float %x) { ; ANY-LABEL: fadd_undef_op0_nnan: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = fadd nnan float undef, %x ret float %r @@ -109,7 +108,6 @@ define float @fadd_undef_op0_nnan(float %x) { define float @fadd_undef_op1_fast(float %x) { ; ANY-LABEL: fadd_undef_op1_fast: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = fadd fast float %x, undef ret float %r @@ -118,7 +116,6 @@ define float @fadd_undef_op1_fast(float %x) { define float @fsub_undef_op0_fast(float %x) { ; ANY-LABEL: fsub_undef_op0_fast: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = fsub fast float undef, %x ret float %r @@ -127,7 +124,6 @@ define float @fsub_undef_op0_fast(float %x) { define float @fsub_undef_op1_nnan(float %x) { ; ANY-LABEL: fsub_undef_op1_nnan: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = fsub nnan float %x, undef ret float %r @@ -136,7 +132,6 @@ define float @fsub_undef_op1_nnan(float %x) { define float @fmul_undef_op0_nnan(float %x) { ; ANY-LABEL: fmul_undef_op0_nnan: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = fmul nnan float undef, %x ret float %r @@ -145,7 +140,6 @@ define float @fmul_undef_op0_nnan(float %x) { define float @fmul_undef_op1_fast(float %x) { ; ANY-LABEL: fmul_undef_op1_fast: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = fmul fast float %x, undef ret float %r @@ -154,7 +148,6 @@ define float @fmul_undef_op1_fast(float %x) { define float @fdiv_undef_op0_fast(float %x) { ; ANY-LABEL: fdiv_undef_op0_fast: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = fdiv fast float undef, %x ret float %r @@ -163,7 +156,6 @@ define float @fdiv_undef_op0_fast(float %x) { define float @fdiv_undef_op1_nnan(float %x) { ; ANY-LABEL: fdiv_undef_op1_nnan: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = fdiv nnan float %x, undef ret float %r @@ -172,7 +164,6 @@ define float @fdiv_undef_op1_nnan(float %x) { define float @frem_undef_op0_nnan(float %x) { ; ANY-LABEL: frem_undef_op0_nnan: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = frem nnan float undef, %x ret float %r @@ -181,7 +172,6 @@ define float @frem_undef_op0_nnan(float %x) { define float @frem_undef_op1_fast(float %x) { ; ANY-LABEL: frem_undef_op1_fast: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = frem fast float %x, undef ret float %r @@ -234,7 +224,6 @@ define double @frem_undef_undef(double %x) { define float @fadd_undef_op0_nnan_constant(float %x) { ; ANY-LABEL: fadd_undef_op0_nnan_constant: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = fadd nnan float undef, 1.0 ret float %r @@ -252,7 +241,6 @@ define float @fadd_undef_op1_constant(float %x) { define float @fsub_undef_op0_fast_constant(float %x) { ; ANY-LABEL: fsub_undef_op0_fast_constant: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = fsub fast float undef, 3.0 ret float %r @@ -270,7 +258,6 @@ define float @fsub_undef_op1_constant(float %x) { define float @fmul_undef_op0_nnan_constant(float %x) { ; ANY-LABEL: fmul_undef_op0_nnan_constant: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = fmul nnan float undef, 5.0 ret float %r @@ -288,7 +275,6 @@ define float @fmul_undef_op1_constant(float %x) { define float @fdiv_undef_op0_fast_constant(float %x) { ; ANY-LABEL: fdiv_undef_op0_fast_constant: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = fdiv fast float undef, 7.0 ret float %r @@ -306,7 +292,6 @@ define float @fdiv_undef_op1_constant(float %x) { define float @frem_undef_op0_nnan_constant(float %x) { ; ANY-LABEL: frem_undef_op0_nnan_constant: ; ANY: # %bb.0: -; ANY-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ANY-NEXT: retq %r = frem nnan float undef, 9.0 ret float %r @@ -335,7 +320,6 @@ define double @fadd_undef_op0_constant_nan(double %x) { define double @fadd_undef_op1_fast_constant_nan(double %x) { ; ANY-LABEL: fadd_undef_op1_fast_constant_nan: ; ANY: # %bb.0: -; ANY-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; ANY-NEXT: retq %r = fadd fast double 0xFFF0000000000001, undef ret double %r @@ -353,7 +337,6 @@ define double @fsub_undef_op0_constant_nan(double %x) { define double @fsub_undef_op1_nnan_constant_nan(double %x) { ; ANY-LABEL: fsub_undef_op1_nnan_constant_nan: ; ANY: # %bb.0: -; ANY-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; ANY-NEXT: retq %r = fsub nnan double 0x7FF0000000000011, undef ret double %r @@ -371,7 +354,6 @@ define double @fmul_undef_op0_constant_nan(double %x) { define double @fmul_undef_op1_fast_constant_nan(double %x) { ; ANY-LABEL: fmul_undef_op1_fast_constant_nan: ; ANY: # %bb.0: -; ANY-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; ANY-NEXT: retq %r = fmul fast double 0xFFF0000000000101, undef ret double %r @@ -389,7 +371,6 @@ define double @fdiv_undef_op0_constant_nan(double %x) { define double @fdiv_undef_op1_nnan_constant_nan(double %x) { ; ANY-LABEL: fdiv_undef_op1_nnan_constant_nan: ; ANY: # %bb.0: -; ANY-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; ANY-NEXT: retq %r = fdiv nnan double 0x7FF0000000000111, undef ret double %r @@ -407,7 +388,6 @@ define double @frem_undef_op0_constant_nan(double %x) { define double @frem_undef_op1_fast_constant_nan(double %x) { ; ANY-LABEL: frem_undef_op1_fast_constant_nan: ; ANY: # %bb.0: -; ANY-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; ANY-NEXT: retq %r = frem fast double 0xFFF0000000001001, undef ret double %r @@ -427,7 +407,6 @@ define double @fadd_undef_op0_constant_inf(double %x) { define double @fadd_undef_op1_fast_constant_inf(double %x) { ; ANY-LABEL: fadd_undef_op1_fast_constant_inf: ; ANY: # %bb.0: -; ANY-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; ANY-NEXT: retq %r = fadd fast double 0xFFF0000000000000, undef ret double %r @@ -445,7 +424,6 @@ define double @fsub_undef_op0_constant_inf(double %x) { define double @fsub_undef_op1_ninf_constant_inf(double %x) { ; ANY-LABEL: fsub_undef_op1_ninf_constant_inf: ; ANY: # %bb.0: -; ANY-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; ANY-NEXT: retq %r = fsub ninf double 0x7FF0000000000000, undef ret double %r @@ -463,7 +441,6 @@ define double @fmul_undef_op0_constant_inf(double %x) { define double @fmul_undef_op1_fast_constant_inf(double %x) { ; ANY-LABEL: fmul_undef_op1_fast_constant_inf: ; ANY: # %bb.0: -; ANY-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; ANY-NEXT: retq %r = fmul fast double 0xFFF0000000000000, undef ret double %r @@ -481,7 +458,6 @@ define double @fdiv_undef_op0_constant_inf(double %x) { define double @fdiv_undef_op1_ninf_constant_inf(double %x) { ; ANY-LABEL: fdiv_undef_op1_ninf_constant_inf: ; ANY: # %bb.0: -; ANY-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; ANY-NEXT: retq %r = fdiv ninf double 0x7FF0000000000000, undef ret double %r @@ -499,7 +475,6 @@ define double @frem_undef_op0_constant_inf(double %x) { define double @frem_undef_op1_fast_constant_inf(double %x) { ; ANY-LABEL: frem_undef_op1_fast_constant_inf: ; ANY: # %bb.0: -; ANY-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; ANY-NEXT: retq %r = frem fast double 0xFFF0000000000000, undef ret double %r diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll index f052718d98400..319eb6f5edc32 100644 --- a/llvm/test/CodeGen/X86/iabs.ll +++ b/llvm/test/CodeGen/X86/iabs.ll @@ -121,73 +121,34 @@ define i64 @test_i64(i64 %a) nounwind { } define i128 @test_i128(i128 %a) nounwind { -; X86-NO-CMOV-LABEL: test_i128: -; X86-NO-CMOV: # %bb.0: -; X86-NO-CMOV-NEXT: pushl %ebp -; X86-NO-CMOV-NEXT: pushl %ebx -; X86-NO-CMOV-NEXT: pushl %edi -; X86-NO-CMOV-NEXT: pushl %esi -; X86-NO-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-CMOV-NEXT: xorl %ecx, %ecx -; X86-NO-CMOV-NEXT: negl %ebp -; X86-NO-CMOV-NEXT: movl $0, %ebx -; X86-NO-CMOV-NEXT: sbbl %edx, %ebx -; X86-NO-CMOV-NEXT: movl $0, %edi -; X86-NO-CMOV-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NO-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-CMOV-NEXT: sbbl %esi, %ecx -; X86-NO-CMOV-NEXT: testl %esi, %esi -; X86-NO-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-CMOV-NEXT: js .LBB4_2 -; X86-NO-CMOV-NEXT: # %bb.1: -; X86-NO-CMOV-NEXT: movl %esi, %ecx -; X86-NO-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-CMOV-NEXT: movl %edx, %ebx -; X86-NO-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-CMOV-NEXT: .LBB4_2: -; X86-NO-CMOV-NEXT: movl %ebp, (%eax) -; X86-NO-CMOV-NEXT: movl %ebx, 4(%eax) -; X86-NO-CMOV-NEXT: movl %edi, 8(%eax) -; X86-NO-CMOV-NEXT: movl %ecx, 12(%eax) -; X86-NO-CMOV-NEXT: popl %esi -; X86-NO-CMOV-NEXT: popl %edi -; X86-NO-CMOV-NEXT: popl %ebx -; X86-NO-CMOV-NEXT: popl %ebp -; X86-NO-CMOV-NEXT: retl $4 -; -; X86-CMOV-LABEL: test_i128: -; X86-CMOV: # %bb.0: -; X86-CMOV-NEXT: pushl %ebp -; X86-CMOV-NEXT: pushl %ebx -; X86-CMOV-NEXT: pushl %edi -; X86-CMOV-NEXT: pushl %esi -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-CMOV-NEXT: xorl %esi, %esi -; X86-CMOV-NEXT: negl %edi -; X86-CMOV-NEXT: movl $0, %ebx -; X86-CMOV-NEXT: sbbl %edx, %ebx -; X86-CMOV-NEXT: movl $0, %ebp -; X86-CMOV-NEXT: sbbl %ecx, %ebp -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-CMOV-NEXT: sbbl %eax, %esi -; X86-CMOV-NEXT: testl %eax, %eax -; X86-CMOV-NEXT: cmovnsl %eax, %esi -; X86-CMOV-NEXT: cmovnsl %ecx, %ebp -; X86-CMOV-NEXT: cmovnsl %edx, %ebx -; X86-CMOV-NEXT: cmovnsl {{[0-9]+}}(%esp), %edi -; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-CMOV-NEXT: movl %edi, (%eax) -; X86-CMOV-NEXT: movl %ebx, 4(%eax) -; X86-CMOV-NEXT: movl %ebp, 8(%eax) -; X86-CMOV-NEXT: movl %esi, 12(%eax) -; X86-CMOV-NEXT: popl %esi -; X86-CMOV-NEXT: popl %edi -; X86-CMOV-NEXT: popl %ebx -; X86-CMOV-NEXT: popl %ebp -; X86-CMOV-NEXT: retl $4 +; X86-LABEL: test_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: adcl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: adcl %edx, %ebx +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: xorl %edx, %ebx +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebx, 8(%eax) +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl $4 ; ; X64-LABEL: test_i128: ; X64: # %bb.0: diff --git a/llvm/test/CodeGen/X86/implicit-null-check-negative.ll b/llvm/test/CodeGen/X86/implicit-null-check-negative.ll index c05b4a072adfd..d7eae8c98173a 100644 --- a/llvm/test/CodeGen/X86/implicit-null-check-negative.ll +++ b/llvm/test/CodeGen/X86/implicit-null-check-negative.ll @@ -109,4 +109,24 @@ define i32 @imp_null_check_add_result(i32* %x, i32* %y) { ret i32 %p } +; This redefines the null check reg by doing a zero-extend, a shift on +; itself and then an add. +; Cannot be converted to implicit check since the zero reg is no longer zero. +define i64 @imp_null_check_load_shift_add_addr(i64* %x, i64 %r) { + entry: + %c = icmp eq i64* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i64 42 + + not_null: + %y = ptrtoint i64* %x to i64 + %shry = shl i64 %y, 6 + %shry.add = add i64 %shry, %r + %y.ptr = inttoptr i64 %shry.add to i64* + %x.loc = getelementptr i64, i64* %y.ptr, i64 1 + %t = load i64, i64* %x.loc + ret i64 %t +} !0 = !{} diff --git a/llvm/test/CodeGen/X86/implicit-null-check.ll b/llvm/test/CodeGen/X86/implicit-null-check.ll index 6d6b31f86dbe9..c6241b18f785e 100644 --- a/llvm/test/CodeGen/X86/implicit-null-check.ll +++ b/llvm/test/CodeGen/X86/implicit-null-check.ll @@ -48,6 +48,8 @@ define i32 @imp_null_check_unordered_load(i32* %x) { ret i32 %t } + +; TODO: Can be converted into implicit check. ;; Probably could be implicit, but we're conservative for now define i32 @imp_null_check_seq_cst_load(i32* %x) { ; CHECK-LABEL: imp_null_check_seq_cst_load: @@ -557,4 +559,66 @@ define i32 @imp_null_check_neg_gep_load(i32* %x) { ret i32 %t } +; This redefines the null check reg by doing a zero-extend and a shift on +; itself. +; Converted into implicit null check since both of these operations do not +; change the nullness of %x (i.e. if it is null, it remains null). +define i64 @imp_null_check_load_shift_addr(i64* %x) { +; CHECK-LABEL: imp_null_check_load_shift_addr: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: shlq $6, %rdi +; CHECK-NEXT: Ltmp17: +; CHECK-NEXT: movq 8(%rdi), %rax ## on-fault: LBB21_1 +; CHECK-NEXT: ## %bb.2: ## %not_null +; CHECK-NEXT: retq +; CHECK-NEXT: LBB21_1: ## %is_null +; CHECK-NEXT: movl $42, %eax +; CHECK-NEXT: retq + + entry: + %c = icmp eq i64* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i64 42 + + not_null: + %y = ptrtoint i64* %x to i64 + %shry = shl i64 %y, 6 + %y.ptr = inttoptr i64 %shry to i64* + %x.loc = getelementptr i64, i64* %y.ptr, i64 1 + %t = load i64, i64* %x.loc + ret i64 %t +} + +; Same as imp_null_check_load_shift_addr but shift is by 3 and this is now +; converted into complex addressing. +; TODO: Can be converted into implicit null check +define i64 @imp_null_check_load_shift_by_3_addr(i64* %x) { +; CHECK-LABEL: imp_null_check_load_shift_by_3_addr: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: je LBB22_1 +; CHECK-NEXT: ## %bb.2: ## %not_null +; CHECK-NEXT: movq 8(,%rdi,8), %rax +; CHECK-NEXT: retq +; CHECK-NEXT: LBB22_1: ## %is_null +; CHECK-NEXT: movl $42, %eax +; CHECK-NEXT: retq + + entry: + %c = icmp eq i64* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i64 42 + + not_null: + %y = ptrtoint i64* %x to i64 + %shry = shl i64 %y, 3 + %y.ptr = inttoptr i64 %shry to i64* + %x.loc = getelementptr i64, i64* %y.ptr, i64 1 + %t = load i64, i64* %x.loc + ret i64 %t +} !0 = !{} diff --git a/llvm/test/CodeGen/X86/lvi-hardening-loads.ll b/llvm/test/CodeGen/X86/lvi-hardening-loads.ll index ff8276f6f1c22..e660f306ef75b 100644 --- a/llvm/test/CodeGen/X86/lvi-hardening-loads.ll +++ b/llvm/test/CodeGen/X86/lvi-hardening-loads.ll @@ -117,9 +117,9 @@ if.then: ; preds = %for.body ; X64-NOOPT-NEXT: lfence ; X64-NOOPT-NEXT: movq (%rax,%rcx,8), %rax ; X64-NOOPT-NEXT: lfence -; X64-NOOPT-NEXT: movl (%rax), %eax +; X64-NOOPT-NEXT: movl (%rax), %edx ; X64-NOOPT-NEXT: lfence -; X64-NOOPT-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-NOOPT-NEXT: movl %edx, -{{[0-9]+}}(%rsp) if.end: ; preds = %if.then, %for.body br label %for.inc diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 88418fd85fe52..948928099d38e 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -1629,182 +1629,122 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) { ret <16 x float>%res } -; Check non-power-of-2 case. It should be scalarized. declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>) define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { ; KNL_64-LABEL: test30: ; KNL_64: # %bb.0: -; KNL_64-NEXT: andb $1, %dil -; KNL_64-NEXT: andb $1, %sil -; KNL_64-NEXT: addb %sil, %sil -; KNL_64-NEXT: orb %dil, %sil -; KNL_64-NEXT: andb $1, %dl -; KNL_64-NEXT: shlb $2, %dl -; KNL_64-NEXT: orb %sil, %dl +; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; KNL_64-NEXT: movw $-3, %ax +; KNL_64-NEXT: kmovw %eax, %k0 +; KNL_64-NEXT: andl $1, %edi +; KNL_64-NEXT: kmovw %edi, %k1 +; KNL_64-NEXT: kandw %k0, %k1, %k0 +; KNL_64-NEXT: kmovw %esi, %k1 +; KNL_64-NEXT: kshiftlw $15, %k1, %k1 +; KNL_64-NEXT: kshiftrw $14, %k1, %k1 +; KNL_64-NEXT: korw %k1, %k0, %k0 +; KNL_64-NEXT: movw $-5, %ax +; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: kandw %k1, %k0, %k0 +; KNL_64-NEXT: kmovw %edx, %k1 +; KNL_64-NEXT: kshiftlw $15, %k1, %k1 +; KNL_64-NEXT: kshiftrw $13, %k1, %k1 +; KNL_64-NEXT: korw %k1, %k0, %k0 +; KNL_64-NEXT: kshiftlw $12, %k0, %k0 +; KNL_64-NEXT: kshiftrw $12, %k0, %k1 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; KNL_64-NEXT: testb $1, %dl -; KNL_64-NEXT: jne .LBB31_1 -; KNL_64-NEXT: # %bb.2: # %else -; KNL_64-NEXT: testb $2, %dl -; KNL_64-NEXT: jne .LBB31_3 -; KNL_64-NEXT: .LBB31_4: # %else2 -; KNL_64-NEXT: testb $4, %dl -; KNL_64-NEXT: jne .LBB31_5 -; KNL_64-NEXT: .LBB31_6: # %else5 -; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 -; KNL_64-NEXT: vzeroupper -; KNL_64-NEXT: retq -; KNL_64-NEXT: .LBB31_1: # %cond.load -; KNL_64-NEXT: vmovq %xmm0, %rax -; KNL_64-NEXT: vpinsrd $0, (%rax), %xmm2, %xmm2 -; KNL_64-NEXT: testb $2, %dl -; KNL_64-NEXT: je .LBB31_4 -; KNL_64-NEXT: .LBB31_3: # %cond.load1 -; KNL_64-NEXT: vpextrq $1, %xmm0, %rax -; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 -; KNL_64-NEXT: testb $4, %dl -; KNL_64-NEXT: je .LBB31_6 -; KNL_64-NEXT: .LBB31_5: # %cond.load4 -; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_64-NEXT: vmovq %xmm0, %rax -; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2 +; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1} ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test30: ; KNL_32: # %bb.0: -; KNL_32-NEXT: pushl %eax -; KNL_32-NEXT: .cfi_def_cfa_offset 8 +; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; KNL_32-NEXT: movw $-3, %ax +; KNL_32-NEXT: kmovw %eax, %k0 ; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_32-NEXT: andb $1, %al -; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %cl -; KNL_32-NEXT: andb $1, %cl -; KNL_32-NEXT: addb %cl, %cl -; KNL_32-NEXT: orb %al, %cl +; KNL_32-NEXT: andl $1, %eax +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: kandw %k0, %k1, %k0 ; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_32-NEXT: andb $1, %al -; KNL_32-NEXT: shlb $2, %al -; KNL_32-NEXT: orb %cl, %al +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: kshiftlw $15, %k1, %k1 +; KNL_32-NEXT: kshiftrw $14, %k1, %k1 +; KNL_32-NEXT: korw %k1, %k0, %k0 +; KNL_32-NEXT: movw $-5, %ax +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: kandw %k1, %k0, %k0 +; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: kshiftlw $15, %k1, %k1 +; KNL_32-NEXT: kshiftrw $13, %k1, %k1 +; KNL_32-NEXT: korw %k1, %k0, %k0 +; KNL_32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_32-NEXT: kshiftrw $12, %k0, %k1 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; KNL_32-NEXT: testb $1, %al -; KNL_32-NEXT: jne .LBB31_1 -; KNL_32-NEXT: # %bb.2: # %else -; KNL_32-NEXT: testb $2, %al -; KNL_32-NEXT: jne .LBB31_3 -; KNL_32-NEXT: .LBB31_4: # %else2 -; KNL_32-NEXT: testb $4, %al -; KNL_32-NEXT: je .LBB31_6 -; KNL_32-NEXT: .LBB31_5: # %cond.load4 -; KNL_32-NEXT: vpextrd $2, %xmm0, %eax -; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2 -; KNL_32-NEXT: .LBB31_6: # %else5 +; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 -; KNL_32-NEXT: popl %eax -; KNL_32-NEXT: .cfi_def_cfa_offset 4 +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl -; KNL_32-NEXT: .LBB31_1: # %cond.load -; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: vmovd %xmm0, %ecx -; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2 -; KNL_32-NEXT: testb $2, %al -; KNL_32-NEXT: je .LBB31_4 -; KNL_32-NEXT: .LBB31_3: # %cond.load1 -; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx -; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm2, %xmm2 -; KNL_32-NEXT: testb $4, %al -; KNL_32-NEXT: jne .LBB31_5 -; KNL_32-NEXT: jmp .LBB31_6 ; ; SKX-LABEL: test30: ; SKX: # %bb.0: -; SKX-NEXT: andb $1, %dil -; SKX-NEXT: andb $1, %sil -; SKX-NEXT: addb %sil, %sil -; SKX-NEXT: orb %dil, %sil -; SKX-NEXT: andb $1, %dl -; SKX-NEXT: shlb $2, %dl -; SKX-NEXT: orb %sil, %dl +; SKX-NEXT: movb $-3, %al +; SKX-NEXT: kmovw %eax, %k0 +; SKX-NEXT: kmovw %edi, %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftrb $7, %k1, %k1 +; SKX-NEXT: kandw %k0, %k1, %k0 +; SKX-NEXT: kmovw %esi, %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftrb $6, %k1, %k1 +; SKX-NEXT: korw %k1, %k0, %k0 +; SKX-NEXT: movb $-5, %al +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: kandw %k1, %k0, %k0 +; SKX-NEXT: kmovw %edx, %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k1, %k1 +; SKX-NEXT: korw %k1, %k0, %k1 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; SKX-NEXT: testb $1, %dl -; SKX-NEXT: jne .LBB31_1 -; SKX-NEXT: # %bb.2: # %else -; SKX-NEXT: testb $2, %dl -; SKX-NEXT: jne .LBB31_3 -; SKX-NEXT: .LBB31_4: # %else2 -; SKX-NEXT: testb $4, %dl -; SKX-NEXT: jne .LBB31_5 -; SKX-NEXT: .LBB31_6: # %else5 -; SKX-NEXT: vmovdqa %xmm2, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq -; SKX-NEXT: .LBB31_1: # %cond.load -; SKX-NEXT: vmovq %xmm0, %rax -; SKX-NEXT: vpinsrd $0, (%rax), %xmm2, %xmm2 -; SKX-NEXT: testb $2, %dl -; SKX-NEXT: je .LBB31_4 -; SKX-NEXT: .LBB31_3: # %cond.load1 -; SKX-NEXT: vpextrq $1, %xmm0, %rax -; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 -; SKX-NEXT: testb $4, %dl -; SKX-NEXT: je .LBB31_6 -; SKX-NEXT: .LBB31_5: # %cond.load4 -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vmovq %xmm0, %rax -; SKX-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2 +; SKX-NEXT: vpgatherqd (,%ymm0), %xmm2 {%k1} ; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test30: ; SKX_32: # %bb.0: -; SKX_32-NEXT: pushl %eax -; SKX_32-NEXT: .cfi_def_cfa_offset 8 +; SKX_32-NEXT: movb $-3, %al +; SKX_32-NEXT: kmovw %eax, %k0 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al -; SKX_32-NEXT: andb $1, %al -; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %cl -; SKX_32-NEXT: andb $1, %cl -; SKX_32-NEXT: addb %cl, %cl -; SKX_32-NEXT: orb %al, %cl +; SKX_32-NEXT: kmovw %eax, %k1 +; SKX_32-NEXT: kshiftlb $7, %k1, %k1 +; SKX_32-NEXT: kshiftrb $7, %k1, %k1 +; SKX_32-NEXT: kandw %k0, %k1, %k0 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al -; SKX_32-NEXT: andb $1, %al -; SKX_32-NEXT: shlb $2, %al -; SKX_32-NEXT: orb %cl, %al +; SKX_32-NEXT: kmovw %eax, %k1 +; SKX_32-NEXT: kshiftlb $7, %k1, %k1 +; SKX_32-NEXT: kshiftrb $6, %k1, %k1 +; SKX_32-NEXT: korw %k1, %k0, %k0 +; SKX_32-NEXT: movb $-5, %al +; SKX_32-NEXT: kmovw %eax, %k1 +; SKX_32-NEXT: kandw %k1, %k0, %k0 +; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al +; SKX_32-NEXT: kmovw %eax, %k1 +; SKX_32-NEXT: kshiftlb $7, %k1, %k1 +; SKX_32-NEXT: kshiftrb $5, %k1, %k1 +; SKX_32-NEXT: korw %k1, %k0, %k1 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; SKX_32-NEXT: testb $1, %al -; SKX_32-NEXT: jne .LBB31_1 -; SKX_32-NEXT: # %bb.2: # %else -; SKX_32-NEXT: testb $2, %al -; SKX_32-NEXT: jne .LBB31_3 -; SKX_32-NEXT: .LBB31_4: # %else2 -; SKX_32-NEXT: testb $4, %al -; SKX_32-NEXT: je .LBB31_6 -; SKX_32-NEXT: .LBB31_5: # %cond.load4 -; SKX_32-NEXT: vpextrd $2, %xmm0, %eax -; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2 -; SKX_32-NEXT: .LBB31_6: # %else5 +; SKX_32-NEXT: vpgatherdd (,%xmm0), %xmm2 {%k1} ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 -; SKX_32-NEXT: popl %eax -; SKX_32-NEXT: .cfi_def_cfa_offset 4 ; SKX_32-NEXT: retl -; SKX_32-NEXT: .LBB31_1: # %cond.load -; SKX_32-NEXT: .cfi_def_cfa_offset 8 -; SKX_32-NEXT: vmovd %xmm0, %ecx -; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2 -; SKX_32-NEXT: testb $2, %al -; SKX_32-NEXT: je .LBB31_4 -; SKX_32-NEXT: .LBB31_3: # %cond.load1 -; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx -; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm2, %xmm2 -; SKX_32-NEXT: testb $4, %al -; SKX_32-NEXT: jne .LBB31_5 -; SKX_32-NEXT: jmp .LBB31_6 %sext_ind = sext <3 x i32> %ind to <3 x i64> %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind @@ -1812,6 +1752,125 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ret <3 x i32>%res } +; Non-power of 2 scatter +declare void @llvm.masked.scatter.v3i32.v3p0i32(<3 x i32>, <3 x i32*>, i32, <3 x i1>) +define void @test30b(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { +; KNL_64-LABEL: test30b: +; KNL_64: # %bb.0: +; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; KNL_64-NEXT: movw $-3, %ax +; KNL_64-NEXT: kmovw %eax, %k0 +; KNL_64-NEXT: andl $1, %edi +; KNL_64-NEXT: kmovw %edi, %k1 +; KNL_64-NEXT: kandw %k0, %k1, %k0 +; KNL_64-NEXT: kmovw %esi, %k1 +; KNL_64-NEXT: kshiftlw $15, %k1, %k1 +; KNL_64-NEXT: kshiftrw $14, %k1, %k1 +; KNL_64-NEXT: korw %k1, %k0, %k0 +; KNL_64-NEXT: movw $-5, %ax +; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: kandw %k1, %k0, %k0 +; KNL_64-NEXT: kmovw %edx, %k1 +; KNL_64-NEXT: kshiftlw $15, %k1, %k1 +; KNL_64-NEXT: kshiftrw $13, %k1, %k1 +; KNL_64-NEXT: korw %k1, %k0, %k0 +; KNL_64-NEXT: kshiftlw $12, %k0, %k0 +; KNL_64-NEXT: kshiftrw $12, %k0, %k1 +; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 +; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 +; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; KNL_64-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1} +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test30b: +; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; KNL_32-NEXT: movw $-3, %ax +; KNL_32-NEXT: kmovw %eax, %k0 +; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_32-NEXT: andl $1, %eax +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: kandw %k0, %k1, %k0 +; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: kshiftlw $15, %k1, %k1 +; KNL_32-NEXT: kshiftrw $14, %k1, %k1 +; KNL_32-NEXT: korw %k1, %k0, %k0 +; KNL_32-NEXT: movw $-5, %ax +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: kandw %k1, %k0, %k0 +; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: kshiftlw $15, %k1, %k1 +; KNL_32-NEXT: kshiftrw $13, %k1, %k1 +; KNL_32-NEXT: korw %k1, %k0, %k0 +; KNL_32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_32-NEXT: kshiftrw $12, %k0, %k1 +; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 +; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; +; SKX-LABEL: test30b: +; SKX: # %bb.0: +; SKX-NEXT: movb $-3, %al +; SKX-NEXT: kmovw %eax, %k0 +; SKX-NEXT: kmovw %edi, %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftrb $7, %k1, %k1 +; SKX-NEXT: kandw %k0, %k1, %k0 +; SKX-NEXT: kmovw %esi, %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftrb $6, %k1, %k1 +; SKX-NEXT: korw %k1, %k0, %k0 +; SKX-NEXT: movb $-5, %al +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: kandw %k1, %k0, %k0 +; SKX-NEXT: kmovw %edx, %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k1, %k1 +; SKX-NEXT: korw %k1, %k0, %k1 +; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 +; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 +; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpscatterqd %xmm2, (,%ymm0) {%k1} +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; +; SKX_32-LABEL: test30b: +; SKX_32: # %bb.0: +; SKX_32-NEXT: movb $-3, %al +; SKX_32-NEXT: kmovw %eax, %k0 +; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al +; SKX_32-NEXT: kmovw %eax, %k1 +; SKX_32-NEXT: kshiftlb $7, %k1, %k1 +; SKX_32-NEXT: kshiftrb $7, %k1, %k1 +; SKX_32-NEXT: kandw %k0, %k1, %k0 +; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al +; SKX_32-NEXT: kmovw %eax, %k1 +; SKX_32-NEXT: kshiftlb $7, %k1, %k1 +; SKX_32-NEXT: kshiftrb $6, %k1, %k1 +; SKX_32-NEXT: korw %k1, %k0, %k0 +; SKX_32-NEXT: movb $-5, %al +; SKX_32-NEXT: kmovw %eax, %k1 +; SKX_32-NEXT: kandw %k1, %k0, %k0 +; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al +; SKX_32-NEXT: kmovw %eax, %k1 +; SKX_32-NEXT: kshiftlb $7, %k1, %k1 +; SKX_32-NEXT: kshiftrb $5, %k1, %k1 +; SKX_32-NEXT: korw %k1, %k0, %k1 +; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 +; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; SKX_32-NEXT: vpscatterdd %xmm2, (,%xmm0) {%k1} +; SKX_32-NEXT: retl + %sext_ind = sext <3 x i32> %ind to <3 x i64> + %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind + call void @llvm.masked.scatter.v3i32.v3p0i32(<3 x i32> %src0, <3 x i32*> %gep.random, i32 4, <3 x i1> %mask) + ret void +} + declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>) define <16 x float*> @test31(<16 x float**> %ptrs) { ; KNL_64-LABEL: test31: @@ -2483,41 +2542,41 @@ define void @v1_scatter(<1 x i32>%a1, <1 x i32*> %ptr, <1 x i1> %mask) { ; KNL_64-LABEL: v1_scatter: ; KNL_64: # %bb.0: ; KNL_64-NEXT: testb $1, %dl -; KNL_64-NEXT: je .LBB44_2 +; KNL_64-NEXT: je .LBB45_2 ; KNL_64-NEXT: # %bb.1: # %cond.store ; KNL_64-NEXT: movl %edi, (%rsi) -; KNL_64-NEXT: .LBB44_2: # %else +; KNL_64-NEXT: .LBB45_2: # %else ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: v1_scatter: ; KNL_32: # %bb.0: ; KNL_32-NEXT: testb $1, {{[0-9]+}}(%esp) -; KNL_32-NEXT: je .LBB44_2 +; KNL_32-NEXT: je .LBB45_2 ; KNL_32-NEXT: # %bb.1: # %cond.store ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; KNL_32-NEXT: movl %ecx, (%eax) -; KNL_32-NEXT: .LBB44_2: # %else +; KNL_32-NEXT: .LBB45_2: # %else ; KNL_32-NEXT: retl ; ; SKX-LABEL: v1_scatter: ; SKX: # %bb.0: ; SKX-NEXT: testb $1, %dl -; SKX-NEXT: je .LBB44_2 +; SKX-NEXT: je .LBB45_2 ; SKX-NEXT: # %bb.1: # %cond.store ; SKX-NEXT: movl %edi, (%rsi) -; SKX-NEXT: .LBB44_2: # %else +; SKX-NEXT: .LBB45_2: # %else ; SKX-NEXT: retq ; ; SKX_32-LABEL: v1_scatter: ; SKX_32: # %bb.0: ; SKX_32-NEXT: testb $1, {{[0-9]+}}(%esp) -; SKX_32-NEXT: je .LBB44_2 +; SKX_32-NEXT: je .LBB45_2 ; SKX_32-NEXT: # %bb.1: # %cond.store ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; SKX_32-NEXT: movl %ecx, (%eax) -; SKX_32-NEXT: .LBB44_2: # %else +; SKX_32-NEXT: .LBB45_2: # %else ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> %a1, <1 x i32*> %ptr, i32 4, <1 x i1> %mask) ret void @@ -3421,3 +3480,50 @@ define void @splat_ptr_scatter(i32* %ptr, <4 x i1> %mask, <4 x i32> %val) { ret void } +%struct.foo = type { i8*, i64, i16, i16, i32 } + +; This used to cause fast-isel to generate bad copy instructions that would +; cause an error in copyPhysReg. +define <8 x i64> @pr45906(<8 x %struct.foo*> %ptr) { +; KNL_64-LABEL: pr45906: +; KNL_64: # %bb.0: # %bb +; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm0 {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: pr45906: +; KNL_32: # %bb.0: # %bb +; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] +; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vpgatherdq (,%ymm1), %zmm0 {%k1} +; KNL_32-NEXT: retl +; +; SKX_SMALL-LABEL: pr45906: +; SKX_SMALL: # %bb.0: # %bb +; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT: vpgatherqq (,%zmm1), %zmm0 {%k1} +; SKX_SMALL-NEXT: retq +; +; SKX_LARGE-LABEL: pr45906: +; SKX_LARGE: # %bb.0: # %bb +; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax +; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1 +; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT: vpgatherqq (,%zmm1), %zmm0 {%k1} +; SKX_LARGE-NEXT: retq +; +; SKX_32-LABEL: pr45906: +; SKX_32: # %bb.0: # %bb +; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1 +; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpgatherdq (,%ymm1), %zmm0 {%k1} +; SKX_32-NEXT: retl +bb: + %tmp = getelementptr inbounds %struct.foo, <8 x %struct.foo*> %ptr, i64 0, i32 1 + %tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %tmp, i32 8, <8 x i1> , <8 x i64> undef) + ret <8 x i64> %tmp1 +} +declare <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*>, i32, <8 x i1>, <8 x i64>) diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 75e41618263ea..d15b7f4d0c649 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -6171,25 +6171,10 @@ define <4 x float> @mload_constmask_v4f32_all(<4 x float>* %addr) { ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: mload_constmask_v4f32_all: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vmovups (%rdi), %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: mload_constmask_v4f32_all: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: movw $15, %ax -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: mload_constmask_v4f32_all: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: kxnorw %k0, %k0, %k1 -; AVX512VL-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} -; AVX512VL-NEXT: retq +; AVX-LABEL: mload_constmask_v4f32_all: +; AVX: ## %bb.0: +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: retq %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> , <4 x float>undef) ret <4 x float> %res } @@ -6573,6 +6558,69 @@ define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %ds ret <8 x double> %res } +; Make sure we detect the mask is all ones after type +; legalization to use an unmasked load for some of the avx512 instructions. +define <16 x double> @mload_constmask_v16f64_allones_split(<16 x double>* %addr, <16 x double> %dst) { +; SSE-LABEL: mload_constmask_v16f64_allones_split: +; SSE: ## %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movups (%rsi), %xmm0 +; SSE-NEXT: movups 16(%rsi), %xmm1 +; SSE-NEXT: movups 32(%rsi), %xmm2 +; SSE-NEXT: movups 48(%rsi), %xmm3 +; SSE-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] +; SSE-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; SSE-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; SSE-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] +; SSE-NEXT: movaps %xmm7, 112(%rdi) +; SSE-NEXT: movaps %xmm6, 96(%rdi) +; SSE-NEXT: movaps %xmm5, 80(%rdi) +; SSE-NEXT: movaps %xmm4, 64(%rdi) +; SSE-NEXT: movaps %xmm3, 48(%rdi) +; SSE-NEXT: movaps %xmm2, 32(%rdi) +; SSE-NEXT: movaps %xmm1, 16(%rdi) +; SSE-NEXT: movaps %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: mload_constmask_v16f64_allones_split: +; AVX1OR2: ## %bb.0: +; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0] +; AVX1OR2-NEXT: ## ymm0 = mem[0,1,0,1] +; AVX1OR2-NEXT: vmaskmovpd 64(%rdi), %ymm0, %ymm1 +; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1OR2-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm0 +; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; AVX1OR2-NEXT: vmovups (%rdi), %ymm0 +; AVX1OR2-NEXT: vmovups 32(%rdi), %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: mload_constmask_v16f64_allones_split: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: movb $85, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovupd 64(%rdi), %zmm1 {%k1} +; AVX512F-NEXT: vmovups (%rdi), %zmm0 +; AVX512F-NEXT: retq +; +; AVX512VLDQ-LABEL: mload_constmask_v16f64_allones_split: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $85, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vmovupd 64(%rdi), %zmm1 {%k1} +; AVX512VLDQ-NEXT: vmovups (%rdi), %zmm0 +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: mload_constmask_v16f64_allones_split: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: movb $85, %al +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vmovupd 64(%rdi), %zmm1 {%k1} +; AVX512VLBW-NEXT: vmovups (%rdi), %zmm0 +; AVX512VLBW-NEXT: retq + %res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %addr, i32 4, <16 x i1> , <16 x double> %dst) + ret <16 x double> %res +} + ; If the pass-through operand is undef, no blend is needed. define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) { @@ -6788,20 +6836,20 @@ define i32 @pr38986(i1 %c, i32* %p) { ; SSE: ## %bb.0: ; SSE-NEXT: testb $1, %dil ; SSE-NEXT: ## implicit-def: $eax -; SSE-NEXT: je LBB43_2 +; SSE-NEXT: je LBB44_2 ; SSE-NEXT: ## %bb.1: ## %cond.load ; SSE-NEXT: movl (%rsi), %eax -; SSE-NEXT: LBB43_2: ## %else +; SSE-NEXT: LBB44_2: ## %else ; SSE-NEXT: retq ; ; AVX-LABEL: pr38986: ; AVX: ## %bb.0: ; AVX-NEXT: testb $1, %dil ; AVX-NEXT: ## implicit-def: $eax -; AVX-NEXT: je LBB43_2 +; AVX-NEXT: je LBB44_2 ; AVX-NEXT: ## %bb.1: ## %cond.load ; AVX-NEXT: movl (%rsi), %eax -; AVX-NEXT: LBB43_2: ## %else +; AVX-NEXT: LBB44_2: ## %else ; AVX-NEXT: retq %vc = insertelement <1 x i1> undef, i1 %c, i32 0 %vp = bitcast i32* %p to <1 x i32>* @@ -6822,6 +6870,7 @@ define <2 x double> @zero_mask(<2 x double>* %addr, <2 x double> %dst) { ret <2 x double> %res } +declare <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>*, i32, <16 x i1>, <16 x double>) declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 380891847a5c2..992ef96fd2e87 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -4504,34 +4504,102 @@ define void @mstore_constmask_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, ; SSE-NEXT: movups %xmm1, (%rdi) ; SSE-NEXT: retq ; -; AVX1-LABEL: mstore_constmask_v4i32_v4i32: +; AVX-LABEL: mstore_constmask_v4i32_v4i32: +; AVX: ## %bb.0: +; AVX-NEXT: vmovups %xmm1, (%rdi) +; AVX-NEXT: retq + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>) + ret void +} + +; Make sure we are able to detect all ones constant mask after type legalization +; to avoid masked stores. +define void @mstore_constmask_allones_split(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) { +; SSE2-LABEL: mstore_constmask_allones_split: +; SSE2: ## %bb.0: +; SSE2-NEXT: movd %xmm4, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE2-NEXT: movd %xmm0, 4(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE2-NEXT: movd %xmm0, 12(%rdi) +; SSE2-NEXT: movd %xmm5, 16(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE2-NEXT: movd %xmm0, 24(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE2-NEXT: movd %xmm0, 28(%rdi) +; SSE2-NEXT: movd %xmm6, 32(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE2-NEXT: movd %xmm0, 36(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE2-NEXT: movd %xmm0, 40(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE2-NEXT: movd %xmm0, 44(%rdi) +; SSE2-NEXT: movd %xmm7, 48(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE2-NEXT: movd %xmm0, 52(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE2-NEXT: movd %xmm0, 56(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE2-NEXT: movd %xmm0, 60(%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: mstore_constmask_allones_split: +; SSE4: ## %bb.0: +; SSE4-NEXT: movss %xmm4, (%rdi) +; SSE4-NEXT: extractps $1, %xmm4, 4(%rdi) +; SSE4-NEXT: extractps $3, %xmm4, 12(%rdi) +; SSE4-NEXT: movd %xmm5, 16(%rdi) +; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: palignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; SSE4-NEXT: palignr {{.*#+}} xmm6 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; SSE4-NEXT: movdqu %xmm6, 24(%rdi) +; SSE4-NEXT: movdqu %xmm0, 40(%rdi) +; SSE4-NEXT: pextrd $2, %xmm7, 56(%rdi) +; SSE4-NEXT: pextrd $3, %xmm7, 60(%rdi) +; SSE4-NEXT: retq +; +; AVX1-LABEL: mstore_constmask_allones_split: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,0,4294967295,4294967295,0,4294967295,4294967295] +; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi) +; AVX1-NEXT: vmovups %ymm3, 32(%rdi) +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: mstore_constmask_v4i32_v4i32: +; AVX2-LABEL: mstore_constmask_allones_split: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967295,4294967295,0,4294967295,4294967295,0,4294967295,4294967295] +; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, (%rdi) +; AVX2-NEXT: vmovups %ymm3, 32(%rdi) +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: mstore_constmask_v4i32_v4i32: +; AVX512F-LABEL: mstore_constmask_allones_split: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: movw $15, %ax +; AVX512F-NEXT: movw $-37, %ax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: mstore_constmask_v4i32_v4i32: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: kxnorw %k0, %k0, %k1 -; AVX512VL-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} -; AVX512VL-NEXT: retq - %mask = icmp eq <4 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>) +; AVX512VLDQ-LABEL: mstore_constmask_allones_split: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movw $-37, %ax +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: mstore_constmask_allones_split: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: movw $-37, %ax +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} +; AVX512VLBW-NEXT: vzeroupper +; AVX512VLBW-NEXT: retq + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %val, <16 x i32>* %addr, i32 4, <16 x i1>) ret void } @@ -4642,31 +4710,31 @@ define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x doub ; SSE-NEXT: pslld $31, %xmm2 ; SSE-NEXT: movmskps %xmm2, %eax ; SSE-NEXT: testb $1, %al -; SSE-NEXT: jne LBB23_1 +; SSE-NEXT: jne LBB24_1 ; SSE-NEXT: ## %bb.2: ## %else ; SSE-NEXT: testb $2, %al -; SSE-NEXT: jne LBB23_3 -; SSE-NEXT: LBB23_4: ## %else2 +; SSE-NEXT: jne LBB24_3 +; SSE-NEXT: LBB24_4: ## %else2 ; SSE-NEXT: testb $4, %al -; SSE-NEXT: jne LBB23_5 -; SSE-NEXT: LBB23_6: ## %else4 +; SSE-NEXT: jne LBB24_5 +; SSE-NEXT: LBB24_6: ## %else4 ; SSE-NEXT: testb $8, %al -; SSE-NEXT: jne LBB23_7 -; SSE-NEXT: LBB23_8: ## %else6 +; SSE-NEXT: jne LBB24_7 +; SSE-NEXT: LBB24_8: ## %else6 ; SSE-NEXT: retq -; SSE-NEXT: LBB23_1: ## %cond.store +; SSE-NEXT: LBB24_1: ## %cond.store ; SSE-NEXT: movlps %xmm0, (%rdi) ; SSE-NEXT: testb $2, %al -; SSE-NEXT: je LBB23_4 -; SSE-NEXT: LBB23_3: ## %cond.store1 +; SSE-NEXT: je LBB24_4 +; SSE-NEXT: LBB24_3: ## %cond.store1 ; SSE-NEXT: movhps %xmm0, 8(%rdi) ; SSE-NEXT: testb $4, %al -; SSE-NEXT: je LBB23_6 -; SSE-NEXT: LBB23_5: ## %cond.store3 +; SSE-NEXT: je LBB24_6 +; SSE-NEXT: LBB24_5: ## %cond.store3 ; SSE-NEXT: movlps %xmm1, 16(%rdi) ; SSE-NEXT: testb $8, %al -; SSE-NEXT: je LBB23_8 -; SSE-NEXT: LBB23_7: ## %cond.store5 +; SSE-NEXT: je LBB24_8 +; SSE-NEXT: LBB24_7: ## %cond.store5 ; SSE-NEXT: movhps %xmm1, 24(%rdi) ; SSE-NEXT: retq ; @@ -4728,35 +4796,35 @@ define void @one_mask_bit_set1_variable(<4 x float>* %addr, <4 x float> %val, <4 ; SSE2: ## %bb.0: ; SSE2-NEXT: movmskps %xmm1, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: jne LBB24_1 +; SSE2-NEXT: jne LBB25_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al -; SSE2-NEXT: jne LBB24_3 -; SSE2-NEXT: LBB24_4: ## %else2 +; SSE2-NEXT: jne LBB25_3 +; SSE2-NEXT: LBB25_4: ## %else2 ; SSE2-NEXT: testb $4, %al -; SSE2-NEXT: jne LBB24_5 -; SSE2-NEXT: LBB24_6: ## %else4 +; SSE2-NEXT: jne LBB25_5 +; SSE2-NEXT: LBB25_6: ## %else4 ; SSE2-NEXT: testb $8, %al -; SSE2-NEXT: jne LBB24_7 -; SSE2-NEXT: LBB24_8: ## %else6 +; SSE2-NEXT: jne LBB25_7 +; SSE2-NEXT: LBB25_8: ## %else6 ; SSE2-NEXT: retq -; SSE2-NEXT: LBB24_1: ## %cond.store +; SSE2-NEXT: LBB25_1: ## %cond.store ; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: testb $2, %al -; SSE2-NEXT: je LBB24_4 -; SSE2-NEXT: LBB24_3: ## %cond.store1 +; SSE2-NEXT: je LBB25_4 +; SSE2-NEXT: LBB25_3: ## %cond.store1 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: movss %xmm1, 4(%rdi) ; SSE2-NEXT: testb $4, %al -; SSE2-NEXT: je LBB24_6 -; SSE2-NEXT: LBB24_5: ## %cond.store3 +; SSE2-NEXT: je LBB25_6 +; SSE2-NEXT: LBB25_5: ## %cond.store3 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE2-NEXT: movss %xmm1, 8(%rdi) ; SSE2-NEXT: testb $8, %al -; SSE2-NEXT: je LBB24_8 -; SSE2-NEXT: LBB24_7: ## %cond.store5 +; SSE2-NEXT: je LBB25_8 +; SSE2-NEXT: LBB25_7: ## %cond.store5 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: movss %xmm0, 12(%rdi) ; SSE2-NEXT: retq @@ -4765,31 +4833,31 @@ define void @one_mask_bit_set1_variable(<4 x float>* %addr, <4 x float> %val, <4 ; SSE4: ## %bb.0: ; SSE4-NEXT: movmskps %xmm1, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: jne LBB24_1 +; SSE4-NEXT: jne LBB25_1 ; SSE4-NEXT: ## %bb.2: ## %else ; SSE4-NEXT: testb $2, %al -; SSE4-NEXT: jne LBB24_3 -; SSE4-NEXT: LBB24_4: ## %else2 +; SSE4-NEXT: jne LBB25_3 +; SSE4-NEXT: LBB25_4: ## %else2 ; SSE4-NEXT: testb $4, %al -; SSE4-NEXT: jne LBB24_5 -; SSE4-NEXT: LBB24_6: ## %else4 +; SSE4-NEXT: jne LBB25_5 +; SSE4-NEXT: LBB25_6: ## %else4 ; SSE4-NEXT: testb $8, %al -; SSE4-NEXT: jne LBB24_7 -; SSE4-NEXT: LBB24_8: ## %else6 +; SSE4-NEXT: jne LBB25_7 +; SSE4-NEXT: LBB25_8: ## %else6 ; SSE4-NEXT: retq -; SSE4-NEXT: LBB24_1: ## %cond.store +; SSE4-NEXT: LBB25_1: ## %cond.store ; SSE4-NEXT: movss %xmm0, (%rdi) ; SSE4-NEXT: testb $2, %al -; SSE4-NEXT: je LBB24_4 -; SSE4-NEXT: LBB24_3: ## %cond.store1 +; SSE4-NEXT: je LBB25_4 +; SSE4-NEXT: LBB25_3: ## %cond.store1 ; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) ; SSE4-NEXT: testb $4, %al -; SSE4-NEXT: je LBB24_6 -; SSE4-NEXT: LBB24_5: ## %cond.store3 +; SSE4-NEXT: je LBB25_6 +; SSE4-NEXT: LBB25_5: ## %cond.store3 ; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) ; SSE4-NEXT: testb $8, %al -; SSE4-NEXT: je LBB24_8 -; SSE4-NEXT: LBB24_7: ## %cond.store5 +; SSE4-NEXT: je LBB25_8 +; SSE4-NEXT: LBB25_7: ## %cond.store5 ; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi) ; SSE4-NEXT: retq ; @@ -4834,25 +4902,25 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; SSE2-NEXT: shlb $2, %cl ; SSE2-NEXT: orb %dl, %cl ; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: jne LBB25_1 +; SSE2-NEXT: jne LBB26_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %cl -; SSE2-NEXT: jne LBB25_3 -; SSE2-NEXT: LBB25_4: ## %else2 +; SSE2-NEXT: jne LBB26_3 +; SSE2-NEXT: LBB26_4: ## %else2 ; SSE2-NEXT: testb $4, %cl -; SSE2-NEXT: jne LBB25_5 -; SSE2-NEXT: LBB25_6: ## %else4 +; SSE2-NEXT: jne LBB26_5 +; SSE2-NEXT: LBB26_6: ## %else4 ; SSE2-NEXT: retq -; SSE2-NEXT: LBB25_1: ## %cond.store +; SSE2-NEXT: LBB26_1: ## %cond.store ; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: testb $2, %cl -; SSE2-NEXT: je LBB25_4 -; SSE2-NEXT: LBB25_3: ## %cond.store1 +; SSE2-NEXT: je LBB26_4 +; SSE2-NEXT: LBB26_3: ## %cond.store1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: movd %xmm1, 4(%rdi) ; SSE2-NEXT: testb $4, %cl -; SSE2-NEXT: je LBB25_6 -; SSE2-NEXT: LBB25_5: ## %cond.store3 +; SSE2-NEXT: je LBB26_6 +; SSE2-NEXT: LBB26_5: ## %cond.store3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: retq @@ -4867,24 +4935,24 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; SSE4-NEXT: shlb $2, %cl ; SSE4-NEXT: orb %dl, %cl ; SSE4-NEXT: testb $1, %cl -; SSE4-NEXT: jne LBB25_1 +; SSE4-NEXT: jne LBB26_1 ; SSE4-NEXT: ## %bb.2: ## %else ; SSE4-NEXT: testb $2, %cl -; SSE4-NEXT: jne LBB25_3 -; SSE4-NEXT: LBB25_4: ## %else2 +; SSE4-NEXT: jne LBB26_3 +; SSE4-NEXT: LBB26_4: ## %else2 ; SSE4-NEXT: testb $4, %cl -; SSE4-NEXT: jne LBB25_5 -; SSE4-NEXT: LBB25_6: ## %else4 +; SSE4-NEXT: jne LBB26_5 +; SSE4-NEXT: LBB26_6: ## %else4 ; SSE4-NEXT: retq -; SSE4-NEXT: LBB25_1: ## %cond.store +; SSE4-NEXT: LBB26_1: ## %cond.store ; SSE4-NEXT: movss %xmm0, (%rdi) ; SSE4-NEXT: testb $2, %cl -; SSE4-NEXT: je LBB25_4 -; SSE4-NEXT: LBB25_3: ## %cond.store1 +; SSE4-NEXT: je LBB26_4 +; SSE4-NEXT: LBB26_3: ## %cond.store1 ; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) ; SSE4-NEXT: testb $4, %cl -; SSE4-NEXT: je LBB25_6 -; SSE4-NEXT: LBB25_5: ## %cond.store3 +; SSE4-NEXT: je LBB26_6 +; SSE4-NEXT: LBB26_5: ## %cond.store3 ; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) ; SSE4-NEXT: retq ; @@ -4998,68 +5066,68 @@ define void @PR11210(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <2 x i64 ; SSE2: ## %bb.0: ; SSE2-NEXT: movmskps %xmm2, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: jne LBB27_1 +; SSE2-NEXT: jne LBB28_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al -; SSE2-NEXT: jne LBB27_3 -; SSE2-NEXT: LBB27_4: ## %else2 +; SSE2-NEXT: jne LBB28_3 +; SSE2-NEXT: LBB28_4: ## %else2 ; SSE2-NEXT: testb $4, %al -; SSE2-NEXT: jne LBB27_5 -; SSE2-NEXT: LBB27_6: ## %else4 +; SSE2-NEXT: jne LBB28_5 +; SSE2-NEXT: LBB28_6: ## %else4 ; SSE2-NEXT: testb $8, %al -; SSE2-NEXT: jne LBB27_7 -; SSE2-NEXT: LBB27_8: ## %else6 +; SSE2-NEXT: jne LBB28_7 +; SSE2-NEXT: LBB28_8: ## %else6 ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: jne LBB27_9 -; SSE2-NEXT: LBB27_10: ## %else9 +; SSE2-NEXT: jne LBB28_9 +; SSE2-NEXT: LBB28_10: ## %else9 ; SSE2-NEXT: testb $2, %al -; SSE2-NEXT: jne LBB27_11 -; SSE2-NEXT: LBB27_12: ## %else11 +; SSE2-NEXT: jne LBB28_11 +; SSE2-NEXT: LBB28_12: ## %else11 ; SSE2-NEXT: testb $4, %al -; SSE2-NEXT: jne LBB27_13 -; SSE2-NEXT: LBB27_14: ## %else13 +; SSE2-NEXT: jne LBB28_13 +; SSE2-NEXT: LBB28_14: ## %else13 ; SSE2-NEXT: testb $8, %al -; SSE2-NEXT: jne LBB27_15 -; SSE2-NEXT: LBB27_16: ## %else15 +; SSE2-NEXT: jne LBB28_15 +; SSE2-NEXT: LBB28_16: ## %else15 ; SSE2-NEXT: retq -; SSE2-NEXT: LBB27_1: ## %cond.store +; SSE2-NEXT: LBB28_1: ## %cond.store ; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: testb $2, %al -; SSE2-NEXT: je LBB27_4 -; SSE2-NEXT: LBB27_3: ## %cond.store1 +; SSE2-NEXT: je LBB28_4 +; SSE2-NEXT: LBB28_3: ## %cond.store1 ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movss %xmm2, 4(%rdi) ; SSE2-NEXT: testb $4, %al -; SSE2-NEXT: je LBB27_6 -; SSE2-NEXT: LBB27_5: ## %cond.store3 +; SSE2-NEXT: je LBB28_6 +; SSE2-NEXT: LBB28_5: ## %cond.store3 ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE2-NEXT: movss %xmm2, 8(%rdi) ; SSE2-NEXT: testb $8, %al -; SSE2-NEXT: je LBB27_8 -; SSE2-NEXT: LBB27_7: ## %cond.store5 +; SSE2-NEXT: je LBB28_8 +; SSE2-NEXT: LBB28_7: ## %cond.store5 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: movss %xmm0, 12(%rdi) ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB27_10 -; SSE2-NEXT: LBB27_9: ## %cond.store8 +; SSE2-NEXT: je LBB28_10 +; SSE2-NEXT: LBB28_9: ## %cond.store8 ; SSE2-NEXT: movss %xmm1, (%rdi) ; SSE2-NEXT: testb $2, %al -; SSE2-NEXT: je LBB27_12 -; SSE2-NEXT: LBB27_11: ## %cond.store10 +; SSE2-NEXT: je LBB28_12 +; SSE2-NEXT: LBB28_11: ## %cond.store10 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] ; SSE2-NEXT: movss %xmm0, 4(%rdi) ; SSE2-NEXT: testb $4, %al -; SSE2-NEXT: je LBB27_14 -; SSE2-NEXT: LBB27_13: ## %cond.store12 +; SSE2-NEXT: je LBB28_14 +; SSE2-NEXT: LBB28_13: ## %cond.store12 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE2-NEXT: movss %xmm0, 8(%rdi) ; SSE2-NEXT: testb $8, %al -; SSE2-NEXT: je LBB27_16 -; SSE2-NEXT: LBB27_15: ## %cond.store14 +; SSE2-NEXT: je LBB28_16 +; SSE2-NEXT: LBB28_15: ## %cond.store14 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: movss %xmm1, 12(%rdi) ; SSE2-NEXT: retq @@ -5068,59 +5136,59 @@ define void @PR11210(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <2 x i64 ; SSE4: ## %bb.0: ; SSE4-NEXT: movmskps %xmm2, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: jne LBB27_1 +; SSE4-NEXT: jne LBB28_1 ; SSE4-NEXT: ## %bb.2: ## %else ; SSE4-NEXT: testb $2, %al -; SSE4-NEXT: jne LBB27_3 -; SSE4-NEXT: LBB27_4: ## %else2 +; SSE4-NEXT: jne LBB28_3 +; SSE4-NEXT: LBB28_4: ## %else2 ; SSE4-NEXT: testb $4, %al -; SSE4-NEXT: jne LBB27_5 -; SSE4-NEXT: LBB27_6: ## %else4 +; SSE4-NEXT: jne LBB28_5 +; SSE4-NEXT: LBB28_6: ## %else4 ; SSE4-NEXT: testb $8, %al -; SSE4-NEXT: jne LBB27_7 -; SSE4-NEXT: LBB27_8: ## %else6 +; SSE4-NEXT: jne LBB28_7 +; SSE4-NEXT: LBB28_8: ## %else6 ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: jne LBB27_9 -; SSE4-NEXT: LBB27_10: ## %else9 +; SSE4-NEXT: jne LBB28_9 +; SSE4-NEXT: LBB28_10: ## %else9 ; SSE4-NEXT: testb $2, %al -; SSE4-NEXT: jne LBB27_11 -; SSE4-NEXT: LBB27_12: ## %else11 +; SSE4-NEXT: jne LBB28_11 +; SSE4-NEXT: LBB28_12: ## %else11 ; SSE4-NEXT: testb $4, %al -; SSE4-NEXT: jne LBB27_13 -; SSE4-NEXT: LBB27_14: ## %else13 +; SSE4-NEXT: jne LBB28_13 +; SSE4-NEXT: LBB28_14: ## %else13 ; SSE4-NEXT: testb $8, %al -; SSE4-NEXT: jne LBB27_15 -; SSE4-NEXT: LBB27_16: ## %else15 +; SSE4-NEXT: jne LBB28_15 +; SSE4-NEXT: LBB28_16: ## %else15 ; SSE4-NEXT: retq -; SSE4-NEXT: LBB27_1: ## %cond.store +; SSE4-NEXT: LBB28_1: ## %cond.store ; SSE4-NEXT: movss %xmm0, (%rdi) ; SSE4-NEXT: testb $2, %al -; SSE4-NEXT: je LBB27_4 -; SSE4-NEXT: LBB27_3: ## %cond.store1 +; SSE4-NEXT: je LBB28_4 +; SSE4-NEXT: LBB28_3: ## %cond.store1 ; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) ; SSE4-NEXT: testb $4, %al -; SSE4-NEXT: je LBB27_6 -; SSE4-NEXT: LBB27_5: ## %cond.store3 +; SSE4-NEXT: je LBB28_6 +; SSE4-NEXT: LBB28_5: ## %cond.store3 ; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) ; SSE4-NEXT: testb $8, %al -; SSE4-NEXT: je LBB27_8 -; SSE4-NEXT: LBB27_7: ## %cond.store5 +; SSE4-NEXT: je LBB28_8 +; SSE4-NEXT: LBB28_7: ## %cond.store5 ; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi) ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB27_10 -; SSE4-NEXT: LBB27_9: ## %cond.store8 +; SSE4-NEXT: je LBB28_10 +; SSE4-NEXT: LBB28_9: ## %cond.store8 ; SSE4-NEXT: movss %xmm1, (%rdi) ; SSE4-NEXT: testb $2, %al -; SSE4-NEXT: je LBB27_12 -; SSE4-NEXT: LBB27_11: ## %cond.store10 +; SSE4-NEXT: je LBB28_12 +; SSE4-NEXT: LBB28_11: ## %cond.store10 ; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi) ; SSE4-NEXT: testb $4, %al -; SSE4-NEXT: je LBB27_14 -; SSE4-NEXT: LBB27_13: ## %cond.store12 +; SSE4-NEXT: je LBB28_14 +; SSE4-NEXT: LBB28_13: ## %cond.store12 ; SSE4-NEXT: extractps $2, %xmm1, 8(%rdi) ; SSE4-NEXT: testb $8, %al -; SSE4-NEXT: je LBB27_16 -; SSE4-NEXT: LBB27_15: ## %cond.store14 +; SSE4-NEXT: je LBB28_16 +; SSE4-NEXT: LBB28_15: ## %cond.store14 ; SSE4-NEXT: extractps $3, %xmm1, 12(%rdi) ; SSE4-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/memcpy-inline-fsrm.ll b/llvm/test/CodeGen/X86/memcpy-inline-fsrm.ll new file mode 100644 index 0000000000000..54f7973dea39a --- /dev/null +++ b/llvm/test/CodeGen/X86/memcpy-inline-fsrm.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -x86-use-fsrm-for-memcpy -mattr=-fsrm < %s -o - | FileCheck %s --check-prefix=NOFSRM +; RUN: llc -mtriple=x86_64-linux-gnu -x86-use-fsrm-for-memcpy -mattr=+fsrm < %s -o - | FileCheck %s --check-prefix=FSRM +; RUN: llc -mtriple=x86_64-linux-gnu -x86-use-fsrm-for-memcpy -mcpu=haswell < %s | FileCheck %s --check-prefix=NOFSRM +; RUN: llc -mtriple=x86_64-linux-gnu -x86-use-fsrm-for-memcpy -mcpu=icelake-client < %s | FileCheck %s --check-prefix=FSRM +; RUN: llc -mtriple=x86_64-linux-gnu -x86-use-fsrm-for-memcpy -mcpu=icelake-server < %s | FileCheck %s --check-prefix=FSRM + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind + +define void @test1(i8* %a, i8* %b, i64 %s) nounwind { +; NOFSRM-LABEL: test1 +; NOFSRM: # %bb.0: +; NOFSRM: jmp memcpy +; +; FSRM-LABEL: test1 +; FSRM: # %bb.0: +; FSRM-NEXT: movq %rdx, %rcx +; FSRM-NEXT: rep;movsb (%rsi), %es:(%rdi) +; FSRM-NEXT: retq + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 %s, i1 0) + ret void +} + +; Check that we don't crash due to a memcpy size type mismatch error ("Cannot +; emit physreg copy instruction") in X86InstrInfo::copyPhysReg. +%struct = type { [4096 x i8] } +declare void @foo(%struct* byval) +define void @test2(%struct* %x) { + call void @foo(%struct* byval %x) + ret void +} diff --git a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll index ac55e1a1fc653..a1ad7f3c0f534 100644 --- a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll +++ b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll @@ -69,8 +69,8 @@ define dso_local void @test_zero_ext(%struct.Foo* %f, i32 addrspace(271)* %i) { ; CHECK-O0-LABEL: test_zero_ext: ; CHECK-O0: # %bb.0: # %entry ; CHECK-O0-NEXT: movl %edx, %eax -; CHECK-O0-NEXT: # kill: def $rax killed $eax -; CHECK-O0-NEXT: movq %rax, 8(%rcx) +; CHECK-O0-NEXT: movl %eax, %r8d +; CHECK-O0-NEXT: movq %r8, 8(%rcx) ; CHECK-O0-NEXT: jmp use_foo # TAILCALL entry: %0 = addrspacecast i32 addrspace(271)* %i to i32* @@ -125,23 +125,19 @@ entry: ; Test that null can be passed as a 32-bit pointer. define dso_local void @test_null_arg(%struct.Foo* %f) { -; CHECK-LABEL: test_null_arg: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $40, %rsp -; CHECK: xorl %edx, %edx -; CHECK-NEXT: callq test_noop1 -; CHECK-NEXT: nop -; CHECK-NEXT: addq $40, %rsp -; CHECK-NEXT: retq -; -; CHECK-O0-LABEL: test_null_arg: -; CHECK-O0: # %bb.0: # %entry -; CHECK-O0-NEXT: subq $40, %rsp -; CHECK-O0: xorl %edx, %edx -; CHECK-O0-NEXT: callq test_noop1 -; CHECK-O0-NEXT: nop -; CHECK-O0-NEXT: addq $40, %rsp -; CHECK-O0-NEXT: retq +; ALL-LABEL: test_null_arg: +; ALL: # %bb.0: # %entry +; ALL-NEXT: subq $40, %rsp +; ALL-NEXT: .seh_stackalloc 40 +; ALL-NEXT: .seh_endprologue +; ALL-NEXT: xorl %edx, %edx +; ALL-NEXT: callq test_noop1 +; ALL-NEXT: nop +; ALL-NEXT: addq $40, %rsp +; ALL-NEXT: retq +; ALL-NEXT: .seh_handlerdata +; ALL-NEXT: .text +; ALL-NEXT: .seh_endproc entry: call void @test_noop1(%struct.Foo* %f, i32 addrspace(270)* null) ret void @@ -177,8 +173,8 @@ define void @test_unrecognized2(%struct.Foo* %f, i32 addrspace(271)* %i) { ; CHECK-O0-LABEL: test_unrecognized2: ; CHECK-O0: # %bb.0: # %entry ; CHECK-O0-NEXT: movl %edx, %eax -; CHECK-O0-NEXT: # kill: def $rax killed $eax -; CHECK-O0-NEXT: movq %rax, 16(%rcx) +; CHECK-O0-NEXT: movl %eax, %r8d +; CHECK-O0-NEXT: movq %r8, 16(%rcx) ; CHECK-O0-NEXT: jmp use_foo # TAILCALL entry: %0 = addrspacecast i32 addrspace(271)* %i to i32 addrspace(9)* @@ -189,16 +185,11 @@ entry: } define i32 @test_load_sptr32(i32 addrspace(270)* %i) { -; CHECK-LABEL: test_load_sptr32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movslq %ecx, %rax -; CHECK-NEXT: movl (%rax), %eax -; CHECK-NEXT: retq -; CHECK-O0-LABEL: test_load_sptr32: -; CHECK-O0: # %bb.0: # %entry -; CHECK-O0-NEXT: movslq %ecx, %rax -; CHECK-O0-NEXT: movl (%rax), %eax -; CHECK-O0-NEXT: retq +; ALL-LABEL: test_load_sptr32: +; ALL: # %bb.0: # %entry +; ALL-NEXT: movslq %ecx, %rax +; ALL-NEXT: movl (%rax), %eax +; ALL-NEXT: retq entry: %0 = load i32, i32 addrspace(270)* %i, align 4 ret i32 %0 @@ -210,11 +201,12 @@ define i32 @test_load_uptr32(i32 addrspace(271)* %i) { ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl (%rax), %eax ; CHECK-NEXT: retq +; ; CHECK-O0-LABEL: test_load_uptr32: ; CHECK-O0: # %bb.0: # %entry ; CHECK-O0-NEXT: movl %ecx, %eax -; CHECK-O0-NEXT: # kill: def $rax killed $eax -; CHECK-O0-NEXT: movl (%rax), %eax +; CHECK-O0-NEXT: movl %eax, %edx +; CHECK-O0-NEXT: movl (%rdx), %eax ; CHECK-O0-NEXT: retq entry: %0 = load i32, i32 addrspace(271)* %i, align 4 @@ -222,30 +214,21 @@ entry: } define i32 @test_load_ptr64(i32 addrspace(272)* %i) { -; CHECK-LABEL: test_load_ptr64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl (%rcx), %eax -; CHECK-NEXT: retq -; CHECK-O0-LABEL: test_load_ptr64: -; CHECK-O0: # %bb.0: # %entry -; CHECK-O0-NEXT: movl (%rcx), %eax -; CHECK-O0-NEXT: retq +; ALL-LABEL: test_load_ptr64: +; ALL: # %bb.0: # %entry +; ALL-NEXT: movl (%rcx), %eax +; ALL-NEXT: retq entry: %0 = load i32, i32 addrspace(272)* %i, align 8 ret i32 %0 } define void @test_store_sptr32(i32 addrspace(270)* %s, i32 %i) { -; CHECK-LABEL: test_store_sptr32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movslq %ecx, %rax -; CHECK-NEXT: movl %edx, (%rax) -; CHECK-NEXT: retq -; CHECK-O0-LABEL: test_store_sptr32: -; CHECK-O0: # %bb.0: # %entry -; CHECK-O0-NEXT: movslq %ecx, %rax -; CHECK-O0-NEXT: movl %edx, (%rax) -; CHECK-O0-NEXT: retq +; ALL-LABEL: test_store_sptr32: +; ALL: # %bb.0: # %entry +; ALL-NEXT: movslq %ecx, %rax +; ALL-NEXT: movl %edx, (%rax) +; ALL-NEXT: retq entry: store i32 %i, i32 addrspace(270)* %s, align 4 ret void @@ -257,11 +240,12 @@ define void @test_store_uptr32(i32 addrspace(271)* %s, i32 %i) { ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl %edx, (%rax) ; CHECK-NEXT: retq +; ; CHECK-O0-LABEL: test_store_uptr32: ; CHECK-O0: # %bb.0: # %entry ; CHECK-O0-NEXT: movl %ecx, %eax -; CHECK-O0-NEXT: # kill: def $rax killed $eax -; CHECK-O0-NEXT: movl %edx, (%rax) +; CHECK-O0-NEXT: movl %eax, %r8d +; CHECK-O0-NEXT: movl %edx, (%r8) ; CHECK-O0-NEXT: retq entry: store i32 %i, i32 addrspace(271)* %s, align 4 @@ -269,14 +253,10 @@ entry: } define void @test_store_ptr64(i32 addrspace(272)* %s, i32 %i) { -; CHECK-LABEL: test_store_ptr64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %edx, (%rcx) -; CHECK-NEXT: retq -; CHECK-O0-LABEL: test_store_ptr64: -; CHECK-O0: # %bb.0: # %entry -; CHECK-O0-NEXT: movl %edx, (%rcx) -; CHECK-O0-NEXT: retq +; ALL-LABEL: test_store_ptr64: +; ALL: # %bb.0: # %entry +; ALL-NEXT: movl %edx, (%rcx) +; ALL-NEXT: retq entry: store i32 %i, i32 addrspace(272)* %s, align 8 ret void diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index 2aa88abd2db8c..f44a7cdad3c7a 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -99,6 +99,7 @@ ; CHECK-NEXT: X86 cmov Conversion ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: Early Machine Loop Invariant Code Motion ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Block Frequency Analysis diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll index 6289ab482426c..4bc225cba5476 100644 --- a/llvm/test/CodeGen/X86/parity.ll +++ b/llvm/test/CodeGen/X86/parity.ll @@ -4,6 +4,187 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X86-POPCNT ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT +define i4 @parity_4(i4 %x) { +; X86-NOPOPCNT-LABEL: parity_4: +; X86-NOPOPCNT: # %bb.0: +; X86-NOPOPCNT-NEXT: testb $15, {{[0-9]+}}(%esp) +; X86-NOPOPCNT-NEXT: setnp %al +; X86-NOPOPCNT-NEXT: retl +; +; X64-NOPOPCNT-LABEL: parity_4: +; X64-NOPOPCNT: # %bb.0: +; X64-NOPOPCNT-NEXT: testb $15, %dil +; X64-NOPOPCNT-NEXT: setnp %al +; X64-NOPOPCNT-NEXT: retq +; +; X86-POPCNT-LABEL: parity_4: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: testb $15, {{[0-9]+}}(%esp) +; X86-POPCNT-NEXT: setnp %al +; X86-POPCNT-NEXT: retl +; +; X64-POPCNT-LABEL: parity_4: +; X64-POPCNT: # %bb.0: +; X64-POPCNT-NEXT: testb $15, %dil +; X64-POPCNT-NEXT: setnp %al +; X64-POPCNT-NEXT: retq + %1 = tail call i4 @llvm.ctpop.i4(i4 %x) + %2 = and i4 %1, 1 + ret i4 %2 +} + +define i8 @parity_8(i8 %x) { +; X86-NOPOPCNT-LABEL: parity_8: +; X86-NOPOPCNT: # %bb.0: +; X86-NOPOPCNT-NEXT: cmpb $0, {{[0-9]+}}(%esp) +; X86-NOPOPCNT-NEXT: setnp %al +; X86-NOPOPCNT-NEXT: retl +; +; X64-NOPOPCNT-LABEL: parity_8: +; X64-NOPOPCNT: # %bb.0: +; X64-NOPOPCNT-NEXT: testb %dil, %dil +; X64-NOPOPCNT-NEXT: setnp %al +; X64-NOPOPCNT-NEXT: retq +; +; X86-POPCNT-LABEL: parity_8: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: cmpb $0, {{[0-9]+}}(%esp) +; X86-POPCNT-NEXT: setnp %al +; X86-POPCNT-NEXT: retl +; +; X64-POPCNT-LABEL: parity_8: +; X64-POPCNT: # %bb.0: +; X64-POPCNT-NEXT: testb %dil, %dil +; X64-POPCNT-NEXT: setnp %al +; X64-POPCNT-NEXT: retq + %1 = tail call i8 @llvm.ctpop.i8(i8 %x) + %2 = and i8 %1, 1 + ret i8 %2 +} + +define i16 @parity_16(i16 %x) { +; X86-NOPOPCNT-LABEL: parity_16: +; X86-NOPOPCNT: # %bb.0: +; X86-NOPOPCNT-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %eax +; X86-NOPOPCNT-NEXT: xorb %ch, %cl +; X86-NOPOPCNT-NEXT: setnp %al +; X86-NOPOPCNT-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NOPOPCNT-NEXT: retl +; +; X64-NOPOPCNT-LABEL: parity_16: +; X64-NOPOPCNT: # %bb.0: +; X64-NOPOPCNT-NEXT: movl %edi, %ecx +; X64-NOPOPCNT-NEXT: xorl %eax, %eax +; X64-NOPOPCNT-NEXT: xorb %ch, %cl +; X64-NOPOPCNT-NEXT: setnp %al +; X64-NOPOPCNT-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NOPOPCNT-NEXT: retq +; +; X86-POPCNT-LABEL: parity_16: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: popcntw {{[0-9]+}}(%esp), %ax +; X86-POPCNT-NEXT: andl $1, %eax +; X86-POPCNT-NEXT: # kill: def $ax killed $ax killed $eax +; X86-POPCNT-NEXT: retl +; +; X64-POPCNT-LABEL: parity_16: +; X64-POPCNT: # %bb.0: +; X64-POPCNT-NEXT: popcntw %di, %ax +; X64-POPCNT-NEXT: andl $1, %eax +; X64-POPCNT-NEXT: # kill: def $ax killed $ax killed $eax +; X64-POPCNT-NEXT: retq + %1 = tail call i16 @llvm.ctpop.i16(i16 %x) + %2 = and i16 %1, 1 + ret i16 %2 +} + +define i16 @parity_16_load(i16* %x) { +; X86-NOPOPCNT-LABEL: parity_16_load: +; X86-NOPOPCNT: # %bb.0: +; X86-NOPOPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOPOPCNT-NEXT: movzwl (%eax), %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %eax +; X86-NOPOPCNT-NEXT: xorb %ch, %cl +; X86-NOPOPCNT-NEXT: setnp %al +; X86-NOPOPCNT-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NOPOPCNT-NEXT: retl +; +; X64-NOPOPCNT-LABEL: parity_16_load: +; X64-NOPOPCNT: # %bb.0: +; X64-NOPOPCNT-NEXT: movzwl (%rdi), %ecx +; X64-NOPOPCNT-NEXT: xorl %eax, %eax +; X64-NOPOPCNT-NEXT: xorb %ch, %cl +; X64-NOPOPCNT-NEXT: setnp %al +; X64-NOPOPCNT-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NOPOPCNT-NEXT: retq +; +; X86-POPCNT-LABEL: parity_16_load: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: popcntw (%eax), %ax +; X86-POPCNT-NEXT: andl $1, %eax +; X86-POPCNT-NEXT: # kill: def $ax killed $ax killed $eax +; X86-POPCNT-NEXT: retl +; +; X64-POPCNT-LABEL: parity_16_load: +; X64-POPCNT: # %bb.0: +; X64-POPCNT-NEXT: popcntw (%rdi), %ax +; X64-POPCNT-NEXT: andl $1, %eax +; X64-POPCNT-NEXT: # kill: def $ax killed $ax killed $eax +; X64-POPCNT-NEXT: retq + %1 = load i16, i16* %x + %2 = tail call i16 @llvm.ctpop.i16(i16 %1) + %3 = and i16 %2, 1 + ret i16 %3 +} + +define i17 @parity_17(i17 %x) { +; X86-NOPOPCNT-LABEL: parity_17: +; X86-NOPOPCNT: # %bb.0: +; X86-NOPOPCNT-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOPOPCNT-NEXT: movl %ecx, %eax +; X86-NOPOPCNT-NEXT: andl $131071, %eax # imm = 0x1FFFF +; X86-NOPOPCNT-NEXT: movl %eax, %edx +; X86-NOPOPCNT-NEXT: shrl $16, %edx +; X86-NOPOPCNT-NEXT: xorl %eax, %edx +; X86-NOPOPCNT-NEXT: xorl %eax, %eax +; X86-NOPOPCNT-NEXT: xorb %dl, %ch +; X86-NOPOPCNT-NEXT: setnp %al +; X86-NOPOPCNT-NEXT: retl +; +; X64-NOPOPCNT-LABEL: parity_17: +; X64-NOPOPCNT: # %bb.0: +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: andl $131071, %eax # imm = 0x1FFFF +; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: shrl $16, %ecx +; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: shrl $8, %edi +; X64-NOPOPCNT-NEXT: xorl %eax, %eax +; X64-NOPOPCNT-NEXT: xorb %cl, %dil +; X64-NOPOPCNT-NEXT: setnp %al +; X64-NOPOPCNT-NEXT: retq +; +; X86-POPCNT-LABEL: parity_17: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: movl $131071, %eax # imm = 0x1FFFF +; X86-POPCNT-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: popcntl %eax, %eax +; X86-POPCNT-NEXT: andl $1, %eax +; X86-POPCNT-NEXT: retl +; +; X64-POPCNT-LABEL: parity_17: +; X64-POPCNT: # %bb.0: +; X64-POPCNT-NEXT: andl $131071, %edi # imm = 0x1FFFF +; X64-POPCNT-NEXT: popcntl %edi, %eax +; X64-POPCNT-NEXT: andl $1, %eax +; X64-POPCNT-NEXT: retq + %1 = tail call i17 @llvm.ctpop.i17(i17 %x) + %2 = and i17 %1, 1 + ret i17 %2 +} + define i32 @parity_32(i32 %x) { ; X86-NOPOPCNT-LABEL: parity_32: ; X86-NOPOPCNT: # %bb.0: @@ -157,14 +338,14 @@ define i8 @parity_32_trunc(i32 %x) { ; X86-POPCNT-LABEL: parity_32_trunc: ; X86-POPCNT: # %bb.0: ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax -; X86-POPCNT-NEXT: andb $1, %al +; X86-POPCNT-NEXT: andl $1, %eax ; X86-POPCNT-NEXT: # kill: def $al killed $al killed $eax ; X86-POPCNT-NEXT: retl ; ; X64-POPCNT-LABEL: parity_32_trunc: ; X64-POPCNT: # %bb.0: ; X64-POPCNT-NEXT: popcntl %edi, %eax -; X64-POPCNT-NEXT: andb $1, %al +; X64-POPCNT-NEXT: andl $1, %eax ; X64-POPCNT-NEXT: # kill: def $al killed $al killed $eax ; X64-POPCNT-NEXT: retq %1 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -241,5 +422,103 @@ define i32 @parity_8_mask(i32 %x) { ret i32 %c } +define i32 @parity_32_shift(i32 %0) { +; X86-NOPOPCNT-LABEL: parity_32_shift: +; X86-NOPOPCNT: # %bb.0: +; X86-NOPOPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOPOPCNT-NEXT: movl %eax, %ecx +; X86-NOPOPCNT-NEXT: shrl $16, %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %eax +; X86-NOPOPCNT-NEXT: xorb %ch, %cl +; X86-NOPOPCNT-NEXT: setnp %al +; X86-NOPOPCNT-NEXT: addl %eax, %eax +; X86-NOPOPCNT-NEXT: retl +; +; X64-NOPOPCNT-LABEL: parity_32_shift: +; X64-NOPOPCNT: # %bb.0: +; X64-NOPOPCNT-NEXT: movl %edi, %ecx +; X64-NOPOPCNT-NEXT: shrl $16, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx +; X64-NOPOPCNT-NEXT: xorl %eax, %eax +; X64-NOPOPCNT-NEXT: xorb %ch, %cl +; X64-NOPOPCNT-NEXT: setnp %al +; X64-NOPOPCNT-NEXT: addl %eax, %eax +; X64-NOPOPCNT-NEXT: retq +; +; X86-POPCNT-LABEL: parity_32_shift: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: andl $1, %eax +; X86-POPCNT-NEXT: addl %eax, %eax +; X86-POPCNT-NEXT: retl +; +; X64-POPCNT-LABEL: parity_32_shift: +; X64-POPCNT: # %bb.0: +; X64-POPCNT-NEXT: popcntl %edi, %eax +; X64-POPCNT-NEXT: andl $1, %eax +; X64-POPCNT-NEXT: addl %eax, %eax +; X64-POPCNT-NEXT: retq + %2 = tail call i32 @llvm.ctpop.i32(i32 %0) + %3 = shl nuw nsw i32 %2, 1 + %4 = and i32 %3, 2 + ret i32 %4 +} + +define i64 @parity_64_shift(i64 %0) { +; X86-NOPOPCNT-LABEL: parity_64_shift: +; X86-NOPOPCNT: # %bb.0: +; X86-NOPOPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOPOPCNT-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-NOPOPCNT-NEXT: movl %eax, %ecx +; X86-NOPOPCNT-NEXT: shrl $16, %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %ecx +; X86-NOPOPCNT-NEXT: xorl %eax, %eax +; X86-NOPOPCNT-NEXT: xorb %ch, %cl +; X86-NOPOPCNT-NEXT: setnp %al +; X86-NOPOPCNT-NEXT: addl %eax, %eax +; X86-NOPOPCNT-NEXT: xorl %edx, %edx +; X86-NOPOPCNT-NEXT: retl +; +; X64-NOPOPCNT-LABEL: parity_64_shift: +; X64-NOPOPCNT: # %bb.0: +; X64-NOPOPCNT-NEXT: movq %rdi, %rax +; X64-NOPOPCNT-NEXT: shrq $32, %rax +; X64-NOPOPCNT-NEXT: xorl %edi, %eax +; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: shrl $16, %ecx +; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %eax, %eax +; X64-NOPOPCNT-NEXT: xorb %ch, %cl +; X64-NOPOPCNT-NEXT: setnp %al +; X64-NOPOPCNT-NEXT: addq %rax, %rax +; X64-NOPOPCNT-NEXT: retq +; +; X86-POPCNT-LABEL: parity_64_shift: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: popcntl %eax, %eax +; X86-POPCNT-NEXT: andl $1, %eax +; X86-POPCNT-NEXT: addl %eax, %eax +; X86-POPCNT-NEXT: xorl %edx, %edx +; X86-POPCNT-NEXT: retl +; +; X64-POPCNT-LABEL: parity_64_shift: +; X64-POPCNT: # %bb.0: +; X64-POPCNT-NEXT: popcntq %rdi, %rax +; X64-POPCNT-NEXT: andl $1, %eax +; X64-POPCNT-NEXT: addq %rax, %rax +; X64-POPCNT-NEXT: retq + %2 = tail call i64 @llvm.ctpop.i64(i64 %0) + %3 = shl nuw nsw i64 %2, 1 + %4 = and i64 %3, 2 + ret i64 %4 +} + +declare i4 @llvm.ctpop.i4(i4 %x) +declare i8 @llvm.ctpop.i8(i8 %x) +declare i16 @llvm.ctpop.i16(i16 %x) +declare i17 @llvm.ctpop.i17(i17 %x) declare i32 @llvm.ctpop.i32(i32 %x) declare i64 @llvm.ctpop.i64(i64 %x) diff --git a/llvm/test/CodeGen/X86/pr1489.ll b/llvm/test/CodeGen/X86/pr1489.ll index d1148eecb0da9..6226ea6caf90f 100644 --- a/llvm/test/CodeGen/X86/pr1489.ll +++ b/llvm/test/CodeGen/X86/pr1489.ll @@ -16,9 +16,9 @@ define i32 @quux() nounwind { ; CHECK-NEXT: movl $1082126238, (%eax) ## imm = 0x407FEF9E ; CHECK-NEXT: calll _lrintf ; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: setl %al -; CHECK-NEXT: andb $1, %al -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: setl %cl +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movzbl %cl, %eax ; CHECK-NEXT: addl $8, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl @@ -42,9 +42,9 @@ define i32 @foo() nounwind { ; CHECK-NEXT: movl $-1236950581, (%eax) ## imm = 0xB645A1CB ; CHECK-NEXT: calll _lrint ; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: setl %al -; CHECK-NEXT: andb $1, %al -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: setl %cl +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movzbl %cl, %eax ; CHECK-NEXT: addl $8, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl @@ -67,9 +67,9 @@ define i32 @bar() nounwind { ; CHECK-NEXT: movl $1082126238, (%eax) ## imm = 0x407FEF9E ; CHECK-NEXT: calll _lrintf ; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: setl %al -; CHECK-NEXT: andb $1, %al -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: setl %cl +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movzbl %cl, %eax ; CHECK-NEXT: addl $8, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl @@ -90,9 +90,9 @@ define i32 @baz() nounwind { ; CHECK-NEXT: movl $1082126238, (%eax) ## imm = 0x407FEF9E ; CHECK-NEXT: calll _lrintf ; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: setl %al -; CHECK-NEXT: andb $1, %al -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: setl %cl +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movzbl %cl, %eax ; CHECK-NEXT: addl $8, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/pr27591.ll b/llvm/test/CodeGen/X86/pr27591.ll index 7455584ac698a..97ad6814f1926 100644 --- a/llvm/test/CodeGen/X86/pr27591.ll +++ b/llvm/test/CodeGen/X86/pr27591.ll @@ -9,9 +9,9 @@ define void @test1(i32 %x) #0 { ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: cmpl $0, %edi ; CHECK-NEXT: setne %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: movzbl %al, %ecx +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: movl %ecx, %edi ; CHECK-NEXT: callq callee1 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -27,10 +27,10 @@ define void @test2(i32 %x) #0 { ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: cmpl $0, %edi ; CHECK-NEXT: setne %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: negl %eax -; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: movzbl %al, %ecx +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: negl %ecx +; CHECK-NEXT: movl %ecx, %edi ; CHECK-NEXT: callq callee2 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr30430.ll b/llvm/test/CodeGen/X86/pr30430.ll index e524245daa112..4d40aa09eeab1 100644 --- a/llvm/test/CodeGen/X86/pr30430.ll +++ b/llvm/test/CodeGen/X86/pr30430.ll @@ -75,28 +75,28 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; CHECK-NEXT: # implicit-def: $ymm2 ; CHECK-NEXT: vmovaps %xmm1, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] ; CHECK-NEXT: # implicit-def: $ymm3 -; CHECK-NEXT: vmovaps %xmm2, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; CHECK-NEXT: # implicit-def: $zmm2 -; CHECK-NEXT: vmovaps %ymm1, %ymm2 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovaps %xmm1, %xmm3 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 +; CHECK-NEXT: # implicit-def: $zmm24 +; CHECK-NEXT: vmovaps %zmm3, %zmm24 +; CHECK-NEXT: vinsertf64x4 $1, %ymm2, %zmm24, %zmm24 +; CHECK-NEXT: vmovaps %zmm24, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/pr30813.ll b/llvm/test/CodeGen/X86/pr30813.ll index 7266c5bd8d015..e3e096bda6c28 100644 --- a/llvm/test/CodeGen/X86/pr30813.ll +++ b/llvm/test/CodeGen/X86/pr30813.ll @@ -1,8 +1,9 @@ ; RUN: llc -mtriple=x86_64-linux-gnu -O0 %s -o - | FileCheck %s ; CHECK: patatino: ; CHECK: .cfi_startproc -; CHECK: movzwl (%rax), %e[[REG0:[abcd]x]] -; CHECK: movq %r[[REG0]], ({{%r[abcd]x}}) +; CHECK: movzwl (%rax), [[REG0:%e[abcd]x]] +; CHECK: movl [[REG0]], %e[[REG1C:[abcd]]]x +; CHECK: movq %r[[REG1C]]x, ({{%r[abcd]x}}) ; CHECK: retq define void @patatino() { diff --git a/llvm/test/CodeGen/X86/pr32241.ll b/llvm/test/CodeGen/X86/pr32241.ll index 1f3d273dfc416..6d628e6962eda 100644 --- a/llvm/test/CodeGen/X86/pr32241.ll +++ b/llvm/test/CodeGen/X86/pr32241.ll @@ -23,14 +23,14 @@ define i32 @_Z3foov() { ; CHECK-NEXT: .LBB0_2: # %lor.end ; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload ; CHECK-NEXT: andb $1, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: cmpl %eax, %ecx +; CHECK-NEXT: movzbl %al, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; CHECK-NEXT: cmpl %ecx, %edx ; CHECK-NEXT: setl %al ; CHECK-NEXT: andb $1, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: xorl $-1, %eax -; CHECK-NEXT: cmpl $0, %eax +; CHECK-NEXT: movzbl %al, %ecx +; CHECK-NEXT: xorl $-1, %ecx +; CHECK-NEXT: cmpl $0, %ecx ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; CHECK-NEXT: jne .LBB0_4 @@ -42,9 +42,9 @@ define i32 @_Z3foov() { ; CHECK-NEXT: .LBB0_4: # %lor.end5 ; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload ; CHECK-NEXT: andb $1, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: movzbl %al, %ecx +; CHECK-NEXT: # kill: def $cx killed $cx killed $ecx +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) ; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: addl $16, %esp ; CHECK-NEXT: .cfi_def_cfa_offset 4 diff --git a/llvm/test/CodeGen/X86/pr32284.ll b/llvm/test/CodeGen/X86/pr32284.ll index 533473663d73b..a1041ab889c23 100644 --- a/llvm/test/CodeGen/X86/pr32284.ll +++ b/llvm/test/CodeGen/X86/pr32284.ll @@ -10,28 +10,28 @@ define void @foo() { ; X86-O0-LABEL: foo: ; X86-O0: # %bb.0: # %entry ; X86-O0-NEXT: xorl %eax, %eax -; X86-O0-NEXT: # kill: def $rax killed $eax -; X86-O0-NEXT: xorl %ecx, %ecx +; X86-O0-NEXT: movl %eax, %ecx +; X86-O0-NEXT: xorl %eax, %eax ; X86-O0-NEXT: movzbl c, %edx -; X86-O0-NEXT: subl %edx, %ecx -; X86-O0-NEXT: movslq %ecx, %rcx -; X86-O0-NEXT: subq %rcx, %rax -; X86-O0-NEXT: # kill: def $al killed $al killed $rax -; X86-O0-NEXT: cmpb $0, %al -; X86-O0-NEXT: setne %al -; X86-O0-NEXT: andb $1, %al -; X86-O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; X86-O0-NEXT: subl %edx, %eax +; X86-O0-NEXT: movslq %eax, %rsi +; X86-O0-NEXT: subq %rsi, %rcx +; X86-O0-NEXT: # kill: def $cl killed $cl killed $rcx +; X86-O0-NEXT: cmpb $0, %cl +; X86-O0-NEXT: setne %cl +; X86-O0-NEXT: andb $1, %cl +; X86-O0-NEXT: movb %cl, -{{[0-9]+}}(%rsp) ; X86-O0-NEXT: cmpb $0, c -; X86-O0-NEXT: setne %al -; X86-O0-NEXT: xorb $-1, %al -; X86-O0-NEXT: xorb $-1, %al -; X86-O0-NEXT: andb $1, %al -; X86-O0-NEXT: movzbl %al, %eax -; X86-O0-NEXT: movzbl c, %ecx -; X86-O0-NEXT: cmpl %ecx, %eax -; X86-O0-NEXT: setle %al -; X86-O0-NEXT: andb $1, %al -; X86-O0-NEXT: movzbl %al, %eax +; X86-O0-NEXT: setne %cl +; X86-O0-NEXT: xorb $-1, %cl +; X86-O0-NEXT: xorb $-1, %cl +; X86-O0-NEXT: andb $1, %cl +; X86-O0-NEXT: movzbl %cl, %eax +; X86-O0-NEXT: movzbl c, %edx +; X86-O0-NEXT: cmpl %edx, %eax +; X86-O0-NEXT: setle %cl +; X86-O0-NEXT: andb $1, %cl +; X86-O0-NEXT: movzbl %cl, %eax ; X86-O0-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ; X86-O0-NEXT: retq ; @@ -63,13 +63,13 @@ define void @foo() { ; 686-O0-NEXT: xorb $-1, %al ; 686-O0-NEXT: xorb $-1, %al ; 686-O0-NEXT: andb $1, %al -; 686-O0-NEXT: movzbl %al, %eax -; 686-O0-NEXT: movzbl c, %ecx -; 686-O0-NEXT: cmpl %ecx, %eax +; 686-O0-NEXT: movzbl %al, %ecx +; 686-O0-NEXT: movzbl c, %edx +; 686-O0-NEXT: cmpl %edx, %ecx ; 686-O0-NEXT: setle %al ; 686-O0-NEXT: andb $1, %al -; 686-O0-NEXT: movzbl %al, %eax -; 686-O0-NEXT: movl %eax, (%esp) +; 686-O0-NEXT: movzbl %al, %ecx +; 686-O0-NEXT: movl %ecx, (%esp) ; 686-O0-NEXT: addl $8, %esp ; 686-O0-NEXT: .cfi_def_cfa_offset 4 ; 686-O0-NEXT: retl @@ -126,33 +126,33 @@ define void @f1() { ; X86-O0-NEXT: movabsq $8381627093, %rcx # imm = 0x1F3957AD5 ; X86-O0-NEXT: addq %rcx, %rax ; X86-O0-NEXT: cmpq $0, %rax -; X86-O0-NEXT: setne %al -; X86-O0-NEXT: andb $1, %al -; X86-O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; X86-O0-NEXT: movl var_5, %eax -; X86-O0-NEXT: xorl $-1, %eax -; X86-O0-NEXT: cmpl $0, %eax -; X86-O0-NEXT: setne %al -; X86-O0-NEXT: xorb $-1, %al -; X86-O0-NEXT: andb $1, %al -; X86-O0-NEXT: movzbl %al, %eax -; X86-O0-NEXT: # kill: def $rax killed $eax +; X86-O0-NEXT: setne %dl +; X86-O0-NEXT: andb $1, %dl +; X86-O0-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; X86-O0-NEXT: movl var_5, %esi +; X86-O0-NEXT: xorl $-1, %esi +; X86-O0-NEXT: cmpl $0, %esi +; X86-O0-NEXT: setne %dl +; X86-O0-NEXT: xorb $-1, %dl +; X86-O0-NEXT: andb $1, %dl +; X86-O0-NEXT: movzbl %dl, %esi +; X86-O0-NEXT: movl %esi, %eax ; X86-O0-NEXT: movslq var_5, %rcx ; X86-O0-NEXT: addq $7093, %rcx # imm = 0x1BB5 ; X86-O0-NEXT: cmpq %rcx, %rax -; X86-O0-NEXT: setg %al -; X86-O0-NEXT: andb $1, %al -; X86-O0-NEXT: movzbl %al, %eax -; X86-O0-NEXT: # kill: def $rax killed $eax +; X86-O0-NEXT: setg %dl +; X86-O0-NEXT: andb $1, %dl +; X86-O0-NEXT: movzbl %dl, %esi +; X86-O0-NEXT: movl %esi, %eax ; X86-O0-NEXT: movq %rax, var_57 -; X86-O0-NEXT: movl var_5, %eax -; X86-O0-NEXT: xorl $-1, %eax -; X86-O0-NEXT: cmpl $0, %eax -; X86-O0-NEXT: setne %al -; X86-O0-NEXT: xorb $-1, %al -; X86-O0-NEXT: andb $1, %al -; X86-O0-NEXT: movzbl %al, %eax -; X86-O0-NEXT: # kill: def $rax killed $eax +; X86-O0-NEXT: movl var_5, %esi +; X86-O0-NEXT: xorl $-1, %esi +; X86-O0-NEXT: cmpl $0, %esi +; X86-O0-NEXT: setne %dl +; X86-O0-NEXT: xorb $-1, %dl +; X86-O0-NEXT: andb $1, %dl +; X86-O0-NEXT: movzbl %dl, %esi +; X86-O0-NEXT: movl %esi, %eax ; X86-O0-NEXT: movq %rax, _ZN8struct_210member_2_0E ; X86-O0-NEXT: retq ; @@ -178,17 +178,20 @@ define void @f1() { ; ; 686-O0-LABEL: f1: ; 686-O0: # %bb.0: # %entry -; 686-O0-NEXT: pushl %ebx +; 686-O0-NEXT: pushl %ebp ; 686-O0-NEXT: .cfi_def_cfa_offset 8 -; 686-O0-NEXT: pushl %edi +; 686-O0-NEXT: pushl %ebx ; 686-O0-NEXT: .cfi_def_cfa_offset 12 -; 686-O0-NEXT: pushl %esi +; 686-O0-NEXT: pushl %edi ; 686-O0-NEXT: .cfi_def_cfa_offset 16 +; 686-O0-NEXT: pushl %esi +; 686-O0-NEXT: .cfi_def_cfa_offset 20 ; 686-O0-NEXT: subl $1, %esp -; 686-O0-NEXT: .cfi_def_cfa_offset 17 -; 686-O0-NEXT: .cfi_offset %esi, -16 -; 686-O0-NEXT: .cfi_offset %edi, -12 -; 686-O0-NEXT: .cfi_offset %ebx, -8 +; 686-O0-NEXT: .cfi_def_cfa_offset 21 +; 686-O0-NEXT: .cfi_offset %esi, -20 +; 686-O0-NEXT: .cfi_offset %edi, -16 +; 686-O0-NEXT: .cfi_offset %ebx, -12 +; 686-O0-NEXT: .cfi_offset %ebp, -8 ; 686-O0-NEXT: movl var_5, %eax ; 686-O0-NEXT: movl %eax, %ecx ; 686-O0-NEXT: sarl $31, %ecx @@ -214,16 +217,18 @@ define void @f1() { ; 686-O0-NEXT: movl var_5, %edi ; 686-O0-NEXT: subl $-1, %edi ; 686-O0-NEXT: sete %bl -; 686-O0-NEXT: movzbl %bl, %ebx -; 686-O0-NEXT: movl %ebx, _ZN8struct_210member_2_0E +; 686-O0-NEXT: movzbl %bl, %ebp +; 686-O0-NEXT: movl %ebp, _ZN8struct_210member_2_0E ; 686-O0-NEXT: movl $0, _ZN8struct_210member_2_0E+4 ; 686-O0-NEXT: addl $1, %esp -; 686-O0-NEXT: .cfi_def_cfa_offset 16 +; 686-O0-NEXT: .cfi_def_cfa_offset 20 ; 686-O0-NEXT: popl %esi -; 686-O0-NEXT: .cfi_def_cfa_offset 12 +; 686-O0-NEXT: .cfi_def_cfa_offset 16 ; 686-O0-NEXT: popl %edi -; 686-O0-NEXT: .cfi_def_cfa_offset 8 +; 686-O0-NEXT: .cfi_def_cfa_offset 12 ; 686-O0-NEXT: popl %ebx +; 686-O0-NEXT: .cfi_def_cfa_offset 8 +; 686-O0-NEXT: popl %ebp ; 686-O0-NEXT: .cfi_def_cfa_offset 4 ; 686-O0-NEXT: retl ; @@ -305,25 +310,25 @@ define void @f2() { ; X86-O0-NEXT: setne %cl ; X86-O0-NEXT: xorb $-1, %cl ; X86-O0-NEXT: andb $1, %cl -; X86-O0-NEXT: movzbl %cl, %ecx -; X86-O0-NEXT: xorl %ecx, %eax +; X86-O0-NEXT: movzbl %cl, %edx +; X86-O0-NEXT: xorl %edx, %eax ; X86-O0-NEXT: # kill: def $ax killed $ax killed $eax ; X86-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; X86-O0-NEXT: movzbl var_7, %eax -; X86-O0-NEXT: # kill: def $ax killed $ax killed $eax -; X86-O0-NEXT: cmpw $0, %ax -; X86-O0-NEXT: setne %al -; X86-O0-NEXT: xorb $-1, %al -; X86-O0-NEXT: andb $1, %al -; X86-O0-NEXT: movzbl %al, %eax -; X86-O0-NEXT: movzbl var_7, %ecx -; X86-O0-NEXT: cmpl %ecx, %eax -; X86-O0-NEXT: sete %al -; X86-O0-NEXT: andb $1, %al -; X86-O0-NEXT: movzbl %al, %eax -; X86-O0-NEXT: # kill: def $ax killed $ax killed $eax -; X86-O0-NEXT: # implicit-def: $rcx -; X86-O0-NEXT: movw %ax, (%rcx) +; X86-O0-NEXT: movzbl var_7, %edx +; X86-O0-NEXT: # kill: def $dx killed $dx killed $edx +; X86-O0-NEXT: cmpw $0, %dx +; X86-O0-NEXT: setne %cl +; X86-O0-NEXT: xorb $-1, %cl +; X86-O0-NEXT: andb $1, %cl +; X86-O0-NEXT: movzbl %cl, %esi +; X86-O0-NEXT: movzbl var_7, %edi +; X86-O0-NEXT: cmpl %edi, %esi +; X86-O0-NEXT: sete %cl +; X86-O0-NEXT: andb $1, %cl +; X86-O0-NEXT: movzbl %cl, %esi +; X86-O0-NEXT: # kill: def $si killed $si killed $esi +; X86-O0-NEXT: # implicit-def: $r8 +; X86-O0-NEXT: movw %si, (%r8) ; X86-O0-NEXT: retq ; ; X64-LABEL: f2: @@ -345,33 +350,43 @@ define void @f2() { ; ; 686-O0-LABEL: f2: ; 686-O0: # %bb.0: # %entry +; 686-O0-NEXT: pushl %edi +; 686-O0-NEXT: .cfi_def_cfa_offset 8 +; 686-O0-NEXT: pushl %esi +; 686-O0-NEXT: .cfi_def_cfa_offset 12 ; 686-O0-NEXT: subl $2, %esp -; 686-O0-NEXT: .cfi_def_cfa_offset 6 +; 686-O0-NEXT: .cfi_def_cfa_offset 14 +; 686-O0-NEXT: .cfi_offset %esi, -12 +; 686-O0-NEXT: .cfi_offset %edi, -8 ; 686-O0-NEXT: movzbl var_7, %eax ; 686-O0-NEXT: cmpb $0, var_7 ; 686-O0-NEXT: setne %cl ; 686-O0-NEXT: xorb $-1, %cl ; 686-O0-NEXT: andb $1, %cl -; 686-O0-NEXT: movzbl %cl, %ecx -; 686-O0-NEXT: xorl %ecx, %eax +; 686-O0-NEXT: movzbl %cl, %edx +; 686-O0-NEXT: xorl %edx, %eax ; 686-O0-NEXT: # kill: def $ax killed $ax killed $eax ; 686-O0-NEXT: movw %ax, (%esp) -; 686-O0-NEXT: movzbl var_7, %eax -; 686-O0-NEXT: # kill: def $ax killed $ax killed $eax -; 686-O0-NEXT: cmpw $0, %ax -; 686-O0-NEXT: setne %al -; 686-O0-NEXT: xorb $-1, %al -; 686-O0-NEXT: andb $1, %al -; 686-O0-NEXT: movzbl %al, %eax -; 686-O0-NEXT: movzbl var_7, %ecx -; 686-O0-NEXT: cmpl %ecx, %eax -; 686-O0-NEXT: sete %al -; 686-O0-NEXT: andb $1, %al -; 686-O0-NEXT: movzbl %al, %eax -; 686-O0-NEXT: # kill: def $ax killed $ax killed $eax -; 686-O0-NEXT: # implicit-def: $ecx -; 686-O0-NEXT: movw %ax, (%ecx) +; 686-O0-NEXT: movzbl var_7, %edx +; 686-O0-NEXT: # kill: def $dx killed $dx killed $edx +; 686-O0-NEXT: cmpw $0, %dx +; 686-O0-NEXT: setne %cl +; 686-O0-NEXT: xorb $-1, %cl +; 686-O0-NEXT: andb $1, %cl +; 686-O0-NEXT: movzbl %cl, %esi +; 686-O0-NEXT: movzbl var_7, %edi +; 686-O0-NEXT: cmpl %edi, %esi +; 686-O0-NEXT: sete %cl +; 686-O0-NEXT: andb $1, %cl +; 686-O0-NEXT: movzbl %cl, %esi +; 686-O0-NEXT: # kill: def $si killed $si killed $esi +; 686-O0-NEXT: # implicit-def: $edi +; 686-O0-NEXT: movw %si, (%edi) ; 686-O0-NEXT: addl $2, %esp +; 686-O0-NEXT: .cfi_def_cfa_offset 12 +; 686-O0-NEXT: popl %esi +; 686-O0-NEXT: .cfi_def_cfa_offset 8 +; 686-O0-NEXT: popl %edi ; 686-O0-NEXT: .cfi_def_cfa_offset 4 ; 686-O0-NEXT: retl ; @@ -431,35 +446,35 @@ define void @f3() #0 { ; X86-O0-NEXT: movl var_13, %eax ; X86-O0-NEXT: xorl $-1, %eax ; X86-O0-NEXT: movl %eax, %eax -; X86-O0-NEXT: # kill: def $rax killed $eax +; X86-O0-NEXT: movl %eax, %ecx ; X86-O0-NEXT: cmpl $0, var_13 -; X86-O0-NEXT: setne %cl -; X86-O0-NEXT: xorb $-1, %cl -; X86-O0-NEXT: andb $1, %cl -; X86-O0-NEXT: movzbl %cl, %ecx -; X86-O0-NEXT: # kill: def $rcx killed $ecx -; X86-O0-NEXT: movl var_13, %edx -; X86-O0-NEXT: xorl $-1, %edx -; X86-O0-NEXT: xorl var_16, %edx -; X86-O0-NEXT: movl %edx, %edx -; X86-O0-NEXT: # kill: def $rdx killed $edx -; X86-O0-NEXT: andq %rdx, %rcx -; X86-O0-NEXT: orq %rcx, %rax -; X86-O0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X86-O0-NEXT: setne %dl +; X86-O0-NEXT: xorb $-1, %dl +; X86-O0-NEXT: andb $1, %dl +; X86-O0-NEXT: movzbl %dl, %eax +; X86-O0-NEXT: movl %eax, %esi ; X86-O0-NEXT: movl var_13, %eax ; X86-O0-NEXT: xorl $-1, %eax +; X86-O0-NEXT: xorl var_16, %eax ; X86-O0-NEXT: movl %eax, %eax -; X86-O0-NEXT: # kill: def $rax killed $eax +; X86-O0-NEXT: movl %eax, %edi +; X86-O0-NEXT: andq %rdi, %rsi +; X86-O0-NEXT: orq %rsi, %rcx +; X86-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X86-O0-NEXT: movl var_13, %eax +; X86-O0-NEXT: xorl $-1, %eax +; X86-O0-NEXT: movl %eax, %eax +; X86-O0-NEXT: movl %eax, %ecx ; X86-O0-NEXT: cmpl $0, var_13 -; X86-O0-NEXT: setne %cl -; X86-O0-NEXT: xorb $-1, %cl -; X86-O0-NEXT: andb $1, %cl -; X86-O0-NEXT: movzbl %cl, %ecx -; X86-O0-NEXT: # kill: def $rcx killed $ecx -; X86-O0-NEXT: andq $0, %rcx -; X86-O0-NEXT: orq %rcx, %rax -; X86-O0-NEXT: # kill: def $eax killed $eax killed $rax -; X86-O0-NEXT: movl %eax, var_46 +; X86-O0-NEXT: setne %dl +; X86-O0-NEXT: xorb $-1, %dl +; X86-O0-NEXT: andb $1, %dl +; X86-O0-NEXT: movzbl %dl, %eax +; X86-O0-NEXT: movl %eax, %esi +; X86-O0-NEXT: andq $0, %rsi +; X86-O0-NEXT: orq %rsi, %rcx +; X86-O0-NEXT: # kill: def $ecx killed $ecx killed $rcx +; X86-O0-NEXT: movl %ecx, var_46 ; X86-O0-NEXT: retq ; ; X64-LABEL: f3: @@ -484,28 +499,31 @@ define void @f3() #0 { ; 686-O0-NEXT: .cfi_offset %ebp, -8 ; 686-O0-NEXT: movl %esp, %ebp ; 686-O0-NEXT: .cfi_def_cfa_register %ebp +; 686-O0-NEXT: pushl %edi ; 686-O0-NEXT: pushl %esi ; 686-O0-NEXT: andl $-8, %esp -; 686-O0-NEXT: subl $16, %esp -; 686-O0-NEXT: .cfi_offset %esi, -12 +; 686-O0-NEXT: subl $8, %esp +; 686-O0-NEXT: .cfi_offset %esi, -16 +; 686-O0-NEXT: .cfi_offset %edi, -12 ; 686-O0-NEXT: movl var_13, %eax ; 686-O0-NEXT: movl %eax, %ecx ; 686-O0-NEXT: notl %ecx ; 686-O0-NEXT: testl %eax, %eax -; 686-O0-NEXT: sete %al -; 686-O0-NEXT: movzbl %al, %eax -; 686-O0-NEXT: movl var_16, %edx -; 686-O0-NEXT: movl %ecx, %esi -; 686-O0-NEXT: xorl %edx, %esi -; 686-O0-NEXT: andl %esi, %eax +; 686-O0-NEXT: sete %dl +; 686-O0-NEXT: movzbl %dl, %eax +; 686-O0-NEXT: movl var_16, %esi +; 686-O0-NEXT: movl %ecx, %edi +; 686-O0-NEXT: xorl %esi, %edi +; 686-O0-NEXT: andl %edi, %eax ; 686-O0-NEXT: orl %eax, %ecx ; 686-O0-NEXT: movl %ecx, (%esp) ; 686-O0-NEXT: movl $0, {{[0-9]+}}(%esp) ; 686-O0-NEXT: movl var_13, %eax ; 686-O0-NEXT: notl %eax ; 686-O0-NEXT: movl %eax, var_46 -; 686-O0-NEXT: leal -4(%ebp), %esp +; 686-O0-NEXT: leal -8(%ebp), %esp ; 686-O0-NEXT: popl %esi +; 686-O0-NEXT: popl %edi ; 686-O0-NEXT: popl %ebp ; 686-O0-NEXT: .cfi_def_cfa %esp, 4 ; 686-O0-NEXT: retl diff --git a/llvm/test/CodeGen/X86/pr32340.ll b/llvm/test/CodeGen/X86/pr32340.ll index 98685b959f642..1e428ac7d83a6 100644 --- a/llvm/test/CodeGen/X86/pr32340.ll +++ b/llvm/test/CodeGen/X86/pr32340.ll @@ -14,37 +14,37 @@ define void @foo() { ; X64-LABEL: foo: ; X64: # %bb.0: # %entry ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: # kill: def $rax killed $eax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: movw $0, var_825 -; X64-NEXT: movzwl var_32, %ecx +; X64-NEXT: movzwl var_32, %eax ; X64-NEXT: movzwl var_901, %edx -; X64-NEXT: movl %ecx, %esi +; X64-NEXT: movl %eax, %esi ; X64-NEXT: xorl %edx, %esi -; X64-NEXT: movl %ecx, %edx +; X64-NEXT: movl %eax, %edx ; X64-NEXT: xorl %esi, %edx -; X64-NEXT: addl %ecx, %edx -; X64-NEXT: movslq %edx, %rcx -; X64-NEXT: movq %rcx, var_826 -; X64-NEXT: movzwl var_32, %ecx -; X64-NEXT: # kill: def $rcx killed $ecx -; X64-NEXT: movzwl var_901, %edx -; X64-NEXT: xorl $51981, %edx # imm = 0xCB0D -; X64-NEXT: movslq %edx, %rdx -; X64-NEXT: movabsq $-1142377792914660288, %rsi # imm = 0xF02575732E06E440 -; X64-NEXT: xorq %rsi, %rdx -; X64-NEXT: movq %rcx, %rsi -; X64-NEXT: xorq %rdx, %rsi -; X64-NEXT: xorq $-1, %rsi -; X64-NEXT: xorq %rsi, %rcx -; X64-NEXT: movq %rcx, %rdx -; X64-NEXT: orq var_57, %rdx -; X64-NEXT: orq %rdx, %rcx -; X64-NEXT: # kill: def $cx killed $cx killed $rcx -; X64-NEXT: movw %cx, var_900 -; X64-NEXT: cmpq var_28, %rax -; X64-NEXT: setne %al -; X64-NEXT: andb $1, %al -; X64-NEXT: movzbl %al, %eax +; X64-NEXT: addl %eax, %edx +; X64-NEXT: movslq %edx, %rdi +; X64-NEXT: movq %rdi, var_826 +; X64-NEXT: movzwl var_32, %eax +; X64-NEXT: movl %eax, %edi +; X64-NEXT: movzwl var_901, %eax +; X64-NEXT: xorl $51981, %eax # imm = 0xCB0D +; X64-NEXT: movslq %eax, %r8 +; X64-NEXT: movabsq $-1142377792914660288, %r9 # imm = 0xF02575732E06E440 +; X64-NEXT: xorq %r9, %r8 +; X64-NEXT: movq %rdi, %r9 +; X64-NEXT: xorq %r8, %r9 +; X64-NEXT: xorq $-1, %r9 +; X64-NEXT: xorq %r9, %rdi +; X64-NEXT: movq %rdi, %r8 +; X64-NEXT: orq var_57, %r8 +; X64-NEXT: orq %r8, %rdi +; X64-NEXT: # kill: def $di killed $di killed $rdi +; X64-NEXT: movw %di, var_900 +; X64-NEXT: cmpq var_28, %rcx +; X64-NEXT: setne %r10b +; X64-NEXT: andb $1, %r10b +; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: movw %ax, var_827 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr32345.ll b/llvm/test/CodeGen/X86/pr32345.ll index 165e0292d4648..d5f7fde77f6d2 100644 --- a/llvm/test/CodeGen/X86/pr32345.ll +++ b/llvm/test/CodeGen/X86/pr32345.ll @@ -15,23 +15,23 @@ define void @foo() { ; X640-NEXT: xorl %ecx, %eax ; X640-NEXT: movzwl var_27, %ecx ; X640-NEXT: xorl %ecx, %eax -; X640-NEXT: cltq -; X640-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X640-NEXT: movslq %eax, %rdx +; X640-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; X640-NEXT: movzwl var_22, %eax ; X640-NEXT: movzwl var_27, %ecx ; X640-NEXT: xorl %ecx, %eax ; X640-NEXT: movzwl var_27, %ecx ; X640-NEXT: xorl %ecx, %eax -; X640-NEXT: cltq -; X640-NEXT: movzwl var_27, %ecx -; X640-NEXT: subl $16610, %ecx # imm = 0x40E2 -; X640-NEXT: movl %ecx, %ecx -; X640-NEXT: # kill: def $rcx killed $ecx +; X640-NEXT: movslq %eax, %rdx +; X640-NEXT: movzwl var_27, %eax +; X640-NEXT: subl $16610, %eax # imm = 0x40E2 +; X640-NEXT: movl %eax, %eax +; X640-NEXT: movl %eax, %ecx ; X640-NEXT: # kill: def $cl killed $rcx -; X640-NEXT: sarq %cl, %rax -; X640-NEXT: # kill: def $al killed $al killed $rax -; X640-NEXT: # implicit-def: $rcx -; X640-NEXT: movb %al, (%rcx) +; X640-NEXT: sarq %cl, %rdx +; X640-NEXT: # kill: def $dl killed $dl killed $rdx +; X640-NEXT: # implicit-def: $rsi +; X640-NEXT: movb %dl, (%rsi) ; X640-NEXT: retq ; ; 6860-LABEL: foo: @@ -41,37 +41,43 @@ define void @foo() { ; 6860-NEXT: .cfi_offset %ebp, -8 ; 6860-NEXT: movl %esp, %ebp ; 6860-NEXT: .cfi_def_cfa_register %ebp +; 6860-NEXT: pushl %ebx +; 6860-NEXT: pushl %edi +; 6860-NEXT: pushl %esi ; 6860-NEXT: andl $-8, %esp -; 6860-NEXT: subl $24, %esp +; 6860-NEXT: subl $32, %esp +; 6860-NEXT: .cfi_offset %esi, -20 +; 6860-NEXT: .cfi_offset %edi, -16 +; 6860-NEXT: .cfi_offset %ebx, -12 ; 6860-NEXT: movw var_22, %ax ; 6860-NEXT: movzwl var_27, %ecx ; 6860-NEXT: movw %cx, %dx ; 6860-NEXT: xorw %dx, %ax -; 6860-NEXT: # implicit-def: $edx -; 6860-NEXT: movw %ax, %dx -; 6860-NEXT: xorl %ecx, %edx -; 6860-NEXT: # kill: def $dx killed $dx killed $edx -; 6860-NEXT: movzwl %dx, %eax -; 6860-NEXT: movl %eax, {{[0-9]+}}(%esp) +; 6860-NEXT: # implicit-def: $esi +; 6860-NEXT: movw %ax, %si +; 6860-NEXT: xorl %ecx, %esi +; 6860-NEXT: # kill: def $si killed $si killed $esi +; 6860-NEXT: movzwl %si, %ecx +; 6860-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; 6860-NEXT: movl $0, {{[0-9]+}}(%esp) ; 6860-NEXT: movw var_22, %ax ; 6860-NEXT: movzwl var_27, %ecx ; 6860-NEXT: movw %cx, %dx ; 6860-NEXT: xorw %dx, %ax -; 6860-NEXT: # implicit-def: $edx -; 6860-NEXT: movw %ax, %dx -; 6860-NEXT: xorl %ecx, %edx -; 6860-NEXT: # kill: def $dx killed $dx killed $edx -; 6860-NEXT: movzwl %dx, %eax +; 6860-NEXT: # implicit-def: $edi +; 6860-NEXT: movw %ax, %di +; 6860-NEXT: xorl %ecx, %edi +; 6860-NEXT: # kill: def $di killed $di killed $edi +; 6860-NEXT: movzwl %di, %ebx ; 6860-NEXT: # kill: def $cl killed $cl killed $ecx ; 6860-NEXT: addb $30, %cl -; 6860-NEXT: xorl %edx, %edx +; 6860-NEXT: xorl %eax, %eax ; 6860-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; 6860-NEXT: shrdl %cl, %edx, %eax +; 6860-NEXT: shrdl %cl, %eax, %ebx ; 6860-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload ; 6860-NEXT: testb $32, %cl +; 6860-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; 6860-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; 6860-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; 6860-NEXT: jne .LBB0_2 ; 6860-NEXT: # %bb.1: # %bb ; 6860-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -81,7 +87,10 @@ define void @foo() { ; 6860-NEXT: # kill: def $al killed $al killed $eax ; 6860-NEXT: # implicit-def: $ecx ; 6860-NEXT: movb %al, (%ecx) -; 6860-NEXT: movl %ebp, %esp +; 6860-NEXT: leal -12(%ebp), %esp +; 6860-NEXT: popl %esi +; 6860-NEXT: popl %edi +; 6860-NEXT: popl %ebx ; 6860-NEXT: popl %ebp ; 6860-NEXT: .cfi_def_cfa %esp, 4 ; 6860-NEXT: retl diff --git a/llvm/test/CodeGen/X86/pr32451.ll b/llvm/test/CodeGen/X86/pr32451.ll index 3b1997234ce55..4754d8e4cf6cb 100644 --- a/llvm/test/CodeGen/X86/pr32451.ll +++ b/llvm/test/CodeGen/X86/pr32451.ll @@ -9,24 +9,29 @@ target triple = "x86_64-unknown-linux-gnu" define i8** @japi1_convert_690(i8**, i8***, i32) { ; CHECK-LABEL: japi1_convert_690: ; CHECK: # %bb.0: # %top +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: subl $16, %esp -; CHECK-NEXT: .cfi_def_cfa_offset 20 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_offset %ebx, -8 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill ; CHECK-NEXT: calll julia.gc_root_decl -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill ; CHECK-NEXT: calll jl_get_ptls_states -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload ; CHECK-NEXT: movl 4(%ecx), %edx -; CHECK-NEXT: movb (%edx), %dl -; CHECK-NEXT: andb $1, %dl -; CHECK-NEXT: movzbl %dl, %edx +; CHECK-NEXT: movb (%edx), %bl +; CHECK-NEXT: andb $1, %bl +; CHECK-NEXT: movzbl %bl, %edx ; CHECK-NEXT: movl %edx, (%esp) -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill ; CHECK-NEXT: calll jl_box_int32 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload ; CHECK-NEXT: movl %eax, (%ecx) ; CHECK-NEXT: addl $16, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: popl %ebx ; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl top: diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll index 25b068c8fad6f..0f73036a4c6c9 100644 --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -10,7 +10,7 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1 ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: andq $-32, %rsp -; CHECK-NEXT: subq $160, %rsp +; CHECK-NEXT: subq $192, %rsp ; CHECK-NEXT: vmovaps 240(%rbp), %ymm8 ; CHECK-NEXT: vmovaps 208(%rbp), %ymm9 ; CHECK-NEXT: vmovaps 176(%rbp), %ymm10 @@ -27,14 +27,14 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1 ; CHECK-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,0] ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; CHECK-NEXT: vmovaps %xmm7, %xmm2 -; CHECK-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; CHECK-NEXT: # implicit-def: $ymm9 -; CHECK-NEXT: vmovaps %xmm2, %xmm9 -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; CHECK-NEXT: vmovaps %xmm7, %xmm9 +; CHECK-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7] +; CHECK-NEXT: # implicit-def: $ymm2 +; CHECK-NEXT: vmovaps %xmm9, %xmm2 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; CHECK-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3] +; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; CHECK-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] ; CHECK-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,1,3] ; CHECK-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,1,4,5,4,5] @@ -43,11 +43,14 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1 ; CHECK-NEXT: vmovq {{.*#+}} xmm7 = xmm7[0],zero ; CHECK-NEXT: # implicit-def: $ymm8 ; CHECK-NEXT: vmovaps %xmm7, %xmm8 -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[0,1],ymm6[0,1] +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm8[0,1],ymm6[0,1] ; CHECK-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm5, %ymm1 +; CHECK-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovaps %ymm6, %ymm2 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; CHECK-NEXT: vmovaps %ymm3, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovaps %ymm9, %ymm3 +; CHECK-NEXT: vmovaps %ymm5, %ymm3 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 diff --git a/llvm/test/CodeGen/X86/pr39733.ll b/llvm/test/CodeGen/X86/pr39733.ll index 31bd5b71d0a6e..cfe5832d7ad66 100644 --- a/llvm/test/CodeGen/X86/pr39733.ll +++ b/llvm/test/CodeGen/X86/pr39733.ll @@ -23,8 +23,8 @@ define void @test55() { ; CHECK-NEXT: vmovaps %xmm1, %xmm2 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rsp) +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 +; CHECK-NEXT: vmovdqa %ymm2, (%rsp) ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 diff --git a/llvm/test/CodeGen/X86/pr44749.ll b/llvm/test/CodeGen/X86/pr44749.ll index 1012d8c723b13..d465009c7c38a 100644 --- a/llvm/test/CodeGen/X86/pr44749.ll +++ b/llvm/test/CodeGen/X86/pr44749.ll @@ -14,22 +14,20 @@ define i32 @a() { ; CHECK-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: callq _b ; CHECK-NEXT: cvtsi2sd %eax, %xmm0 -; CHECK-NEXT: movq _calloc@{{.*}}(%rip), %rax -; CHECK-NEXT: subq $-1, %rax -; CHECK-NEXT: setne %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: ## kill: def $rcx killed $ecx -; CHECK-NEXT: leaq {{.*}}(%rip), %rdx +; CHECK-NEXT: movq _calloc@{{.*}}(%rip), %rcx +; CHECK-NEXT: subq $-1, %rcx +; CHECK-NEXT: setne %dl +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: leaq {{.*}}(%rip), %rdi ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: ucomisd %xmm1, %xmm0 -; CHECK-NEXT: setae %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: ## kill: def $rcx killed $ecx -; CHECK-NEXT: leaq {{.*}}(%rip), %rdx +; CHECK-NEXT: setae %dl +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: leaq {{.*}}(%rip), %rdi ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: cvttsd2si %xmm0, %ecx -; CHECK-NEXT: movq %rax, (%rsp) ## 8-byte Spill -; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: cvttsd2si %xmm0, %eax ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr47000.ll b/llvm/test/CodeGen/X86/pr47000.ll index 083aa780a07c2..922b6403cc4f4 100755 --- a/llvm/test/CodeGen/X86/pr47000.ll +++ b/llvm/test/CodeGen/X86/pr47000.ll @@ -12,47 +12,51 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind { ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $124, %esp -; CHECK-NEXT: movl 144(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: movw 176(%esp), %dx -; CHECK-NEXT: movw 172(%esp), %si -; CHECK-NEXT: movw 168(%esp), %di -; CHECK-NEXT: movw 164(%esp), %bx -; CHECK-NEXT: movw 160(%esp), %bp +; CHECK-NEXT: movw {{[0-9]+}}(%esp), %dx +; CHECK-NEXT: movw {{[0-9]+}}(%esp), %si +; CHECK-NEXT: movw {{[0-9]+}}(%esp), %di +; CHECK-NEXT: movw {{[0-9]+}}(%esp), %bx +; CHECK-NEXT: movw {{[0-9]+}}(%esp), %bp +; CHECK-NEXT: movw %dx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movw {{[0-9]+}}(%esp), %dx +; CHECK-NEXT: movw %dx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movw {{[0-9]+}}(%esp), %dx +; CHECK-NEXT: movw %dx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movw {{[0-9]+}}(%esp), %dx +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %dx # 2-byte Reload +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %dx # 2-byte Reload +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movw %bp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %bp # 2-byte Reload +; CHECK-NEXT: movw %bp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movw %si, {{[0-9]+}}(%esp) +; CHECK-NEXT: movw %di, {{[0-9]+}}(%esp) +; CHECK-NEXT: movw %bx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movw 156(%esp), %ax -; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill -; CHECK-NEXT: movw 152(%esp), %ax -; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill -; CHECK-NEXT: movw 148(%esp), %ax -; CHECK-NEXT: movw %ax, 112(%esp) -; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload -; CHECK-NEXT: movw %ax, 114(%esp) -; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload -; CHECK-NEXT: movw %ax, 116(%esp) -; CHECK-NEXT: movw %bp, 118(%esp) -; CHECK-NEXT: movw %dx, 110(%esp) -; CHECK-NEXT: movw %si, 108(%esp) -; CHECK-NEXT: movw %di, 106(%esp) -; CHECK-NEXT: movw %bx, 104(%esp) -; CHECK-NEXT: movzwl 118(%esp), %edx -; CHECK-NEXT: movzwl 116(%esp), %esi -; CHECK-NEXT: movzwl 114(%esp), %edi -; CHECK-NEXT: movzwl 112(%esp), %ebx -; CHECK-NEXT: movzwl 110(%esp), %ebp -; CHECK-NEXT: movzwl 108(%esp), %eax +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl 106(%esp), %eax +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl 104(%esp), %eax +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ebx, (%eax) ; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, (%eax) ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: calll __gnu_h2f_ieee ; CHECK-NEXT: movl %esp, %eax ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -68,58 +72,58 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind { ; CHECK-NEXT: fstps (%eax) ; CHECK-NEXT: calll __gnu_f2h_ieee ; CHECK-NEXT: movl %esp, %ecx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; CHECK-NEXT: movl %edx, (%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; CHECK-NEXT: movl %esi, (%ecx) ; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill ; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: movl %esp, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; CHECK-NEXT: movl %esi, (%ecx) ; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill ; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fstps 4(%eax) +; CHECK-NEXT: movl %esp, %ecx +; CHECK-NEXT: fstps 4(%ecx) ; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload -; CHECK-NEXT: fstps (%eax) +; CHECK-NEXT: fstps (%ecx) ; CHECK-NEXT: calll fmodf -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fstps (%eax) +; CHECK-NEXT: movl %esp, %ecx +; CHECK-NEXT: fstps (%ecx) ; CHECK-NEXT: calll __gnu_f2h_ieee ; CHECK-NEXT: movl %esp, %ecx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; CHECK-NEXT: movl %edx, (%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; CHECK-NEXT: movl %esi, (%ecx) ; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill ; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: movl %esp, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; CHECK-NEXT: movl %esi, (%ecx) ; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill ; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fstps 4(%eax) +; CHECK-NEXT: movl %esp, %ecx +; CHECK-NEXT: fstps 4(%ecx) ; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload -; CHECK-NEXT: fstps (%eax) +; CHECK-NEXT: fstps (%ecx) ; CHECK-NEXT: calll fmodf -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fstps (%eax) +; CHECK-NEXT: movl %esp, %ecx +; CHECK-NEXT: fstps (%ecx) ; CHECK-NEXT: calll __gnu_f2h_ieee ; CHECK-NEXT: movl %esp, %ecx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; CHECK-NEXT: movl %edx, (%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; CHECK-NEXT: movl %esi, (%ecx) ; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill ; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: movl %esp, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; CHECK-NEXT: movl %esi, (%ecx) ; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill ; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fstps 4(%eax) +; CHECK-NEXT: movl %esp, %ecx +; CHECK-NEXT: fstps 4(%ecx) ; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload -; CHECK-NEXT: fstps (%eax) +; CHECK-NEXT: fstps (%ecx) ; CHECK-NEXT: calll fmodf -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fstps (%eax) +; CHECK-NEXT: movl %esp, %ecx +; CHECK-NEXT: fstps (%ecx) ; CHECK-NEXT: calll __gnu_f2h_ieee ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; CHECK-NEXT: movw %ax, 6(%ecx) @@ -127,9 +131,10 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind { ; CHECK-NEXT: movw %ax, 4(%ecx) ; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %dx # 2-byte Reload ; CHECK-NEXT: movw %dx, 2(%ecx) -; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %si # 2-byte Reload -; CHECK-NEXT: movw %si, (%ecx) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %bp # 2-byte Reload +; CHECK-NEXT: movw %bp, (%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: addl $124, %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/pr47482.ll b/llvm/test/CodeGen/X86/pr47482.ll new file mode 100644 index 0000000000000..e0f01f3c51152 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr47482.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=bmi | FileCheck %s + +@a = external local_unnamed_addr global i32, align 4 +@f = external local_unnamed_addr global i32, align 4 + +define void @g(i32* %x, i32* %y, i32* %z) { +; CHECK-LABEL: g: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl {{.*}}(%rip), %eax +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: sete %cl +; CHECK-NEXT: addl %ecx, %ecx +; CHECK-NEXT: orl (%rdi), %ecx +; CHECK-NEXT: movl $0, (%rsi) +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: shll $8, %eax +; CHECK-NEXT: bextrl %eax, {{.*}}(%rip), %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movl %eax, (%rdx) +; CHECK-NEXT: retq +entry: + %0 = load i32, i32* @a, align 4 + %1 = tail call i32 asm "", "=r,r,~{dirflag},~{fpsr},~{flags}"(i32 %0) + %2 = icmp eq i32 %1, 0 + %shl1 = select i1 %2, i32 2, i32 0 + %3 = load i32, i32* %x, align 4 + %or = or i32 %3, %shl1 + store i32 0, i32* %y, align 4 + %4 = tail call i32 asm "", "=r,~{dirflag},~{fpsr},~{flags}"() + %notmask = shl nsw i32 -1, %4 + %sub = xor i32 %notmask, -1 + %5 = load i32, i32* @f, align 4 + %and4 = and i32 %5, %sub + %or6 = or i32 %and4, %or + store i32 %or6, i32* %z, align 4 + ret void +} diff --git a/llvm/test/CodeGen/X86/pr47517.ll b/llvm/test/CodeGen/X86/pr47517.ll new file mode 100644 index 0000000000000..5672fbc69a41d --- /dev/null +++ b/llvm/test/CodeGen/X86/pr47517.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple x86_64 < %s | FileCheck %s + +; To ensure unused floating point constant is correctly removed +define float @test(float %src, float* %p) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq $0, (%rdi) +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: retq +entry: + %a0 = getelementptr inbounds float, float* %p, i32 0 + %a1 = getelementptr inbounds float, float* %p, i32 1 + store float 0.000000e+00, float* %a0 + store float 0.000000e+00, float* %a1 + %zero = load float, float* %a0 + %fmul1 = fmul fast float %zero, %src + %fadd1 = fadd fast float %fmul1, %zero + %fmul2 = fmul fast float %fadd1, 2.000000e+00 + %fmul3 = fmul fast float %fmul2, %fmul2 + %fmul4 = fmul fast float %fmul2, 2.000000e+00 + %fadd2 = fadd fast float %fmul4, -3.000000e+00 + %fmul5 = fmul fast float %fadd2, %fmul2 + %fadd3 = fadd fast float %fmul2, %src + %fadd4 = fadd fast float %fadd3, %fmul5 + %fmul6 = fmul fast float %fmul3, %fadd4 + ret float %fmul6 +} diff --git a/llvm/test/CodeGen/X86/regalloc-fast-missing-live-out-spill.mir b/llvm/test/CodeGen/X86/regalloc-fast-missing-live-out-spill.mir index 2821f00940ecf..0fe9f60897fd1 100644 --- a/llvm/test/CodeGen/X86/regalloc-fast-missing-live-out-spill.mir +++ b/llvm/test/CodeGen/X86/regalloc-fast-missing-live-out-spill.mir @@ -23,15 +23,15 @@ body: | ; CHECK: successors: %bb.3(0x80000000) ; CHECK: $rax = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load 8 from %stack.1) ; CHECK: renamable $ecx = MOV32r0 implicit-def $eflags - ; CHECK: renamable $rcx = SUBREG_TO_REG 0, killed renamable $ecx, %subreg.sub_32bit + ; CHECK: renamable $rdx = SUBREG_TO_REG 0, killed renamable $ecx, %subreg.sub_32bit ; CHECK: MOV64mi32 killed renamable $rax, 1, $noreg, 0, $noreg, 0 :: (volatile store 8) - ; CHECK: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed $rcx :: (store 8 into %stack.0) + ; CHECK: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed $rdx :: (store 8 into %stack.0) ; CHECK: bb.3: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK: $rax = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load 8 from %stack.0) ; CHECK: renamable $ecx = MOV32r0 implicit-def dead $eflags - ; CHECK: renamable $rcx = SUBREG_TO_REG 0, killed renamable $ecx, %subreg.sub_32bit - ; CHECK: MOV64mr %stack.1, 1, $noreg, 0, $noreg, killed $rcx :: (store 8 into %stack.1) + ; CHECK: renamable $rdx = SUBREG_TO_REG 0, killed renamable $ecx, %subreg.sub_32bit + ; CHECK: MOV64mr %stack.1, 1, $noreg, 0, $noreg, killed $rdx :: (store 8 into %stack.1) ; CHECK: JMP64r killed renamable $rax bb.0: liveins: $edi, $rsi diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index e233bf5be8cfa..e3051f669e18a 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2510,11 +2510,11 @@ define <2 x i64> @test_mm_max_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %cmp = icmp sgt <8 x i16> %arg0, %arg1 - %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1 + %sel = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %arg0, <8 x i16> %arg1) %bc = bitcast <8 x i16> %sel to <2 x i64> ret <2 x i64> %bc } +declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) define <2 x i64> @test_mm_max_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; SSE-LABEL: test_mm_max_epu8: @@ -2533,11 +2533,11 @@ define <2 x i64> @test_mm_max_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %cmp = icmp ugt <16 x i8> %arg0, %arg1 - %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1 + %sel = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %arg0, <16 x i8> %arg1) %bc = bitcast <16 x i8> %sel to <2 x i64> ret <2 x i64> %bc } +declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>) define <2 x double> @test_mm_max_pd(<2 x double> %a0, <2 x double> %a1) nounwind { ; SSE-LABEL: test_mm_max_pd: @@ -2606,11 +2606,11 @@ define <2 x i64> @test_mm_min_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %cmp = icmp slt <8 x i16> %arg0, %arg1 - %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1 + %sel = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %arg0, <8 x i16> %arg1) %bc = bitcast <8 x i16> %sel to <2 x i64> ret <2 x i64> %bc } +declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) define <2 x i64> @test_mm_min_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; SSE-LABEL: test_mm_min_epu8: @@ -2629,11 +2629,11 @@ define <2 x i64> @test_mm_min_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %cmp = icmp ult <16 x i8> %arg0, %arg1 - %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1 + %sel = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %arg0, <16 x i8> %arg1) %bc = bitcast <16 x i8> %sel to <2 x i64> ret <2 x i64> %bc } +declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>) define <2 x double> @test_mm_min_pd(<2 x double> %a0, <2 x double> %a1) nounwind { ; SSE-LABEL: test_mm_min_pd: diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll index 9990ac00eb054..e4db7c09ef6d8 100644 --- a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll @@ -662,11 +662,11 @@ define <2 x i64> @test_mm_max_epi8(<2 x i64> %a0, <2 x i64> %a1) { ; AVX-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %cmp = icmp sgt <16 x i8> %arg0, %arg1 - %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1 + %sel = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %arg0, <16 x i8> %arg1) %bc = bitcast <16 x i8> %sel to <2 x i64> ret <2 x i64> %bc } +declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>) define <2 x i64> @test_mm_max_epi32(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_mm_max_epi32: @@ -680,11 +680,11 @@ define <2 x i64> @test_mm_max_epi32(<2 x i64> %a0, <2 x i64> %a1) { ; AVX-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %arg1 = bitcast <2 x i64> %a1 to <4 x i32> - %cmp = icmp sgt <4 x i32> %arg0, %arg1 - %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1 + %sel = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %arg0, <4 x i32> %arg1) %bc = bitcast <4 x i32> %sel to <2 x i64> ret <2 x i64> %bc } +declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) define <2 x i64> @test_mm_max_epu16(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_mm_max_epu16: @@ -698,11 +698,11 @@ define <2 x i64> @test_mm_max_epu16(<2 x i64> %a0, <2 x i64> %a1) { ; AVX-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %cmp = icmp ugt <8 x i16> %arg0, %arg1 - %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1 + %sel = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %arg0, <8 x i16> %arg1) %bc = bitcast <8 x i16> %sel to <2 x i64> ret <2 x i64> %bc } +declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) define <2 x i64> @test_mm_max_epu32(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_mm_max_epu32: @@ -716,11 +716,11 @@ define <2 x i64> @test_mm_max_epu32(<2 x i64> %a0, <2 x i64> %a1) { ; AVX-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %arg1 = bitcast <2 x i64> %a1 to <4 x i32> - %cmp = icmp ugt <4 x i32> %arg0, %arg1 - %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1 + %sel = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %arg0, <4 x i32> %arg1) %bc = bitcast <4 x i32> %sel to <2 x i64> ret <2 x i64> %bc } +declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) define <2 x i64> @test_mm_min_epi8(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_mm_min_epi8: @@ -734,11 +734,11 @@ define <2 x i64> @test_mm_min_epi8(<2 x i64> %a0, <2 x i64> %a1) { ; AVX-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %cmp = icmp slt <16 x i8> %arg0, %arg1 - %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1 + %sel = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %arg0, <16 x i8> %arg1) %bc = bitcast <16 x i8> %sel to <2 x i64> ret <2 x i64> %bc } +declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>) define <2 x i64> @test_mm_min_epi32(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_mm_min_epi32: @@ -752,11 +752,11 @@ define <2 x i64> @test_mm_min_epi32(<2 x i64> %a0, <2 x i64> %a1) { ; AVX-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %arg1 = bitcast <2 x i64> %a1 to <4 x i32> - %cmp = icmp slt <4 x i32> %arg0, %arg1 - %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1 + %sel = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %arg0, <4 x i32> %arg1) %bc = bitcast <4 x i32> %sel to <2 x i64> ret <2 x i64> %bc } +declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) define <2 x i64> @test_mm_min_epu16(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_mm_min_epu16: @@ -770,11 +770,11 @@ define <2 x i64> @test_mm_min_epu16(<2 x i64> %a0, <2 x i64> %a1) { ; AVX-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %cmp = icmp ult <8 x i16> %arg0, %arg1 - %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1 + %sel = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %arg0, <8 x i16> %arg1) %bc = bitcast <8 x i16> %sel to <2 x i64> ret <2 x i64> %bc } +declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>) define <2 x i64> @test_mm_min_epu32(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_mm_min_epu32: @@ -788,11 +788,11 @@ define <2 x i64> @test_mm_min_epu32(<2 x i64> %a0, <2 x i64> %a1) { ; AVX-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %arg1 = bitcast <2 x i64> %a1 to <4 x i32> - %cmp = icmp ult <4 x i32> %arg0, %arg1 - %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1 + %sel = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %arg0, <4 x i32> %arg1) %bc = bitcast <4 x i32> %sel to <2 x i64> ret <2 x i64> %bc } +declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) define <2 x i64> @test_mm_minpos_epu16(<2 x i64> %a0) { ; SSE-LABEL: test_mm_minpos_epu16: diff --git a/llvm/test/CodeGen/X86/stack-align2.ll b/llvm/test/CodeGen/X86/stack-align2.ll index 7239198000c99..095a9090ed08f 100644 --- a/llvm/test/CodeGen/X86/stack-align2.ll +++ b/llvm/test/CodeGen/X86/stack-align2.ll @@ -2,10 +2,12 @@ ; RUN: llc < %s -mcpu=generic -mtriple=i386-kfreebsd | FileCheck %s -check-prefix=KFREEBSD-I386 ; RUN: llc < %s -mcpu=generic -mtriple=i386-netbsd | FileCheck %s -check-prefix=NETBSD-I386 ; RUN: llc < %s -mcpu=generic -mtriple=i686-apple-darwin8 | FileCheck %s -check-prefix=DARWIN-I386 +; RUN: llc < %s -mcpu=generic -mtriple=i386-pc-solaris2.11 | FileCheck %s -check-prefix=SOLARIS-I386 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s -check-prefix=LINUX-X86_64 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-kfreebsd | FileCheck %s -check-prefix=KFREEBSD-X86_64 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-netbsd | FileCheck %s -check-prefix=NETBSD-X86_64 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin8 | FileCheck %s -check-prefix=DARWIN-X86_64 +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-solaris2.11 | FileCheck %s -check-prefix=SOLARIS-X86_64 define i32 @test() nounwind { entry: @@ -15,7 +17,8 @@ entry: ; LINUX-I386: subl $12, %esp ; KFREEBSD-I386: subl $12, %esp ; DARWIN-I386: subl $12, %esp -; NETBSD-I386-NOT: subl {{.*}}, %esp +; NETBSD-I386-NOT: subl {{.*}}, %esp +; SOLARIS-I386-NOT: subl {{.*}}, %esp ; LINUX-X86_64: pushq %{{.*}} ; LINUX-X86_64-NOT: subq {{.*}}, %rsp @@ -23,6 +26,8 @@ entry: ; DARWIN-X86_64-NOT: subq {{.*}}, %rsp ; NETBSD-X86_64: pushq %{{.*}} ; NETBSD-X86_64-NOT: subq {{.*}}, %rsp +; SOLARIS-X86_64: pushq %{{.*}} +; SOLARIS-X86_64-NOT: subq {{.*}}, %rsp ; KFREEBSD-X86_64: pushq %{{.*}} ; KFREEBSD-X86_64-NOT: subq {{.*}}, %rsp } diff --git a/llvm/test/CodeGen/X86/statepoint-vreg.ll b/llvm/test/CodeGen/X86/statepoint-vreg.ll index b613a949c273d..6a65abed57541 100644 --- a/llvm/test/CodeGen/X86/statepoint-vreg.ll +++ b/llvm/test/CodeGen/X86/statepoint-vreg.ll @@ -8,8 +8,12 @@ declare i1 @return_i1() declare void @func() declare void @"some_call"(i64 addrspace(1)*) declare void @consume(i32 addrspace(1)*) +declare i32 @consume1(i32) gc "statepoint-example" declare void @consume2(i32 addrspace(1)*, i32 addrspace(1)*) +declare void @consume3(float) gc "statepoint-example" +declare float @consume4(i64) gc "statepoint-example" declare void @consume5(i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*) + declare void @use1(i32 addrspace(1)*, i8 addrspace(1)*) declare i32 @"personality_function"() @@ -47,6 +51,7 @@ entry: call void @consume(i32 addrspace(1)* %rel1) ret i1 %res1 } + ; test pointer variables intermixed with pointer constants define void @test_mixed(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) gc "statepoint-example" { ; CHECK-LABEL: test_mixed: @@ -567,6 +572,112 @@ exceptional_return.right: ret i64 addrspace(1)* %val.relocated3 } +; test ISEL for constant base pointer - must properly tie operands +define void @test_const_base(i32 addrspace(1)* %a) gc "statepoint-example" { +; CHECK-LABEL: test_const_base: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: callq func +; CHECK-NEXT: .Ltmp24: +; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: callq consume +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %token1 = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 0, i32 1, i32 7, i32 addrspace(1)* null, i32 9), "gc-live" (i32 addrspace(1)* null, i32 addrspace(1)* %a)] + %rel = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token1, i32 0, i32 1) + call void @consume(i32 addrspace(1)* %rel) + ret void +} + +; test multiple statepoints/relocates within single block. +; relocates must be properly scheduled w.r.t. statepoints +define void @test_sched(float %0, i32 %1, i8 addrspace(1)* %2) gc "statepoint-example" { +; CHECK-LABEL: test_sched: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp) +; CHECK-NEXT: callq consume3 +; CHECK-NEXT: .Ltmp25: +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %ebp, %xmm0 +; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rbx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: nopl 8(%rax,%rax) +; CHECK-NEXT: .Ltmp26: +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss %xmm0, (%rsp) +; CHECK-NEXT: movq %rbx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: nopl 8(%rax,%rax) +; CHECK-NEXT: .Ltmp27: +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss %xmm0, (%rsp) +; CHECK-NEXT: movq %rbx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: nopl 8(%rax,%rax) +; CHECK-NEXT: .Ltmp28: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: xorpd %xmm0, %xmm0 +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero +; CHECK-NEXT: ucomisd %xmm0, %xmm1 +; CHECK-NEXT: movabsq $9223372036854775807, %rdi # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovbeq %rax, %rdi +; CHECK-NEXT: movsd %xmm1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss %xmm0, (%rsp) +; CHECK-NEXT: nopl 8(%rax,%rax) +; CHECK-NEXT: .Ltmp29: +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %token0 = call token (i64, i32, void (float)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf32f(i64 2, i32 0, void (float)* nonnull @consume3, i32 1, i32 0, float %0, i32 0, i32 0) [ "gc-live"(i8 addrspace(1)* %2) ] + %reloc1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %token0, i32 0, i32 0) ; (%2, %2) + %tmp1 = sitofp i32 %1 to double + %to_max.i29 = fcmp ogt double %tmp1, 0.000000e+00 + %token1 = call token (i64, i32, i32 (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32i32f(i64 2, i32 5, i32 (i32)* nonnull @consume1, i32 1, i32 0, i32 undef, i32 0, i32 0) [ "gc-live"(i8 addrspace(1)* %reloc1) ] + %reloc2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %token1, i32 0, i32 0) ; (%reloc1, %reloc1) + %reloc3 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %token1, i32 0, i32 0) ; (%reloc1, %reloc1) + %token2 = call token (i64, i32, i32 (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32i32f(i64 2, i32 5, i32 (i32)* nonnull @consume1, i32 1, i32 0, i32 undef, i32 0, i32 0) [ "deopt"(float %0, double %tmp1), "gc-live"(i8 addrspace(1)* %reloc2, i8 addrspace(1)* %reloc3) ] + %reloc4 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %token2, i32 0, i32 0) ; (%reloc3, %reloc2) + %reloc5 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %token2, i32 1, i32 1) ; (%reloc3, %reloc3) + %token3 = call token (i64, i32, void (float)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf32f(i64 2, i32 5, void (float)* nonnull @consume3, i32 1, i32 0, float %0, i32 0, i32 0) [ "deopt"(float %0, double %tmp1), "gc-live"(i8 addrspace(1)* %reloc4, i8 addrspace(1)* %reloc5) ] + %reloc6 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %token3, i32 1, i32 0) ; (%reloc5, %reloc4) + %tmp5 = select i1 %to_max.i29, i64 9223372036854775807, i64 0 + %token4 = call token (i64, i32, float (i64)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f32i64f(i64 2, i32 5, float (i64)* nonnull @consume4, i32 1, i32 0, i64 %tmp5, i32 0, i32 0) [ "deopt"(float %0, double %tmp1), "gc-live"() ] +ret void +} + +declare token @llvm.experimental.gc.statepoint.p0f_f32i64f(i64 immarg, i32 immarg, float (i64)*, i32 immarg, i32 immarg, ...) +declare token @llvm.experimental.gc.statepoint.p0f_i32i32f(i64 immarg, i32 immarg, i32 (i32)*, i32 immarg, i32 immarg, ...) +declare token @llvm.experimental.gc.statepoint.p0f_isVoidf32f(i64 immarg, i32 immarg, void (float)*, i32 immarg, i32 immarg, ...) declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...) declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) declare token @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) diff --git a/llvm/test/CodeGen/X86/swift-return.ll b/llvm/test/CodeGen/X86/swift-return.ll index 4934419055acd..c62e92f2cac55 100644 --- a/llvm/test/CodeGen/X86/swift-return.ll +++ b/llvm/test/CodeGen/X86/swift-return.ll @@ -28,10 +28,11 @@ define i16 @test(i32 %key) { ; CHECK-O0-NEXT: movl %edi, {{[0-9]+}}(%rsp) ; CHECK-O0-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-O0-NEXT: callq gen -; CHECK-O0-NEXT: cwtl -; CHECK-O0-NEXT: movsbl %dl, %ecx -; CHECK-O0-NEXT: addl %ecx, %eax -; CHECK-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-O0-NEXT: movswl %ax, %ecx +; CHECK-O0-NEXT: movsbl %dl, %esi +; CHECK-O0-NEXT: addl %esi, %ecx +; CHECK-O0-NEXT: # kill: def $cx killed $cx killed $ecx +; CHECK-O0-NEXT: movw %cx, %ax ; CHECK-O0-NEXT: popq %rcx ; CHECK-O0-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O0-NEXT: retq @@ -79,16 +80,16 @@ define i32 @test2(i32 %key) #0 { ; CHECK-O0-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-O0-NEXT: movq %rsp, %rax ; CHECK-O0-NEXT: callq gen2 -; CHECK-O0-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-O0-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-O0-NEXT: movl {{[0-9]+}}(%rsp), %edx -; CHECK-O0-NEXT: movl (%rsp), %esi -; CHECK-O0-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-O0-NEXT: addl %edi, %esi -; CHECK-O0-NEXT: addl %edx, %esi -; CHECK-O0-NEXT: addl %ecx, %esi -; CHECK-O0-NEXT: addl %eax, %esi -; CHECK-O0-NEXT: movl %esi, %eax +; CHECK-O0-NEXT: movl {{[0-9]+}}(%rsp), %esi +; CHECK-O0-NEXT: movl (%rsp), %edi +; CHECK-O0-NEXT: movl {{[0-9]+}}(%rsp), %r8d +; CHECK-O0-NEXT: addl %r8d, %edi +; CHECK-O0-NEXT: addl %esi, %edi +; CHECK-O0-NEXT: addl %edx, %edi +; CHECK-O0-NEXT: addl %ecx, %edi +; CHECK-O0-NEXT: movl %edi, %eax ; CHECK-O0-NEXT: addq $24, %rsp ; CHECK-O0-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O0-NEXT: retq @@ -263,17 +264,17 @@ define void @consume_i1_ret() { ; CHECK-O0-NEXT: .cfi_def_cfa_offset 16 ; CHECK-O0-NEXT: callq produce_i1_ret ; CHECK-O0-NEXT: andb $1, %al -; CHECK-O0-NEXT: movzbl %al, %eax -; CHECK-O0-NEXT: movl %eax, var +; CHECK-O0-NEXT: movzbl %al, %esi +; CHECK-O0-NEXT: movl %esi, var ; CHECK-O0-NEXT: andb $1, %dl -; CHECK-O0-NEXT: movzbl %dl, %eax -; CHECK-O0-NEXT: movl %eax, var +; CHECK-O0-NEXT: movzbl %dl, %esi +; CHECK-O0-NEXT: movl %esi, var ; CHECK-O0-NEXT: andb $1, %cl -; CHECK-O0-NEXT: movzbl %cl, %eax -; CHECK-O0-NEXT: movl %eax, var +; CHECK-O0-NEXT: movzbl %cl, %esi +; CHECK-O0-NEXT: movl %esi, var ; CHECK-O0-NEXT: andb $1, %r8b -; CHECK-O0-NEXT: movzbl %r8b, %eax -; CHECK-O0-NEXT: movl %eax, var +; CHECK-O0-NEXT: movzbl %r8b, %esi +; CHECK-O0-NEXT: movl %esi, var ; CHECK-O0-NEXT: popq %rax ; CHECK-O0-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O0-NEXT: retq diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll index 1afae31b2b8d2..1388c61c18984 100644 --- a/llvm/test/CodeGen/X86/swifterror.ll +++ b/llvm/test/CodeGen/X86/swifterror.ll @@ -790,8 +790,8 @@ a: ; CHECK-O0-LABEL: testAssign4 ; CHECK-O0: callq _foo2 ; CHECK-O0: xorl %eax, %eax -; CHECK-O0: ## kill: def $rax killed $eax -; CHECK-O0: movq %rax, [[SLOT:[-a-z0-9\(\)\%]*]] +; CHECK-O0: movl %eax, %ecx +; CHECK-O0: movq %rcx, [[SLOT:[-a-z0-9\(\)\%]*]] ; CHECK-O0: movq [[SLOT]], %rax ; CHECK-O0: movq %rax, [[SLOT2:[-a-z0-9\(\)\%]*]] ; CHECK-O0: movq [[SLOT2]], %r12 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll new file mode 100644 index 0000000000000..50b88c2c55f5c --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll @@ -0,0 +1,328 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL + +; These tests are identical to corresponding tests in the 'nnan' versions +; of the files except that they use 'fast' FMF. If things are working as +; expected, the 'nnan' codegen should be the same as 'fast'. + +; +; vXf32 +; + +define float @test_v2f32(<2 x float> %a0) { +; SSE2-LABEL: test_v2f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v2f32: +; AVX: # %bb.0: +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0) + ret float %1 +} + +define float @test_v4f32(<4 x float> %a0) { +; SSE2-LABEL: test_v4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: maxps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0) + ret float %1 +} + +define float @test_v8f32(<8 x float> %a0) { +; SSE2-LABEL: test_v8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: minps %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: minps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0) + ret float %1 +} + +define float @test_v16f32(<16 x float> %a0) { +; SSE2-LABEL: test_v16f32: +; SSE2: # %bb.0: +; SSE2-NEXT: maxps %xmm3, %xmm1 +; SSE2-NEXT: maxps %xmm2, %xmm0 +; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16f32: +; SSE41: # %bb.0: +; SSE41-NEXT: maxps %xmm3, %xmm1 +; SSE41-NEXT: maxps %xmm2, %xmm0 +; SSE41-NEXT: maxps %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: maxps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v16f32: +; AVX: # %bb.0: +; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v16f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call fast float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0) + ret float %1 +} + +; +; vXf64 +; + +define double @test_v2f64(<2 x double> %a0) { +; SSE-LABEL: test_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: minsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0) + ret double %1 +} + +define double @test_v4f64(<4 x double> %a0) { +; SSE-LABEL: test_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: maxpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: maxsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0) + ret double %1 +} + +define double @test_v8f64(<8 x double> %a0) { +; SSE-LABEL: test_v8f64: +; SSE: # %bb.0: +; SSE-NEXT: minpd %xmm3, %xmm1 +; SSE-NEXT: minpd %xmm2, %xmm0 +; SSE-NEXT: minpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: minsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v8f64: +; AVX: # %bb.0: +; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v8f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call fast double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0) + ret double %1 +} + +define double @test_v16f64(<16 x double> %a0) { +; SSE-LABEL: test_v16f64: +; SSE: # %bb.0: +; SSE-NEXT: maxpd %xmm6, %xmm2 +; SSE-NEXT: maxpd %xmm4, %xmm0 +; SSE-NEXT: maxpd %xmm2, %xmm0 +; SSE-NEXT: maxpd %xmm7, %xmm3 +; SSE-NEXT: maxpd %xmm5, %xmm1 +; SSE-NEXT: maxpd %xmm3, %xmm1 +; SSE-NEXT: maxpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: maxsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v16f64: +; AVX: # %bb.0: +; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm1 +; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v16f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call fast double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0) + ret double %1 +} + +declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>) +declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) +declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>) +declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>) + +declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) +declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>) +declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>) +declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll index e2025be011343..dd3378411ecc8 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -13,27 +13,46 @@ define float @test_v2f32(<2 x float> %a0) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0) ret float %1 @@ -43,35 +62,45 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE2-NEXT: maxss %xmm3, %xmm0 +; SSE2-NEXT: maxss %xmm2, %xmm0 ; SSE2-NEXT: maxss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: maxss %xmm3, %xmm0 +; SSE41-NEXT: maxss %xmm2, %xmm0 ; SSE41-NEXT: maxss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0) @@ -82,43 +111,67 @@ define float @test_v8f32(<8 x float> %a0) { ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: maxps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm7, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm5, %xmm0, %xmm0 ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm5, %xmm0, %xmm0 ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %a0) @@ -131,12 +184,16 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE2-NEXT: maxps %xmm3, %xmm1 ; SSE2-NEXT: maxps %xmm2, %xmm0 ; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: @@ -144,35 +201,69 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-NEXT: maxps %xmm3, %xmm1 ; SSE41-NEXT: maxps %xmm2, %xmm0 ; SSE41-NEXT: maxps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm9 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm10 = xmm1[1,1,3,3] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm5 +; AVX512-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm12 = xmm5[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm13 = xmm5[1,1,3,3] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpermilps {{.*#+}} xmm14 = xmm3[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm15 = xmm3[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm3[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm15, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm14, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm13, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm12, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm11, %xmm0, %xmm0 ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm10, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm9, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm8, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0) @@ -206,6 +297,76 @@ define double @test_v2f64(<2 x double> %a0) { ret double %1 } +define double @test_v3f64(<3 x double> %a0) { +; SSE2-LABEL: test_v3f64: +; SSE2: # %bb.0: +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; SSE2-NEXT: movapd %xmm2, %xmm1 +; SSE2-NEXT: maxpd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm1, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v3f64: +; SSE41: # %bb.0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: maxpd %xmm0, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: maxsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v3f64: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v3f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v3f64(<3 x double> %a0) + ret double %1 +} + define double @test_v4f64(<4 x double> %a0) { ; SSE-LABEL: test_v4f64: ; SSE: # %bb.0: @@ -218,18 +379,22 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0) @@ -250,21 +415,31 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm4, %xmm0, %xmm0 ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %a0) @@ -274,12 +449,12 @@ define double @test_v8f64(<8 x double> %a0) { define double @test_v16f64(<16 x double> %a0) { ; SSE-LABEL: test_v16f64: ; SSE: # %bb.0: -; SSE-NEXT: maxpd %xmm6, %xmm2 -; SSE-NEXT: maxpd %xmm4, %xmm0 -; SSE-NEXT: maxpd %xmm2, %xmm0 ; SSE-NEXT: maxpd %xmm7, %xmm3 ; SSE-NEXT: maxpd %xmm5, %xmm1 ; SSE-NEXT: maxpd %xmm3, %xmm1 +; SSE-NEXT: maxpd %xmm6, %xmm2 +; SSE-NEXT: maxpd %xmm4, %xmm0 +; SSE-NEXT: maxpd %xmm2, %xmm0 ; SSE-NEXT: maxpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -291,34 +466,110 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm1 ; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0) ret double %1 } +define half @test_v2f16(<2 x half> %a0) nounwind { +; SSE-LABEL: test_v2f16: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $16, %rsp +; SSE-NEXT: movl %edi, %ebx +; SSE-NEXT: movzwl %si, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movzwl %bx, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: cmpunordss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: andps %xmm3, %xmm2 +; SSE-NEXT: maxss %xmm0, %xmm3 +; SSE-NEXT: andnps %xmm3, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: addq $16, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2f16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: movl %esi, %ebx +; AVX-NEXT: movzwl %di, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: movzwl %bx, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; AVX-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vcmpunordss %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: callq __gnu_f2h_ieee +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2f16: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl %di, %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: movzwl %si, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call nnan half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %a0) + ret half %1 +} declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>) declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>) declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>) declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>) +declare double @llvm.experimental.vector.reduce.fmax.v3f64(<3 x double>) declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>) declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>) declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>) + +declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll index d3b17d25ef096..c5e025be5423a 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -10,69 +10,225 @@ ; vXf32 ; +define float @test_v1f32(<1 x float> %a0) { +; ALL-LABEL: test_v1f32: +; ALL: # %bb.0: +; ALL-NEXT: retq + %1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a0) + ret float %1 +} + define float @test_v2f32(<2 x float> %a0) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0) ret float %1 } +define float @test_v3f32(<3 x float> %a0) { +; SSE2-LABEL: test_v3f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v3f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: maxss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v3f32: +; AVX: # %bb.0: +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v3f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: retq + %1 = call float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a0) + ret float %1 +} + define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: maxss %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: andnps %xmm3, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: andps %xmm3, %xmm4 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE41-NEXT: andnps %xmm3, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm1, %xmm3 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: maxss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vmaxss %xmm4, %xmm2, %xmm0 +; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0) ret float %1 @@ -81,46 +237,170 @@ define float @test_v4f32(<4 x float> %a0) { define float @test_v8f32(<8 x float> %a0) { ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: maxps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: maxps %xmm0, %xmm2 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: cmpunordss %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm3 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: andnps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm0, %xmm7, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm2, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm6, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm5, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmaxss %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmaxss %xmm0, %xmm8, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm8, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8f32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vmaxss %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovss %xmm7, %xmm1, %xmm1 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1 +; AVX512BW-NEXT: vmaxss %xmm1, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX512BW-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm8, %xmm0 +; AVX512BW-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512VL-NEXT: vmaxss %xmm2, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %a0) ret float %1 } @@ -128,53 +408,259 @@ define float @test_v8f32(<8 x float> %a0) { define float @test_v16f32(<16 x float> %a0) { ; SSE2-LABEL: test_v16f32: ; SSE2: # %bb.0: -; SSE2-NEXT: maxps %xmm3, %xmm1 -; SSE2-NEXT: maxps %xmm2, %xmm0 -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: maxps %xmm0, %xmm4 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm4, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: maxps %xmm1, %xmm2 +; SSE2-NEXT: cmpunordps %xmm1, %xmm1 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: maxps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: ; SSE41: # %bb.0: -; SSE41-NEXT: maxps %xmm3, %xmm1 -; SSE41-NEXT: maxps %xmm2, %xmm0 -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm4 +; SSE41-NEXT: maxps %xmm0, %xmm4 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movaps %xmm3, %xmm2 +; SSE41-NEXT: maxps %xmm1, %xmm2 +; SSE41-NEXT: cmpunordps %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: maxps %xmm4, %xmm1 +; SSE41-NEXT: cmpunordps %xmm4, %xmm4 +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm1, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: -; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxps %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v16f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v16f32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vmaxss %xmm0, %xmm2, %xmm3 +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vmaxss %xmm2, %xmm0, %xmm2 +; AVX512BW-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm3[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm10 = xmm3[1,1,3,3] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm11 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm12 = xmm6[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm15 = xmm2[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm2[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm7 +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovss %xmm5, %xmm7, %xmm7 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm7, %xmm7, %k1 +; AVX512VL-NEXT: vmaxss %xmm7, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm16, %xmm0 +; AVX512VL-NEXT: vmovss %xmm16, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm15, %xmm0 +; AVX512VL-NEXT: vmovss %xmm15, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm14, %xmm0 +; AVX512VL-NEXT: vmovss %xmm14, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm13, %xmm0 +; AVX512VL-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm12, %xmm0 +; AVX512VL-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm11, %xmm0 +; AVX512VL-NEXT: vmovss %xmm11, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm10, %xmm0 +; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm9, %xmm0 +; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0) ret float %1 } @@ -186,50 +672,106 @@ define float @test_v16f32(<16 x float> %a0) { define double @test_v2f64(<2 x double> %a0) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: maxsd %xmm1, %xmm0 +; SSE-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm3 +; SSE-NEXT: andpd %xmm2, %xmm3 +; SSE-NEXT: maxsd %xmm0, %xmm2 +; SSE-NEXT: andnpd %xmm2, %xmm1 +; SSE-NEXT: orpd %xmm3, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a0) ret double %1 } define double @test_v4f64(<4 x double> %a0) { -; SSE-LABEL: test_v4f64: -; SSE: # %bb.0: -; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: maxsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: maxpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: maxpd %xmm0, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm1, %xmm3 +; SSE41-NEXT: maxsd %xmm2, %xmm1 +; SSE41-NEXT: andnpd %xmm1, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm4, %xmm4, %k1 +; AVX512-NEXT: vmaxsd %xmm4, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0) @@ -237,83 +779,325 @@ define double @test_v4f64(<4 x double> %a0) { } define double @test_v8f64(<8 x double> %a0) { -; SSE-LABEL: test_v8f64: -; SSE: # %bb.0: -; SSE-NEXT: maxpd %xmm3, %xmm1 -; SSE-NEXT: maxpd %xmm2, %xmm0 -; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: maxsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v8f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: maxpd %xmm0, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: maxpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: maxpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: maxpd %xmm0, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: maxpd %xmm1, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: maxpd %xmm4, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm4, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: maxsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: -; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8f64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm8 = xmm2[1,0] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovsd %xmm7, %xmm1, %xmm1 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512BW-NEXT: vmaxsd %xmm1, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm3, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm8, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovsd %xmm7, %xmm2, %xmm2 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512VL-NEXT: vmaxsd %xmm2, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %a0) ret double %1 } define double @test_v16f64(<16 x double> %a0) { -; SSE-LABEL: test_v16f64: -; SSE: # %bb.0: -; SSE-NEXT: maxpd %xmm6, %xmm2 -; SSE-NEXT: maxpd %xmm4, %xmm0 -; SSE-NEXT: maxpd %xmm2, %xmm0 -; SSE-NEXT: maxpd %xmm7, %xmm3 -; SSE-NEXT: maxpd %xmm5, %xmm1 -; SSE-NEXT: maxpd %xmm3, %xmm1 -; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: maxsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v16f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm4, %xmm8 +; SSE2-NEXT: maxpd %xmm0, %xmm8 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm4 +; SSE2-NEXT: andnpd %xmm8, %xmm0 +; SSE2-NEXT: orpd %xmm4, %xmm0 +; SSE2-NEXT: movapd %xmm6, %xmm4 +; SSE2-NEXT: maxpd %xmm2, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm6 +; SSE2-NEXT: andnpd %xmm4, %xmm2 +; SSE2-NEXT: orpd %xmm6, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: maxpd %xmm0, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm5, %xmm2 +; SSE2-NEXT: maxpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm5 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm5, %xmm1 +; SSE2-NEXT: movapd %xmm7, %xmm2 +; SSE2-NEXT: maxpd %xmm3, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE2-NEXT: andpd %xmm3, %xmm7 +; SSE2-NEXT: andnpd %xmm2, %xmm3 +; SSE2-NEXT: orpd %xmm7, %xmm3 +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: maxpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: maxpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: movapd %xmm4, %xmm3 +; SSE41-NEXT: maxpd %xmm0, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movapd %xmm6, %xmm4 +; SSE41-NEXT: maxpd %xmm2, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm2 +; SSE41-NEXT: maxpd %xmm3, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movapd %xmm5, %xmm3 +; SSE41-NEXT: maxpd %xmm1, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: maxpd %xmm8, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm8, %xmm8 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: maxpd %xmm3, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: maxpd %xmm2, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: maxsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: -; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxpd %ymm0, %ymm2, %ymm4 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm4, %ymm0 +; AVX-NEXT: vmaxpd %ymm1, %ymm3, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vmaxpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxpd %zmm0, %zmm1, %zmm2 +; AVX512-NEXT: vcmpunordpd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vmovapd %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm2[1,0] +; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm0 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm0 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm1, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0) ret double %1 } +declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float>) declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>) +declare float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float>) declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>) declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll index f25852f0c6a85..4354463dfdc28 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -10,68 +10,176 @@ ; vXf32 ; +define float @test_v1f32(<1 x float> %a0) { +; ALL-LABEL: test_v1f32: +; ALL: # %bb.0: +; ALL-NEXT: retq + %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a0) + ret float %1 +} + define float @test_v2f32(<2 x float> %a0) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0) ret float %1 } +define float @test_v3f32(<3 x float> %a0) { +; SSE2-LABEL: test_v3f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v3f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: minss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v3f32: +; AVX: # %bb.0: +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v3f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vminss %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: retq + %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a0) + ret float %1 +} + define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE2-NEXT: minss %xmm3, %xmm0 +; SSE2-NEXT: minss %xmm2, %xmm0 ; SSE2-NEXT: minss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: minss %xmm3, %xmm0 +; SSE41-NEXT: minss %xmm2, %xmm0 ; SSE41-NEXT: minss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a0) @@ -82,43 +190,67 @@ define float @test_v8f32(<8 x float> %a0) { ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: minss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: minps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: minss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm7, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm5, %xmm0, %xmm0 ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm5, %xmm0, %xmm0 ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0) @@ -131,12 +263,16 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE2-NEXT: minps %xmm3, %xmm1 ; SSE2-NEXT: minps %xmm2, %xmm0 ; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: minss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: @@ -144,35 +280,69 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-NEXT: minps %xmm3, %xmm1 ; SSE41-NEXT: minps %xmm2, %xmm0 ; SSE41-NEXT: minps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: minss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: ; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm9 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm10 = xmm1[1,1,3,3] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm5 +; AVX512-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm12 = xmm5[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm13 = xmm5[1,1,3,3] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpermilps {{.*#+}} xmm14 = xmm3[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm15 = xmm3[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm3[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm15, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm14, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm13, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm12, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm11, %xmm0, %xmm0 ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm10, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm9, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm8, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a0) @@ -218,18 +388,22 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %a0) @@ -250,21 +424,31 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: ; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm4, %xmm0, %xmm0 ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0) @@ -274,12 +458,12 @@ define double @test_v8f64(<8 x double> %a0) { define double @test_v16f64(<16 x double> %a0) { ; SSE-LABEL: test_v16f64: ; SSE: # %bb.0: -; SSE-NEXT: minpd %xmm6, %xmm2 -; SSE-NEXT: minpd %xmm4, %xmm0 -; SSE-NEXT: minpd %xmm2, %xmm0 ; SSE-NEXT: minpd %xmm7, %xmm3 ; SSE-NEXT: minpd %xmm5, %xmm1 ; SSE-NEXT: minpd %xmm3, %xmm1 +; SSE-NEXT: minpd %xmm6, %xmm2 +; SSE-NEXT: minpd %xmm4, %xmm0 +; SSE-NEXT: minpd %xmm2, %xmm0 ; SSE-NEXT: minpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -291,29 +475,105 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX-NEXT: vminpd %ymm3, %ymm1, %ymm1 ; AVX-NEXT: vminpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %a0) ret double %1 } +define half @test_v2f16(<2 x half> %a0) nounwind { +; SSE-LABEL: test_v2f16: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $16, %rsp +; SSE-NEXT: movl %edi, %ebx +; SSE-NEXT: movzwl %si, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movzwl %bx, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: cmpunordss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: andps %xmm3, %xmm2 +; SSE-NEXT: minss %xmm0, %xmm3 +; SSE-NEXT: andnps %xmm3, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: addq $16, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2f16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: movl %esi, %ebx +; AVX-NEXT: movzwl %di, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: movzwl %bx, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; AVX-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vminss %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vcmpunordss %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: callq __gnu_f2h_ieee +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2f16: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl %di, %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: movzwl %si, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call nnan half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %a0) + ret half %1 +} + +declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float>) declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>) +declare float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float>) declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>) declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>) @@ -322,3 +582,5 @@ declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>) declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>) declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>) + +declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll index d6c681f507522..1d7436eaa8a44 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -13,27 +13,46 @@ define float @test_v2f32(<2 x float> %a0) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0) ret float %1 @@ -42,37 +61,95 @@ define float @test_v2f32(<2 x float> %a0) { define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: andnps %xmm3, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: andps %xmm3, %xmm4 +; SSE41-NEXT: minss %xmm0, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE41-NEXT: andnps %xmm3, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm1, %xmm3 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: minss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vminss %xmm4, %xmm2, %xmm0 +; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a0) ret float %1 @@ -81,46 +158,170 @@ define float @test_v4f32(<4 x float> %a0) { define float @test_v8f32(<8 x float> %a0) { ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: minps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: minps %xmm0, %xmm2 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: cmpunordss %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm3 +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: andnps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: minss %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm0, %xmm7, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm2, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm6, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm5, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminss %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminss %xmm0, %xmm8, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm8, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8f32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vminss %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovss %xmm7, %xmm1, %xmm1 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1 +; AVX512BW-NEXT: vminss %xmm1, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX512BW-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm8, %xmm0 +; AVX512BW-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512VL-NEXT: vminss %xmm2, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0) ret float %1 } @@ -128,53 +329,259 @@ define float @test_v8f32(<8 x float> %a0) { define float @test_v16f32(<16 x float> %a0) { ; SSE2-LABEL: test_v16f32: ; SSE2: # %bb.0: -; SSE2-NEXT: minps %xmm3, %xmm1 -; SSE2-NEXT: minps %xmm2, %xmm0 -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: minps %xmm0, %xmm4 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm4, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: minps %xmm1, %xmm2 +; SSE2-NEXT: cmpunordps %xmm1, %xmm1 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: minps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: ; SSE41: # %bb.0: -; SSE41-NEXT: minps %xmm3, %xmm1 -; SSE41-NEXT: minps %xmm2, %xmm0 -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm4 +; SSE41-NEXT: minps %xmm0, %xmm4 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movaps %xmm3, %xmm2 +; SSE41-NEXT: minps %xmm1, %xmm2 +; SSE41-NEXT: cmpunordps %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: minps %xmm4, %xmm1 +; SSE41-NEXT: cmpunordps %xmm4, %xmm4 +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm1, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: -; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminps %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v16f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v16f32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vminss %xmm0, %xmm2, %xmm3 +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vminss %xmm2, %xmm0, %xmm2 +; AVX512BW-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm3[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm10 = xmm3[1,1,3,3] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm11 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm12 = xmm6[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm15 = xmm2[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm2[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm7 +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovss %xmm5, %xmm7, %xmm7 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm7, %xmm7, %k1 +; AVX512VL-NEXT: vminss %xmm7, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm16, %xmm0 +; AVX512VL-NEXT: vmovss %xmm16, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm15, %xmm0 +; AVX512VL-NEXT: vmovss %xmm15, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm14, %xmm0 +; AVX512VL-NEXT: vmovss %xmm14, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm13, %xmm0 +; AVX512VL-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm12, %xmm0 +; AVX512VL-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm11, %xmm0 +; AVX512VL-NEXT: vmovss %xmm11, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm10, %xmm0 +; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm9, %xmm0 +; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a0) ret float %1 } @@ -186,50 +593,176 @@ define float @test_v16f32(<16 x float> %a0) { define double @test_v2f64(<2 x double> %a0) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: minsd %xmm1, %xmm0 +; SSE-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm3 +; SSE-NEXT: andpd %xmm2, %xmm3 +; SSE-NEXT: minsd %xmm0, %xmm2 +; SSE-NEXT: andnpd %xmm2, %xmm1 +; SSE-NEXT: orpd %xmm3, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0) ret double %1 } +define double @test_v3f64(<3 x double> %a0) { +; SSE2-LABEL: test_v3f64: +; SSE2: # %bb.0: +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; SSE2-NEXT: movapd %xmm2, %xmm1 +; SSE2-NEXT: minpd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm1, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v3f64: +; SSE41: # %bb.0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: minpd %xmm0, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: minsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v3f64: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v3f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call double @llvm.experimental.vector.reduce.fmin.v3f64(<3 x double> %a0) + ret double %1 +} + define double @test_v4f64(<4 x double> %a0) { -; SSE-LABEL: test_v4f64: -; SSE: # %bb.0: -; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: minsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: minpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: minpd %xmm0, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm1, %xmm3 +; SSE41-NEXT: minsd %xmm2, %xmm1 +; SSE41-NEXT: andnpd %xmm1, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminsd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm4, %xmm4, %k1 +; AVX512-NEXT: vminsd %xmm4, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminsd %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %a0) @@ -237,76 +770,316 @@ define double @test_v4f64(<4 x double> %a0) { } define double @test_v8f64(<8 x double> %a0) { -; SSE-LABEL: test_v8f64: -; SSE: # %bb.0: -; SSE-NEXT: minpd %xmm3, %xmm1 -; SSE-NEXT: minpd %xmm2, %xmm0 -; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: minsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v8f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: minpd %xmm0, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: minpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: minpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: minpd %xmm0, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: minpd %xmm1, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: minpd %xmm4, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm4, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: minsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: -; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8f64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm8 = xmm2[1,0] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512BW-NEXT: vminsd %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovsd %xmm7, %xmm1, %xmm1 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512BW-NEXT: vminsd %xmm1, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm3, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm8, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512VL-NEXT: vminsd %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovsd %xmm7, %xmm2, %xmm2 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512VL-NEXT: vminsd %xmm2, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0) ret double %1 } define double @test_v16f64(<16 x double> %a0) { -; SSE-LABEL: test_v16f64: -; SSE: # %bb.0: -; SSE-NEXT: minpd %xmm6, %xmm2 -; SSE-NEXT: minpd %xmm4, %xmm0 -; SSE-NEXT: minpd %xmm2, %xmm0 -; SSE-NEXT: minpd %xmm7, %xmm3 -; SSE-NEXT: minpd %xmm5, %xmm1 -; SSE-NEXT: minpd %xmm3, %xmm1 -; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: minsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v16f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm4, %xmm8 +; SSE2-NEXT: minpd %xmm0, %xmm8 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm4 +; SSE2-NEXT: andnpd %xmm8, %xmm0 +; SSE2-NEXT: orpd %xmm4, %xmm0 +; SSE2-NEXT: movapd %xmm6, %xmm4 +; SSE2-NEXT: minpd %xmm2, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm6 +; SSE2-NEXT: andnpd %xmm4, %xmm2 +; SSE2-NEXT: orpd %xmm6, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: minpd %xmm0, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm5, %xmm2 +; SSE2-NEXT: minpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm5 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm5, %xmm1 +; SSE2-NEXT: movapd %xmm7, %xmm2 +; SSE2-NEXT: minpd %xmm3, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE2-NEXT: andpd %xmm3, %xmm7 +; SSE2-NEXT: andnpd %xmm2, %xmm3 +; SSE2-NEXT: orpd %xmm7, %xmm3 +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: minpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: minpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: movapd %xmm4, %xmm3 +; SSE41-NEXT: minpd %xmm0, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movapd %xmm6, %xmm4 +; SSE41-NEXT: minpd %xmm2, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm2 +; SSE41-NEXT: minpd %xmm3, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movapd %xmm5, %xmm3 +; SSE41-NEXT: minpd %xmm1, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: minpd %xmm8, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm8, %xmm8 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: minpd %xmm3, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: minpd %xmm2, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: minsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: -; AVX-NEXT: vminpd %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vminpd %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminpd %ymm0, %ymm2, %ymm4 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm4, %ymm0 +; AVX-NEXT: vminpd %ymm1, %ymm3, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vminpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminpd %zmm0, %zmm1, %zmm2 +; AVX512-NEXT: vcmpunordpd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vmovapd %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm2[1,0] +; AVX512-NEXT: vminsd %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm0 +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm0 +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm1, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %a0) @@ -319,6 +1092,7 @@ declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>) declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>) declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) +declare double @llvm.experimental.vector.reduce.fmin.v3f64(<3 x double>) declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>) declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>) declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index fb019ffd99e9b..06a428c514a78 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -53,7 +53,7 @@ define i1 @trunc_v2i64_v2i1(<2 x i64>) { ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb %al, %al +; AVX512VL-NEXT: testb $3, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = trunc <2 x i64> %0 to <2 x i1> @@ -103,7 +103,7 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) { ; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb %al, %al +; AVX512VL-NEXT: testb $15, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = trunc <4 x i32> %0 to <4 x i1> @@ -251,7 +251,7 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) { ; AVX512VL-NEXT: vpsllq $63, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb %al, %al +; AVX512VL-NEXT: testb $15, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -974,7 +974,7 @@ define i1 @icmp_v2i64_v2i1(<2 x i64>) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb %al, %al +; AVX512VL-NEXT: testb $3, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <2 x i64> %0, zeroinitializer @@ -1025,7 +1025,7 @@ define i1 @icmp_v4i32_v4i1(<4 x i32>) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb %al, %al +; AVX512VL-NEXT: testb $15, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <4 x i32> %0, zeroinitializer @@ -1214,7 +1214,7 @@ define i1 @icmp_v4i64_v4i1(<4 x i64>) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb %al, %al +; AVX512VL-NEXT: testb $15, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 19d9b159fd830..fb300a88b4120 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -289,31 +289,13 @@ define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31( } define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) { -; SSE2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: -; SSSE3: # %bb.0: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: -; SSE41: # %bb.0: -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; SSE: # %bb.0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index ec775e9155721..5eb4b1039bf9f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -2139,9 +2139,9 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_3 ; ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: @@ -2161,9 +2161,9 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_3 ; ; XOPAVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,3] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -2181,9 +2181,9 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_2 ; ; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: @@ -2203,9 +2203,9 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_2 ; ; XOPAVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,3] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -5086,10 +5086,9 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_2 ; ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: @@ -5110,10 +5109,9 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_2 ; ; XOPAVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -5181,10 +5179,10 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_3 ; ; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: @@ -5205,10 +5203,10 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_3 ; ; XOPAVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,3] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -5283,21 +5281,19 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3 ; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7],ymm1[8,9,10,11,12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,7,u,4,7,u,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,10,11,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,28,29,u,u,30,31,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: @@ -5320,12 +5316,10 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3 ; XOPAVX2-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3] -; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] -; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] -; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7],ymm1[8,9,10,11,12,13,14],ymm2[15] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,3,2,3,4,7,6,7] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -5350,19 +5344,18 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2 ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,0,1,u,u,2,3,u,u,24,25,u,u,26,27,u,u,16,17,u,u,18,19] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,0,4,u,6,4,u,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,10,11,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,28,29,u,u,30,31,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: @@ -5386,10 +5379,9 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2 ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,0,1,u,u,2,3,u,u,24,25,u,u,26,27,u,u,16,17,u,u,18,19] -; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -5469,10 +5461,9 @@ define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_1 ; ; AVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: @@ -5494,10 +5485,9 @@ define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -5516,10 +5506,10 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1 ; ; AVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: @@ -5541,10 +5531,10 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,3] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index a7e65f10a3604..23bf91de6e7e8 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2793,16 +2793,16 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_ ; ; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,3] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31] -; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,3] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,3] +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: @@ -2822,9 +2822,9 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_ ; ; XOPAVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,3] +; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -2842,16 +2842,16 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_ ; ; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,3] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23] -; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,3] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,3] +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: @@ -2871,9 +2871,9 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_ ; ; XOPAVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero -; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,3] +; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -3316,7 +3316,6 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX512VLBW-FAST-NEXT: kmovd %eax, %k1 ; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,0,1,u,u,u,u,5,10,13,u,u,0,u,u,16,23,u,23,u,u,u,u,u,u,u,27,u,u,u,u] ; AVX512VLBW-FAST-NEXT: retq - ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX512VLVBMI: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll index ac6701b383f25..2b76d668f5fe2 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -67,16 +67,16 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1 ; KNL: ## %bb.0: ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3] -; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,6,7,u,u,12,13,u,u,2,3,u,u,0,1,u,u,22,23,u,u,20,21,u,u,18,19,u,u,u,u] +; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[6,7,12,13,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,u,u,u,u,u,u,u,u,u,u] ; KNL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3] -; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,u,u,4,5,u,u,2,3,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17,u,u] -; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7],ymm0[8],ymm3[9],ymm0[10],ymm3[11],ymm0[12],ymm3[13],ymm0[14],ymm3[15] +; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,8,9,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u] +; KNL-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] ; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; KNL-NEXT: vpbroadcastw %xmm1, %ymm1 ; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15] ; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,6,7,u,u,12,13,u,u,2,3,u,u,0,1,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17] -; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,12,13,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u] +; KNL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] ; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll index ccf1476e6a657..422f64d982bfb 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -596,6 +596,21 @@ define void @test_demandedelts_pshufb_v32i8_v16i8(<2 x i32>* %src, <8 x i32>* %d ret void } +define <32 x float> @PR47534(<8 x float> %tmp) { +; CHECK-LABEL: PR47534: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [7,25,26,27,7,29,30,31,7,25,26,27,7,29,30,31] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: ret{{[l|q]}} + %tmp1 = shufflevector <8 x float> %tmp, <8 x float> undef, <32 x i32> + %tmp2 = shufflevector <32 x float> , <32 x float> undef, <32 x i32> + %tmp18 = shufflevector <32 x float> %tmp2, <32 x float> %tmp1, <32 x i32> + ret <32 x float> %tmp18 +} + %union1= type { <16 x float> } @src1 = external dso_local local_unnamed_addr global %union1, align 64 diff --git a/llvm/test/CodeGen/X86/vmaskmov-offset.ll b/llvm/test/CodeGen/X86/vmaskmov-offset.ll index 03fead64bc29e..a67dcce037508 100644 --- a/llvm/test/CodeGen/X86/vmaskmov-offset.ll +++ b/llvm/test/CodeGen/X86/vmaskmov-offset.ll @@ -52,3 +52,31 @@ bb: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %masked_loaded_vec, <8 x double>* nonnull %stack_output_vec, i32 4, <8 x i1> %mask) ret void } + +define <2 x double> @mload_constmask_v2f64(<2 x double>* %addr, <2 x double> %dst) { + ; CHECK-LABEL: name: mload_constmask_v2f64 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK: liveins: $rdi, $xmm0 + ; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY $xmm0 + ; CHECK: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi + ; CHECK: [[VMOVHPDrm:%[0-9]+]]:vr128 = VMOVHPDrm [[COPY]], [[COPY1]], 1, $noreg, 8, $noreg :: (load 8 from %ir.addr + 8, align 4) + ; CHECK: $xmm0 = COPY [[VMOVHPDrm]] + ; CHECK: RET 0, $xmm0 + %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> , <2 x double> %dst) + ret <2 x double> %res +} + +define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) { + ; CHECK-LABEL: name: one_mask_bit_set2 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK: liveins: $rdi, $xmm0 + ; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY $xmm0 + ; CHECK: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi + ; CHECK: VEXTRACTPSmr [[COPY1]], 1, $noreg, 8, $noreg, [[COPY]], 2 :: (store 4 into %ir.addr + 8) + ; CHECK: RET 0 + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1>) + ret void +} + +declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) diff --git a/llvm/test/DebugInfo/MIR/Mips/last-inst-bundled.mir b/llvm/test/DebugInfo/MIR/Mips/last-inst-bundled.mir index 1187dd4331408..ed7360a68da49 100644 --- a/llvm/test/DebugInfo/MIR/Mips/last-inst-bundled.mir +++ b/llvm/test/DebugInfo/MIR/Mips/last-inst-bundled.mir @@ -21,7 +21,7 @@ # # Check that last bundled instruction of block gets recognized as end of basic block. # CHECK: bb.2.if.end -# CHECK-NEXT: DBG_VALUE $s0, $noreg, !12, !DIExpression(), debug-location !17 +# CHECK-NEXT: DBG_VALUE $s0, $noreg, !12, !DIExpression() --- | ; ModuleID = '' diff --git a/llvm/test/DebugInfo/MIR/X86/kill-after-spill.mir b/llvm/test/DebugInfo/MIR/X86/kill-after-spill.mir index d85be7f6d8048..fb5503d7e086e 100644 --- a/llvm/test/DebugInfo/MIR/X86/kill-after-spill.mir +++ b/llvm/test/DebugInfo/MIR/X86/kill-after-spill.mir @@ -14,8 +14,8 @@ # ... # # CHECK: bb.1.if.end: -# CHECK: DBG_VALUE $rbp, 0, !37, !DIExpression(DW_OP_constu, 44, DW_OP_minus), debug-location !58 -# CHECK-NOT: DBG_VALUE $rbp, 0, !36, !DIExpression(DW_OP_constu, 48, DW_OP_minus), debug-location !57 +# CHECK: DBG_VALUE $rbp, 0, !37, !DIExpression(DW_OP_constu, 44, DW_OP_minus) +# CHECK-NOT: DBG_VALUE $rbp, 0, !36, !DIExpression(DW_OP_constu, 48, DW_OP_minus) --- | ; ModuleID = '' @@ -283,7 +283,7 @@ body: | $r13 = MOV64rr $rax renamable $ecx = XOR32rr undef $ecx, undef $ecx, implicit-def dead $eflags renamable $r13 = AND64rr killed renamable $r13, renamable $r14, implicit-def $eflags - JCC_1 %bb.9, 4, implicit $eflags + JCC_1 %bb.9, 4, implicit $eflags, debug-location !57 bb.1.if.end: successors: %bb.2(0x30000000), %bb.3(0x50000000) @@ -301,7 +301,7 @@ body: | $r12 = MOV64rr $rax $r15 = MOV64rr $r12 renamable $r15 = AND64ri8 killed renamable $r15, -123, implicit-def $eflags - JCC_1 %bb.2, 4, implicit $eflags + JCC_1 %bb.2, 4, implicit $eflags, debug-location !57 bb.3.private.exit: successors: %bb.9(0x30000000), %bb.4(0x50000000) @@ -316,7 +316,7 @@ body: | CALL64pcrel32 @func4, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax renamable $ecx = MOV32ri 1 TEST32rr killed renamable $eax, renamable $eax, implicit-def $eflags - JCC_1 %bb.9, 4, implicit $eflags + JCC_1 %bb.9, 4, implicit $eflags, debug-location !57 bb.4.if.then8: successors: %bb.8(0x30000000), %bb.5(0x50000000) @@ -327,21 +327,21 @@ body: | CALL64pcrel32 @func5, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $esi, implicit-def $rsp, implicit-def $ssp renamable $rax = MOV64rm killed renamable $r13, 1, $noreg, 8, $noreg :: (load 8 from %ir.13) TEST64rr renamable $rax, renamable $rax, implicit-def $eflags - JCC_1 %bb.8, 4, implicit $eflags + JCC_1 %bb.8, 4, implicit $eflags, debug-location !57 bb.5.land.lhs.true: successors: %bb.6(0x30000000), %bb.7(0x50000000) liveins: $rax, $r12, $r15 CMP32mi8 renamable $r15, 1, $noreg, 0, $noreg, 0, implicit-def $eflags :: (load 4 from %ir.tot_perf2, align 8) - JCC_1 %bb.7, 5, implicit $eflags + JCC_1 %bb.7, 5, implicit $eflags, debug-location !57 bb.6.lor.lhs.false: successors: %bb.8(0x30000000), %bb.7(0x50000000) liveins: $rax, $r12, $r15 CMP32mi8 killed renamable $r15, 1, $noreg, 4, $noreg, 0, implicit-def $eflags :: (load 4 from %ir.tot_bw) - JCC_1 %bb.8, 4, implicit $eflags + JCC_1 %bb.8, 4, implicit $eflags, debug-location !57 bb.7.if.then14: successors: %bb.8(0x80000000) @@ -350,13 +350,13 @@ body: | renamable $rdx = MOV64rm killed renamable $rax, 1, $noreg, 8, $noreg :: (load 8 from %ir.20) $rdi = MOV64rr killed $r12 $esi = MOV32rm $rbp, 1, $noreg, -44, $noreg :: (load 4 from %stack.1) - CALL64pcrel32 @func6, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $esi, implicit $rdx, implicit-def $rsp, implicit-def $ssp + CALL64pcrel32 @func6, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $esi, implicit $rdx, implicit-def $rsp, implicit-def $ssp, debug-location !57 bb.8.cleanup: successors: %bb.9(0x80000000) renamable $ecx = MOV32ri 1 - JMP_1 %bb.9 + JMP_1 %bb.9, debug-location !57 bb.2.if.then3: successors: %bb.9(0x80000000) @@ -369,7 +369,7 @@ body: | $edx = MOV32ri 5 $r8d = MOV32rm $rbp, 1, $noreg, -48, $noreg :: (load 4 from %stack.0) CALL64pcrel32 @func3, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit $esi, implicit $edx, implicit $rcx, implicit $r8d, implicit-def $rsp, implicit-def $ssp - renamable $ecx = XOR32rr undef $ecx, undef $ecx, implicit-def dead $eflags + renamable $ecx = XOR32rr undef $ecx, undef $ecx, implicit-def dead $eflags, debug-location !57 bb.9.cleanup: liveins: $ecx @@ -382,6 +382,6 @@ body: | $r14 = POP64r implicit-def $rsp, implicit $rsp $r15 = POP64r implicit-def $rsp, implicit $rsp $rbp = POP64r implicit-def $rsp, implicit $rsp - RETQ $eax + RETQ $eax, debug-location !57 ... diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir index c55269951aa50..bef0f4e4aa5ab 100644 --- a/llvm/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir +++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir @@ -31,9 +31,9 @@ # DBG_VALUE for variables "x", "y" and "z" are extended into %bb.9 from its # predecessors %bb.0, %bb.2 and %bb.8. # CHECK: bb.9.for.end: -# CHECK-DAG: DBG_VALUE $edi, $noreg, ![[X_VAR]], !DIExpression(), debug-location !{{[0-9]+}} -# CHECK-DAG: DBG_VALUE $esi, $noreg, ![[Y_VAR]], !DIExpression(), debug-location !{{[0-9]+}} -# CHECK-DAG: DBG_VALUE $edx, $noreg, ![[Z_VAR]], !DIExpression(), debug-location !{{[0-9]+}} +# CHECK-DAG: DBG_VALUE $edi, $noreg, ![[X_VAR]], !DIExpression() +# CHECK-DAG: DBG_VALUE $esi, $noreg, ![[Y_VAR]], !DIExpression() +# CHECK-DAG: DBG_VALUE $edx, $noreg, ![[Z_VAR]], !DIExpression() # CHECK: RET --- | diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-bad-transfer.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-bad-transfer.mir index 1d978b9c45532..97fad0755b80e 100644 --- a/llvm/test/DebugInfo/MIR/X86/live-debug-values-bad-transfer.mir +++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-bad-transfer.mir @@ -1,4 +1,5 @@ # RUN: llc %s -mtriple=x86_64-unknown-unknown -o - -run-pass=livedebugvalues | FileCheck %s --implicit-check-not=DBG_VALUE +# RUN: llc %s -mtriple=x86_64-unknown-unknown -o - -run-pass=livedebugvalues -experimental-debug-variable-locations | FileCheck %s -check-prefix=NEWLDV --implicit-check-not=DBG_VALUE # # Test that the DBG_VALUE of ecx below does not get propagated. It is considered # live-in on LiveDebugValues' first pass through the loop, but on the second it @@ -17,6 +18,13 @@ # CHECK-LABEL: bb.1.loop: # CHECK: $ebx = COPY killed $ecx # CHECK-NEXT: DBG_VALUE +# +# This doesn't occur under value-tracking LiveDebugValues though. +# +# NEWLDV-LABEL: name: foo +# NEWLDV-LABEL: bb.0.entry: +# NEWLDV: $ecx = MOV32ri 0 +# NEWLDV-NEXT: DBG_VALUE --- | source_filename = "live-debug-values-remove-range.ll" @@ -74,30 +82,30 @@ body: | CFI_INSTRUCTION def_cfa_offset 16 CFI_INSTRUCTION offset $rbx, -16 $ebx = MOV32rr $edi - $eax = MOV32ri 0 - $ecx = MOV32ri 0 + $eax = MOV32ri 0, debug-location !10 + $ecx = MOV32ri 0, debug-location !10 DBG_VALUE $ecx, $noreg, !9, !DIExpression(), debug-location !10 - $edi = MOV32ri 0 - $esi = MOV32ri 0 + $edi = MOV32ri 0, debug-location !10 + $esi = MOV32ri 0, debug-location !10 bb.1.loop: successors: %bb.1, %bb.2 liveins: $ebx, $eax, $ecx, $edi, $esi - $eax = COPY $ecx - $ebx = COPY killed $ecx - $ecx = COPY killed $edi - $edi = COPY killed $esi - $esi = MOV32ri 1 + $eax = COPY $ecx, debug-location !10 + $ebx = COPY killed $ecx, debug-location !10 + $ecx = COPY killed $edi, debug-location !10 + $edi = COPY killed $esi, debug-location !10 + $esi = MOV32ri 1, debug-location !10 TEST8ri killed renamable $al, 1, implicit-def $eflags - JCC_1 %bb.1, 5, implicit killed $eflags + JCC_1 %bb.1, 5, implicit killed $eflags, debug-location !10 bb.2.exit: liveins: $ebx - $eax = MOV32rr killed $ebx + $eax = MOV32rr killed $ebx, debug-location !10 $rbx = frame-destroy POP64r implicit-def $rsp, implicit $rsp CFI_INSTRUCTION def_cfa_offset 8 - RETQ $eax + RETQ $eax, debug-location !10 ... diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values.mir index 2cf52611bafd1..2731eac26ecdd 100644 --- a/llvm/test/DebugInfo/MIR/X86/live-debug-values.mir +++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values.mir @@ -35,7 +35,7 @@ # CHECK: ![[N_VAR:[0-9]+]] = !DILocalVariable(name: "n",{{.*}}) # # CHECK: bb.5.if.end.7: -# CHECK: DBG_VALUE $ebx, $noreg, ![[N_VAR]], !DIExpression(), debug-location !{{[0-9]+}} +# CHECK: DBG_VALUE $ebx, $noreg, ![[N_VAR]], !DIExpression() --- | diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues-ignores-metaInstructions.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues-ignores-metaInstructions.mir index e8c3a994e59d0..89c7d55d95c6e 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues-ignores-metaInstructions.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues-ignores-metaInstructions.mir @@ -6,11 +6,11 @@ ; CHECK-LABEL: bb.0.entry: ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.3.bb3: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond.mir index 4004199ad0482..89b4ac63e08a1 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond.mir @@ -5,13 +5,13 @@ ; a diamond that doesn't move or clobber their locations. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.3.bb3: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_match_clobber.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_match_clobber.mir index 063b7f450e08e..bd6dacc2fed1a 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_match_clobber.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_match_clobber.mir @@ -5,12 +5,12 @@ ; a diamond when the location is clobbered and not into the successor block. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-NEXT: $ebx = MOV32ri 0, debug-location !17 ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-NEXT: $ebx = MOV32ri 0, debug-location !17 define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_match_move.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_match_move.mir index 8e530c89db621..05a1955532aaa 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_match_move.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_match_move.mir @@ -5,17 +5,17 @@ ; diamond CFG when the location is moved by another instruction. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-NEXT: $eax = MOV32ri 0, debug-location !17 - ; CHECK-NEXT: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK-NEXT: DBG_VALUE $eax, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-NEXT: $eax = MOV32ri 0, debug-location !17 - ; CHECK-NEXT: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK-NEXT: DBG_VALUE $eax, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.3.bb3: - ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_one_clobber.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_one_clobber.mir index a89546800a217..ee843492c7b95 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_one_clobber.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_one_clobber.mir @@ -5,11 +5,11 @@ ; of a diamond CFG that clobbers its location. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_one_move.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_one_move.mir index 4b9b70455407b..fe3924bf846ae 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_one_move.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_diamond_one_move.mir @@ -5,13 +5,13 @@ ; of a diamond CFG that moves its location. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-NEXT: $eax = MOV32ri 0, debug-location !17 - ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_loop.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_loop.mir index ba2d31ea0b462..d7eb4bd48ab3a 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_loop.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_basic_loop.mir @@ -5,13 +5,13 @@ ; loop that doesn't move or clobber its location. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.3.bb3: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_bb_to_bb.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_bb_to_bb.mir index 2801df4832e33..f48940a24861b 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_bb_to_bb.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_bb_to_bb.mir @@ -5,13 +5,13 @@ ; sequential CFG. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.3.bb3: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_bb_to_bb_clobbered.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_bb_to_bb_clobbered.mir index d1cacff032e13..f969179b76a7d 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_bb_to_bb_clobbered.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_bb_to_bb_clobbered.mir @@ -5,9 +5,9 @@ ; control flow when it's location is clobbered. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_bb_to_bb_move_to_clobber.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_bb_to_bb_move_to_clobber.mir index c1cb8d5daa958..339d21380fa64 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_bb_to_bb_move_to_clobber.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_bb_to_bb_move_to_clobber.mir @@ -5,13 +5,13 @@ ; no control flow when a location is moved and then clobbered. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-NEXT: $eax = MOV32ri 0, debug-location !17 - ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.3.bb3: define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_load_in_loop.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_load_in_loop.mir new file mode 100644 index 0000000000000..97af3bf502196 --- /dev/null +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_load_in_loop.mir @@ -0,0 +1,113 @@ +--- | + ; RUN: llc %s -march=x86-64 -run-pass=livedebugvalues -o - -experimental-debug-variable-locations -emulate-old-livedebugvalues=0 | FileCheck %s -implicit-check-not=DBG_VALUE + + ; Sometimes, variables can have multiple locations, and when control flow + ; merges LiveDebugValues has a hard time picking which one the variable lives + ; in. Test two of these scenarios that old LiveDebugValues can't handle: when + ; a value is in two registers, and when a value is both in a register and + ; on the stack. + + ; In a register: + + ; CHECK-LABEL: bb.0.entry: + ; CHECK: DBG_VALUE $rdi, $noreg, !16, !DIExpression() + ; CHECK-LABEL: bb.1.bb1: + ; CHECK: DBG_VALUE $rbp, $noreg, !16, !DIExpression() + ; CHECK: DBG_VALUE $rbp, $noreg, !16, !DIExpression() + ; CHECK-LABEL: bb.2.bb2: + ; CHECK: DBG_VALUE $rbp, $noreg, !16, !DIExpression() + ; CHECK-LABEL: bb.3.bb3: + ; CHECK: DBG_VALUE $rbp, $noreg, !16, !DIExpression() + + ; On the stack: we move from $rbp to a stack slot in bb4, but join back on + ; $rbp in bb6. + + ; CHECK-LABEL: bb.4: + ; CHECK: DBG_VALUE $rbp, $noreg, !16, !DIExpression() + ; CHECK: DBG_VALUE $rsp, 0, !16, !DIExpression() + ; CHECK-LABEL: bb.5: + ; CHECK: DBG_VALUE $rbp, $noreg, !16, !DIExpression() + ; CHECK-LABEL: bb.6: + ; CHECK: DBG_VALUE $rbp, $noreg, !16, !DIExpression() + + declare i64 @bees(i64 %arg); + + define i32 @_Z8bb_to_bb(i64 %arg) local_unnamed_addr !dbg !12 { + entry: + br label %bb1, !dbg !17 + bb1: + br label %bb2, !dbg !17 + bb2: + br label %bb3, !dbg !17 + bb3: + ret i32 0, !dbg !17 + } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!7, !8, !9, !10} + !llvm.ident = !{!11} + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !3, debugInfoForProfiling: true, nameTableKind: None) + !1 = !DIFile(filename: "main.cpp", directory: "F:\") + !2 = !{} + !3 = !{!4} + !4 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression()) + !5 = distinct !DIGlobalVariable(name: "start", scope: !0, file: !1, line: 4, type: !6, isLocal: false, isDefinition: true) + !6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !7 = !{i32 2, !"Dwarf Version", i32 4} + !8 = !{i32 2, !"Debug Info Version", i32 3} + !9 = !{i32 1, !"wchar_size", i32 2} + !10 = !{i32 7, !"PIC Level", i32 2} + !11 = !{!"clang version 10.0.0"} + !12 = distinct !DISubprogram(name: "bb_to_bb", linkageName: "bb_to_bb", scope: !1, file: !1, line: 6, type: !13, scopeLine: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !15) + !13 = !DISubroutineType(types: !14) + !14 = !{!6, !6} + !15 = !{!16} + !16 = !DILocalVariable(name: "myVar", scope: !12, file: !1, line: 7, type: !6) + !17 = !DILocation(line: 10, scope: !12) + +... +--- +name: _Z8bb_to_bb +tracksRegLiveness: true +liveins: + - { reg: '$rdi', virtual-reg: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.0.entry: + liveins: $rdi + successors: %bb.1, %bb.2 + DBG_VALUE $rdi, $noreg, !16, !DIExpression(), debug-location !17 + $rbp = MOV64rr $rdi, debug-location !17 + dead $rcx = MOV64ri 0, debug-location !17 + CALL64pcrel32 @bees, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $rax, debug-location !17 + CMP64ri8 renamable $rax, 1, implicit-def $eflags, debug-location !17 + JCC_1 %bb.2, 4, implicit killed $eflags, debug-location !17 + bb.1.bb1: + liveins: $rax, $rbp + successors: %bb.3 + $rbp = MOV64ri 0, debug-location !17 + DBG_VALUE $rbp, $noreg, !16, !DIExpression(), debug-location !17 + JMP_1 %bb.3 + bb.2.bb2: + liveins: $rax, $rbp + successors: %bb.3 + $rax = MOV64ri 0, debug-location !17 + bb.3.bb3: + liveins: $rax, $rbp + $rdi = MOV64rr $rbp, debug-location !17 + CALL64pcrel32 @bees, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $rax, debug-location !17 + CMP64ri8 renamable $rax, 1, implicit-def $eflags, debug-location !17 + JCC_1 %bb.5, 4, implicit killed $eflags, debug-location !17 + bb.4: + liveins: $rax, $rbp + MOV64mr $rsp, 1, $noreg, 8, $noreg, killed renamable $rbp :: (store 8 into %stack.0) + JMP_1 %bb.6 + bb.5: + liveins: $rax, $rbp + bb.6: + liveins: $rax, $rbp + RETQ $rax, debug-location !17 +... diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_break.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_break.mir index 7860517adaf08..0d9cc1905134a 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_break.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_break.mir @@ -5,15 +5,15 @@ ; break. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.3.bb3: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.4.bb4: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_diamond.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_diamond.mir index 9854e05e20dca..1e410054dc1cb 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_diamond.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_diamond.mir @@ -5,17 +5,17 @@ ; diamond pattern and beyond. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.3.bb3: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.4.bb4: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.5.bb5: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_diamond_move.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_diamond_move.mir index ed7bdcffd881b..7861e7dfa9c62 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_diamond_move.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_diamond_move.mir @@ -5,17 +5,17 @@ ; diamond pattern but not beyond. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.3.bb3: - ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.4.bb4: - ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.5.bb5: - ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_two_backedge.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_two_backedge.mir index 0989ee335b083..83f7235558947 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_two_backedge.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_two_backedge.mir @@ -5,15 +5,15 @@ ; backedges and beyond. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.3.bb3: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.4.bb4: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_within_loop.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_within_loop.mir index f15275ed60a90..7ff781a07fce6 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_within_loop.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_within_loop.mir @@ -4,17 +4,17 @@ ; Check that DBG_VALUE instructions are propagated into loops within loops. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.1.bb1: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.2.bb2: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.3.bb3: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.4.bb4: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.5.bb5: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_within_loop_moved.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_within_loop_moved.mir index da624928c3aa8..fca7f83a14be4 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_within_loop_moved.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_within_loop_moved.mir @@ -5,9 +5,9 @@ ; loops that move their locations. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.3.bb3: - ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_within_loop_outer_moved.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_within_loop_outer_moved.mir index 12f22df63b141..baade395c6ede 100644 --- a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_within_loop_outer_moved.mir +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_loop_within_loop_outer_moved.mir @@ -5,11 +5,11 @@ ; loops that move their locations. ; CHECK-LABEL: bb.0.entry: - ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $ebx, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.4.bb4: - ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression() ; CHECK-LABEL: bb.5.bb5: - ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression(), debug-location !17 + ; CHECK: DBG_VALUE $eax, $noreg, !16, !DIExpression() define i32 @_Z8bb_to_bb() local_unnamed_addr !dbg !12 { entry: diff --git a/llvm/test/DebugInfo/MIR/X86/livedebugvalues_many_loop_heads.mir b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_many_loop_heads.mir new file mode 100644 index 0000000000000..f5332c29c837f --- /dev/null +++ b/llvm/test/DebugInfo/MIR/X86/livedebugvalues_many_loop_heads.mir @@ -0,0 +1,196 @@ +--- | + ; RUN: llc %s -march=x86-64 -run-pass=livedebugvalues -o - -experimental-debug-variable-locations | FileCheck %s -implicit-check-not=DBG_VALUE + + ; The MIR below represents a pathalogical case for value-tracking + ; LiveDebugValues. The code structure is eight nested loops, with loop heads + ; from bb.1 to bb.8, a central block bb.9 that does nothing, and loop ends + ; from bb.10 to bb.17. The CMP's and jumps might be broken; the only + ; important part is that it looks like nested loops to LiveDebugValues. + ; + ; The variable location is always $rsi, which enters the function live. + ; There's also a def of $rsi in bb.14, in a loop tail, half way into the + ; loop nest.s. + ; + ; This presents a serious problem: the outer four loops each implicitly have + ; a PHI value for $rsi, because the block could be entered on a path straight + ; from entry, or from bb.14 where $rsi is def'd. While the innermost four + ; loops have a value of $rsi that is live-through each loop from bb.5 + ; onwards. + ; + ; Value-tracking LiveDebugValues _must_ correctly identify each PHI value. + ; Observe the DBG_VALUE in bb.2: this variable location musn't be propagated + ; any further, because there's a path to either successor that goes through + ; bb.14 where the value is overwritten.Value tracking needs to identify the + ; PHI value on entry to the block; and that each successor has a different + ; PHI value in that register. + ; + ; Likewise, we mustn't identify values as PHIs which aren't. Entering bb.5 + ; has a PHI value (from bb.4) in $rsi. There are no paths to bb.5 that pass + ; through the clobbering bb.14, which don't also pass through bb.4: thus + ; that value is live-through the innermost four loops. If we + ; over-approximated where PHIs happened, we would lose variable location + ; coverage here, by not propagating the variable location through the inner + ; loops. + ; + ; Getting this right requires the lattice descent (described in the + ; implementation) to search loop head PHI values, until one is found that is + ; live-through a loop. + + ; This location in bb.2 should not be propagated further, + ; CHECK-LABEL: bb.2: + ; CHECK: DBG_VALUE $rsi, $noreg + + ; This location should be live through the inner loops, til bb.14 + ; CHECK-LABEL: bb.5: + ; CHECK: DBG_VALUE $rsi, $noreg + ; CHECK-LABEL: bb.6: + ; CHECK: DBG_VALUE $rsi, $noreg + ; CHECK-LABEL: bb.7: + ; CHECK: DBG_VALUE $rsi, $noreg + ; CHECK-LABEL: bb.8: + ; CHECK: DBG_VALUE $rsi, $noreg + ; CHECK-LABEL: bb.9: + ; CHECK: DBG_VALUE $rsi, $noreg + ; CHECK-LABEL: bb.10: + ; CHECK: DBG_VALUE $rsi, $noreg + ; CHECK-LABEL: bb.11: + ; CHECK: DBG_VALUE $rsi, $noreg + ; CHECK-LABEL: bb.12: + ; CHECK: DBG_VALUE $rsi, $noreg + ; CHECK-LABEL: bb.13: + ; CHECK: DBG_VALUE $rsi, $noreg + + declare i64 @bees(i64 %arg); + + define i32 @chiasm(i64 %arg) local_unnamed_addr !dbg !12 { + entry: + br label %bb1, !dbg !17 + bb1: + br label %bb2, !dbg !17 + bb2: + br label %bb3, !dbg !17 + bb3: + ret i32 0, !dbg !17 + } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!7, !8, !9, !10} + !llvm.ident = !{!11} + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !3, debugInfoForProfiling: true, nameTableKind: None) + !1 = !DIFile(filename: "main.cpp", directory: "F:\") + !2 = !{} + !3 = !{!4} + !4 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression()) + !5 = distinct !DIGlobalVariable(name: "start", scope: !0, file: !1, line: 4, type: !6, isLocal: false, isDefinition: true) + !6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !7 = !{i32 2, !"Dwarf Version", i32 4} + !8 = !{i32 2, !"Debug Info Version", i32 3} + !9 = !{i32 1, !"wchar_size", i32 2} + !10 = !{i32 7, !"PIC Level", i32 2} + !11 = !{!"clang version 10.0.0"} + !12 = distinct !DISubprogram(name: "bb_to_bb", linkageName: "bb_to_bb", scope: !1, file: !1, line: 6, type: !13, scopeLine: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !15) + !13 = !DISubroutineType(types: !14) + !14 = !{!6, !6} + !15 = !{!16} + !16 = !DILocalVariable(name: "myVar", scope: !12, file: !1, line: 7, type: !6) + !17 = !DILocation(line: 10, scope: !12) + +... +--- +name: chiasm +tracksRegLiveness: true +liveins: + - { reg: '$rdi', virtual-reg: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.0.entry: + liveins: $rdi, $rsi + + bb.1: + liveins: $rsi, $rdi + CMP64ri8 renamable $rdi, 1, implicit-def $eflags, debug-location !17 + JCC_1 %bb.17, 4, implicit $eflags, debug-location !17 + + bb.2: + liveins: $rsi, $rdi + DBG_VALUE $rsi, $noreg, !16, !DIExpression(), debug-location !17 + CMP64ri8 renamable $rdi, 2, implicit-def $eflags, debug-location !17 + JCC_1 %bb.16, 4, implicit $eflags, debug-location !17 + + bb.3: + liveins: $rsi, $rdi + CMP64ri8 renamable $rdi, 3, implicit-def $eflags, debug-location !17 + JCC_1 %bb.15, 4, implicit $eflags, debug-location !17 + + bb.4: + liveins: $rsi, $rdi + CMP64ri8 renamable $rdi, 4, implicit-def $eflags, debug-location !17 + JCC_1 %bb.14, 4, implicit $eflags, debug-location !17 + + bb.5: + liveins: $rsi, $rdi + DBG_VALUE $rsi, $noreg, !16, !DIExpression(), debug-location !17 + CMP64ri8 renamable $rdi, 4, implicit-def $eflags, debug-location !17 + JCC_1 %bb.13, 4, implicit $eflags, debug-location !17 + + bb.6: + liveins: $rsi, $rdi + CMP64ri8 renamable $rdi, 4, implicit-def $eflags, debug-location !17 + JCC_1 %bb.12, 4, implicit $eflags, debug-location !17 + + bb.7: + liveins: $rsi, $rdi + CMP64ri8 renamable $rdi, 4, implicit-def $eflags, debug-location !17 + JCC_1 %bb.11, 4, implicit $eflags, debug-location !17 + + bb.8: + liveins: $rsi, $rdi + CMP64ri8 renamable $rdi, 4, implicit-def $eflags, debug-location !17 + JCC_1 %bb.10, 4, implicit $eflags, debug-location !17 + + bb.9: + liveins: $rsi, $rdi, $eflags + ;$rsi = MOV64ri 0, debug-location !17 + ;JMP_1 %bb.1, debug-location !17 + + bb.10: + liveins: $rsi, $rdi, $eflags + JCC_1 %bb.8, 4, implicit $eflags, debug-location !17 + + bb.11: + liveins: $rsi, $rdi, $eflags + JCC_1 %bb.7, 4, implicit $eflags, debug-location !17 + + bb.12: + liveins: $rsi, $rdi, $eflags + JCC_1 %bb.6, 4, implicit $eflags, debug-location !17 + + bb.13: + liveins: $rsi, $rdi, $eflags + JCC_1 %bb.5, 4, implicit $eflags, debug-location !17 + + bb.14: + liveins: $rsi, $rdi, $eflags + $rsi = MOV64ri 0, debug-location !17 + JCC_1 %bb.4, 4, implicit $eflags, debug-location !17 + + bb.15: + liveins: $rsi, $rdi, $eflags + JCC_1 %bb.3, 4, implicit $eflags, debug-location !17 + + bb.16: + liveins: $rsi, $rdi, $eflags + JCC_1 %bb.2, 4, implicit $eflags, debug-location !17 + + bb.17: + liveins: $rsi, $rdi, $eflags + JCC_1 %bb.1, 4, implicit $eflags, debug-location !17 + + bb.18: + liveins: $rsi, $rdi, $eflags + RETQ + +... diff --git a/llvm/test/DebugInfo/WebAssembly/fission-cu.ll b/llvm/test/DebugInfo/WebAssembly/fission-cu.ll new file mode 100644 index 0000000000000..8a04d48d4de73 --- /dev/null +++ b/llvm/test/DebugInfo/WebAssembly/fission-cu.ll @@ -0,0 +1,121 @@ +; RUN: llc -split-dwarf-file=baz.dwo -O0 %s -mtriple=wasm32-unknown-unknown -filetype=obj -o %t +; RUN: llvm-dwarfdump -v -all %t | FileCheck %s +; RUN: llvm-readobj --relocations %t | FileCheck --check-prefix=OBJ %s +; RUN: llvm-objdump -h %t | FileCheck --check-prefix=HDR %s + +; This test is derived from test/DebugInfo/X86/fission-cu.ll + +source_filename = "test/DebugInfo/WebAssembly/fission-cu.ll" + +@a = global i32 0, align 4, !dbg !0 + +!llvm.dbg.cu = !{!4} +!llvm.module.flags = !{!7} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = !DIGlobalVariable(name: "a", scope: null, file: !2, line: 1, type: !3, isLocal: false, isDefinition: true) +!2 = !DIFile(filename: "baz.c", directory: "/usr/local/google/home/echristo/tmp") +!3 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!4 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, producer: "clang version 3.3 (trunk 169021) (llvm/trunk 169020)", isOptimized: false, runtimeVersion: 0, splitDebugFilename: "baz.dwo", emissionKind: FullDebug, enums: !5, retainedTypes: !5, globals: !6, imports: !5) +!5 = !{} +; Check that the skeleton compile unit contains the proper attributes: +; This DIE has the following attributes: DW_AT_comp_dir, DW_AT_stmt_list, +; DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_dwo_name, DW_AT_dwo_id, +; DW_AT_ranges_base, DW_AT_addr_base. + +; CHECK: .debug_abbrev contents: +; CHECK: Abbrev table for offset: 0x00000000 +; CHECK: [1] DW_TAG_compile_unit DW_CHILDREN_no +; CHECK: DW_AT_stmt_list DW_FORM_sec_offset +; CHECK: DW_AT_comp_dir DW_FORM_strp +; CHECK: DW_AT_GNU_dwo_name DW_FORM_strp +; CHECK: DW_AT_GNU_dwo_id DW_FORM_data8 + +; Check that we're using the right forms. +; CHECK: .debug_abbrev.dwo contents: +; CHECK: Abbrev table for offset: 0x00000000 +; CHECK: [1] DW_TAG_compile_unit DW_CHILDREN_yes +; CHECK: DW_AT_producer DW_FORM_GNU_str_index +; CHECK: DW_AT_language DW_FORM_data2 +; CHECK: DW_AT_name DW_FORM_GNU_str_index +; CHECK: DW_AT_GNU_dwo_name DW_FORM_GNU_str_index +; CHECK-NOT: DW_AT_low_pc +; CHECK-NOT: DW_AT_stmt_list +; CHECK-NOT: DW_AT_comp_dir +; CHECK: DW_AT_GNU_dwo_id DW_FORM_data8 + +; CHECK: [2] DW_TAG_variable DW_CHILDREN_no +; CHECK: DW_AT_name DW_FORM_GNU_str_index +; CHECK: DW_AT_type DW_FORM_ref4 +; CHECK: DW_AT_external DW_FORM_flag_present +; CHECK: DW_AT_decl_file DW_FORM_data1 +; CHECK: DW_AT_decl_line DW_FORM_data1 +; CHECK: DW_AT_location DW_FORM_exprloc + +; CHECK: [3] DW_TAG_base_type DW_CHILDREN_no +; CHECK: DW_AT_name DW_FORM_GNU_str_index +; CHECK: DW_AT_encoding DW_FORM_data1 +; CHECK: DW_AT_byte_size DW_FORM_data1 + +; CHECK: .debug_info contents: +; CHECK: DW_TAG_compile_unit +; CHECK-NEXT: DW_AT_stmt_list [DW_FORM_sec_offset] (0x00000000) +; CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strp] ( .debug_str[0x00000000] = "/usr/local/google/home/echristo/tmp") +; CHECK-NEXT: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x00000024] = "baz.dwo") +; CHECK-NEXT: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x1f1f859683d49324) + +; Check that the rest of the compile units have information. +; CHECK: .debug_info.dwo contents: +; CHECK: DW_TAG_compile_unit +; CHECK: DW_AT_producer [DW_FORM_GNU_str_index] (indexed (00000002) string = "clang version 3.3 (trunk 169021) (llvm/trunk 169020)") +; CHECK: DW_AT_language [DW_FORM_data2] (DW_LANG_C99) +; CHECK: DW_AT_name [DW_FORM_GNU_str_index] (indexed (00000003) string = "baz.c") +; CHECK: DW_AT_GNU_dwo_name [DW_FORM_GNU_str_index] (indexed (00000004) string = "baz.dwo") +; CHECK-NOT: DW_AT_low_pc +; CHECK-NOT: DW_AT_stmt_list +; CHECK-NOT: DW_AT_comp_dir +; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x1f1f859683d49324) +; CHECK: DW_TAG_variable +; CHECK: DW_AT_name [DW_FORM_GNU_str_index] (indexed (00000000) string = "a") +; CHECK: DW_AT_type [DW_FORM_ref4] (cu + 0x{{[0-9a-f]*}} => {[[TYPE:0x[0-9a-f]*]]} +; CHECK: DW_AT_external [DW_FORM_flag_present] (true) +; CHECK: DW_AT_decl_file [DW_FORM_data1] (0x01) +; CHECK: DW_AT_decl_line [DW_FORM_data1] (1) +; CHECK: DW_AT_location [DW_FORM_exprloc] (DW_OP_GNU_addr_index 0x0) +; CHECK: [[TYPE]]: DW_TAG_base_type +; CHECK: DW_AT_name [DW_FORM_GNU_str_index] (indexed (00000001) string = "int") + +; CHECK: .debug_str contents: +; CHECK: 0x00000000: "/usr/local/google/home/echristo/tmp" +; CHECK: 0x00000024: "baz.dwo" + +; CHECK: .debug_str.dwo contents: +; CHECK: 0x00000000: "a" +; CHECK: 0x00000002: "int" +; CHECK: 0x00000006: "clang version 3.3 (trunk 169021) (llvm/trunk 169020)" +; CHECK: 0x0000003b: "baz.c" +; CHECK: 0x00000041: "baz.dwo" + +; CHECK: .debug_str_offsets.dwo contents: +; CHECK: 0x00000000: 00000000 +; CHECK: 0x00000004: 00000002 +; CHECK: 0x00000008: 00000006 +; CHECK: 0x0000000c: 0000003b +; CHECK: 0x00000010: 00000041 + +; Object file checks +; For wasm we should have this set of relocations for the debug info section +; +; OBJ: .debug_info +; OBJ-NEXT: R_WASM_SECTION_OFFSET_I32 .debug_abbrev 0 +; OBJ-NEXT: R_WASM_SECTION_OFFSET_I32 .debug_line 0 +; OBJ-NEXT: R_WASM_SECTION_OFFSET_I32 .debug_str 0 +; OBJ-NEXT: R_WASM_SECTION_OFFSET_I32 .debug_str 36 +; OBJ-NEXT: R_WASM_SECTION_OFFSET_I32 .debug_addr 0 +; OBJ-NEXT: } + +; HDR-NOT: .debug_aranges +; HDR-NOT: .rela.{{.*}}.dwo + +!6 = !{!0} +!7 = !{i32 1, !"Debug Info Version", i32 3} diff --git a/llvm/test/DebugInfo/WebAssembly/fission-sections.ll b/llvm/test/DebugInfo/WebAssembly/fission-sections.ll new file mode 100644 index 0000000000000..d7109127109a4 --- /dev/null +++ b/llvm/test/DebugInfo/WebAssembly/fission-sections.ll @@ -0,0 +1,48 @@ +; RUN: llc -split-dwarf-file=baz.dwo -split-dwarf-output=%t.dwo -O0 %s -mtriple=wasm32-unknown-unknown -filetype=obj -o %t +; RUN: llvm-objdump -h %t | FileCheck --check-prefix=OBJ %s +; RUN: llvm-objdump -h %t.dwo | FileCheck --check-prefix=DWO %s + + +; This test is derived from test/DebugInfo/X86/fission-cu.ll +; But it checks that the output objects have the expected sections + +source_filename = "test/DebugInfo/WebAssembly/fission-cu.ll" + +@a = global i32 0, align 4, !dbg !0 + +!llvm.dbg.cu = !{!4} +!llvm.module.flags = !{!7} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = !DIGlobalVariable(name: "a", scope: null, file: !2, line: 1, type: !3, isLocal: false, isDefinition: true) +!2 = !DIFile(filename: "baz.c", directory: "/usr/local/google/home/echristo/tmp") +!3 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!4 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, producer: "clang version 3.3 (trunk 169021) (llvm/trunk 169020)", isOptimized: false, runtimeVersion: 0, splitDebugFilename: "baz.dwo", emissionKind: FullDebug, enums: !5, retainedTypes: !5, globals: !6, imports: !5) +!5 = !{} +!6 = !{!0} +!7 = !{i32 1, !"Debug Info Version", i32 3} + +; CHECK-LABEL: Sections: + +; OBJ: Idx Name +; OBJ-NEXT: 0 IMPORT +; OBJ-NEXT: DATACOUNT +; OBJ-NEXT: DATA +; OBJ-NEXT: .debug_abbrev +; OBJ-NEXT: .debug_info +; OBJ-NEXT: .debug_str +; OBJ-NEXT: .debug_addr +; OBJ-NEXT: .debug_pubnames +; OBJ-NEXT: .debug_pubtypes +; OBJ-NEXT: .debug_line +; OBJ-NEXT: linking + + +; DWO: Idx Name +; DWO-NOT: IMPORT +; DWO-NOT: DATA +; DWO: 0 .debug_str.dwo +; DWO-NEXT: .debug_str_offsets.dwo +; DWO-NEXT: .debug_info.dwo +; DWO-NEXT: .debug_abbrev.dwo +; DWO-NEXT: producers diff --git a/llvm/test/DebugInfo/X86/DW_AT_location-reference.ll b/llvm/test/DebugInfo/X86/DW_AT_location-reference.ll index d516a4c5d0813..3fe6330d9ae9e 100644 --- a/llvm/test/DebugInfo/X86/DW_AT_location-reference.ll +++ b/llvm/test/DebugInfo/X86/DW_AT_location-reference.ll @@ -1,8 +1,17 @@ ; RUN: llc -O1 -filetype=obj -mtriple=x86_64-apple-darwin < %s > %t -; RUN: llvm-dwarfdump -v %t | FileCheck %s +; RUN: llvm-dwarfdump -v %t | FileCheck %s --check-prefixes=CHECK,DWARFv4 ; RUN: llvm-objdump -r %t | FileCheck --check-prefix=DARWIN %s + ; RUN: llc -O1 -filetype=obj -mtriple=x86_64-pc-linux-gnu < %s > %t -; RUN: llvm-dwarfdump -v %t | FileCheck %s +; RUN: llvm-dwarfdump -v %t | FileCheck %s --check-prefixes=CHECK,DWARFv4 +; RUN: llvm-objdump -r %t | FileCheck --check-prefix=LINUX %s + +; RUN: llc -dwarf-version=3 -O1 -filetype=obj -mtriple=x86_64-pc-linux-gnu < %s > %t +; RUN: llvm-dwarfdump -debug-info -v %t | FileCheck %s --check-prefixes=CHECK,DWARF32v3 +; RUN: llvm-objdump -r %t | FileCheck --check-prefix=LINUX %s + +; RUN: llc -dwarf64 -dwarf-version=3 -O1 -filetype=obj -mtriple=x86_64-pc-linux-gnu < %s > %t +; RUN: llvm-dwarfdump -debug-info -v %t | FileCheck %s --check-prefixes=CHECK,DWARF64v3 ; RUN: llvm-objdump -r %t | FileCheck --check-prefix=LINUX %s ; PR9493 @@ -31,7 +40,9 @@ ; // The 'x' variable and its symbol reference location ; CHECK: .debug_info contents: ; CHECK: DW_TAG_variable -; CHECK-NEXT: DW_AT_location [DW_FORM_sec_offset] (0x00000000 +; DWARF32v3-NEXT: DW_AT_location [DW_FORM_data4] (0x00000000 +; DWARF64v3-NEXT: DW_AT_location [DW_FORM_data8] (0x00000000 +; DWARFv4-NEXT: DW_AT_location [DW_FORM_sec_offset] (0x00000000 ; Check that the location contains only 4 ranges. ; CHECK-NEXT: [0x{{[0-9a-f]*}}, 0x{{[0-9a-f]*}}) ; CHECK-NEXT: [0x{{[0-9a-f]*}}, 0x{{[0-9a-f]*}}) diff --git a/llvm/test/DebugInfo/X86/assumed_size_array.ll b/llvm/test/DebugInfo/X86/assumed_size_array.ll new file mode 100644 index 0000000000000..cad7afdd68b59 --- /dev/null +++ b/llvm/test/DebugInfo/X86/assumed_size_array.ll @@ -0,0 +1,122 @@ +;; Check whether fortran assumed size array is accepted +;; which has upperBound absent in DISubrange + +; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -filetype=obj -o %t.o +; RUN: llvm-dwarfdump %t.o | FileCheck %s + +; CHECK-LABEL: DW_TAG_formal_parameter +; CHECK: DW_AT_name ("array1") +; CHECK: DW_AT_type ([[type1:0x[0-9a-f]+]] +; CHECK-LABEL: DW_TAG_formal_parameter +; CHECK: DW_AT_name ("array2") +; CHECK: DW_AT_type ([[type2:0x[0-9a-f]+]] +; CHECK: [[type1]]: DW_TAG_array_type +; CHECK: DW_TAG_subrange_type +; CHECK: [[type2]]: DW_TAG_array_type +; CHECK: DW_TAG_subrange_type +; CHECK: DW_AT_lower_bound (4) +; CHECK: DW_AT_upper_bound (9) +; CHECK: DW_TAG_subrange_type +; CHECK: DW_AT_lower_bound (10) +; +; +;; original fortran program +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;subroutine sub (array1, array2) +;; integer :: array1 (*) +;; integer :: array2 (4:9, 10:*) +;; +;; array1(7:8) = 9 +;; array2(5, 10) = 10 +;;end subroutine +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; ModuleID = 'assumed_size_array.ll' +source_filename = "assumed_size_array.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.C344_sub_ = internal constant i32 10 +@.C345_sub_ = internal constant i64 10 +@.C351_sub_ = internal constant i64 5 +@.C341_sub_ = internal constant i32 9 +@.C322_sub_ = internal constant i64 1 +@.C350_sub_ = internal constant i64 8 +@.C349_sub_ = internal constant i64 7 + +define void @sub_(i64* noalias %array1, i64* noalias %array2) #0 !dbg !5 { +L.entry: + %.dY0001_361 = alloca i64, align 8 + %"i$a_357" = alloca i64, align 8 + call void @llvm.dbg.declare(metadata i64* %array1, metadata !16, metadata !DIExpression()), !dbg !17 + call void @llvm.dbg.declare(metadata i64* %array2, metadata !18, metadata !DIExpression()), !dbg !17 + br label %L.LB1_364 + +L.LB1_364: ; preds = %L.entry + store i64 2, i64* %.dY0001_361, align 8, !dbg !19 + call void @llvm.dbg.declare(metadata i64* %"i$a_357", metadata !20, metadata !DIExpression()), !dbg !17 + store i64 7, i64* %"i$a_357", align 8, !dbg !19 + br label %L.LB1_359 + +L.LB1_359: ; preds = %L.LB1_359, %L.LB1_364 + %0 = load i64, i64* %"i$a_357", align 8, !dbg !19 + call void @llvm.dbg.value(metadata i64 %0, metadata !22, metadata !DIExpression()), !dbg !17 + %1 = bitcast i64* %array1 to i8*, !dbg !19 + %2 = getelementptr i8, i8* %1, i64 -4, !dbg !19 + %3 = bitcast i8* %2 to i32*, !dbg !19 + %4 = getelementptr i32, i32* %3, i64 %0, !dbg !19 + store i32 9, i32* %4, align 4, !dbg !19 + %5 = load i64, i64* %"i$a_357", align 8, !dbg !19 + call void @llvm.dbg.value(metadata i64 %5, metadata !23, metadata !DIExpression()), !dbg !17 + %6 = add nsw i64 %5, 1, !dbg !19 + store i64 %6, i64* %"i$a_357", align 8, !dbg !19 + %7 = load i64, i64* %.dY0001_361, align 8, !dbg !19 + %8 = sub nsw i64 %7, 1, !dbg !19 + store i64 %8, i64* %.dY0001_361, align 8, !dbg !19 + %9 = load i64, i64* %.dY0001_361, align 8, !dbg !19 + %10 = icmp sgt i64 %9, 0, !dbg !19 + br i1 %10, label %L.LB1_359, label %L.LB1_383, !dbg !19 + +L.LB1_383: ; preds = %L.LB1_359 + %11 = bitcast i64* %array2 to i8*, !dbg !24 + %12 = getelementptr i8, i8* %11, i64 4, !dbg !24 + %13 = bitcast i8* %12 to i32*, !dbg !24 + store i32 10, i32* %13, align 4, !dbg !24 + ret void, !dbg !25 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare void @llvm.dbg.declare(metadata, metadata, metadata) + +; Function Attrs: nounwind readnone speculatable willreturn +declare void @llvm.dbg.value(metadata, metadata, metadata) + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} + +!0 = !{i32 2, !"Dwarf Version", i32 4} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = distinct !DICompileUnit(language: DW_LANG_Fortran90, file: !3, producer: " F90 Flang - 1.5 2017-05-01", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4, globals: !4, imports: !4) +!3 = !DIFile(filename: "assumed_size_array.f90", directory: "/tmp") +!4 = !{} +!5 = distinct !DISubprogram(name: "sub", scope: !2, file: !3, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !2) +!6 = !DISubroutineType(types: !7) +!7 = !{null, !8, !12} +!8 = !DICompositeType(tag: DW_TAG_array_type, baseType: !9, align: 32, elements: !10) +!9 = !DIBasicType(name: "integer", size: 32, align: 32, encoding: DW_ATE_signed) +!10 = !{!11} +!11 = !DISubrange(lowerBound: 1) +!12 = !DICompositeType(tag: DW_TAG_array_type, baseType: !9, align: 32, elements: !13) +!13 = !{!14, !15} +!14 = !DISubrange(lowerBound: 4, upperBound: 9) +!15 = !DISubrange(lowerBound: 10) +!16 = !DILocalVariable(name: "array1", arg: 1, scope: !5, file: !3, line: 1, type: !8) +!17 = !DILocation(line: 0, scope: !5) +!18 = !DILocalVariable(name: "array2", arg: 2, scope: !5, file: !3, line: 1, type: !12) +!19 = !DILocation(line: 5, column: 1, scope: !5) +!20 = distinct !DILocalVariable(scope: !5, file: !3, type: !21, flags: DIFlagArtificial) +!21 = !DIBasicType(name: "integer*8", size: 64, align: 64, encoding: DW_ATE_signed) +!22 = distinct !DILocalVariable(scope: !5, file: !3, type: !21, flags: DIFlagArtificial) +!23 = distinct !DILocalVariable(scope: !5, file: !3, type: !21, flags: DIFlagArtificial) +!24 = !DILocation(line: 6, column: 1, scope: !5) +!25 = !DILocation(line: 7, column: 1, scope: !5) diff --git a/llvm/test/DebugInfo/X86/debug-addr-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-addr-dwarf64.ll new file mode 100644 index 0000000000000..5c64d48568a3b --- /dev/null +++ b/llvm/test/DebugInfo/X86/debug-addr-dwarf64.ll @@ -0,0 +1,44 @@ +; This checks that .debug_addr can be generated in the DWARF64 format. + +; RUN: llc -mtriple=x86_64 -dwarf-version=5 -dwarf64 -filetype=obj %s -o %t +; RUN: llvm-dwarfdump -debug-info -debug-addr %t | FileCheck %s + +; CHECK: .debug_info contents: +; CHECK: DW_TAG_compile_unit +; CHECK: DW_AT_addr_base (0x0000000000000010) + +; CHECK: .debug_addr contents: +; CHECK-NEXT: Address table header: length = 0x0000000000000014, format = DWARF64, version = 0x0005, addr_size = 0x08, seg_size = 0x00 +; CHECK-NEXT: Addrs: [ +; CHECK-NEXT: 0x0000000000000000 +; CHECK-NEXT: 0x0000000000000004 +; CHECK-NEXT: ] + +; IR generated and reduced from: +; $ cat foo.c +; int foo; +; int bar; +; $ clang -g -S -emit-llvm foo.c -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +@foo = dso_local global i32 0, align 4, !dbg !0 +@bar = dso_local global i32 0, align 4, !dbg !6 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!9, !10, !11} +!llvm.ident = !{!12} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !8, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "foo.c", directory: "/tmp") +!4 = !{} +!5 = !{!0, !6} +!6 = !DIGlobalVariableExpression(var: !7, expr: !DIExpression()) +!7 = distinct !DIGlobalVariable(name: "bar", scope: !2, file: !3, line: 2, type: !8, isLocal: false, isDefinition: true) +!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!9 = !{i32 7, !"Dwarf Version", i32 4} +!10 = !{i32 2, !"Debug Info Version", i32 3} +!11 = !{i32 1, !"wchar_size", i32 4} +!12 = !{!"clang version 12.0.0"} diff --git a/llvm/test/DebugInfo/X86/debug-aranges-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-aranges-dwarf64.ll new file mode 100644 index 0000000000000..7e037ac125009 --- /dev/null +++ b/llvm/test/DebugInfo/X86/debug-aranges-dwarf64.ll @@ -0,0 +1,39 @@ +; This checks that .debug_aranges can be generated in the DWARF64 format. + +; RUN: llc -mtriple=x86_64 -dwarf64 -generate-arange-section -filetype=obj %s -o %t +; RUN: llvm-dwarfdump -debug-aranges %t | FileCheck %s + +; CHECK: .debug_aranges contents: +; CHECK-NEXT: Address Range Header: +; CHECK-SAME: length = 0x0000000000000034, +; CHECK-SAME: format = DWARF64, +; CHECK-SAME: version = 0x0002, +; CHECK-SAME: cu_offset = 0x0000000000000000, +; CHECK-SAME: addr_size = 0x08, +; CHECK-SAME: seg_size = 0x00 +; CHECK-NEXT: [0x0000000000000000, 0x0000000000000004) + +; IR generated and reduced from: +; $ cat foo.c +; int foo; +; $ clang -g -S -emit-llvm foo.c -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +@foo = dso_local global i32 0, align 4, !dbg !0 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!7, !8, !9} +!llvm.ident = !{!10} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "foo.c", directory: "/tmp") +!4 = !{} +!5 = !{!0} +!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!7 = !{i32 7, !"Dwarf Version", i32 4} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{!"clang version 12.0.0"} diff --git a/llvm/test/DebugInfo/X86/debug-frame-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-frame-dwarf64.ll new file mode 100644 index 0000000000000..8efb739a0d621 --- /dev/null +++ b/llvm/test/DebugInfo/X86/debug-frame-dwarf64.ll @@ -0,0 +1,37 @@ +; This checks that .debug_frame can be generated in the DWARF64 format. + +; RUN: llc -mtriple=x86_64 -dwarf64 -force-dwarf-frame-section -filetype=obj %s -o %t +; RUN: llvm-dwarfdump -debug-frame %t | FileCheck %s + +; CHECK: .debug_frame contents: +; CHECK: 00000000 {{.+}} ffffffffffffffff CIE +; CHECK-NEXT: Format: DWARF64 +; CHECK: {{.+}} 0000000000000000 FDE cie=00000000 pc= +; CHECK-NEXT: Format: DWARF64 + +; IR generated and reduced from: +; $ cat foo.c +; void foo() { } +; $ clang -g -S -emit-llvm foo.c -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +define dso_local void @foo() #0 !dbg !7 { + ret void, !dbg !10 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "foo.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 12.0.0"} +!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !9) +!9 = !{null} +!10 = !DILocation(line: 1, column: 14, scope: !7) diff --git a/llvm/test/DebugInfo/X86/debug-info-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-info-dwarf64.ll new file mode 100644 index 0000000000000..7f988b43a9fd4 --- /dev/null +++ b/llvm/test/DebugInfo/X86/debug-info-dwarf64.ll @@ -0,0 +1,63 @@ +; This checks that .debug_info can be generated in the DWARF64 format. + +; RUN: llc -mtriple=x86_64 -dwarf-version=3 -dwarf64 -filetype=obj %s -o %t3 +; RUN: llvm-dwarfdump -debug-abbrev -debug-info -v %t3 | \ +; RUN: FileCheck %s --check-prefixes=CHECK,DWARFv3 + +; RUN: llc -mtriple=x86_64 -dwarf-version=4 -dwarf64 -filetype=obj %s -o %t4 +; RUN: llvm-dwarfdump -debug-abbrev -debug-info -v %t4 | \ +; RUN: FileCheck %s --check-prefixes=CHECK,DWARFv4 + +; CHECK: .debug_abbrev contents: +; CHECK: [1] DW_TAG_compile_unit DW_CHILDREN_yes +; CHECK-NEXT: DW_AT_producer DW_FORM_strp +; CHECK-NEXT: DW_AT_language DW_FORM_data2 +; CHECK-NEXT: DW_AT_name DW_FORM_strp +; DWARFv3-NEXT: DW_AT_stmt_list DW_FORM_data8 +; DWARFv4-NEXT: DW_AT_stmt_list DW_FORM_sec_offset +; CHECK-NEXT: DW_AT_comp_dir DW_FORM_strp +; CHECK: [2] DW_TAG_variable DW_CHILDREN_no +; CHECK-NEXT: DW_AT_name DW_FORM_strp +; CHECK-NEXT: DW_AT_type DW_FORM_ref4 +; CHECK: [3] DW_TAG_base_type DW_CHILDREN_no +; CHECK-NEXT: DW_AT_name DW_FORM_strp + +; CHECK: .debug_info contents: +; CHECK: Compile Unit: length = 0x{{([[:xdigit:]]{16})}}, format = DWARF64, +; CHECK: DW_TAG_compile_unit [1] * +; CHECK-NEXT: DW_AT_producer [DW_FORM_strp] ( .debug_str[0x{{([[:xdigit:]]{16})}}] = "clang version 12.0.0") +; CHECK-NEXT: DW_AT_language [DW_FORM_data2] (DW_LANG_C99) +; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( .debug_str[0x{{([[:xdigit:]]{16})}}] = "foo.c") +; DWARFv3-NEXT: DW_AT_stmt_list [DW_FORM_data8] (0x0000000000000000) +; DWARFv4-NEXT: DW_AT_stmt_list [DW_FORM_sec_offset] (0x0000000000000000) +; CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strp] ( .debug_str[0x{{([[:xdigit:]]{16})}}] = "/tmp") +; CHECK: DW_TAG_variable [2] +; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( .debug_str[0x{{([[:xdigit:]]{16})}}] = "foo") +; CHECK-NEXT: DW_AT_type [DW_FORM_ref4] (cu + {{.+}} => {{.+}} "int") +; CHECK: DW_TAG_base_type [3] +; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( .debug_str[0x{{([[:xdigit:]]{16})}}] = "int") + +; IR generated and reduced from: +; $ cat foo.c +; int foo; +; $ clang -g -S -emit-llvm foo.c -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +@foo = dso_local global i32 0, align 4, !dbg !0 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!7, !8, !9} +!llvm.ident = !{!10} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "foo.c", directory: "/tmp") +!4 = !{} +!5 = !{!0} +!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!7 = !{i32 7, !"Dwarf Version", i32 4} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{!"clang version 12.0.0"} diff --git a/llvm/test/DebugInfo/X86/debug-info-dwo-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-info-dwo-dwarf64.ll new file mode 100644 index 0000000000000..acc2fded69129 --- /dev/null +++ b/llvm/test/DebugInfo/X86/debug-info-dwo-dwarf64.ll @@ -0,0 +1,32 @@ +; This checks that .debug_info.dwo can be generated in the DWARF64 format. + +; RUN: llc -mtriple=x86_64 -dwarf-version=5 -dwarf64 -split-dwarf-file=foo.dwo -filetype=obj %s -o %t +; RUN: llvm-dwarfdump -debug-info %t | FileCheck %s + +; CHECK: .debug_info.dwo contents: +; CHECK-NEXT: Compile Unit: {{.+}}, format = DWARF64, version = 0x0005, unit_type = DW_UT_split_compile, abbr_offset = 0x0000, + +; IR generated and reduced from: +; $ cat foo.c +; int foo; +; $ clang -g -S -emit-llvm foo.c -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +@foo = dso_local global i32 0, align 4, !dbg !0 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!7, !8, !9} +!llvm.ident = !{!10} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "foo.c", directory: "/tmp") +!4 = !{} +!5 = !{!0} +!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!7 = !{i32 7, !"Dwarf Version", i32 4} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{!"clang version 12.0.0"} diff --git a/llvm/test/DebugInfo/X86/debug-line-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-line-dwarf64.ll new file mode 100644 index 0000000000000..e5045f1495063 --- /dev/null +++ b/llvm/test/DebugInfo/X86/debug-line-dwarf64.ll @@ -0,0 +1,35 @@ +; This checks that .debug_line can be generated in the DWARF64 format. + +; RUN: llc -mtriple=x86_64 -dwarf-version=3 -dwarf64 -filetype=obj %s -o %t3 +; RUN: llvm-dwarfdump -debug-line %t3 | FileCheck %s + +; CHECK: .debug_line contents: +; CHECK-NEXT: debug_line[0x00000000] +; CHECK-NEXT: Line table prologue: +; CHECK-NEXT: total_length: +; CHECK-NEXT: format: DWARF64 + +; IR generated and reduced from: +; $ cat foo.c +; int foo; +; $ clang -g -S -emit-llvm foo.c -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +@foo = dso_local global i32 0, align 4, !dbg !0 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!7, !8, !9} +!llvm.ident = !{!10} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "foo.c", directory: "/tmp") +!4 = !{} +!5 = !{!0} +!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!7 = !{i32 7, !"Dwarf Version", i32 4} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{!"clang version 12.0.0"} diff --git a/llvm/test/DebugInfo/X86/debug-macro-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-macro-dwarf64.ll new file mode 100644 index 0000000000000..8a41922cac12f --- /dev/null +++ b/llvm/test/DebugInfo/X86/debug-macro-dwarf64.ll @@ -0,0 +1,52 @@ +; This checks that .debug_macro[.dwo] can be generated in the DWARF64 format. + +; RUN: llc -mtriple=x86_64 -dwarf-version=4 -dwarf64 -use-gnu-debug-macro -filetype=obj %s -o %t +; RUN: llvm-dwarfdump -debug-macro %t | FileCheck %s --check-prefix=DWARF4 + +; RUN: llc -mtriple=x86_64 -dwarf-version=5 -dwarf64 -filetype=obj %s -o %t +; RUN: llvm-dwarfdump -debug-macro %t | FileCheck %s --check-prefix=DWARF5 + +; RUN: llc -mtriple=x86_64 -dwarf-version=5 -dwarf64 -split-dwarf-file=foo.dwo -filetype=obj %s -o %t +; RUN: llvm-dwarfdump -debug-macro %t | FileCheck %s --check-prefixes=DWARF5,DWO + +; DWARF4: .debug_macro contents: +; DWARF4-NEXT: 0x00000000: +; DWARF4-NEXT: macro header: version = 0x0004, flags = 0x03, format = DWARF64, debug_line_offset = 0x0000000000000000 +; DWARF4-NEXT: DW_MACRO_GNU_start_file - lineno: 0 filenum: 1 +; DWARF4-NEXT: DW_MACRO_GNU_define_indirect - lineno: 1 macro: FOO 1 +; DWARF4-NEXT: DW_MACRO_GNU_undef_indirect - lineno: 2 macro: BAR +; DWARF4-NEXT: DW_MACRO_GNU_end_file + +; DWARF5: .debug_macro contents: +; DWO: .debug_macro.dwo contents: +; DWARF5-NEXT: 0x00000000: +; DWARF5-NEXT: macro header: version = 0x0005, flags = 0x03, format = DWARF64, debug_line_offset = 0x0000000000000000 +; DWARF5-NEXT: DW_MACRO_start_file - lineno: 0 filenum: 0 +; DWARF5-NEXT: DW_MACRO_define_strx - lineno: 1 macro: FOO 1 +; DWARF5-NEXT: DW_MACRO_undef_strx - lineno: 2 macro: BAR +; DWARF5-NEXT: DW_MACRO_end_file + +; IR generated and reduced from: +; $ cat foo.c +; #define FOO 1 +; #undef BAR +; $ clang -g -S -emit-llvm -fdebug-macro foo.c -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!348, !349, !350} +!llvm.ident = !{!351} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, macros: !3, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "foo.c", directory: "/tmp") +!2 = !{} +!3 = !{!4} +!4 = !DIMacroFile(file: !1, nodes: !5) +!5 = !{!6, !7} +!6 = !DIMacro(type: DW_MACINFO_define, line: 1, name: "FOO", value: "1") +!7 = !DIMacro(type: DW_MACINFO_undef, line: 2, name: "BAR") +!348 = !{i32 7, !"Dwarf Version", i32 4} +!349 = !{i32 2, !"Debug Info Version", i32 3} +!350 = !{i32 1, !"wchar_size", i32 4} +!351 = !{!"clang version 12.0.0"} diff --git a/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll new file mode 100644 index 0000000000000..3fc91ef85df1f --- /dev/null +++ b/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll @@ -0,0 +1,87 @@ +; This checks that .debug_names can be generated in the DWARF64 format. + +; RUN: llc -mtriple=x86_64 -dwarf64 -accel-tables=Dwarf -dwarf-version=5 -filetype=obj %s -o %t +; RUN: llvm-dwarfdump -debug-info -debug-names %t | FileCheck %s +; RUN: llvm-dwarfdump -debug-names -verify %t | FileCheck --check-prefix=VERIFY %s + +; CHECK: .debug_info contents: +; CHECK-NEXT: 0x00000000: Compile Unit: {{.+}}, format = DWARF64, +; CHECK: [[VARDIE:.+]]: DW_TAG_variable +; CHECK-NEXT: DW_AT_name ("foo") +; CHECK: [[TYPEDIE:.+]]: DW_TAG_base_type +; CHECK-NEXT: DW_AT_name ("int") + +; CHECK: .debug_names contents: +; CHECK-NEXT: Name Index @ 0x0 { +; CHECK-NEXT: Header { +; CHECK: Format: DWARF64 +; CHECK-NEXT: Version: 5 +; CHECK-NEXT: CU count: 1 +; CHECK-NEXT: Local TU count: 0 +; CHECK-NEXT: Foreign TU count: 0 +; CHECK-NEXT: Bucket count: 2 +; CHECK-NEXT: Name count: 2 +; CHECK: } +; CHECK-NEXT: Compilation Unit offsets [ +; CHECK-NEXT: CU[0]: 0x00000000 +; CHECK-NEXT: ] +; CHECK-NEXT: Abbreviations [ +; CHECK-NEXT: Abbreviation 0x34 { +; CHECK-NEXT: Tag: DW_TAG_variable +; CHECK-NEXT: DW_IDX_die_offset: DW_FORM_ref4 +; CHECK-NEXT: } +; CHECK-NEXT: Abbreviation 0x24 { +; CHECK-NEXT: Tag: DW_TAG_base_type +; CHECK-NEXT: DW_IDX_die_offset: DW_FORM_ref4 +; CHECK-NEXT: } +; CHECK-NEXT: ] +; CHECK-NEXT: Bucket 0 [ +; CHECK-NEXT: Name 1 { +; CHECK-NEXT: Hash: 0xB888030 +; CHECK-NEXT: String: {{.+}} "int" +; CHECK-NEXT: Entry @ {{.+}} { +; CHECK-NEXT: Abbrev: 0x24 +; CHECK-NEXT: Tag: DW_TAG_base_type +; CHECK-NEXT: DW_IDX_die_offset: [[TYPEDIE]] +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: ] +; CHECK-NEXT: Bucket 1 [ +; CHECK-NEXT: Name 2 { +; CHECK-NEXT: Hash: 0xB887389 +; CHECK-NEXT: String: {{.+}} "foo" +; CHECK-NEXT: Entry @ {{.+}} { +; CHECK-NEXT: Abbrev: 0x34 +; CHECK-NEXT: Tag: DW_TAG_variable +; CHECK-NEXT: DW_IDX_die_offset: [[VARDIE]] +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: ] +; CHECK-NEXT: } + +; VERIFY: No errors. + +; IR generated and reduced from: +; $ cat foo.c +; int foo; +; $ clang -g -gpubnames -S -emit-llvm foo.c -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +@foo = dso_local global i32 0, align 4, !dbg !0 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!7, !8, !9} +!llvm.ident = !{!10} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false) +!3 = !DIFile(filename: "foo.c", directory: "/tmp") +!4 = !{} +!5 = !{!0} +!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!7 = !{i32 7, !"Dwarf Version", i32 4} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{!"clang version 12.0.0"} diff --git a/llvm/test/DebugInfo/X86/debug-pubtables-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-pubtables-dwarf64.ll new file mode 100644 index 0000000000000..5ac3551e68d35 --- /dev/null +++ b/llvm/test/DebugInfo/X86/debug-pubtables-dwarf64.ll @@ -0,0 +1,54 @@ +; This checks that .debug_pubnames and .debug_pubtypes can be generated in the DWARF64 format. + +; RUN: llc -mtriple=x86_64 -dwarf64 -filetype=obj %s -o %t +; RUN: llvm-dwarfdump -debug-info -debug-pubnames -debug-pubtypes %t | FileCheck %s + +; CHECK: .debug_info contents: +; CHECK: 0x[[VAR:.+]]: DW_TAG_variable +; CHECK-NEXT: DW_AT_name ("foo") +; CHECK: 0x[[STRUCT:.+]]: DW_TAG_structure_type +; CHECK-NEXT: DW_AT_name ("Foo") +; CHECK: 0x[[BASET:.+]]: DW_TAG_base_type +; CHECK-NEXT: DW_AT_name ("int") + +; CHECK: .debug_pubnames contents: +; CHECK-NEXT: length = 0x0000000000000026, format = DWARF64, version = 0x0002, unit_offset = +; CHECK-NEXT: Offset Name +; CHECK-NEXT: 0x00000000[[VAR]] "foo" + +; CHECK: .debug_pubtypes contents: +; CHECK-NEXT: length = 0x0000000000000032, format = DWARF64, version = 0x0002, unit_offset = +; CHECK-NEXT: Offset Name +; CHECK-NEXT: 0x00000000[[STRUCT]] "Foo" +; CHECK-NEXT: 0x00000000[[BASET]] "int" + +; IR generated and reduced from: +; $ cat foo.c +; struct Foo { int bar; }; +; struct Foo foo; +; $ clang -g -gpubnames -S -emit-llvm foo.c -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +%struct.Foo = type { i32 } + +@foo = dso_local global %struct.Foo zeroinitializer, align 4, !dbg !0 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!10, !11, !12} +!llvm.ident = !{!13} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false) +!3 = !DIFile(filename: "foo.c", directory: "/tmp") +!4 = !{} +!5 = !{!0} +!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Foo", file: !3, line: 1, size: 32, elements: !7) +!7 = !{!8} +!8 = !DIDerivedType(tag: DW_TAG_member, name: "bar", scope: !6, file: !3, line: 1, baseType: !9, size: 32) +!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!10 = !{i32 7, !"Dwarf Version", i32 4} +!11 = !{i32 2, !"Debug Info Version", i32 3} +!12 = !{i32 1, !"wchar_size", i32 4} +!13 = !{!"clang version 12.0.0"} diff --git a/llvm/test/DebugInfo/X86/debug-str-offsets-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-str-offsets-dwarf64.ll new file mode 100644 index 0000000000000..043c72e9b3c48 --- /dev/null +++ b/llvm/test/DebugInfo/X86/debug-str-offsets-dwarf64.ll @@ -0,0 +1,57 @@ +; This checks that .debug_str_offsets can be generated in the DWARF64 format. + +; RUN: llc -mtriple=x86_64 -dwarf-version=5 -dwarf64 -filetype=obj %s -o %t +; RUN: llvm-dwarfdump -debug-info -debug-str -debug-str-offsets -v %t | \ +; RUN: FileCheck %s + +; CHECK: .debug_info contents: +; CHECK-NEXT: Compile Unit: {{.*}}, format = DWARF64, +; CHECK: DW_TAG_compile_unit [1] * +; CHECK: DW_AT_producer [DW_FORM_strx1] (indexed (00000000) string = "clang version 12.0.0") +; CHECK: DW_AT_name [DW_FORM_strx1] (indexed (00000001) string = "foo.c") +; CHECK: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000000000000010) +; CHECK: DW_AT_comp_dir [DW_FORM_strx1] (indexed (00000002) string = "/tmp") +; CHECK: DW_TAG_variable [2] +; CHECK: DW_AT_name [DW_FORM_strx1] (indexed (00000003) string = "foo") +; CHECK: DW_TAG_base_type [3] +; CHECK: DW_AT_name [DW_FORM_strx1] (indexed (00000004) string = "int") + +; CHECK: .debug_str contents: +; CHECK-NEXT: 0x00000000: "clang version 12.0.0" +; CHECK-NEXT: 0x00000015: "foo.c" +; CHECK-NEXT: 0x0000001b: "/tmp" +; CHECK-NEXT: 0x00000020: "foo" +; CHECK-NEXT: 0x00000024: "int" + +; CHECK: .debug_str_offsets contents: +; CHECK-NEXT: 0x00000000: Contribution size = 44, Format = DWARF64, Version = 5 +; CHECK-NEXT: 0x00000010: 0000000000000000 "clang version 12.0.0" +; CHECK-NEXT: 0x00000018: 0000000000000015 "foo.c" +; CHECK-NEXT: 0x00000020: 000000000000001b "/tmp" +; CHECK-NEXT: 0x00000028: 0000000000000020 "foo" +; CHECK-NEXT: 0x00000030: 0000000000000024 "int" + +; IR generated and reduced from: +; $ cat foo.c +; int foo; +; $ clang -g -S -emit-llvm foo.c -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +@foo = dso_local global i32 0, align 4, !dbg !0 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!7, !8, !9} +!llvm.ident = !{!10} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "foo.c", directory: "/tmp") +!4 = !{} +!5 = !{!0} +!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!7 = !{i32 7, !"Dwarf Version", i32 4} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{!"clang version 12.0.0"} diff --git a/llvm/test/DebugInfo/X86/debug-str-offsets-dwo-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-str-offsets-dwo-dwarf64.ll new file mode 100644 index 0000000000000..1366c195f60be --- /dev/null +++ b/llvm/test/DebugInfo/X86/debug-str-offsets-dwo-dwarf64.ll @@ -0,0 +1,56 @@ +; This checks that .debug_str_offsets.dwo can be generated in the DWARF64 format. + +; RUN: llc -mtriple=x86_64 -dwarf-version=5 -dwarf64 -split-dwarf-file=foo.dwo -filetype=obj %s -o %t +; RUN: llvm-dwarfdump -debug-info -debug-str -debug-str-offsets -v %t | \ +; RUN: FileCheck %s + +; CHECK: .debug_info.dwo contents: +; CHECK-NEXT: Compile Unit: {{.*}}, format = DWARF64, +; CHECK: DW_TAG_compile_unit [1] * +; CHECK: DW_AT_producer [DW_FORM_strx1] (indexed (00000002) string = "clang version 12.0.0") +; CHECK: DW_AT_name [DW_FORM_strx1] (indexed (00000003) string = "foo.c") +; CHECK: DW_AT_dwo_name [DW_FORM_strx1] (indexed (00000004) string = "foo.dwo") +; CHECK: DW_TAG_variable [2] +; CHECK: DW_AT_name [DW_FORM_strx1] (indexed (00000000) string = "foo") +; CHECK: DW_TAG_base_type [3] +; CHECK: DW_AT_name [DW_FORM_strx1] (indexed (00000001) string = "int") + +; CHECK: .debug_str.dwo contents: +; CHECK-NEXT: 0x00000000: "foo" +; CHECK-NEXT: 0x00000004: "int" +; CHECK-NEXT: 0x00000008: "clang version 12.0.0" +; CHECK-NEXT: 0x0000001d: "foo.c" +; CHECK-NEXT: 0x00000023: "foo.dwo" + +; CHECK: .debug_str_offsets.dwo contents: +; CHECK-NEXT: 0x00000000: Contribution size = 44, Format = DWARF64, Version = 5 +; CHECK-NEXT: 0x00000010: 0000000000000000 "foo" +; CHECK-NEXT: 0x00000018: 0000000000000004 "int" +; CHECK-NEXT: 0x00000020: 0000000000000008 "clang version 12.0.0" +; CHECK-NEXT: 0x00000028: 000000000000001d "foo.c" +; CHECK-NEXT: 0x00000030: 0000000000000023 "foo.dwo" + +; IR generated and reduced from: +; $ cat foo.c +; int foo; +; $ clang -g -S -emit-llvm foo.c -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +@foo = dso_local global i32 0, align 4, !dbg !0 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!7, !8, !9} +!llvm.ident = !{!10} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "foo.c", directory: "/tmp") +!4 = !{} +!5 = !{!0} +!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!7 = !{i32 7, !"Dwarf Version", i32 4} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{!"clang version 12.0.0"} diff --git a/llvm/test/DebugInfo/X86/debug-types-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-types-dwarf64.ll new file mode 100644 index 0000000000000..7e88d7ef6a3ba --- /dev/null +++ b/llvm/test/DebugInfo/X86/debug-types-dwarf64.ll @@ -0,0 +1,55 @@ +; This checks that .debug_types can be generated in the DWARF64 format. + +; RUN: llc -mtriple=x86_64 -dwarf-version=4 -dwarf64 -generate-type-units -filetype=obj %s -o %t +; RUN: llvm-dwarfdump -debug-types -v %t | FileCheck %s + +; CHECK: .debug_types contents: +; CHECK-NEXT: Type Unit: {{.+}}, format = DWARF64, {{.+}}, type_offset = 0x[[OFF:.+]] (next unit at + +; CHECK: 0x00000027: DW_TAG_type_unit + +; CHECK: 0x0000[[OFF]]: DW_TAG_structure_type +; CHECK-NEXT: DW_AT_calling_convention +; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ({{.+}} = "Foo") + +; CHECK: 0x{{.+}}: DW_TAG_member +; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ({{.+}} = "bar") +; CHECK-NEXT: DW_AT_type [DW_FORM_ref4] (cu + 0x[[BTOFF:.+]] => {0x0000[[BTOFF]]} "int") + +; CHECK: 0x{{.+}}: NULL + +; CHECK: 0x0000[[BTOFF]]: DW_TAG_base_type [4] +; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ({{.+}} = "int") + +; CHECK: 0x{{.+}}: NULL + +; IR generated and reduced from: +; $ cat foo.cc +; struct Foo { int bar; }; +; Foo foo; +; $ clang -g -S -emit-llvm foo.cc -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +%struct.Foo = type { i32 } + +@foo = dso_local global %struct.Foo zeroinitializer, align 4, !dbg !0 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!10, !11, !12} +!llvm.ident = !{!13} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "foo.cc", directory: "/tmp") +!4 = !{} +!5 = !{!0} +!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Foo", file: !3, line: 1, size: 32, flags: DIFlagTypePassByValue, elements: !7, identifier: "_ZTS3Foo") +!7 = !{!8} +!8 = !DIDerivedType(tag: DW_TAG_member, name: "bar", scope: !6, file: !3, line: 1, baseType: !9, size: 32) +!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!10 = !{i32 7, !"Dwarf Version", i32 4} +!11 = !{i32 2, !"Debug Info Version", i32 3} +!12 = !{i32 1, !"wchar_size", i32 4} +!13 = !{!"clang version 12.0.0"} diff --git a/llvm/test/DebugInfo/X86/dwarf64-support.ll b/llvm/test/DebugInfo/X86/dwarf64-support.ll new file mode 100644 index 0000000000000..6790cafd551eb --- /dev/null +++ b/llvm/test/DebugInfo/X86/dwarf64-support.ll @@ -0,0 +1,59 @@ +; This checks cases when the 64-bit DWARF debug info should not be generated +; even if '-dwarf64' is specified. + +; The 64-bit DWARF format was introduced in DWARFv3, so the '-dwarf64' switch +; should be ignored for earlier versions. +; RUN: llc -mtriple=x86_64 -dwarf-version=2 -dwarf64 -filetype=obj %s -o - | \ +; RUN: llvm-dwarfdump -debug-line - | \ +; RUN: FileCheck %s --check-prefixes=ELF64,CHECK + +; DWARF64 requires 64-bit relocations, so it is not produced for 32-bit targets. +; RUN: llc -mtriple=i386 -dwarf-version=5 -dwarf64 -filetype=obj %s -o - | \ +; RUN: llvm-dwarfdump -debug-line - | \ +; RUN: FileCheck %s --check-prefixes=ELF32,CHECK + +; DWARF64 is enabled only for ELF targets. The switch should be ignored for COFF. +; RUN: llc -mtriple=x86_64-windows-gnu -dwarf-version=5 -dwarf64 -filetype=obj %s -o - | \ +; RUN: llvm-dwarfdump -debug-line - | \ +; RUN: FileCheck %s --check-prefixes=COFF,CHECK + +; DWARF64 is enabled only for ELF targets. The switch should be ignored for Mach-O. +; RUN: llc -mtriple=x86_64-apple-darwin -dwarf-version=5 -dwarf64 -filetype=obj %s -o - | \ +; RUN: llvm-dwarfdump -debug-line - | \ +; RUN: FileCheck %s --check-prefixes=MACHO,CHECK + +; ELF64: file format elf64-x86-64 +; ELF32: file format elf32-i386 +; COFF: file format COFF-x86-64 +; MACHO: file format Mach-O 64-bit x86-64 + +; CHECK: .debug_line contents: +; CHECK-NEXT: debug_line[0x00000000] +; CHECK-NEXT: Line table prologue: +; CHECK-NEXT: total_length: +; CHECK-NEXT: format: DWARF32 + +; IR generated and reduced from: +; $ cat foo.c +; int foo; +; $ clang -g -S -emit-llvm foo.c -o foo.ll + +target triple = "x86_64-unknown-linux-gnu" + +@foo = dso_local global i32 0, align 4, !dbg !0 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!7, !8, !9} +!llvm.ident = !{!10} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "foo.c", directory: "/tmp") +!4 = !{} +!5 = !{!0} +!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!7 = !{i32 7, !"Dwarf Version", i32 4} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{!"clang version 12.0.0"} diff --git a/llvm/test/DebugInfo/X86/op_deref.ll b/llvm/test/DebugInfo/X86/op_deref.ll index 1b49dc554f7ef..5de9976d6de2a 100644 --- a/llvm/test/DebugInfo/X86/op_deref.ll +++ b/llvm/test/DebugInfo/X86/op_deref.ll @@ -6,10 +6,10 @@ ; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=DWARF3 ; DWARF4: DW_AT_location [DW_FORM_sec_offset] (0x00000000 -; DWARF4-NEXT: {{.*}}: DW_OP_breg2 RCX+0, DW_OP_deref +; DWARF4-NEXT: {{.*}}: DW_OP_breg1 RDX+0, DW_OP_deref ; DWARF3: DW_AT_location [DW_FORM_data4] (0x00000000 -; DWARF3-NEXT: {{.*}}: DW_OP_breg2 RCX+0, DW_OP_deref +; DWARF3-NEXT: {{.*}}: DW_OP_breg1 RDX+0, DW_OP_deref ; CHECK-NOT: DW_TAG ; CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000067] = "vla") @@ -17,8 +17,8 @@ ; Check the DEBUG_VALUE comments for good measure. ; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o - -filetype=asm | FileCheck %s -check-prefix=ASM-CHECK ; vla should have a register-indirect address at one point. -; ASM-CHECK: DEBUG_VALUE: vla <- [DW_OP_deref] [$rcx+0] -; ASM-CHECK: DW_OP_breg2 +; ASM-CHECK: DEBUG_VALUE: vla <- [DW_OP_deref] [$rdx+0] +; ASM-CHECK: DW_OP_breg1 ; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s --check-prefix=PRETTY-PRINT ; PRETTY-PRINT: DIExpression(DW_OP_deref) diff --git a/llvm/test/DebugInfo/X86/split-dwarf-v5-ranges.ll b/llvm/test/DebugInfo/X86/split-dwarf-v5-ranges.ll index 183787620b7d3..bf9b24387c15d 100644 --- a/llvm/test/DebugInfo/X86/split-dwarf-v5-ranges.ll +++ b/llvm/test/DebugInfo/X86/split-dwarf-v5-ranges.ll @@ -1,22 +1,29 @@ -; RUN: llc -split-dwarf-file=foo.dwo -mtriple=x86_64-unknown-linux-gnu -filetype=obj < %s \ -; RUN: | llvm-dwarfdump -v -debug-info -debug-rnglists - | FileCheck %s +; RUN: llc -split-dwarf-file=foo.dwo -mtriple=x86_64-unknown-linux-gnu -filetype=obj %s -o %t32 +; RUN: llvm-dwarfdump -v -debug-info -debug-rnglists %t32 | \ +; RUN: FileCheck %s --check-prefixes=CHECK,DWARF32 -; CHECK: .debug_info contents: -; CHECK: .debug_info.dwo contents: -; CHECK: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x0) rangelist = 0x00000010 -; CHECK: [0x0000000000000001, 0x000000000000000c) ".text" -; CHECK: [0x000000000000000e, 0x0000000000000013) ".text") +; RUN: llc -dwarf64 -split-dwarf-file=foo.dwo -mtriple=x86_64-unknown-linux-gnu -filetype=obj %s -o %t64 +; RUN: llvm-dwarfdump -v -debug-info -debug-rnglists %t64 | \ +; RUN: FileCheck %s --check-prefixes=CHECK,DWARF64 -; CHECK: .debug_rnglists.dwo contents: -; CHECK: 0x00000000: range list header: length = 0x00000015, format = DWARF32, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000001 -; CHECK: offsets: [ -; CHECK: 0x00000004 => 0x00000010 -; CHECK: ] -; CHECK: ranges: -; CHECK: 0x00000010: [DW_RLE_base_addressx]: 0x0000000000000000 -; CHECK: 0x00000012: [DW_RLE_offset_pair ]: 0x0000000000000001, 0x000000000000000c => [0x0000000000000001, 0x000000000000000c) -; CHECK: 0x00000015: [DW_RLE_offset_pair ]: 0x000000000000000e, 0x0000000000000013 => [0x000000000000000e, 0x0000000000000013) -; CHECK: 0x00000018: [DW_RLE_end_of_list ] +; CHECK: .debug_info contents: +; CHECK: .debug_info.dwo contents: +; CHECK: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x0) rangelist = 0x[[#%.8x,RNG_OFF:]] +; CHECK: [0x0000000000000001, 0x000000000000000c) ".text" +; CHECK: [0x000000000000000e, 0x0000000000000013) ".text") + +; CHECK: .debug_rnglists.dwo contents: +; DWARF32: 0x00000000: range list header: length = 0x00000015, format = DWARF32, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000001 +; DWARF64: 0x00000000: range list header: length = 0x0000000000000019, format = DWARF64, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000001 +; CHECK: offsets: [ +; DWARF32: 0x00000004 => 0x[[#RNG_OFF]] +; DWARF64: 0x0000000000000008 => 0x[[#RNG_OFF]] +; CHECK: ] +; CHECK: ranges: +; CHECK: 0x[[#RNG_OFF]]: [DW_RLE_base_addressx]: 0x0000000000000000 +; CHECK: 0x[[#RNG_OFF+2]]: [DW_RLE_offset_pair ]: 0x0000000000000001, 0x000000000000000c => [0x0000000000000001, 0x000000000000000c) +; CHECK: 0x[[#RNG_OFF+5]]: [DW_RLE_offset_pair ]: 0x000000000000000e, 0x0000000000000013 => [0x000000000000000e, 0x0000000000000013) +; CHECK: 0x[[#RNG_OFF+8]]: [DW_RLE_end_of_list ] ; Function Attrs: noinline optnone uwtable define dso_local void @_Z2f3v() !dbg !7 { diff --git a/llvm/test/Instrumentation/HeapProfiler/basic.ll b/llvm/test/Instrumentation/HeapProfiler/basic.ll index a26dae15f5090..cf6320414bd38 100644 --- a/llvm/test/Instrumentation/HeapProfiler/basic.ll +++ b/llvm/test/Instrumentation/HeapProfiler/basic.ll @@ -1,15 +1,15 @@ ; Test basic address sanitizer instrumentation. ; -; RUN: opt < %s -heapprof -heapprof-module -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s -; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s +; RUN: opt < %s -memprof -memprof-module -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s +; RUN: opt < %s -memprof -memprof-module -memprof-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s -; We need the requires since both heapprof and heapprof-module require reading module level metadata which is done once by the heapprof-globals-md analysis -; RUN: opt < %s -passes='function(heapprof),module(heapprof-module)' -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s -; RUN: opt < %s -passes='function(heapprof),module(heapprof-module)' -heapprof-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s +; We need the requires since both memprof and memprof-module require reading module level metadata which is done once by the memprof-globals-md analysis +; RUN: opt < %s -passes='function(memprof),module(memprof-module)' -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s +; RUN: opt < %s -passes='function(memprof),module(memprof-module)' -memprof-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-unknown-linux-gnu" -; CHECK: @llvm.global_ctors = {{.*}}@heapprof.module_ctor +; CHECK: @llvm.global_ctors = {{.*}}@memprof.module_ctor define i32 @test_load(i32* %a) { entry: @@ -17,7 +17,7 @@ entry: ret i32 %tmp1 } ; CHECK-LABEL: @test_load -; CHECK: %[[SHADOW_OFFSET:[^ ]*]] = load i64, i64* @__heapprof_shadow_memory_dynamic_address +; CHECK: %[[SHADOW_OFFSET:[^ ]*]] = load i64, i64* @__memprof_shadow_memory_dynamic_address ; CHECK-NEXT: %[[LOAD_ADDR:[^ ]*]] = ptrtoint i32* %a to i64 ; CHECK-NEXT: %[[MASKED_ADDR:[^ ]*]] = and i64 %[[LOAD_ADDR]], -64 ; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3 @@ -37,7 +37,7 @@ entry: ret void } ; CHECK-LABEL: @test_store -; CHECK: %[[SHADOW_OFFSET:[^ ]*]] = load i64, i64* @__heapprof_shadow_memory_dynamic_address +; CHECK: %[[SHADOW_OFFSET:[^ ]*]] = load i64, i64* @__memprof_shadow_memory_dynamic_address ; CHECK-NEXT: %[[STORE_ADDR:[^ ]*]] = ptrtoint i32* %a to i64 ; CHECK-NEXT: %[[MASKED_ADDR:[^ ]*]] = and i64 %[[STORE_ADDR]], -64 ; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3 @@ -127,14 +127,14 @@ define void @i80test(i80* %a, i80* %b) nounwind uwtable { ; CHECK: store i80 %t, i80* %b ; CHECK: ret void -; heapprof should not instrument functions with available_externally linkage. +; memprof should not instrument functions with available_externally linkage. define available_externally i32 @f_available_externally(i32* %a) { entry: %tmp1 = load i32, i32* %a ret i32 %tmp1 } ; CHECK-LABEL: @f_available_externally -; CHECK-NOT: __heapprof_shadow_memory_dynamic_address +; CHECK-NOT: __memprof_shadow_memory_dynamic_address ; CHECK: ret i32 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind @@ -150,9 +150,9 @@ define void @memintr_test(i8* %a, i8* %b) nounwind uwtable { } ; CHECK-LABEL: memintr_test -; CHECK: __heapprof_memset -; CHECK: __heapprof_memmove -; CHECK: __heapprof_memcpy +; CHECK: __memprof_memset +; CHECK: __memprof_memmove +; CHECK: __memprof_memcpy ; CHECK: ret void declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind @@ -161,7 +161,7 @@ declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture w define void @memintr_element_atomic_test(i8* %a, i8* %b) nounwind uwtable { ; This is a canary test to make sure that these don't get lowered into calls that don't - ; have the element-atomic property. Eventually, heapprof will have to be enhanced to lower + ; have the element-atomic property. Eventually, memprof will have to be enhanced to lower ; these properly. ; CHECK-LABEL: memintr_element_atomic_test ; CHECK: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1) @@ -175,5 +175,5 @@ define void @memintr_element_atomic_test(i8* %a, i8* %b) nounwind uwtable { } -; CHECK: define internal void @heapprof.module_ctor() -; CHECK: call void @__heapprof_init() +; CHECK: define internal void @memprof.module_ctor() +; CHECK: call void @__memprof_init() diff --git a/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll b/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll index 9df3df47d3d0a..e97274347588e 100644 --- a/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll +++ b/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll @@ -1,31 +1,31 @@ -; Test heapprof internal compiler flags: -; -heapprof-use-callbacks -; -heapprof-memory-access-callback-prefix +; Test memprof internal compiler flags: +; -memprof-use-callbacks +; -memprof-memory-access-callback-prefix -; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks -S | FileCheck %s --check-prefix=CHECK-CALL --check-prefix=CHECK-CALL-DEFAULT -; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks -heapprof-memory-access-callback-prefix=__foo_ -S | FileCheck %s --check-prefix=CHECK-CALL --check-prefix=CHECK-CALL-CUSTOM -; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks=false -S | FileCheck %s --check-prefix=CHECK-INLINE -; RUN: opt < %s -heapprof -heapprof-module -S | FileCheck %s --check-prefix=CHECK-INLINE +; RUN: opt < %s -memprof -memprof-module -memprof-use-callbacks -S | FileCheck %s --check-prefix=CHECK-CALL --check-prefix=CHECK-CALL-DEFAULT +; RUN: opt < %s -memprof -memprof-module -memprof-use-callbacks -memprof-memory-access-callback-prefix=__foo_ -S | FileCheck %s --check-prefix=CHECK-CALL --check-prefix=CHECK-CALL-CUSTOM +; RUN: opt < %s -memprof -memprof-module -memprof-use-callbacks=false -S | FileCheck %s --check-prefix=CHECK-INLINE +; RUN: opt < %s -memprof -memprof-module -S | FileCheck %s --check-prefix=CHECK-INLINE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-unknown-linux-gnu" define void @test_load(i32* %a, i64* %b, i512* %c, i80* %d) { entry: ; CHECK-CALL: %[[LOAD_ADDR1:[^ ]*]] = ptrtoint i32* %a to i64 -; CHECK-CALL-DEFAULT: call void @__heapprof_load(i64 %[[LOAD_ADDR1]]) +; CHECK-CALL-DEFAULT: call void @__memprof_load(i64 %[[LOAD_ADDR1]]) ; CHECK-CALL-CUSTOM: call void @__foo_load(i64 %[[LOAD_ADDR1]]) ; CHECK-CALL: %[[LOAD_ADDR2:[^ ]*]] = ptrtoint i64* %b to i64 -; CHECK-CALL-DEFAULT: call void @__heapprof_load(i64 %[[LOAD_ADDR2]]) +; CHECK-CALL-DEFAULT: call void @__memprof_load(i64 %[[LOAD_ADDR2]]) ; CHECK-CALL-CUSTOM: call void @__foo_load(i64 %[[LOAD_ADDR2]]) ; CHECK-CALL: %[[LOAD_ADDR3:[^ ]*]] = ptrtoint i512* %c to i64 -; CHECK-CALL-DEFAULT: call void @__heapprof_load(i64 %[[LOAD_ADDR3]]) +; CHECK-CALL-DEFAULT: call void @__memprof_load(i64 %[[LOAD_ADDR3]]) ; CHECK-CALL-CUSTOM: call void @__foo_load(i64 %[[LOAD_ADDR3]]) ; CHECK-CALL: %[[LOAD_ADDR4:[^ ]*]] = ptrtoint i80* %d to i64 -; CHECK-CALL-DEFAULT: call void @__heapprof_load(i64 %[[LOAD_ADDR4]]) +; CHECK-CALL-DEFAULT: call void @__memprof_load(i64 %[[LOAD_ADDR4]]) ; CHECK-CALL-CUSTOM: call void @__foo_load(i64 %[[LOAD_ADDR4]]) -; CHECK-CALL-DEFAULT-NOT: call void @__heapprof_load +; CHECK-CALL-DEFAULT-NOT: call void @__memprof_load ; CHECK-CALL-CUSTOM-NOT: call void @__foo_load -; CHECK-INLINE-NOT: call void @__heapprof_load +; CHECK-INLINE-NOT: call void @__memprof_load %tmp1 = load i32, i32* %a, align 4 %tmp2 = load i64, i64* %b, align 8 %tmp3 = load i512, i512* %c, align 32 diff --git a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll index fa493a454ef10..dfae33d717b89 100644 --- a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll +++ b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll @@ -1,12 +1,12 @@ -; RUN: opt < %s -heapprof -heapprof-use-callbacks -S \ +; RUN: opt < %s -memprof -memprof-use-callbacks -S \ ; RUN: | FileCheck %s -check-prefix=LOAD -check-prefix=STORE -check-prefix=ALL -; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-reads=0 -S \ +; RUN: opt < %s -memprof -memprof-use-callbacks -memprof-instrument-reads=0 -S \ ; RUN: | FileCheck %s -check-prefix=NOLOAD -check-prefix=STORE -check-prefix=ALL -; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-writes=0 -S \ +; RUN: opt < %s -memprof -memprof-use-callbacks -memprof-instrument-writes=0 -S \ ; RUN: | FileCheck %s -check-prefix=LOAD -check-prefix=NOSTORE -check-prefix=ALL -; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-reads=0 -heapprof-instrument-writes=0 -S \ +; RUN: opt < %s -memprof -memprof-use-callbacks -memprof-instrument-reads=0 -memprof-instrument-writes=0 -S \ ; RUN: | FileCheck %s -check-prefix=NOLOAD -check-prefix=NOSTORE -check-prefix=ALL -; Support heap profiling instrumentation for constant-mask llvm.masked.{load,store} +; Support memory profiling instrumentation for constant-mask llvm.masked.{load,store} target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -22,16 +22,16 @@ declare void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*>, <4 x i32*>*, i32, define void @store.v4f32.1110(<4 x float> %arg) { ; ALL-LABEL: @store.v4f32.1110 %p = load <4 x float>*, <4 x float>** @v4f32, align 8 -; NOSTORE-NOT: call void @__heapprof_store +; NOSTORE-NOT: call void @__memprof_store ; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0 ; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP0]]) +; STORE: call void @__memprof_store(i64 [[PGEP0]]) ; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1 ; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP1]]) +; STORE: call void @__memprof_store(i64 [[PGEP1]]) ; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2 ; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP2]]) +; STORE: call void @__memprof_store(i64 [[PGEP2]]) ; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> ) tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> ) ret void @@ -40,19 +40,19 @@ define void @store.v4f32.1110(<4 x float> %arg) { define void @store.v8i32.10010110(<8 x i32> %arg) { ; ALL-LABEL: @store.v8i32.10010110 %p = load <8 x i32>*, <8 x i32>** @v8i32, align 8 -; NOSTORE-NOT: call void @__heapprof_store +; NOSTORE-NOT: call void @__memprof_store ; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0 ; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP0]]) +; STORE: call void @__memprof_store(i64 [[PGEP0]]) ; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 3 ; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP3]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP3]]) +; STORE: call void @__memprof_store(i64 [[PGEP3]]) ; STORE: [[GEP5:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 5 ; STORE: [[PGEP5:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP5]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP5]]) +; STORE: call void @__memprof_store(i64 [[PGEP5]]) ; STORE: [[GEP6:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 6 ; STORE: [[PGEP6:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP6]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP6]]) +; STORE: call void @__memprof_store(i64 [[PGEP6]]) ; STORE: tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> ) tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> ) ret void @@ -61,10 +61,10 @@ define void @store.v8i32.10010110(<8 x i32> %arg) { define void @store.v4i64.0001(<4 x i32*> %arg) { ; ALL-LABEL: @store.v4i64.0001 %p = load <4 x i32*>*, <4 x i32*>** @v4i64, align 8 -; NOSTORE-NOT: call void @__heapprof_store +; NOSTORE-NOT: call void @__memprof_store ; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3 ; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP3]]) +; STORE: call void @__memprof_store(i64 [[PGEP3]]) ; STORE: tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> ) tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> ) ret void @@ -78,7 +78,7 @@ define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) { ; STORE: [[THEN0]]: ; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0 ; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP0]]) +; STORE: call void @__memprof_store(i64 [[PGEP0]]) ; STORE: br label %[[AFTER0]] ; STORE: [[AFTER0]]: @@ -87,7 +87,7 @@ define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) { ; STORE: [[THEN1]]: ; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1 ; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP1]]) +; STORE: call void @__memprof_store(i64 [[PGEP1]]) ; STORE: br label %[[AFTER1]] ; STORE: [[AFTER1]]: @@ -96,7 +96,7 @@ define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) { ; STORE: [[THEN2]]: ; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2 ; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP2]]) +; STORE: call void @__memprof_store(i64 [[PGEP2]]) ; STORE: br label %[[AFTER2]] ; STORE: [[AFTER2]]: @@ -105,7 +105,7 @@ define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) { ; STORE: [[THEN3]]: ; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3 ; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP3]]) +; STORE: call void @__memprof_store(i64 [[PGEP3]]) ; STORE: br label %[[AFTER3]] ; STORE: [[AFTER3]]: @@ -120,12 +120,12 @@ define void @store.v4f32.1010.split(<4 x float> %arg) { %p = load <4 x float>*, <4 x float>** @v4f32, align 8 ; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0 ; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP0]]) +; STORE: call void @__memprof_store(i64 [[PGEP0]]) ; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> ) tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> ) ; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2 ; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64 -; STORE: call void @__heapprof_store(i64 [[PGEP1]]) +; STORE: call void @__memprof_store(i64 [[PGEP1]]) ; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> ) tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> ) ret void @@ -139,19 +139,19 @@ declare <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>*, i32, <4 x i1 define <8 x i32> @load.v8i32.11100001(<8 x i32> %arg) { ; ALL-LABEL: @load.v8i32.11100001 %p = load <8 x i32>*, <8 x i32>** @v8i32, align 8 -; NOLOAD-NOT: call void @__heapprof_load +; NOLOAD-NOT: call void @__memprof_load ; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0 ; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP0]]) +; LOAD: call void @__memprof_load(i64 [[PGEP0]]) ; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 1 ; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP1]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP1]]) +; LOAD: call void @__memprof_load(i64 [[PGEP1]]) ; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 2 ; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP2]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP2]]) +; LOAD: call void @__memprof_load(i64 [[PGEP2]]) ; LOAD: [[GEP7:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 7 ; LOAD: [[PGEP7:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP7]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP7]]) +; LOAD: call void @__memprof_load(i64 [[PGEP7]]) ; LOAD: tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> , <8 x i32> %arg) %res = tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> , <8 x i32> %arg) ret <8 x i32> %res @@ -160,13 +160,13 @@ define <8 x i32> @load.v8i32.11100001(<8 x i32> %arg) { define <4 x float> @load.v4f32.1001(<4 x float> %arg) { ; ALL-LABEL: @load.v4f32.1001 %p = load <4 x float>*, <4 x float>** @v4f32, align 8 -; NOLOAD-NOT: call void @__heapprof_load +; NOLOAD-NOT: call void @__memprof_load ; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0 ; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP0]]) +; LOAD: call void @__memprof_load(i64 [[PGEP0]]) ; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3 ; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP3]]) +; LOAD: call void @__memprof_load(i64 [[PGEP3]]) ; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> , <4 x float> %arg) %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> , <4 x float> %arg) ret <4 x float> %res @@ -175,10 +175,10 @@ define <4 x float> @load.v4f32.1001(<4 x float> %arg) { define <4 x i32*> @load.v4i64.0001(<4 x i32*> %arg) { ; ALL-LABEL: @load.v4i64.0001 %p = load <4 x i32*>*, <4 x i32*>** @v4i64, align 8 -; NOLOAD-NOT: call void @__heapprof_load +; NOLOAD-NOT: call void @__memprof_load ; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3 ; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP3]]) +; LOAD: call void @__memprof_load(i64 [[PGEP3]]) ; LOAD: tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> , <4 x i32*> %arg) %res = tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> , <4 x i32*> %arg) ret <4 x i32*> %res @@ -192,7 +192,7 @@ define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) { ; LOAD: [[THEN0]]: ; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0 ; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP0]]) +; LOAD: call void @__memprof_load(i64 [[PGEP0]]) ; LOAD: br label %[[AFTER0]] ; LOAD: [[AFTER0]]: @@ -201,7 +201,7 @@ define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) { ; LOAD: [[THEN1]]: ; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1 ; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP1]]) +; LOAD: call void @__memprof_load(i64 [[PGEP1]]) ; LOAD: br label %[[AFTER1]] ; LOAD: [[AFTER1]]: @@ -210,7 +210,7 @@ define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) { ; LOAD: [[THEN2]]: ; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2 ; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP2]]) +; LOAD: call void @__memprof_load(i64 [[PGEP2]]) ; LOAD: br label %[[AFTER2]] ; LOAD: [[AFTER2]]: @@ -219,7 +219,7 @@ define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) { ; LOAD: [[THEN3]]: ; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3 ; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP3]]) +; LOAD: call void @__memprof_load(i64 [[PGEP3]]) ; LOAD: br label %[[AFTER3]] ; LOAD: [[AFTER3]]: @@ -234,12 +234,12 @@ define <4 x float> @load.v4f32.1001.split(<4 x float> %arg) { %p = load <4 x float>*, <4 x float>** @v4f32, align 8 ; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0 ; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP0]]) +; LOAD: call void @__memprof_load(i64 [[PGEP0]]) ; LOAD: %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> , <4 x float> %arg) %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> , <4 x float> %arg) ; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3 ; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64 -; LOAD: call void @__heapprof_load(i64 [[PGEP3]]) +; LOAD: call void @__memprof_load(i64 [[PGEP3]]) ; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> , <4 x float> %res) %res2 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> , <4 x float> %res) ret <4 x float> %res2 diff --git a/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll b/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll index c8c3a6d605db3..ff68584ed7f02 100644 --- a/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll +++ b/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll @@ -1,8 +1,8 @@ -; Test that the scale (-heapprof-mapping-scale) and granularity (-heapprof-mapping-granularity) command-line options work as expected +; Test that the scale (-memprof-mapping-scale) and granularity (-memprof-mapping-granularity) command-line options work as expected ; -; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-granularity 32 -S | FileCheck --check-prefix=CHECK-GRAN %s -; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-scale 1 -S | FileCheck --check-prefix=CHECK-SCALE %s -; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-granularity 16 -heapprof-mapping-scale 0 -S | FileCheck --check-prefix=CHECK-BOTH %s +; RUN: opt < %s -memprof -memprof-module -memprof-mapping-granularity 32 -S | FileCheck --check-prefix=CHECK-GRAN %s +; RUN: opt < %s -memprof -memprof-module -memprof-mapping-scale 1 -S | FileCheck --check-prefix=CHECK-SCALE %s +; RUN: opt < %s -memprof -memprof-module -memprof-mapping-granularity 16 -memprof-mapping-scale 0 -S | FileCheck --check-prefix=CHECK-BOTH %s target triple = "x86_64-unknown-linux-gnu" define i32 @read(i32* %a) { diff --git a/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll b/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll index 84e039551d702..d53e23cff471b 100644 --- a/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll +++ b/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll @@ -1,12 +1,12 @@ -; Check that the HeapProf module constructor guards against compiler/runtime version +; Check that the MemProf module constructor guards against compiler/runtime version ; mismatch. -; RUN: opt < %s -heapprof-module -S | FileCheck %s -; RUN: opt < %s -heapprof-module -heapprof-guard-against-version-mismatch=0 -S | FileCheck %s --check-prefix=NOGUARD +; RUN: opt < %s -memprof-module -S | FileCheck %s +; RUN: opt < %s -memprof-module -memprof-guard-against-version-mismatch=0 -S | FileCheck %s --check-prefix=NOGUARD target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-unknown-linux-gnu" -; CHECK-LABEL: define internal void @heapprof.module_ctor() -; CHECK: call void @__heapprof_version_mismatch_check_v1 -; NOGUARD-NOT: call void @__heapprof_version_mismatch_check_ +; CHECK-LABEL: define internal void @memprof.module_ctor() +; CHECK: call void @__memprof_version_mismatch_check_v1 +; NOGUARD-NOT: call void @__memprof_version_mismatch_check_ diff --git a/llvm/test/Instrumentation/ThreadSanitizer/tsan_musttail.ll b/llvm/test/Instrumentation/ThreadSanitizer/tsan_musttail.ll new file mode 100644 index 0000000000000..bb681f67e0ecd --- /dev/null +++ b/llvm/test/Instrumentation/ThreadSanitizer/tsan_musttail.ll @@ -0,0 +1,30 @@ +; To test that __tsan_func_exit always happen before musttaill call and no exception handling code. +; RUN: opt < %s -tsan -S | FileCheck %s + +define internal i32 @preallocated_musttail(i32* preallocated(i32) %p) sanitize_thread { + %rv = load i32, i32* %p + ret i32 %rv +} + +define i32 @call_preallocated_musttail(i32* preallocated(i32) %a) sanitize_thread { + %r = musttail call i32 @preallocated_musttail(i32* preallocated(i32) %a) + ret i32 %r +} + +; CHECK-LABEL: define i32 @call_preallocated_musttail(i32* preallocated(i32) %a) +; CHECK: call void @__tsan_func_exit() +; CHECK-NEXT: %r = musttail call i32 @preallocated_musttail(i32* preallocated(i32) %a) +; CHECK-NEXT: ret i32 %r + + +define i32 @call_preallocated_musttail_cast(i32* preallocated(i32) %a) sanitize_thread { + %r = musttail call i32 @preallocated_musttail(i32* preallocated(i32) %a) + %t = bitcast i32 %r to i32 + ret i32 %t +} + +; CHECK-LABEL: define i32 @call_preallocated_musttail_cast(i32* preallocated(i32) %a) +; CHECK: call void @__tsan_func_exit() +; CHECK-NEXT: %r = musttail call i32 @preallocated_musttail(i32* preallocated(i32) %a) +; CHECK-NEXT: %t = bitcast i32 %r to i32 +; CHECK-NEXT: ret i32 %t diff --git a/llvm/test/LTO/X86/Inputs/start-lib1.ll b/llvm/test/LTO/X86/Inputs/start-lib1.ll index 9f42e6afff0f3..18b6ea25386f5 100644 --- a/llvm/test/LTO/X86/Inputs/start-lib1.ll +++ b/llvm/test/LTO/X86/Inputs/start-lib1.ll @@ -4,5 +4,6 @@ target triple = "x86_64-unknown-linux-gnu" declare void @bar() define void @foo() { + call void @bar() ret void } diff --git a/llvm/test/LTO/X86/embed-bitcode.ll b/llvm/test/LTO/X86/embed-bitcode.ll index 151f27f55eefb..bdddd079d2265 100644 --- a/llvm/test/LTO/X86/embed-bitcode.ll +++ b/llvm/test/LTO/X86/embed-bitcode.ll @@ -5,19 +5,26 @@ ; RUN: llvm-lto2 run -r %t1.o,_start,px -r %t2.o,foo,px -r %t3.o,bar,px -r %t2.o,bar,lx -o %t3 %t1.o %t2.o %t3.o ; RUN: llvm-readelf -S %t3.0 | FileCheck %s --implicit-check-not=.llvmbc -; RUN: llvm-lto2 run -r %t1.o,_start,px -r %t2.o,foo,px -r %t3.o,bar,px -r %t2.o,bar,lx -lto-embed-bitcode=false -o %t3 %t1.o %t2.o %t3.o +; RUN: llvm-lto2 run -r %t1.o,_start,px -r %t2.o,foo,px -r %t3.o,bar,px -r %t2.o,bar,lx -lto-embed-bitcode=none -o %t3 %t1.o %t2.o %t3.o ; RUN: llvm-readelf -S %t3.0 | FileCheck %s --implicit-check-not=.llvmbc -; RUN: llvm-lto2 run -r %t1.o,_start,px -r %t2.o,foo,px -r %t3.o,bar,px -r %t2.o,bar,lx -lto-embed-bitcode -o %t3 %t1.o %t2.o %t3.o +; RUN: llvm-lto2 run -r %t1.o,_start,px -r %t2.o,foo,px -r %t3.o,bar,px -r %t2.o,bar,lx -lto-embed-bitcode=optimized -o %t3 %t1.o %t2.o %t3.o ; RUN: llvm-readelf -S %t3.0 | FileCheck %s --check-prefix=CHECK-ELF ; RUN: llvm-objcopy --dump-section=.llvmbc=%t-embedded.bc %t3.0 /dev/null -; RUN: llvm-dis %t-embedded.bc -o - | FileCheck %s --check-prefix=CHECK-LL +; RUN: llvm-dis %t-embedded.bc -o - | FileCheck %s --check-prefixes=CHECK-LL,CHECK-OPT + +; RUN: llvm-lto2 run -r %t1.o,_start,px -r %t2.o,foo,px -r %t3.o,bar,px -r %t2.o,bar,lx -lto-embed-bitcode=post-merge-pre-opt -o %t3 %t1.o %t2.o %t3.o +; RUN: llvm-readelf -S %t3.0 | FileCheck %s --check-prefix=CHECK-ELF +; RUN: llvm-objcopy --dump-section=.llvmbc=%t-embedded.bc %t3.0 /dev/null +; RUN: llvm-dis %t-embedded.bc -o - | FileCheck %s --check-prefixes=CHECK-LL,CHECK-NOOPT ; CHECK-ELF: .text PROGBITS 0000000000000000 [[#%x,OFF:]] [[#%x,SIZE:]] 00 AX 0 ; CHECK-ELF-NEXT: .llvmbc PROGBITS 0000000000000000 [[#%x,OFF:]] [[#%x,SIZE:]] 00 0 ; CHECK-LL: @_start ; CHECK-LL: @foo +; CHECK-OPT-NEXT: ret void +; CHECK-NOOPT-NEXT: call void @bar ; CHECK-LL: @bar target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/MC/AArch64/seh-optimize.s b/llvm/test/MC/AArch64/seh-optimize.s new file mode 100644 index 0000000000000..0bf33af9cc75f --- /dev/null +++ b/llvm/test/MC/AArch64/seh-optimize.s @@ -0,0 +1,106 @@ +// This test checks that the unwinding opcodes are remapped to more +// efficient ones where possible. + +// RUN: llvm-mc -triple aarch64-pc-win32 -filetype=obj %s -o %t.o +// RUN: llvm-readobj -u %t.o | FileCheck %s + +// CHECK: UnwindInformation [ +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func +// CHECK-NEXT: ExceptionRecord: .xdata +// CHECK-NEXT: ExceptionData { +// CHECK: Prologue [ +// CHECK-NEXT: 0xd882 ; stp d10, d11, [sp, #16] +// CHECK-NEXT: 0xda07 ; stp d8, d9, [sp, #-64]! +// CHECK-NEXT: 0xe6 ; save next +// CHECK-NEXT: 0x28 ; stp x19, x20, [sp, #-64]! +// CHECK-NEXT: 0xca49 ; stp x28, x29, [sp, #72] +// CHECK-NEXT: 0xe6 ; save next +// CHECK-NEXT: 0xe6 ; save next +// CHECK-NEXT: 0xe6 ; save next +// CHECK-NEXT: 0xcc47 ; stp x20, x21, [sp, #-64]! +// CHECK-NEXT: 0x42 ; stp x29, x30, [sp, #16] +// CHECK-NEXT: 0xca02 ; stp x27, x28, [sp, #16] +// CHECK-NEXT: 0x83 ; stp x29, x30, [sp, #-32]! +// CHECK-NEXT: 0xce03 ; stp x27, x28, [sp, #-32]! +// CHECK-NEXT: 0xe1 ; mov fp, sp +// CHECK-NEXT: 0xe201 ; add fp, sp, #8 +// CHECK-NEXT: 0xe4 ; end +// CHECK-NEXT: ] +// CHECK-NEXT: EpilogueScopes [ +// CHECK-NEXT: EpilogueScope { +// CHECK: Opcodes [ +// CHECK-NEXT: 0xc904 ; ldp x23, x24, [sp, #32] +// CHECK-NEXT: 0xe6 ; restore next +// CHECK-NEXT: 0xcc83 ; ldp x21, x22, [sp], #32 +// CHECK-NEXT: 0x24 ; ldp x19, x20, [sp], #32 +// CHECK-NEXT: 0xcc1f ; ldp x19, x20, [sp], #256 +// CHECK-NEXT: 0xe4 ; end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: ] + + + .text + .globl func + .seh_proc func +func: + add x29, sp, #8 + .seh_add_fp 8 + add x29, sp, #0 + .seh_add_fp 0 + + stp x27, x28, [sp, #-32]! + .seh_save_regp_x x27, 32 + stp x29, x30, [sp, #-32]! + .seh_save_regp_x x29, 32 + + stp x27, x28, [sp, #16] + .seh_save_regp x27, 16 + stp x29, x30, [sp, #16] + .seh_save_regp x29, 16 + + stp x20, x21, [sp, #-64]! + .seh_save_regp_x x20, 64 + stp x22, x23, [sp, #16] + .seh_save_regp x22, 16 + stp x24, x25, [sp, #32] + .seh_save_next + stp x26, x27, [sp, #48] + .seh_save_regp x26, 48 + stp x28, x29, [sp, #72] + .seh_save_regp x28, 72 + + stp x19, x20, [sp, #-64]! + .seh_save_r19r20_x 64 + stp x21, x22, [sp, #16] + .seh_save_regp x21, 16 + + stp d8, d9, [sp, #-64]! + .seh_save_fregp_x d8, 64 + stp d10, d11, [sp, #16] + // This is intentionally not converted into a save_next, to avoid + // bugs in the windows unwinder. + .seh_save_fregp d10, 16 + + .seh_endprologue + + nop + + .seh_startepilogue + ldp x27, x28, [sp, #32] + .seh_save_regp x23, 32 + ldp x23, x24, [sp, #16] + .seh_save_regp x23, 16 + ldp x21, x22, [sp], #32 + .seh_save_regp_x x21, 32 + ldp x19, x20, [sp], #32 + .seh_save_regp_x x19, 32 + ldp x19, x20, [sp], #256 + .seh_save_regp_x x19, 256 + .seh_endepilogue + ret + .seh_endproc diff --git a/llvm/test/MC/AArch64/seh-packed-epilog.s b/llvm/test/MC/AArch64/seh-packed-epilog.s new file mode 100644 index 0000000000000..f9978ea7a1139 --- /dev/null +++ b/llvm/test/MC/AArch64/seh-packed-epilog.s @@ -0,0 +1,187 @@ +// This test checks that the epilogue is packed where possible. + +// RUN: llvm-mc -triple aarch64-pc-win32 -filetype=obj %s -o %t.o +// RUN: llvm-readobj -u %t.o | FileCheck %s + +// CHECK: UnwindInformation [ +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func +// CHECK-NEXT: ExceptionRecord: .xdata +// CHECK-NEXT: ExceptionData { +// CHECK-NEXT: FunctionLength: +// CHECK-NEXT: Version: +// CHECK-NEXT: ExceptionData: +// CHECK-NEXT: EpiloguePacked: Yes +// CHECK-NEXT: EpilogueOffset: 2 +// CHECK-NEXT: ByteCodeLength: +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: 0xdc04 ; str d8, [sp, #32] +// CHECK-NEXT: 0xe1 ; mov fp, sp +// CHECK-NEXT: 0x42 ; stp x29, x30, [sp, #16] +// CHECK-NEXT: 0x85 ; stp x29, x30, [sp, #-48]! +// CHECK-NEXT: 0xe6 ; save next +// CHECK-NEXT: 0x24 ; stp x19, x20, [sp, #-32]! +// CHECK-NEXT: 0xc842 ; stp x20, x21, [sp, #16] +// CHECK-NEXT: 0x03 ; sub sp, #48 +// CHECK-NEXT: 0xe4 ; end +// CHECK-NEXT: ] +// CHECK-NEXT: Epilogue [ +// CHECK-NEXT: 0xe1 ; mov sp, fp +// CHECK-NEXT: 0x42 ; ldp x29, x30, [sp, #16] +// CHECK-NEXT: 0x85 ; ldp x29, x30, [sp], #48 +// CHECK-NEXT: 0xe6 ; restore next +// CHECK-NEXT: 0x24 ; ldp x19, x20, [sp], #32 +// CHECK-NEXT: 0xc842 ; ldp x20, x21, [sp, #16] +// CHECK-NEXT: 0x03 ; add sp, #48 +// CHECK-NEXT: 0xe4 ; end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK: RuntimeFunction { +// CHECK-NEXT: Function: packed2 +// CHECK-NEXT: ExceptionRecord: +// CHECK-NEXT: ExceptionData { +// CHECK: ExceptionData: +// CHECK-NEXT: EpiloguePacked: Yes +// CHECK: RuntimeFunction { +// CHECK-NEXT: Function: nonpacked1 +// CHECK-NEXT: ExceptionRecord: +// CHECK-NEXT: ExceptionData { +// CHECK: ExceptionData: +// CHECK-NEXT: EpiloguePacked: No +// CHECK: RuntimeFunction { +// CHECK-NEXT: Function: nonpacked2 +// CHECK-NEXT: ExceptionRecord: +// CHECK-NEXT: ExceptionData { +// CHECK: ExceptionData: +// CHECK-NEXT: EpiloguePacked: No +// CHECK: RuntimeFunction { +// CHECK-NEXT: Function: nonpacked3 +// CHECK-NEXT: ExceptionRecord: +// CHECK-NEXT: ExceptionData { +// CHECK: ExceptionData: +// CHECK-NEXT: EpiloguePacked: No + + .text + .globl func + .seh_proc func +func: + sub sp, sp, #48 + .seh_stackalloc 48 + // Check that canonical opcode forms (r19r20_x, fplr, fplr_x, save_next, + // set_fp) are treated as a match even if one (in prologue or epilogue) + // was simplified from the more generic opcodes. + stp x20, x21, [sp, #16] + .seh_save_regp x20, 16 + stp x19, x20, [sp, #-32]! + .seh_save_r19r20_x 32 + stp x21, x22, [sp, #16] + .seh_save_regp x21, 16 + stp x29, x30, [sp, #-48]! + .seh_save_regp_x x29, 48 + stp x29, x30, [sp, #16] + .seh_save_regp x29, 16 + add x29, sp, #0 + .seh_add_fp 0 + str d8, [sp, #32] + .seh_save_freg d8, 32 + .seh_endprologue + + nop + + .seh_startepilogue + mov sp, x29 + .seh_set_fp + ldp x29, x30, [sp, #16] + .seh_save_fplr 16 + ldp x29, x30, [sp, #-48]! + .seh_save_fplr_x 48 + ldp x21, x22, [sp, #16] + .seh_save_next + ldp x19, x20, [sp], #32 + .seh_save_regp_x x19, 32 + ldp x20, x21, [sp, #16] + .seh_save_regp x20, 16 + add sp, sp, #48 + .seh_stackalloc 48 + .seh_endepilogue + ret + .seh_endproc + + + // Test a perfectly matching epilog with no offset. + .seh_proc packed2 +packed2: + sub sp, sp, #48 + .seh_stackalloc 48 + stp x29, lr, [sp, #-32]! + .seh_save_fplr_x 32 + .seh_endprologue + nop + .seh_startepilogue + ldp x29, lr, [sp], #32 + .seh_save_fplr_x 32 + add sp, sp, #48 + .seh_stackalloc 48 + .seh_endepilogue + ret + .seh_endproc + + + .seh_proc nonpacked1 +nonpacked1: + sub sp, sp, #48 + .seh_stackalloc 48 + .seh_endprologue + + nop + .seh_startepilogue + add sp, sp, #48 + .seh_stackalloc 48 + .seh_endepilogue + // This epilogue isn't packed with the prologue, as it doesn't align with + // the end of the function (one extra nop before the ret). + nop + ret + .seh_endproc + + + .seh_proc nonpacked2 +nonpacked2: + sub sp, sp, #48 + .seh_stackalloc 48 + sub sp, sp, #32 + .seh_stackalloc 32 + .seh_endprologue + + nop + .seh_startepilogue + // Not packed; the epilogue mismatches at the second opcode. + add sp, sp, #16 + .seh_stackalloc 16 + add sp, sp, #48 + .seh_stackalloc 48 + .seh_endepilogue + ret + .seh_endproc + + .seh_proc nonpacked3 +nonpacked3: + sub sp, sp, #48 + .seh_stackalloc 48 + sub sp, sp, #32 + .seh_stackalloc 32 + .seh_endprologue + + nop + .seh_startepilogue + // Not packed; the epilogue is longer than the prologue. + mov sp, x29 + .seh_set_fp + add sp, sp, #32 + .seh_stackalloc 32 + add sp, sp, #48 + .seh_stackalloc 48 + .seh_endepilogue + ret + .seh_endproc diff --git a/llvm/test/MC/AArch64/seh.s b/llvm/test/MC/AArch64/seh.s index f7faa64b9309a..0da956cbf2f5d 100644 --- a/llvm/test/MC/AArch64/seh.s +++ b/llvm/test/MC/AArch64/seh.s @@ -20,7 +20,7 @@ // CHECK-NEXT: } // CHECK: Section { // CHECK: Name: .xdata -// CHECK: RawDataSize: 56 +// CHECK: RawDataSize: 52 // CHECK: RelocationCount: 1 // CHECK: Characteristics [ // CHECK-NEXT: ALIGN_4BYTES @@ -41,7 +41,7 @@ // CHECK-NEXT: Relocations [ // CHECK-NEXT: Section (4) .xdata { -// CHECK-NEXT: 0x2C IMAGE_REL_ARM64_ADDR32NB __C_specific_handler +// CHECK-NEXT: 0x28 IMAGE_REL_ARM64_ADDR32NB __C_specific_handler // CHECK-NEXT: } // CHECK-NEXT: Section (5) .pdata { // CHECK-NEXT: 0x0 IMAGE_REL_ARM64_ADDR32NB func @@ -64,8 +64,8 @@ // CHECK-NEXT: 0xe202 ; add fp, sp, #16 // CHECK-NEXT: 0xdd41 ; str d13, [sp, #8] // CHECK-NEXT: 0xde83 ; str d12, [sp, #-32]! -// CHECK-NEXT: 0xd882 ; stp d10, d11, [sp, #16] -// CHECK-NEXT: 0xda03 ; stp d8, d9, [sp, #-32]! +// CHECK-NEXT: 0xd884 ; stp d10, d11, [sp, #32] +// CHECK-NEXT: 0xda05 ; stp d8, d9, [sp, #-48]! // CHECK-NEXT: 0x83 ; stp x29, x30, [sp, #-32]! // CHECK-NEXT: 0x46 ; stp x29, x30, [sp, #48] // CHECK-NEXT: 0xd141 ; str x24, [sp, #8] @@ -74,21 +74,15 @@ // CHECK-NEXT: 0xc882 ; stp x21, x22, [sp, #16] // CHECK-NEXT: 0xd6c2 ; stp x25, lr, [sp, #16] // CHECK-NEXT: 0x24 ; stp x19, x20, [sp, #-32]! -// CHECK-NEXT: 0xcc03 ; stp x19, x20, [sp, #-32]! +// CHECK-NEXT: 0xcc83 ; stp x21, x22, [sp, #-32]! // CHECK-NEXT: 0x83 ; stp x29, x30, [sp, #-32]! // CHECK-NEXT: 0xe1 ; mov fp, sp // CHECK-NEXT: 0x01 ; sub sp, #16 // CHECK-NEXT: 0xe4 ; end // CHECK-NEXT: ] -// CHECK-NEXT: EpilogueScopes [ -// CHECK-NEXT: EpilogueScope { -// CHECK-NEXT: StartOffset: 23 -// CHECK-NEXT: EpilogueStartIndex: 33 -// CHECK-NEXT: Opcodes [ -// CHECK-NEXT: 0x01 ; add sp, #16 -// CHECK-NEXT: 0xe4 ; end -// CHECK-NEXT: ] -// CHECK-NEXT: } +// CHECK-NEXT: Epilogue [ +// CHECK-NEXT: 0x01 ; add sp, #16 +// CHECK-NEXT: 0xe4 ; end // CHECK-NEXT: ] // CHECK-NEXT: ExceptionHandler [ // CHECK-NEXT: Routine: __C_specific_handler (0x0) @@ -113,8 +107,8 @@ func: .seh_set_fp stp x29, x30, [sp, #-32]! .seh_save_fplr_x 32 - stp x19, x20, [sp, #-32]! - .seh_save_regp_x x19, 32 + stp x21, x22, [sp, #-32]! + .seh_save_regp_x x21, 32 stp x19, x20, [sp, #-32]! .seh_save_r19r20_x 32 stp x25, x30, [sp, #16] @@ -131,10 +125,10 @@ func: .seh_save_fplr 48 stp x29, x30, [sp, #-32]! .seh_save_fplr_x 32 - stp d8, d9, [sp, #-32]! - .seh_save_fregp_x d8, 32 - stp d10, d11, [sp, #16] - .seh_save_fregp d10, 16 + stp d8, d9, [sp, #-48]! + .seh_save_fregp_x d8, 48 + stp d10, d11, [sp, #32] + .seh_save_fregp d10, 32 str d12, [sp, #-32]! .seh_save_freg_x d12, 32 str d13, [sp, #8] diff --git a/llvm/test/MC/AMDGPU/expressions.s b/llvm/test/MC/AMDGPU/expressions.s index 57f47d8f0345d..0b7bdcdebb88f 100644 --- a/llvm/test/MC/AMDGPU/expressions.s +++ b/llvm/test/MC/AMDGPU/expressions.s @@ -327,8 +327,8 @@ v_sin_f32 v0, -[ttmp0] s1000=1 v_sin_f32 v0, -s1000 -// NOVI: error: not a valid operand. +// NOVI: error: register index is out of range xnack_mask_lo=1 v_sin_f32 v0, xnack_mask_lo -// NOVI: error: not a valid operand. +// NOVI: error: register not available on this GPU diff --git a/llvm/test/MC/AMDGPU/flat-scratch.s b/llvm/test/MC/AMDGPU/flat-scratch.s index eea2f0d07f3ea..9ff9ee3af7e51 100644 --- a/llvm/test/MC/AMDGPU/flat-scratch.s +++ b/llvm/test/MC/AMDGPU/flat-scratch.s @@ -5,32 +5,32 @@ // RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=VI %s s_mov_b64 flat_scratch, -1 -// NOSI: error: not a valid operand. +// NOSI: error: register not available on this GPU // CI: s_mov_b64 flat_scratch, -1 ; encoding: [0xc1,0x04,0xe8,0xbe] // VI: s_mov_b64 flat_scratch, -1 ; encoding: [0xc1,0x01,0xe6,0xbe] s_mov_b32 flat_scratch_lo, -1 -// NOSI: error: not a valid operand. +// NOSI: error: register not available on this GPU // CI: s_mov_b32 flat_scratch_lo, -1 ; encoding: [0xc1,0x03,0xe8,0xbe] // VI: s_mov_b32 flat_scratch_lo, -1 ; encoding: [0xc1,0x00,0xe6,0xbe] s_mov_b32 flat_scratch_hi, -1 -// NOSI: error: not a valid operand. +// NOSI: error: register not available on this GPU // CI: s_mov_b32 flat_scratch_hi, -1 ; encoding: [0xc1,0x03,0xe9,0xbe] // VI: s_mov_b32 flat_scratch_hi, -1 ; encoding: [0xc1,0x00,0xe7,0xbe] s_mov_b64 flat_scratch_lo, -1 -// NOSI: error: not a valid operand. +// NOSI: error: register not available on this GPU // NOCI: error: invalid operand for instruction // NOVI: error: invalid operand for instruction s_mov_b64 flat_scratch_hi, -1 -// NOSI: error: not a valid operand. +// NOSI: error: register not available on this GPU // NOCI: error: invalid operand for instruction // NOVI: error: invalid operand for instruction s_mov_b32 flat_scratch, -1 -// NOSI: error: not a valid operand. +// NOSI: error: register not available on this GPU // NOCI: error: invalid operand for instruction // NOVI: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx1011_err.s b/llvm/test/MC/AMDGPU/gfx1011_err.s index 81c8c6254c037..4b5bc2e5887af 100644 --- a/llvm/test/MC/AMDGPU/gfx1011_err.s +++ b/llvm/test/MC/AMDGPU/gfx1011_err.s @@ -23,16 +23,16 @@ v_fma_legacy_f32 v0, v1, v2, v3 // GFX10: error: instruction not supported on this GPU image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] -// GFX10: error: invalid instruction +// GFX10: error: instruction not supported on this GPU image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 -// GFX10: error: invalid instruction +// GFX10: error: invalid operand image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] -// GFX10: error: invalid instruction +// GFX10: error: instruction not supported on this GPU image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 -// GFX10: error: invalid instruction +// GFX10: error: invalid operand image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D // GFX10: error: not a valid operand. diff --git a/llvm/test/MC/AMDGPU/gfx1030_new.s b/llvm/test/MC/AMDGPU/gfx1030_new.s index 1420f9a7c61eb..3f80bdf745b33 100644 --- a/llvm/test/MC/AMDGPU/gfx1030_new.s +++ b/llvm/test/MC/AMDGPU/gfx1030_new.s @@ -61,6 +61,30 @@ v_fma_legacy_f32 v0, v1, |v2|, -v3 v_fma_legacy_f32 v0, s1, 2.0, -v3 // GFX10: encoding: [0x00,0x00,0x40,0xd5,0x01,0xe8,0x0d,0x84] +image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] +// GFX10: encoding: [0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x00] + +image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 +// GFX10: encoding: [0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x40] + +image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] +// GFX10: encoding: [0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x00] + +image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 +// GFX10: encoding: [0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x40] + +image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15] +// GFX10: encoding: [0x07,0x9f,0x98,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x00,0x00] + +image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20], s[12:15] a16 +// GFX10: encoding: [0x05,0x9f,0x98,0xf1,0x32,0x27,0x03,0x40,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x00] + +image_bvh64_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40, v42], s[12:15] +// GFX10: encoding: [0x07,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x2a,0x00] + +image_bvh64_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19], s[12:15] a16 +// GFX10: encoding: [0x05,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x40,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13] + image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D // GFX10: encoding: [0x01,0x0f,0x00,0xf0,0x05,0x01,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s index b666b7d1cb780..ce6893ed057b9 100644 --- a/llvm/test/MC/AMDGPU/literals.s +++ b/llvm/test/MC/AMDGPU/literals.s @@ -640,11 +640,11 @@ v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD // named inline values: shared_base, shared_limit, private_base, etc //---------------------------------------------------------------------------// -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb] buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_add_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81] s_add_i32 s0, src_shared_base, s0 @@ -654,119 +654,127 @@ s_add_i32 s0, src_shared_base, s0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_add_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81] s_add_i32 s0, src_shared_limit, s0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_add_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81] s_add_i32 s0, src_private_base, s0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_add_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81] s_add_i32 s0, src_private_limit, s0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_add_i32 s0, src_pops_exiting_wave_id, s0 ; encoding: [0xef,0x00,0x00,0x81] s_add_i32 s0, src_pops_exiting_wave_id, s0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x86] s_and_b64 s[0:1], s[0:1], src_shared_base -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x86] s_and_b64 s[0:1], s[0:1], src_shared_limit -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x86] s_and_b64 s[0:1], s[0:1], src_private_base -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x86] s_and_b64 s[0:1], s[0:1], src_private_limit -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id ; encoding: [0x00,0xef,0x80,0x86] s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_add_u16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4c] v_add_u16 v0, src_shared_base, v0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xeb,0x06,0x86,0x06] v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xd6,0x01,0x4c,0x00,0x06,0x06,0x86] v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_add_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x68] v_add_u32 v0, src_shared_base, v0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_add_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x34,0xd1,0xeb,0x00,0x02,0x00] v_add_u32_e64 v0, src_shared_base, v0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_cmp_eq_i64_e32 vcc, src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0xc4,0x7d] v_cmp_eq_i64 vcc, src_shared_base, v[0:1] -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_max_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x5a] v_max_f16 v0, src_shared_base, v0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_max_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x16] v_max_f32 v0, src_shared_base, v0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xeb,0x00,0x02,0x00] v_max_f64 v[0:1], src_shared_base, v[0:1] -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x8f,0xd3,0xeb,0x00,0x02,0x18] v_pk_add_f16 v0, src_shared_base, v0 -// NOSICIVI: error: not a valid operand // GFX9: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0x85,0xd1,0xeb,0x00,0x00,0x20] +// NOSICI: error: not a valid operand. +// NOVI: error: register not available on this GPU v_ceil_f16 v0, neg(src_shared_base) -// NOSICIVI: error: not a valid operand // GFX9: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0x85,0xd1,0xeb,0x00,0x00,0x00] +// NOSICI: error: not a valid operand. +// NOVI: error: register not available on this GPU v_ceil_f16 v0, abs(src_shared_base) -// NOSICIVI: error: not a valid operand // GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00] +// NOSI: error: not a valid operand. +// NOCIVI: error: register not available on this GPU +// NOVI: error: register not available on this GPU v_ceil_f64 v[5:6], |src_shared_base| -// NOSICIVI: error: not a valid operand // GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20] +// NOSI: error: not a valid operand. +// NOCIVI: error: register not available on this GPU +// NOVI: error: register not available on this GPU v_ceil_f64 v[5:6], -src_shared_base -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0x5d,0xd1,0xeb,0x00,0x00,0x20] v_ceil_f32 v0, -src_shared_base -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0x5d,0xd1,0xeb,0x00,0x00,0x00] v_ceil_f32 v0, |src_shared_base| -// NOSICIVI: error: not a valid operand // GFX9: v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0xa6,0x00] +// NOSICI: error: not a valid operand. +// NOVI: error: register not available on this GPU v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE -// NOSICIVI: error: not a valid operand // GFX9: v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0x96,0x00] +// NOSICI: error: not a valid operand. +// NOVI: error: register not available on this GPU v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0x86,0x00] v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD src0_sel:DWORD -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0xa6,0x00] v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD @@ -774,7 +782,7 @@ v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD // named inline values compete with other scalars for constant bus access //---------------------------------------------------------------------------// -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // NOGFX9: error: invalid operand (violates constant bus restrictions) v_add_u32 v0, private_base, s0 @@ -783,17 +791,17 @@ v_add_u32 v0, private_base, s0 v_add_u32 v0, scc, s0 // v_div_fmas implicitly reads VCC -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // NOGFX9: error: invalid operand (violates constant bus restrictions) v_div_fmas_f32 v0, shared_base, v0, v1 // v_div_fmas implicitly reads VCC -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // NOGFX9: error: invalid operand (violates constant bus restrictions) v_div_fmas_f32 v0, v0, shared_limit, v1 // v_div_fmas implicitly reads VCC -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // NOGFX9: error: invalid operand (violates constant bus restrictions) v_div_fmas_f32 v0, v0, v1, private_limit @@ -810,29 +818,29 @@ v_div_fmas_f32 v0, v0, scc, v1 v_div_fmas_f32 v0, v0, v1, vccz // v_addc_co_u32 implicitly reads VCC (VOP2) -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // NOGFX9: error: invalid operand (violates constant bus restrictions) v_addc_co_u32 v0, vcc, shared_base, v0, vcc -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // NOGFX9: error: invalid operand (violates constant bus restrictions) v_madak_f32 v0, shared_base, v0, 0x11213141 // NOGCN: error: invalid operand (violates constant bus restrictions) v_madak_f32 v0, scc, v0, 0x11213141 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // NOGFX9: error: invalid operand (violates constant bus restrictions) v_cmp_eq_f32 s[0:1], private_base, private_limit -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // NOGFX9: error: invalid operand (violates constant bus restrictions) v_cmp_eq_f32 s[0:1], private_base, s0 // NOGCN: error: invalid operand (violates constant bus restrictions) v_cmp_eq_f32 s[0:1], execz, s0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // NOGFX9: error: invalid operand (violates constant bus restrictions) v_pk_add_f16 v255, private_base, private_limit diff --git a/llvm/test/MC/AMDGPU/mtbuf.s b/llvm/test/MC/AMDGPU/mtbuf.s index 0653b591d69d7..a405a8824df4a 100644 --- a/llvm/test/MC/AMDGPU/mtbuf.s +++ b/llvm/test/MC/AMDGPU/mtbuf.s @@ -289,7 +289,7 @@ tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], format:[BUF_DATA_FORMAT_32] // Invalid soffset tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s[255] format:[BUF_NUM_FORMAT_FLOAT] -// GCN-ERR: error: not a valid operand. +// GCN-ERR: error: register index is out of range // Both legacy and symbolic formats are specified tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:1 s0 format:[BUF_NUM_FORMAT_FLOAT] diff --git a/llvm/test/MC/AMDGPU/out-of-range-registers.s b/llvm/test/MC/AMDGPU/out-of-range-registers.s index c7cd03470f9fc..e350fc5de5207 100644 --- a/llvm/test/MC/AMDGPU/out-of-range-registers.s +++ b/llvm/test/MC/AMDGPU/out-of-range-registers.s @@ -4,112 +4,108 @@ // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck -check-prefixes=GCN-ERR,GFX10-ERR --implicit-check-not=error: %s // RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti -show-encoding %s | FileCheck -check-prefix=SIVICI %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=SIVICI %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck -check-prefix=GFX9 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefixes=SIVICI,CIVI9 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck -check-prefixes=GFX9,CIVI9 %s // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -show-encoding %s | FileCheck -check-prefix=GFX10 %s s_add_i32 s106, s0, s1 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_add_i32 s104, s0, s1 -// SICIVI9-ERR: error: not a valid operand +// SICIVI9-ERR: error: register not available on this GPU // GFX10: s_add_i32 s104, s0, s1 ; encoding: s_add_i32 s105, s0, s1 -// SICIVI9-ERR: error: not a valid operand +// SICIVI9-ERR: error: register not available on this GPU // GFX10: s_add_i32 s105, s0, s1 ; encoding: v_add_i32 v256, v0, v1 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range v_add_i32 v257, v0, v1 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_mov_b64 s[0:17], -1 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: invalid or unsupported register size s_mov_b64 s[103:104], -1 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: invalid register alignment s_mov_b64 s[105:106], -1 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: invalid register alignment s_mov_b64 s[104:105], -1 -// SICIVI9-ERR: error: not a valid operand +// SICIVI9-ERR: error: register not available on this GPU // GFX10: s_mov_b64 s[104:105], -1 ; encoding: s_load_dwordx4 s[102:105], s[2:3], s4 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: invalid register alignment s_load_dwordx4 s[104:108], s[2:3], s4 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_load_dwordx4 s[108:112], s[2:3], s4 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_load_dwordx4 s[1:4], s[2:3], s4 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: invalid register alignment -s_load_dwordx4 s[1:4], s[2:3], s4 -// GCN-ERR: error: not a valid operand +s_load_dwordx4 s[2:5], s[2:3], s4 +// GCN-ERR: error: invalid register alignment s_load_dwordx8 s[104:111], s[2:3], s4 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_load_dwordx8 s[100:107], s[2:3], s4 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_load_dwordx8 s[108:115], s[2:3], s4 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_load_dwordx16 s[92:107], s[2:3], s4 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_load_dwordx16 s[96:111], s[2:3], s4 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_load_dwordx16 s[100:115], s[2:3], s4 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_load_dwordx16 s[104:119], s[2:3], s4 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_load_dwordx16 s[108:123], s[2:3], s4 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_mov_b32 ttmp16, 0 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: register index is out of range s_mov_b32 ttmp12, 0 -// SICIVI: error: not a valid operand // GFX9: s_mov_b32 ttmp12, 0 ; encoding: // GFX10: s_mov_b32 ttmp12, 0 ; encoding: -// SIVICI-ERR: error: not a valid operand. +// SIVICI-ERR: error: register not available on this GPU s_mov_b32 ttmp15, 0 -// SICIVI: error: not a valid operand // GFX9: s_mov_b32 ttmp15, 0 ; encoding: // GFX10: s_mov_b32 ttmp15, 0 ; encoding: -// SIVICI-ERR: error: not a valid operand. +// SIVICI-ERR: error: register not available on this GPU s_mov_b32 flat_scratch_lo, 0 -// SI-ERR: error: not a valid operand -// CIVI9: s_mov_b32 flat_scratch_lo, 0 ; encoding: -// GFX10-ERR: error: not a valid operand -// GFX9: s_mov_b32 flat_scratch_lo, 0 ; encoding: [0x80,0x00,0xe6,0xbe] +// SI-ERR: error: register not available on this GPU +// GFX10-ERR: error: register not available on this GPU +// CIVI9: s_mov_b32 flat_scratch_lo, 0 ; encoding: [0x80,0x00,0xe6,0xbe] s_mov_b32 flat_scratch_hi, 0 -// SI-ERR: error: not a valid operand -// CIVI9: s_mov_b32 flat_scratch_hi, 0 ; encoding: -// GFX10-ERR: error: not a valid operand -// GFX9: s_mov_b32 flat_scratch_hi, 0 ; encoding: [0x80,0x00,0xe7,0xbe] +// SI-ERR: error: register not available on this GPU +// GFX10-ERR: error: register not available on this GPU +// CIVI9: s_mov_b32 flat_scratch_hi, 0 ; encoding: [0x80,0x00,0xe7,0xbe] s_mov_b32 tma_lo, 0 // SIVICI: s_mov_b32 tma_lo, 0 ; encoding: -// GFX9-ERR: error: not a valid operand -// GFX10-ERR: error: not a valid operand +// GFX9-ERR: error: register not available on this GPU +// GFX10-ERR: error: register not available on this GPU s_mov_b32 tba_lo, 0 // SIVICI: s_mov_b32 tba_lo, 0 ; encoding: -// GFX9-ERR: error: not a valid operand -// GFX10-ERR: error: not a valid operand +// GFX9-ERR: error: register not available on this GPU +// GFX10-ERR: error: register not available on this GPU diff --git a/llvm/test/MC/AMDGPU/reg-syntax-err.s b/llvm/test/MC/AMDGPU/reg-syntax-err.s index dce9375a47111..8f2c3e79310ce 100644 --- a/llvm/test/MC/AMDGPU/reg-syntax-err.s +++ b/llvm/test/MC/AMDGPU/reg-syntax-err.s @@ -1,73 +1,151 @@ // RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=NOVI --implicit-check-not=error: %s s_mov_b32 s1, s 1 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// NOVI: error: invalid operand for instruction s_mov_b32 s1, s[0 1 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: expected a closing square bracket s_mov_b32 s1, s[0:0 1 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: expected a closing square bracket s_mov_b32 s1, [s[0 1 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: expected a closing square bracket s_mov_b32 s1, [s[0:1] 1 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: expected a single 32-bit register s_mov_b32 s1, [s0, 1 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: expected a register or a list of registers s_mov_b32 s1, s999 1 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: register index is out of range s_mov_b32 s1, s[1:2] 1 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: invalid register alignment s_mov_b32 s1, s[0:2] 1 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// NOVI: error: invalid operand for instruction s_mov_b32 s1, xnack_mask_lo 1 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: register not available on this GPU s_mov_b32 s1, s s0 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// NOVI: error: invalid operand for instruction s_mov_b32 s1, s[0 s0 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: expected a closing square bracket s_mov_b32 s1, s[0:0 s0 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: expected a closing square bracket s_mov_b32 s1, [s[0 s0 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: expected a closing square bracket s_mov_b32 s1, [s[0:1] s0 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: expected a single 32-bit register s_mov_b32 s1, [s0, s0 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: registers in a list must have consecutive indices s_mov_b32 s1, s999 s0 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: register index is out of range s_mov_b32 s1, s[1:2] s0 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: invalid register alignment s_mov_b32 s1, s[0:2] vcc_lo -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// NOVI: error: invalid operand for instruction s_mov_b32 s1, xnack_mask_lo s1 -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: register not available on this GPU exp mrt0 v1, v2, v3, v4000 off -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: register index is out of range v_add_f64 v[0:1], v[0:1], v[0xF00000001:0x2] -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: invalid register index v_add_f64 v[0:1], v[0:1], v[0x1:0xF00000002] -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: invalid register index s_mov_b32 s1, s[0:-1] -// NOVI: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// NOVI: error: invalid register index + +s_mov_b64 s[10:11], [exec_lo,vcc_hi] +// NOVI: error: register does not fit in the list + +s_mov_b64 s[10:11], [exec_hi,exec_lo] +// NOVI: error: register does not fit in the list + +s_mov_b64 s[10:11], [exec_lo,exec_lo] +// NOVI: error: register does not fit in the list + +s_mov_b64 s[10:11], [exec,exec_lo] +// NOVI: error: register does not fit in the list + +s_mov_b64 s[10:11], [exec_lo,exec] +// NOVI: error: register does not fit in the list + +s_mov_b64 s[10:11], [exec_lo,s0] +// NOVI: error: registers in a list must be of the same kind + +s_mov_b64 s[10:11], [s0,exec_lo] +// NOVI: error: registers in a list must be of the same kind + +s_mov_b64 s[10:11], [s0,exec] +// NOVI: error: registers in a list must be of the same kind + +s_mov_b64 s[10:11], [s0,v1] +// NOVI: error: registers in a list must be of the same kind + +s_mov_b64 s[10:11], [v0,s1] +// NOVI: error: registers in a list must be of the same kind + +s_mov_b64 s[10:11], [s0,s0] +// NOVI: error: registers in a list must have consecutive indices + +s_mov_b64 s[10:11], [s0,s2] +// NOVI: error: registers in a list must have consecutive indices + +s_mov_b64 s[10:11], [s2,s1] +// NOVI: error: registers in a list must have consecutive indices + +s_mov_b64 s[10:11], [a0,a2] +// NOVI: error: registers in a list must have consecutive indices + +s_mov_b64 s[10:11], [a0,v1] +// NOVI: error: registers in a list must be of the same kind + +s_mov_b64 s[10:11], [s +// NOVI: error: missing register index + +s_mov_b64 s[10:11], s[1:0] +// NOVI: error: first register index should not exceed second index + +s_mov_b64 s[10:11], [x0,s1] +// NOVI: error: invalid register name + +s_mov_b64 s[10:11], [s,s1] +// NOVI: error: missing register index + +s_mov_b64 s[10:11], [s01,s1] +// NOVI: error: registers in a list must have consecutive indices + +s_mov_b64 s[10:11], [s0x] +// NOVI: error: invalid register index + +s_mov_b64 s[10:11], [s[0:1],s[2:3]] +// NOVI: error: expected a single 32-bit register + +s_mov_b64 s[10:11], [s0,s[2:3]] +// NOVI: error: expected a single 32-bit register + +s_mov_b64 s[10:11], [s0 +// NOVI: error: expected a comma or a closing square bracket + +s_mov_b64 s[10:11], [s0,s1 +// NOVI: error: expected a comma or a closing square bracket + +s_mov_b64 s[10:11], s[1:0] +// NOVI: error: first register index should not exceed second index diff --git a/llvm/test/MC/AMDGPU/reg-syntax-extra.s b/llvm/test/MC/AMDGPU/reg-syntax-extra.s index 528247f562399..1f887118ef8a2 100644 --- a/llvm/test/MC/AMDGPU/reg-syntax-extra.s +++ b/llvm/test/MC/AMDGPU/reg-syntax-extra.s @@ -38,9 +38,9 @@ s_mov_b64 [exec_lo,exec_hi], s[2:3] // GFX10: s_mov_b64 exec, s[2:3] ; encoding: [0x02,0x04,0xfe,0xbe] s_mov_b64 [flat_scratch_lo,flat_scratch_hi], s[2:3] -// NOSICI: error: not a valid operand. +// NOSICI: error: register not available on this GPU // VI: s_mov_b64 flat_scratch, s[2:3] ; encoding: [0x02,0x01,0xe6,0xbe] -// NOGFX10: error: not a valid operand. +// NOGFX10: error: register not available on this GPU s_mov_b64 [vcc_lo,vcc_hi], s[2:3] // SICI: s_mov_b64 vcc, s[2:3] ; encoding: [0x02,0x04,0xea,0xbe] @@ -50,12 +50,12 @@ s_mov_b64 [vcc_lo,vcc_hi], s[2:3] s_mov_b64 [tba_lo,tba_hi], s[2:3] // SICI: s_mov_b64 tba, s[2:3] ; encoding: [0x02,0x04,0xec,0xbe] // VI: s_mov_b64 tba, s[2:3] ; encoding: [0x02,0x01,0xec,0xbe] -// NOGFX10: error: not a valid operand. +// NOGFX10: error: register not available on this GPU s_mov_b64 [tma_lo,tma_hi], s[2:3] // SICI: s_mov_b64 tma, s[2:3] ; encoding: [0x02,0x04,0xee,0xbe] // VI: s_mov_b64 tma, s[2:3] ; encoding: [0x02,0x01,0xee,0xbe] -// NOGFX10: error: not a valid operand. +// NOGFX10: error: register not available on this GPU v_mov_b32_e32 [v1], [v2] // GCN: v_mov_b32_e32 v1, v2 ; encoding: [0x02,0x03,0x02,0x7e] @@ -151,21 +151,21 @@ flat_load_dwordx4 [v[8/2+4],v9,v[10],v[11/2+6]], v[2:3] // NOSICI: error: instruction not supported on this GPU v_mul_f32 v0, null, v2 -// NOSICIVI: error: not a valid operand. +// NOSICIVI: error: 'null' operand is not supported on this GPU // GFX10: v_mul_f32_e32 v0, null, v2 ; encoding: [0x7d,0x04,0x00,0x10] -// NOVI: error: not a valid operand. +// NOVI: error: 'null' operand is not supported on this GPU v_mul_f64 v[0:1], null, null -// NOSICIVI: error: not a valid operand. +// NOSICIVI: error: 'null' operand is not supported on this GPU // GFX10: v_mul_f64 v[0:1], null, null ; encoding: [0x00,0x00,0x65,0xd5,0x7d,0xfa,0x00,0x00] -// NOVI: error: not a valid operand. +// NOVI: error: 'null' operand is not supported on this GPU s_add_u32 null, null, null -// NOSICIVI: error: not a valid operand. +// NOSICIVI: error: 'null' operand is not supported on this GPU // GFX10: s_add_u32 null, null, null ; encoding: [0x7d,0x7d,0x7d,0x80] -// NOVI: error: not a valid operand. +// NOVI: error: 'null' operand is not supported on this GPU s_not_b64 s[2:3], null -// NOSICIVI: error: not a valid operand. +// NOSICIVI: error: 'null' operand is not supported on this GPU // GFX10: s_not_b64 s[2:3], null ; encoding: [0x7d,0x08,0x82,0xbe] -// NOVI: error: not a valid operand. +// NOVI: error: 'null' operand is not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/smem.s b/llvm/test/MC/AMDGPU/smem.s index 4d81929b415e0..5f00a820ee023 100644 --- a/llvm/test/MC/AMDGPU/smem.s +++ b/llvm/test/MC/AMDGPU/smem.s @@ -3,17 +3,19 @@ // RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1012 -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX1012 %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefix=NOSICI -check-prefix=NOSICIVI -check-prefix=NOSICIGFX10 -check-prefix=NOSICIVIGFX10 --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire %s 2>&1 | FileCheck -check-prefix=NOSICI -check-prefix=NOSICIVI -check-prefix=NOSICIGFX10 -check-prefix=NOSICIVIGFX10 --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=kaveri %s 2>&1 | FileCheck -check-prefix=NOSICI -check-prefix=NOSICIVI -check-prefix=NOSICIGFX10 -check-prefix=NOSICIVIGFX10 --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=NOSICIVI -check-prefix=NOVI -check-prefix=NOSICIVIGFX10 --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefix=NOGFX9 --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck -check-prefix=NOSICIGFX10 -check-prefix=NOGFX9 --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1030 -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefix=NOSICI -check-prefix=NOSICIVI -check-prefix=NOSICIGFX10 -check-prefix=NOSICIVIGFX10 -check-prefix=NOSICIGFX1030 -check-prefix=NOSICIVIGFX1030 --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire %s 2>&1 | FileCheck -check-prefix=NOSICI -check-prefix=NOSICIVI -check-prefix=NOSICIGFX10 -check-prefix=NOSICIVIGFX10 -check-prefix=NOSICIGFX1030 -check-prefix=NOSICIVIGFX1030 --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=kaveri %s 2>&1 | FileCheck -check-prefix=NOSICI -check-prefix=NOSICIVI -check-prefix=NOSICIGFX10 -check-prefix=NOSICIVIGFX10 -check-prefix=NOSICIGFX1030 -check-prefix=NOSICIVIGFX1030 --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=NOSICIVI -check-prefix=NOVI -check-prefix=NOSICIVIGFX10 -check-prefix=NOSICIVIGFX1030 --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefix=NOGFX9 -check-prefix=NOGFX9GFX1012 --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck -check-prefix=NOSICIGFX10 -check-prefix=NOGFX9 -check-prefix=NOGFX9GFX1012 --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1030 %s 2>&1 | FileCheck -check-prefix=NOSICIGFX1030 -check-prefix=NOSICIVIGFX10 -check-prefix=NOSICIVIGFX1030 -check-prefix=NOSICIGFX10 -check-prefix=NOGFX9 --implicit-check-not=error: %s s_dcache_wb // GFX89: s_dcache_wb ; encoding: [0x00,0x00,0x84,0xc0,0x00,0x00,0x00,0x00] // GFX1012: s_dcache_wb ; encoding: [0x00,0x00,0x84,0xf4,0x00,0x00,0x00,0x00] -// NOSICI: error: instruction not supported on this GPU +// NOSICIGFX1030: error: instruction not supported on this GPU s_dcache_wb_vol // GFX89: s_dcache_wb_vol ; encoding: [0x00,0x00,0x8c,0xc0,0x00,0x00,0x00,0x00] @@ -47,12 +49,12 @@ s_memrealtime s[4:5] s_memrealtime tba // VI: s_memrealtime tba ; encoding: [0x00,0x1b,0x94,0xc0,0x00,0x00,0x00,0x00] // NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_memrealtime tma // VI: s_memrealtime tma ; encoding: [0x80,0x1b,0x94,0xc0,0x00,0x00,0x00,0x00] // NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_memrealtime ttmp[0:1] // VI: s_memrealtime ttmp[0:1] ; encoding: [0x00,0x1c,0x94,0xc0,0x00,0x00,0x00,0x00] @@ -64,125 +66,123 @@ s_memrealtime ttmp[0:1] s_store_dword s1, s[2:3], 0xfc // GFX89: s_store_dword s1, s[2:3], 0xfc ; encoding: [0x41,0x00,0x42,0xc0,0xfc,0x00,0x00,0x00] // GFX1012: s_store_dword s1, s[2:3], 0xfc ; encoding: [0x41,0x00,0x40,0xf4,0xfc,0x00,0x00,0xfa] -// NOSICI: error: instruction not supported on this GPU +// NOSICIGFX1030: error: instruction not supported on this GPU s_store_dword s1, s[2:3], 0xfc glc // GFX89: s_store_dword s1, s[2:3], 0xfc glc ; encoding: [0x41,0x00,0x43,0xc0,0xfc,0x00,0x00,0x00] // GFX1012: s_store_dword s1, s[2:3], 0xfc glc ; encoding: [0x41,0x00,0x41,0xf4,0xfc,0x00,0x00,0xfa] -// NOSICI: error: invalid operand for instruction +// NOSICIGFX1030: error: invalid operand for instruction s_store_dword s1, s[2:3], s4 // GFX89: s_store_dword s1, s[2:3], s4 ; encoding: [0x41,0x00,0x40,0xc0,0x04,0x00,0x00,0x00] // GFX1012: s_store_dword s1, s[2:3], s4 ; encoding: [0x41,0x00,0x40,0xf4,0x00,0x00,0x00,0x08] -// NOSICI: error: instruction not supported on this GPU +// NOSICIGFX1030: error: instruction not supported on this GPU s_store_dword s1, s[2:3], s4 glc // GFX89: s_store_dword s1, s[2:3], s4 glc ; encoding: [0x41,0x00,0x41,0xc0,0x04,0x00,0x00,0x00] // GFX1012: s_store_dword s1, s[2:3], s4 glc ; encoding: [0x41,0x00,0x41,0xf4,0x00,0x00,0x00,0x08] -// NOSICI: error: invalid operand for instruction +// NOSICIGFX1030: error: invalid operand for instruction s_store_dword tba_lo, s[2:3], s4 // VI: s_store_dword tba_lo, s[2:3], s4 ; encoding: [0x01,0x1b,0x40,0xc0,0x04,0x00,0x00,0x00] // NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_store_dword tba_hi, s[2:3], s4 // VI: s_store_dword tba_hi, s[2:3], s4 ; encoding: [0x41,0x1b,0x40,0xc0,0x04,0x00,0x00,0x00] // NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_store_dword tma_lo, s[2:3], s4 // VI: s_store_dword tma_lo, s[2:3], s4 ; encoding: [0x81,0x1b,0x40,0xc0,0x04,0x00,0x00,0x00] // NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_store_dword tma_hi, s[2:3], s4 // VI: s_store_dword tma_hi, s[2:3], s4 ; encoding: [0xc1,0x1b,0x40,0xc0,0x04,0x00,0x00,0x00] // NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU // FIXME: Should error on SI instead of silently ignoring glc s_load_dword s1, s[2:3], 0xfc glc // GFX89: s_load_dword s1, s[2:3], 0xfc glc ; encoding: [0x41,0x00,0x03,0xc0,0xfc,0x00,0x00,0x00] // GFX10: s_load_dword s1, s[2:3], 0xfc glc ; encoding: [0x41,0x00,0x01,0xf4,0xfc,0x00,0x00,0xfa] -// SICI: s_load_dword s1, s[2:3], 0xfc glc ; encoding: [0xfc,0x83,0x00,0xc0] +// SICI: s_load_dword s1, s[2:3], 0xfc glc ; encoding: [0xfc,0x83,0x00,0xc0 s_load_dword s1, s[2:3], s4 glc // GFX89: s_load_dword s1, s[2:3], s4 glc ; encoding: [0x41,0x00,0x01,0xc0,0x04,0x00,0x00,0x00] // GFX10: s_load_dword s1, s[2:3], s4 glc ; encoding: [0x41,0x00,0x01,0xf4,0x00,0x00,0x00,0x08] -// SICI: s_load_dword s1, s[2:3], s4 glc ; encoding: [0x04,0x82,0x00,0xc0] s_buffer_store_dword s10, s[92:95], m0 // GFX89: s_buffer_store_dword s10, s[92:95], m0 ; encoding: [0xae,0x02,0x60,0xc0,0x7c,0x00,0x00,0x00] -// NOSICI: error: instruction not supported on this GPU -// GFX10: s_buffer_store_dword s10, s[92:95], m0 ; encoding: [0xae,0x02,0x60,0xf4,0x00,0x00,0x00,0xf8] +// NOSICIGFX1030: error: instruction not supported on this GPU +// GFX1012: s_buffer_store_dword s10, s[92:95], m0 ; encoding: [0xae,0x02,0x60,0xf4,0x00,0x00,0x00,0xf8] s_buffer_store_dword tba_lo, s[92:95], m0 // VI: s_buffer_store_dword tba_lo, s[92:95], m0 ; encoding: [0x2e,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00] // NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_buffer_store_dword tba_hi, s[92:95], m0 // VI: s_buffer_store_dword tba_hi, s[92:95], m0 ; encoding: [0x6e,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00] // NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_buffer_store_dword tma_lo, s[92:95], m0 // VI: s_buffer_store_dword tma_lo, s[92:95], m0 ; encoding: [0xae,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00] // NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_buffer_store_dword tma_hi, s[92:95], m0 // VI: s_buffer_store_dword tma_hi, s[92:95], m0 ; encoding: [0xee,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00] // NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_buffer_store_dword ttmp0, s[92:95], m0 // VI: s_buffer_store_dword ttmp0, s[92:95], m0 ; encoding: [0x2e,0x1c,0x60,0xc0,0x7c,0x00,0x00,0x00] // GFX9: s_buffer_store_dword ttmp0, s[92:95], m0 ; encoding: [0x2e,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00] -// NOSICI: error: instruction not supported on this GPU -// GFX10: s_buffer_store_dword ttmp0, s[92:95], m0 ; encoding: [0x2e,0x1b,0x60,0xf4,0x00,0x00,0x00,0xf8] +// NOSICIGFX1030: error: instruction not supported on this GPU +// GFX1012: s_buffer_store_dword ttmp0, s[92:95], m0 ; encoding: [0x2e,0x1b,0x60,0xf4,0x00,0x00,0x00,0xf8] s_buffer_store_dwordx2 s[10:11], s[92:95], m0 // GFX89: s_buffer_store_dwordx2 s[10:11], s[92:95], m0 ; encoding: [0xae,0x02,0x64,0xc0,0x7c,0x00,0x00,0x00] -// NOSICI: error: instruction not supported on this GPU -// GFX10: s_buffer_store_dwordx2 s[10:11], s[92:95], m0 ; encoding: [0xae,0x02,0x64,0xf4,0x00,0x00,0x00,0xf8] +// NOSICIGFX1030: error: instruction not supported on this GPU +// GFX1012: s_buffer_store_dwordx2 s[10:11], s[92:95], m0 ; encoding: [0xae,0x02,0x64,0xf4,0x00,0x00,0x00,0xf8] s_buffer_store_dwordx4 s[8:11], s[92:95], m0 glc // GFX89: s_buffer_store_dwordx4 s[8:11], s[92:95], m0 glc ; encoding: [0x2e,0x02,0x69,0xc0,0x7c,0x00,0x00,0x00] -// NOSICI: error: invalid operand for instruction -// GFX10: s_buffer_store_dwordx4 s[8:11], s[92:95], m0 glc ; encoding: [0x2e,0x02,0x69,0xf4,0x00,0x00,0x00,0xf8] +// NOSICIGFX1030: error: invalid operand for instruction +// GFX1012: s_buffer_store_dwordx4 s[8:11], s[92:95], m0 glc ; encoding: [0x2e,0x02,0x69,0xf4,0x00,0x00,0x00,0xf8] s_buffer_store_dwordx2 tba, s[92:95], m0 glc // VI: s_buffer_store_dwordx2 tba, s[92:95], m0 glc ; encoding: [0x2e,0x1b,0x65,0xc0,0x7c,0x00,0x00,0x00] // NOSICI: error: invalid operand for instruction -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_buffer_load_dword s10, s[92:95], m0 // GFX89: s_buffer_load_dword s10, s[92:95], m0 ; encoding: [0xae,0x02,0x20,0xc0,0x7c,0x00,0x00,0x00] // SICI: s_buffer_load_dword s10, s[92:95], m0 ; encoding: [0x7c,0x5c,0x05,0xc2] // GFX10: s_buffer_load_dword s10, s[92:95], m0 ; encoding: [0xae,0x02,0x20,0xf4,0x00,0x00,0x00,0xf8] -// SICIGFX10: s_buffer_load_dword s10, s[92:95], m0 ; encoding: [0x7c,0x5c,0x05,0xc2] s_buffer_load_dword tba_lo, s[92:95], m0 // VI: s_buffer_load_dword tba_lo, s[92:95], m0 ; encoding: [0x2e,0x1b,0x20,0xc0,0x7c,0x00,0x00,0x00] // SICI: s_buffer_load_dword tba_lo, s[92:95], m0 ; encoding: [0x7c,0x5c,0x36,0xc2] -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_buffer_load_dword tba_hi, s[92:95], m0 // VI: s_buffer_load_dword tba_hi, s[92:95], m0 ; encoding: [0x6e,0x1b,0x20,0xc0,0x7c,0x00,0x00,0x00] // SICI: s_buffer_load_dword tba_hi, s[92:95], m0 ; encoding: [0x7c,0xdc,0x36,0xc2] -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_buffer_load_dword tma_lo, s[92:95], m0 // VI: s_buffer_load_dword tma_lo, s[92:95], m0 ; encoding: [0xae,0x1b,0x20,0xc0,0x7c,0x00,0x00,0x00] // SICI: s_buffer_load_dword tma_lo, s[92:95], m0 ; encoding: [0x7c,0x5c,0x37,0xc2] -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_buffer_load_dword tma_hi, s[92:95], m0 // VI: s_buffer_load_dword tma_hi, s[92:95], m0 ; encoding: [0xee,0x1b,0x20,0xc0,0x7c,0x00,0x00,0x00] // SICI: s_buffer_load_dword tma_hi, s[92:95], m0 ; encoding: [0x7c,0xdc,0x37,0xc2] -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_buffer_load_dword ttmp0, s[92:95], m0 // VI: s_buffer_load_dword ttmp0, s[92:95], m0 ; encoding: [0x2e,0x1c,0x20,0xc0,0x7c,0x00,0x00,0x00] @@ -198,12 +198,12 @@ s_buffer_load_dwordx2 s[10:11], s[92:95], m0 s_buffer_load_dwordx2 tba, s[92:95], m0 // VI: s_buffer_load_dwordx2 tba, s[92:95], m0 ; encoding: [0x2e,0x1b,0x24,0xc0,0x7c,0x00,0x00,0x00] // SICI: s_buffer_load_dwordx2 tba, s[92:95], m0 ; encoding: [0x7c,0x5c,0x76,0xc2] -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_buffer_load_dwordx2 tma, s[92:95], m0 // VI: s_buffer_load_dwordx2 tma, s[92:95], m0 ; encoding: [0xae,0x1b,0x24,0xc0,0x7c,0x00,0x00,0x00] // SICI: s_buffer_load_dwordx2 tma, s[92:95], m0 ; encoding: [0x7c,0x5c,0x77,0xc2] -// NOGFX9: error: not a valid operand. +// NOGFX9: error: register not available on this GPU s_buffer_load_dwordx2 ttmp[0:1], s[92:95], m0 // VI: s_buffer_load_dwordx2 ttmp[0:1], s[92:95], m0 ; encoding: [0x2e,0x1c,0x24,0xc0,0x7c,0x00,0x00,0x00] @@ -215,7 +215,6 @@ s_buffer_load_dwordx2 ttmp[0:1], s[92:95], m0 s_buffer_load_dwordx4 s[8:11], s[92:95], m0 glc // GFX89: s_buffer_load_dwordx4 s[8:11], s[92:95], m0 glc ; encoding: [0x2e,0x02,0x29,0xc0,0x7c,0x00,0x00,0x00] // GFX10: s_buffer_load_dwordx4 s[8:11], s[92:95], m0 glc ; encoding: [0x2e,0x02,0x29,0xf4,0x00,0x00,0x00,0xf8] -// SICI: s_buffer_load_dwordx4 s[8:11], s[92:95], m0 glc ; encoding: [0x7c,0x5c,0x84,0xc2] //===----------------------------------------------------------------------===// // s_scratch instructions @@ -224,47 +223,47 @@ s_buffer_load_dwordx4 s[8:11], s[92:95], m0 glc s_scratch_load_dword s5, s[2:3], s101 // GFX9: s_scratch_load_dword s5, s[2:3], s101 ; encoding: [0x41,0x01,0x14,0xc0,0x65,0x00,0x00,0x00] // GFX1012: s_scratch_load_dword s5, s[2:3], s101 ; encoding: [0x41,0x01,0x14,0xf4,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_scratch_load_dword s5, s[2:3], s0 glc // GFX9: s_scratch_load_dword s5, s[2:3], s0 glc ; encoding: [0x41,0x01,0x15,0xc0,0x00,0x00,0x00,0x00] // GFX1012: s_scratch_load_dword s5, s[2:3], s0 glc ; encoding: [0x41,0x01,0x15,0xf4,0x00,0x00,0x00,0x00] -// NOSICIVI: error: invalid operand for instruction +// NOSICIVIGFX1030: error: invalid operand for instruction s_scratch_load_dwordx2 s[100:101], s[2:3], s0 // GFX9: s_scratch_load_dwordx2 s[100:101], s[2:3], s0 ; encoding: [0x01,0x19,0x18,0xc0,0x00,0x00,0x00,0x00] // GFX1012: s_scratch_load_dwordx2 s[100:101], s[2:3], s0 ; encoding: [0x01,0x19,0x18,0xf4,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_scratch_load_dwordx2 s[10:11], s[2:3], 0x1 glc // GFX9: s_scratch_load_dwordx2 s[10:11], s[2:3], 0x1 glc ; encoding: [0x81,0x02,0x1b,0xc0,0x01,0x00,0x00,0x00] // GFX1012: s_scratch_load_dwordx2 s[10:11], s[2:3], 0x1 glc ; encoding: [0x81,0x02,0x19,0xf4,0x01,0x00,0x00,0xfa] -// NOSICIVI: error: invalid operand for instruction +// NOSICIVIGFX1030: error: invalid operand for instruction s_scratch_load_dwordx4 s[20:23], s[4:5], s0 // GFX9: s_scratch_load_dwordx4 s[20:23], s[4:5], s0 ; encoding: [0x02,0x05,0x1c,0xc0,0x00,0x00,0x00,0x00] // GFX1012: s_scratch_load_dwordx4 s[20:23], s[4:5], s0 ; encoding: [0x02,0x05,0x1c,0xf4,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_scratch_store_dword s101, s[4:5], s0 // GFX9: s_scratch_store_dword s101, s[4:5], s0 ; encoding: [0x42,0x19,0x54,0xc0,0x00,0x00,0x00,0x00] // GFX1012: s_scratch_store_dword s101, s[4:5], s0 ; encoding: [0x42,0x19,0x54,0xf4,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_scratch_store_dword s1, s[4:5], 0x123 glc // GFX9: s_scratch_store_dword s1, s[4:5], 0x123 glc ; encoding: [0x42,0x00,0x57,0xc0,0x23,0x01,0x00,0x00] // GFX1012: s_scratch_store_dword s1, s[4:5], 0x123 glc ; encoding: [0x42,0x00,0x55,0xf4,0x23,0x01,0x00,0xfa] -// NOSICIVI: error: invalid operand for instruction +// NOSICIVIGFX1030: error: invalid operand for instruction s_scratch_store_dwordx2 s[2:3], s[4:5], s101 glc // GFX9: s_scratch_store_dwordx2 s[2:3], s[4:5], s101 glc ; encoding: [0x82,0x00,0x59,0xc0,0x65,0x00,0x00,0x00] // GFX1012: s_scratch_store_dwordx2 s[2:3], s[4:5], s101 glc ; encoding: [0x82,0x00,0x59,0xf4,0x00,0x00,0x00,0xca] -// NOSICIVI: error: invalid operand for instruction +// NOSICIVIGFX1030: error: invalid operand for instruction s_scratch_store_dwordx4 s[4:7], s[4:5], s0 glc // GFX9: s_scratch_store_dwordx4 s[4:7], s[4:5], s0 glc ; encoding: [0x02,0x01,0x5d,0xc0,0x00,0x00,0x00,0x00] // GFX1012: s_scratch_store_dwordx4 s[4:7], s[4:5], s0 glc ; encoding: [0x02,0x01,0x5d,0xf4,0x00,0x00,0x00,0x00] -// NOSICIVI: error: invalid operand for instruction +// NOSICIVIGFX1030: error: invalid operand for instruction //===----------------------------------------------------------------------===// // s_dcache_discard instructions @@ -273,22 +272,22 @@ s_scratch_store_dwordx4 s[4:7], s[4:5], s0 glc s_dcache_discard s[2:3], s0 // GFX9: s_dcache_discard s[2:3], s0 ; encoding: [0x01,0x00,0xa0,0xc0,0x00,0x00,0x00,0x00] // GFX1012: s_dcache_discard s[2:3], s0 ; encoding: [0x01,0x00,0xa0,0xf4,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_dcache_discard s[2:3], 0x0 // GFX9: s_dcache_discard s[2:3], 0x0 ; encoding: [0x01,0x00,0xa2,0xc0,0x00,0x00,0x00,0x00] // GFX1012: s_dcache_discard s[2:3], 0x0 ; encoding: [0x01,0x00,0xa0,0xf4,0x00,0x00,0x00,0xfa] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_dcache_discard_x2 s[2:3], s101 // GFX9: s_dcache_discard_x2 s[2:3], s101 ; encoding: [0x01,0x00,0xa4,0xc0,0x65,0x00,0x00,0x00] // GFX1012: s_dcache_discard_x2 s[2:3], s101 ; encoding: [0x01,0x00,0xa4,0xf4,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_dcache_discard_x2 s[2:3], 0x0 // GFX9: s_dcache_discard_x2 s[2:3], 0x0 ; encoding: [0x01,0x00,0xa6,0xc0,0x00,0x00,0x00,0x00] // GFX1012: s_dcache_discard_x2 s[2:3], 0x0 ; encoding: [0x01,0x00,0xa4,0xf4,0x00,0x00,0x00,0xfa] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU //===----------------------------------------------------------------------===// // s_atomic instructions @@ -297,162 +296,162 @@ s_dcache_discard_x2 s[2:3], 0x0 s_atomic_add s5, s[2:3], s101 // GFX9: s_atomic_add s5, s[2:3], s101 ; encoding: [0x41,0x01,0x08,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_add s5, s[2:3], s101 ; encoding: [0x41,0x01,0x08,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_add s5, s[2:3], 0x0 // GFX9: s_atomic_add s5, s[2:3], 0x0 ; encoding: [0x41,0x01,0x0a,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_add s5, s[2:3], 0x0 ; encoding: [0x41,0x01,0x08,0xf6,0x00,0x00,0x00,0xfa] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_add s5, s[2:3], s0 glc // GFX9: s_atomic_add s5, s[2:3], s0 glc ; encoding: [0x41,0x01,0x09,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_add s5, s[2:3], s0 glc ; encoding: [0x41,0x01,0x09,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_add_x2 s[10:11], s[2:3], s101 // GFX9: s_atomic_add_x2 s[10:11], s[2:3], s101 ; encoding: [0x81,0x02,0x88,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_add_x2 s[10:11], s[2:3], s101 ; encoding: [0x81,0x02,0x88,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_and s5, s[2:3], s101 // GFX9: s_atomic_and s5, s[2:3], s101 ; encoding: [0x41,0x01,0x20,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_and s5, s[2:3], s101 ; encoding: [0x41,0x01,0x20,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_and_x2 s[10:11], s[2:3], 0x0 // GFX9: s_atomic_and_x2 s[10:11], s[2:3], 0x0 ; encoding: [0x81,0x02,0xa2,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_and_x2 s[10:11], s[2:3], 0x0 ; encoding: [0x81,0x02,0xa0,0xf6,0x00,0x00,0x00,0xfa] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_cmpswap s[10:11], s[2:3], s101 // GFX9: s_atomic_cmpswap s[10:11], s[2:3], s101 ; encoding: [0x81,0x02,0x04,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_cmpswap s[10:11], s[2:3], s101 ; encoding: [0x81,0x02,0x04,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_cmpswap s[10:11], s[2:3], 0x0 // GFX9: s_atomic_cmpswap s[10:11], s[2:3], 0x0 ; encoding: [0x81,0x02,0x06,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_cmpswap s[10:11], s[2:3], 0x0 ; encoding: [0x81,0x02,0x04,0xf6,0x00,0x00,0x00,0xfa] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_cmpswap s[10:11], s[2:3], s0 glc // GFX9: s_atomic_cmpswap s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0x05,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_cmpswap s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0x05,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_cmpswap_x2 s[20:23], s[2:3], s101 // GFX9: s_atomic_cmpswap_x2 s[20:23], s[2:3], s101 ; encoding: [0x01,0x05,0x84,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_cmpswap_x2 s[20:23], s[2:3], s101 ; encoding: [0x01,0x05,0x84,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_cmpswap_x2 s[20:23], s[2:3], 0x0 // GFX9: s_atomic_cmpswap_x2 s[20:23], s[2:3], 0x0 ; encoding: [0x01,0x05,0x86,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_cmpswap_x2 s[20:23], s[2:3], 0x0 ; encoding: [0x01,0x05,0x84,0xf6,0x00,0x00,0x00,0xfa] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_cmpswap_x2 s[20:23], s[2:3], s0 glc // GFX9: s_atomic_cmpswap_x2 s[20:23], s[2:3], s0 glc ; encoding: [0x01,0x05,0x85,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_cmpswap_x2 s[20:23], s[2:3], s0 glc ; encoding: [0x01,0x05,0x85,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_dec s5, s[2:3], s0 glc // GFX9: s_atomic_dec s5, s[2:3], s0 glc ; encoding: [0x41,0x01,0x31,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_dec s5, s[2:3], s0 glc ; encoding: [0x41,0x01,0x31,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_dec_x2 s[10:11], s[2:3], s101 // GFX9: s_atomic_dec_x2 s[10:11], s[2:3], s101 ; encoding: [0x81,0x02,0xb0,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_dec_x2 s[10:11], s[2:3], s101 ; encoding: [0x81,0x02,0xb0,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_inc s5, s[2:3], s0 glc // GFX9: s_atomic_inc s5, s[2:3], s0 glc ; encoding: [0x41,0x01,0x2d,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_inc s5, s[2:3], s0 glc ; encoding: [0x41,0x01,0x2d,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_inc_x2 s[10:11], s[2:3], s101 // GFX9: s_atomic_inc_x2 s[10:11], s[2:3], s101 ; encoding: [0x81,0x02,0xac,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_inc_x2 s[10:11], s[2:3], s101 ; encoding: [0x81,0x02,0xac,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_or s5, s[2:3], 0x0 // GFX9: s_atomic_or s5, s[2:3], 0x0 ; encoding: [0x41,0x01,0x26,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_or s5, s[2:3], 0x0 ; encoding: [0x41,0x01,0x24,0xf6,0x00,0x00,0x00,0xfa] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_or_x2 s[10:11], s[2:3], s0 glc // GFX9: s_atomic_or_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0xa5,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_or_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0xa5,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_smax s5, s[2:3], s101 // GFX9: s_atomic_smax s5, s[2:3], s101 ; encoding: [0x41,0x01,0x18,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_smax s5, s[2:3], s101 ; encoding: [0x41,0x01,0x18,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_smax_x2 s[10:11], s[2:3], s0 glc // GFX9: s_atomic_smax_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0x99,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_smax_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0x99,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_smin s5, s[2:3], s101 // GFX9: s_atomic_smin s5, s[2:3], s101 ; encoding: [0x41,0x01,0x10,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_smin s5, s[2:3], s101 ; encoding: [0x41,0x01,0x10,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_smin_x2 s[10:11], s[2:3], s0 glc // GFX9: s_atomic_smin_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0x91,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_smin_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0x91,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_sub s5, s[2:3], s101 // GFX9: s_atomic_sub s5, s[2:3], s101 ; encoding: [0x41,0x01,0x0c,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_sub s5, s[2:3], s101 ; encoding: [0x41,0x01,0x0c,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_sub_x2 s[10:11], s[2:3], s0 glc // GFX9: s_atomic_sub_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0x8d,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_sub_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0x8d,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_swap s5, s[2:3], s101 // GFX9: s_atomic_swap s5, s[2:3], s101 ; encoding: [0x41,0x01,0x00,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_swap s5, s[2:3], s101 ; encoding: [0x41,0x01,0x00,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_swap_x2 s[10:11], s[2:3], s0 glc // GFX9: s_atomic_swap_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0x81,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_swap_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0x81,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_umax s5, s[2:3], s0 glc // GFX9: s_atomic_umax s5, s[2:3], s0 glc ; encoding: [0x41,0x01,0x1d,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_umax s5, s[2:3], s0 glc ; encoding: [0x41,0x01,0x1d,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_umax_x2 s[10:11], s[2:3], s101 // GFX9: s_atomic_umax_x2 s[10:11], s[2:3], s101 ; encoding: [0x81,0x02,0x9c,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_umax_x2 s[10:11], s[2:3], s101 ; encoding: [0x81,0x02,0x9c,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_umin s5, s[2:3], s101 // GFX9: s_atomic_umin s5, s[2:3], s101 ; encoding: [0x41,0x01,0x14,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_umin s5, s[2:3], s101 ; encoding: [0x41,0x01,0x14,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_umin_x2 s[10:11], s[2:3], s0 glc // GFX9: s_atomic_umin_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0x95,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_umin_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0x95,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_xor s5, s[2:3], s101 // GFX9: s_atomic_xor s5, s[2:3], s101 ; encoding: [0x41,0x01,0x28,0xc2,0x65,0x00,0x00,0x00] // GFX1012: s_atomic_xor s5, s[2:3], s101 ; encoding: [0x41,0x01,0x28,0xf6,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_atomic_xor_x2 s[10:11], s[2:3], s0 glc // GFX9: s_atomic_xor_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0xa9,0xc2,0x00,0x00,0x00,0x00] // GFX1012: s_atomic_xor_x2 s[10:11], s[2:3], s0 glc ; encoding: [0x81,0x02,0xa9,0xf6,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU //===----------------------------------------------------------------------===// // s_buffer_atomic instructions @@ -461,162 +460,162 @@ s_atomic_xor_x2 s[10:11], s[2:3], s0 glc s_buffer_atomic_add s5, s[4:7], s101 // GFX9: s_buffer_atomic_add s5, s[4:7], s101 ; encoding: [0x42,0x01,0x08,0xc1,0x65,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_add s5, s[4:7], s101 ; encoding: [0x42,0x01,0x08,0xf5,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_add s5, s[4:7], 0x0 // GFX9: s_buffer_atomic_add s5, s[4:7], 0x0 ; encoding: [0x42,0x01,0x0a,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_add s5, s[4:7], 0x0 ; encoding: [0x42,0x01,0x08,0xf5,0x00,0x00,0x00,0xfa] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_add s5, s[4:7], s0 glc // GFX9: s_buffer_atomic_add s5, s[4:7], s0 glc ; encoding: [0x42,0x01,0x09,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_add s5, s[4:7], s0 glc ; encoding: [0x42,0x01,0x09,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_add_x2 s[10:11], s[4:7], s0 // GFX9: s_buffer_atomic_add_x2 s[10:11], s[4:7], s0 ; encoding: [0x82,0x02,0x88,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_add_x2 s[10:11], s[4:7], s0 ; encoding: [0x82,0x02,0x88,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_and s101, s[4:7], s0 // GFX9: s_buffer_atomic_and s101, s[4:7], s0 ; encoding: [0x42,0x19,0x20,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_and s101, s[4:7], s0 ; encoding: [0x42,0x19,0x20,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_and_x2 s[10:11], s[8:11], s0 // GFX9: s_buffer_atomic_and_x2 s[10:11], s[8:11], s0 ; encoding: [0x84,0x02,0xa0,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_and_x2 s[10:11], s[8:11], s0 ; encoding: [0x84,0x02,0xa0,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_cmpswap s[10:11], s[4:7], s0 // GFX9: s_buffer_atomic_cmpswap s[10:11], s[4:7], s0 ; encoding: [0x82,0x02,0x04,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_cmpswap s[10:11], s[4:7], s0 ; encoding: [0x82,0x02,0x04,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_cmpswap s[10:11], s[4:7], 0x0 // GFX9: s_buffer_atomic_cmpswap s[10:11], s[4:7], 0x0 ; encoding: [0x82,0x02,0x06,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_cmpswap s[10:11], s[4:7], 0x0 ; encoding: [0x82,0x02,0x04,0xf5,0x00,0x00,0x00,0xfa] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_cmpswap s[10:11], s[4:7], s0 glc // GFX9: s_buffer_atomic_cmpswap s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x02,0x05,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_cmpswap s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x02,0x05,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_cmpswap_x2 s[20:23], s[4:7], s101 // GFX9: s_buffer_atomic_cmpswap_x2 s[20:23], s[4:7], s101 ; encoding: [0x02,0x05,0x84,0xc1,0x65,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_cmpswap_x2 s[20:23], s[4:7], s101 ; encoding: [0x02,0x05,0x84,0xf5,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_cmpswap_x2 s[20:23], s[4:7], 0x0 // GFX9: s_buffer_atomic_cmpswap_x2 s[20:23], s[4:7], 0x0 ; encoding: [0x02,0x05,0x86,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_cmpswap_x2 s[20:23], s[4:7], 0x0 ; encoding: [0x02,0x05,0x84,0xf5,0x00,0x00,0x00,0xfa] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_cmpswap_x2 s[20:23], s[4:7], s0 glc // GFX9: s_buffer_atomic_cmpswap_x2 s[20:23], s[4:7], s0 glc ; encoding: [0x02,0x05,0x85,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_cmpswap_x2 s[20:23], s[4:7], s0 glc ; encoding: [0x02,0x05,0x85,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_dec s5, s[4:7], s0 // GFX9: s_buffer_atomic_dec s5, s[4:7], s0 ; encoding: [0x42,0x01,0x30,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_dec s5, s[4:7], s0 ; encoding: [0x42,0x01,0x30,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_dec_x2 s[10:11], s[4:7], s0 glc // GFX9: s_buffer_atomic_dec_x2 s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x02,0xb1,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_dec_x2 s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x02,0xb1,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_inc s101, s[4:7], s0 // GFX9: s_buffer_atomic_inc s101, s[4:7], s0 ; encoding: [0x42,0x19,0x2c,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_inc s101, s[4:7], s0 ; encoding: [0x42,0x19,0x2c,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_inc_x2 s[10:11], s[4:7], 0x0 // GFX9: s_buffer_atomic_inc_x2 s[10:11], s[4:7], 0x0 ; encoding: [0x82,0x02,0xae,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_inc_x2 s[10:11], s[4:7], 0x0 ; encoding: [0x82,0x02,0xac,0xf5,0x00,0x00,0x00,0xfa] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_or s5, s[8:11], s0 // GFX9: s_buffer_atomic_or s5, s[8:11], s0 ; encoding: [0x44,0x01,0x24,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_or s5, s[8:11], s0 ; encoding: [0x44,0x01,0x24,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_or_x2 s[10:11], s[96:99], s0 // GFX9: s_buffer_atomic_or_x2 s[10:11], s[96:99], s0 ; encoding: [0xb0,0x02,0xa4,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_or_x2 s[10:11], s[96:99], s0 ; encoding: [0xb0,0x02,0xa4,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_smax s5, s[4:7], s101 // GFX9: s_buffer_atomic_smax s5, s[4:7], s101 ; encoding: [0x42,0x01,0x18,0xc1,0x65,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_smax s5, s[4:7], s101 ; encoding: [0x42,0x01,0x18,0xf5,0x00,0x00,0x00,0xca] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_smax_x2 s[100:101], s[4:7], s0 // GFX9: s_buffer_atomic_smax_x2 s[100:101], s[4:7], s0 ; encoding: [0x02,0x19,0x98,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_smax_x2 s[100:101], s[4:7], s0 ; encoding: [0x02,0x19,0x98,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_smin s5, s[4:7], 0x0 // GFX9: s_buffer_atomic_smin s5, s[4:7], 0x0 ; encoding: [0x42,0x01,0x12,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_smin s5, s[4:7], 0x0 ; encoding: [0x42,0x01,0x10,0xf5,0x00,0x00,0x00,0xfa] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_smin_x2 s[12:13], s[4:7], s0 // GFX9: s_buffer_atomic_smin_x2 s[12:13], s[4:7], s0 ; encoding: [0x02,0x03,0x90,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_smin_x2 s[12:13], s[4:7], s0 ; encoding: [0x02,0x03,0x90,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_sub s5, s[4:7], s0 glc // GFX9: s_buffer_atomic_sub s5, s[4:7], s0 glc ; encoding: [0x42,0x01,0x0d,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_sub s5, s[4:7], s0 glc ; encoding: [0x42,0x01,0x0d,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_sub_x2 s[10:11], s[4:7], s0 // GFX9: s_buffer_atomic_sub_x2 s[10:11], s[4:7], s0 ; encoding: [0x82,0x02,0x8c,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_sub_x2 s[10:11], s[4:7], s0 ; encoding: [0x82,0x02,0x8c,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_swap s5, s[4:7], s0 // GFX9: s_buffer_atomic_swap s5, s[4:7], s0 ; encoding: [0x42,0x01,0x00,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_swap s5, s[4:7], s0 ; encoding: [0x42,0x01,0x00,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_swap_x2 s[10:11], s[4:7], s0 glc // GFX9: s_buffer_atomic_swap_x2 s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x02,0x81,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_swap_x2 s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x02,0x81,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_umax s5, s[4:7], s0 // GFX9: s_buffer_atomic_umax s5, s[4:7], s0 ; encoding: [0x42,0x01,0x1c,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_umax s5, s[4:7], s0 ; encoding: [0x42,0x01,0x1c,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_umax_x2 s[10:11], s[4:7], s0 glc // GFX9: s_buffer_atomic_umax_x2 s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x02,0x9d,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_umax_x2 s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x02,0x9d,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_umin s5, s[4:7], s0 // GFX9: s_buffer_atomic_umin s5, s[4:7], s0 ; encoding: [0x42,0x01,0x14,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_umin s5, s[4:7], s0 ; encoding: [0x42,0x01,0x14,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_umin_x2 s[10:11], s[4:7], s0 glc // GFX9: s_buffer_atomic_umin_x2 s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x02,0x95,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_umin_x2 s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x02,0x95,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_xor s5, s[4:7], s0 // GFX9: s_buffer_atomic_xor s5, s[4:7], s0 ; encoding: [0x42,0x01,0x28,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_xor s5, s[4:7], s0 ; encoding: [0x42,0x01,0x28,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_xor_x2 s[10:11], s[4:7], s0 glc // GFX9: s_buffer_atomic_xor_x2 s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x02,0xa9,0xc1,0x00,0x00,0x00,0x00] // GFX1012: s_buffer_atomic_xor_x2 s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x02,0xa9,0xf5,0x00,0x00,0x00,0x00] -// NOSICIVI: error: instruction not supported on this GPU +// NOSICIVIGFX1030: error: instruction not supported on this GPU //===----------------------------------------------------------------------===// // Unsigned 20-bit offsets (VI+) @@ -633,23 +632,23 @@ s_atc_probe_buffer 0x1, s[8:11], 0xFFFFF // GFX10: s_atc_probe_buffer 1, s[8:11], 0xfffff ; encoding: [0x44,0x00,0x9c,0xf4,0xff,0xff,0x0f,0xfa] s_store_dword s1, s[2:3], 0xFFFFF -// NOSICI: error: instruction not supported on this GPU +// NOSICIGFX1030: error: instruction not supported on this GPU // GFX89: s_store_dword s1, s[2:3], 0xfffff ; encoding: [0x41,0x00,0x42,0xc0,0xff,0xff,0x0f,0x00] -// GFX10: s_store_dword s1, s[2:3], 0xfffff ; encoding: [0x41,0x00,0x40,0xf4,0xff,0xff,0x0f,0xfa] +// GFX1012: s_store_dword s1, s[2:3], 0xfffff ; encoding: [0x41,0x00,0x40,0xf4,0xff,0xff,0x0f,0xfa] s_buffer_store_dword s10, s[92:95], 0xFFFFF -// NOSICI: error: instruction not supported on this GPU +// NOSICIGFX1030: error: instruction not supported on this GPU // GFX89: s_buffer_store_dword s10, s[92:95], 0xfffff ; encoding: [0xae,0x02,0x62,0xc0,0xff,0xff,0x0f,0x00] -// GFX10: s_buffer_store_dword s10, s[92:95], 0xfffff ; encoding: [0xae,0x02,0x60,0xf4,0xff,0xff,0x0f,0xfa] +// GFX1012: s_buffer_store_dword s10, s[92:95], 0xfffff ; encoding: [0xae,0x02,0x60,0xf4,0xff,0xff,0x0f,0xfa] s_atomic_swap s5, s[2:3], 0xFFFFF -// NOSICIVI: error: instruction not supported on this GPU -// GFX10: s_atomic_swap s5, s[2:3], 0xfffff ; encoding: [0x41,0x01,0x00,0xf6,0xff,0xff,0x0f,0xfa] +// NOSICIVIGFX1030: error: instruction not supported on this GPU +// GFX1012: s_atomic_swap s5, s[2:3], 0xfffff ; encoding: [0x41,0x01,0x00,0xf6,0xff,0xff,0x0f,0xfa] // GFX9: s_atomic_swap s5, s[2:3], 0xfffff ; encoding: [0x41,0x01,0x02,0xc2,0xff,0xff,0x0f,0x00] s_buffer_atomic_swap s5, s[4:7], 0xFFFFF -// NOSICIVI: error: instruction not supported on this GPU -// GFX10: s_buffer_atomic_swap s5, s[4:7], 0xfffff ; encoding: [0x42,0x01,0x00,0xf5,0xff,0xff,0x0f,0xfa] +// NOSICIVIGFX1030: error: instruction not supported on this GPU +// GFX1012: s_buffer_atomic_swap s5, s[4:7], 0xfffff ; encoding: [0x42,0x01,0x00,0xf5,0xff,0xff,0x0f,0xfa] // GFX9: s_buffer_atomic_swap s5, s[4:7], 0xfffff ; encoding: [0x42,0x01,0x02,0xc1,0xff,0xff,0x0f,0x00] s_atc_probe 0x7, s[4:5], 0x1FFFFF @@ -663,22 +662,22 @@ s_atc_probe_buffer 0x1, s[8:11], 0x1FFFFF // NOVI: error: expected a 20-bit unsigned offset s_store_dword s1, s[2:3], 0x1FFFFF -// NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: expected a 21-bit signed offset +// NOSICIGFX1030: error: instruction not supported on this GPU +// NOGFX9GFX1012: error: expected a 21-bit signed offset // NOVI: error: expected a 20-bit unsigned offset s_buffer_store_dword s10, s[92:95], 0x1FFFFF -// NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: expected a 20-bit unsigned offset +// NOSICIGFX1030: error: instruction not supported on this GPU +// NOGFX9GFX1012: error: expected a 20-bit unsigned offset // NOVI: error: expected a 20-bit unsigned offset s_atomic_swap s5, s[2:3], 0x1FFFFF -// NOSICIVI: error: instruction not supported on this GPU -// NOGFX9: error: expected a 21-bit signed offset +// NOSICIVIGFX1030: error: instruction not supported on this GPU +// NOGFX9GFX1012: error: expected a 21-bit signed offset s_buffer_atomic_swap s5, s[4:7], 0x1FFFFF -// NOSICIVI: error: instruction not supported on this GPU -// NOGFX9: error: expected a 20-bit unsigned offset +// NOSICIVIGFX1030: error: instruction not supported on this GPU +// NOGFX9GFX1012: error: expected a 20-bit unsigned offset //===----------------------------------------------------------------------===// // Signed offsets (gfx9+) @@ -698,13 +697,13 @@ s_atc_probe_buffer 0x1, s[8:11], -1 s_store_dword s1, s[2:3], -1 // NOVI: error: expected a 20-bit unsigned offset // GFX9: s_store_dword s1, s[2:3], -0x1 ; encoding: [0x41,0x00,0x42,0xc0,0xff,0xff,0x1f,0x00] -// GFX10: s_store_dword s1, s[2:3], -0x1 ; encoding: [0x41,0x00,0x40,0xf4,0xff,0xff,0x1f,0xfa] -// NOSICI: error: instruction not supported on this GPU +// GFX1012: s_store_dword s1, s[2:3], -0x1 ; encoding: [0x41,0x00,0x40,0xf4,0xff,0xff,0x1f,0xfa] +// NOSICIGFX1030: error: instruction not supported on this GPU s_buffer_store_dword s10, s[92:95], -1 // NOVI: error: expected a 20-bit unsigned offset -// NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: expected a 20-bit unsigned offset +// NOSICIGFX1030: error: instruction not supported on this GPU +// NOGFX9GFX1012: error: expected a 20-bit unsigned offset s_load_dword s1, s[2:3], -1 // NOVI: error: expected a 20-bit unsigned offset @@ -720,13 +719,13 @@ s_buffer_load_dword s10, s[92:95], -1 s_atomic_swap s5, s[2:3], -1 // NOVI: error: instruction not supported on this GPU // GFX9: s_atomic_swap s5, s[2:3], -0x1 ; encoding: [0x41,0x01,0x02,0xc2,0xff,0xff,0x1f,0x00] -// GFX10: s_atomic_swap s5, s[2:3], -0x1 ; encoding: [0x41,0x01,0x00,0xf6,0xff,0xff,0x1f,0xfa] -// NOSICI: error: instruction not supported on this GPU +// GFX1012: s_atomic_swap s5, s[2:3], -0x1 ; encoding: [0x41,0x01,0x00,0xf6,0xff,0xff,0x1f,0xfa] +// NOSICIGFX1030: error: instruction not supported on this GPU s_buffer_atomic_swap s5, s[4:7], -1 // NOVI: error: instruction not supported on this GPU -// NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: expected a 20-bit unsigned offset +// NOSICIGFX1030: error: instruction not supported on this GPU +// NOGFX9GFX1012: error: expected a 20-bit unsigned offset s_atc_probe 0x7, s[4:5], 0xFFFFFFFFFFF00000 // NOSICI: error: instruction not supported on this GPU @@ -740,14 +739,14 @@ s_atc_probe_buffer 0x1, s[8:11], 0xFFFFFFFFFFF00000 // NOVI: error: expected a 20-bit unsigned offset s_store_dword s1, s[2:3], 0xFFFFFFFFFFF00000 -// NOSICI: error: instruction not supported on this GPU -// GFX10: s_store_dword s1, s[2:3], -0x100000 ; encoding: [0x41,0x00,0x40,0xf4,0x00,0x00,0x10,0xfa] +// NOSICIGFX1030: error: instruction not supported on this GPU +// GFX1012: s_store_dword s1, s[2:3], -0x100000 ; encoding: [0x41,0x00,0x40,0xf4,0x00,0x00,0x10,0xfa] // GFX9: s_store_dword s1, s[2:3], -0x100000 ; encoding: [0x41,0x00,0x42,0xc0,0x00,0x00,0x10,0x00] // NOVI: error: expected a 20-bit unsigned offset s_buffer_store_dword s10, s[92:95], 0xFFFFFFFFFFF00000 -// NOSICI: error: instruction not supported on this GPU -// NOGFX9: error: expected a 20-bit unsigned offset +// NOSICIGFX1030: error: instruction not supported on this GPU +// NOGFX9GFX1012: error: expected a 20-bit unsigned offset // NOVI: error: expected a 20-bit unsigned offset s_load_dword s1, s[2:3], 0xFFFFFFFFFFF00000 @@ -762,10 +761,10 @@ s_buffer_load_dword s10, s[92:95], 0xFFFFFFFFFFF00000 // NOVI: error: expected a 20-bit unsigned offset s_atomic_swap s5, s[2:3], 0xFFFFFFFFFFF00000 -// NOSICIVI: error: instruction not supported on this GPU -// GFX10: s_atomic_swap s5, s[2:3], -0x100000 ; encoding: [0x41,0x01,0x00,0xf6,0x00,0x00,0x10,0xfa] +// NOSICIVIGFX1030: error: instruction not supported on this GPU +// GFX1012: s_atomic_swap s5, s[2:3], -0x100000 ; encoding: [0x41,0x01,0x00,0xf6,0x00,0x00,0x10,0xfa] // GFX9: s_atomic_swap s5, s[2:3], -0x100000 ; encoding: [0x41,0x01,0x02,0xc2,0x00,0x00,0x10,0x00] s_buffer_atomic_swap s5, s[4:7], 0xFFFFFFFFFFF00000 -// NOSICIVI: error: instruction not supported on this GPU -// NOGFX9: error: expected a 20-bit unsigned offset +// NOSICIVIGFX1030: error: instruction not supported on this GPU +// NOGFX9GFX1012: error: expected a 20-bit unsigned offset diff --git a/llvm/test/MC/AMDGPU/smrd-err.s b/llvm/test/MC/AMDGPU/smrd-err.s index 68f2ac6570c90..5017a1ac59e3a 100644 --- a/llvm/test/MC/AMDGPU/smrd-err.s +++ b/llvm/test/MC/AMDGPU/smrd-err.s @@ -1,14 +1,14 @@ -// RUN: llvm-mc -arch=amdgcn -mcpu=tahiti %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=NOVI --implicit-check-not=error: %s +// RUN: llvm-mc -arch=amdgcn -mcpu=tahiti %s | FileCheck -check-prefix=SI %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=NOVI --implicit-check-not=error: %s s_load_dwordx4 s[100:103], s[2:3], s4 -// NOVI: error: not a valid operand +// NOVI: error: register not available on this GPU // SI: s_load_dwordx4 s[100:103], s[2:3], s4 s_load_dwordx8 s[96:103], s[2:3], s4 -// NOVI: error: not a valid operand +// NOVI: error: register not available on this GPU // SI: s_load_dwordx8 s[96:103], s[2:3], s4 s_load_dwordx16 s[88:103], s[2:3], s4 -// NOVI: error: not a valid operand +// NOVI: error: register not available on this GPU // SI: s_load_dwordx16 s[88:103], s[2:3], s4 diff --git a/llvm/test/MC/AMDGPU/smrd.s b/llvm/test/MC/AMDGPU/smrd.s index 30f01b2ced1c3..43819935afd02 100644 --- a/llvm/test/MC/AMDGPU/smrd.s +++ b/llvm/test/MC/AMDGPU/smrd.s @@ -105,7 +105,7 @@ s_load_dwordx4 ttmp[4:7], ttmp[2:3], ttmp4 s_load_dwordx4 s[100:103], s[2:3], s4 // GCN: s_load_dwordx4 s[100:103], s[2:3], s4 ; encoding: [0x04,0x02,0xb2,0xc0] -// NOVI: error: not a valid operand +// NOVI: error: register not available on this GPU s_load_dwordx8 s[8:15], s[2:3], 1 // GCN: s_load_dwordx8 s[8:15], s[2:3], 0x1 ; encoding: [0x01,0x03,0xc4,0xc0] @@ -117,7 +117,7 @@ s_load_dwordx8 s[8:15], s[2:3], s4 s_load_dwordx8 s[96:103], s[2:3], s4 // GCN: s_load_dwordx8 s[96:103], s[2:3], s4 ; encoding: [0x04,0x02,0xf0,0xc0] -// NOVI: error: not a valid operand +// NOVI: error: register not available on this GPU s_load_dwordx16 s[16:31], s[2:3], 1 // GCN: s_load_dwordx16 s[16:31], s[2:3], 0x1 ; encoding: [0x01,0x03,0x08,0xc1] @@ -129,7 +129,7 @@ s_load_dwordx16 s[16:31], s[2:3], s4 s_load_dwordx16 s[88:103], s[2:3], s4 // GCN: s_load_dwordx16 s[88:103], s[2:3], s4 ; encoding: [0x04,0x02,0x2c,0xc1] -// NOVI: error: not a valid operand +// NOVI: error: register not available on this GPU s_buffer_load_dword s1, s[4:7], 1 // GCN: s_buffer_load_dword s1, s[4:7], 0x1 ; encoding: [0x01,0x85,0x00,0xc2] @@ -189,7 +189,7 @@ s_buffer_load_dwordx4 ttmp[8:11], ttmp[4:7], ttmp4 s_buffer_load_dwordx4 s[100:103], s[4:7], s4 // GCN: s_buffer_load_dwordx4 s[100:103], s[4:7], s4 ; encoding: [0x04,0x04,0xb2,0xc2] -// NOVI: error: not a valid operand +// NOVI: error: register not available on this GPU s_buffer_load_dwordx8 s[8:15], s[4:7], 1 // GCN: s_buffer_load_dwordx8 s[8:15], s[4:7], 0x1 ; encoding: [0x01,0x05,0xc4,0xc2] @@ -201,7 +201,7 @@ s_buffer_load_dwordx8 s[8:15], s[4:7], s4 s_buffer_load_dwordx8 s[96:103], s[4:7], s4 // GCN: s_buffer_load_dwordx8 s[96:103], s[4:7], s4 ; encoding: [0x04,0x04,0xf0,0xc2] -// NOVI: error: not a valid operand +// NOVI: error: register not available on this GPU s_buffer_load_dwordx16 s[16:31], s[4:7], 1 // GCN: s_buffer_load_dwordx16 s[16:31], s[4:7], 0x1 ; encoding: [0x01,0x05,0x08,0xc3] @@ -213,7 +213,7 @@ s_buffer_load_dwordx16 s[16:31], s[4:7], s4 s_buffer_load_dwordx16 s[88:103], s[4:7], s4 // GCN: s_buffer_load_dwordx16 s[88:103], s[4:7], s4 ; encoding: [0x04,0x04,0x2c,0xc3] -// NOVI: error: not a valid operand +// NOVI: error: register not available on this GPU s_dcache_inv // GCN: s_dcache_inv ; encoding: [0x00,0x00,0xc0,0xc7] diff --git a/llvm/test/MC/AMDGPU/sop1-err.s b/llvm/test/MC/AMDGPU/sop1-err.s index 6322f5b098c35..fe2a02154106b 100644 --- a/llvm/test/MC/AMDGPU/sop1-err.s +++ b/llvm/test/MC/AMDGPU/sop1-err.s @@ -9,16 +9,16 @@ s_mov_b32 s1, v0 // GCN: error: invalid operand for instruction s_mov_b32 s[1:2], s0 -// GCN: error: not a valid operand +// GCN: error: invalid register alignment s_mov_b32 s0, s[1:2] -// GCN: error: not a valid operand +// GCN: error: invalid register alignment s_mov_b32 s220, s0 -// GCN: error: not a valid operand +// GCN: error: register index is out of range s_mov_b32 s0, s220 -// GCN: error: not a valid operand +// GCN: error: register index is out of range s_mov_b64 s1, s[0:1] // GCN: error: invalid operand for instruction @@ -32,13 +32,10 @@ s_mov_b32 s // Out of range register s_mov_b32 s102, 1 -// VI: error: not a valid operand -// SI: s_mov_b32 s102, 1 +// VI: error: register not available on this GPU s_mov_b32 s103, 1 -// VI: error: not a valid operand -// SI: s_mov_b32 s103, 1 +// VI: error: register not available on this GPU s_mov_b64 s[102:103], -1 -// VI: error: not a valid operand -// SI: s_mov_b64 s[102:103], -1 +// VI: error: register not available on this GPU diff --git a/llvm/test/MC/AMDGPU/sop1.s b/llvm/test/MC/AMDGPU/sop1.s index dafbf650b6715..3b0bafd4ae2c2 100644 --- a/llvm/test/MC/AMDGPU/sop1.s +++ b/llvm/test/MC/AMDGPU/sop1.s @@ -42,8 +42,8 @@ s_mov_b64 s[2:3], s[4:5] s_mov_b64 null, s[4:5] // GFX10: s_mov_b64 null, s[4:5] ; encoding: [0x04,0x04,0xfd,0xbe] -// NOSICIVI: error: not a valid operand. -// NOGFX9: error: not a valid operand. +// NOSICIVI: error: 'null' operand is not supported on this GPU +// NOGFX9: error: 'null' operand is not supported on this GPU s_mov_b64 s[2:3], 0xffffffffffffffff // SICI: s_mov_b64 s[2:3], -1 ; encoding: [0xc1,0x04,0x82,0xbe] @@ -62,7 +62,7 @@ s_mov_b64 s[0:1], 0x80000000 s_mov_b64 s[102:103], -1 // SICI: s_mov_b64 s[102:103], -1 ; encoding: [0xc1,0x04,0xe6,0xbe] -// NOGFX89: error: not a valid operand +// NOGFX89: error: register not available on this GPU // GFX10: s_mov_b64 s[102:103], -1 ; encoding: [0xc1,0x04,0xe6,0xbe] s_cmov_b32 s1, 200 diff --git a/llvm/test/MC/AMDGPU/sop2.s b/llvm/test/MC/AMDGPU/sop2.s index 89f41a7b3d512..94152bd98695d 100644 --- a/llvm/test/MC/AMDGPU/sop2.s +++ b/llvm/test/MC/AMDGPU/sop2.s @@ -65,8 +65,8 @@ s_and_b32 s2, 0xFFFF0000, -65536 s_and_b64 null, s[4:5], s[6:7] // GFX10: s_and_b64 null, s[4:5], s[6:7] ; encoding: [0x04,0x06,0xfd,0x87] -// NOSICIVI: error: not a valid operand. -// NOGFX9: error: not a valid operand. +// NOSICIVI: error: 'null' operand is not supported on this GPU +// NOGFX9: error: 'null' operand is not supported on this GPU s_and_b64 s[2:3], s[4:5], s[6:7] // SICI: s_and_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x87] @@ -235,7 +235,7 @@ s_absdiff_i32 s2, s4, s6 s_add_u32 s101, s102, s103 // SICI: s_add_u32 s101, s102, s103 ; encoding: [0x66,0x67,0x65,0x80] -// NOGFX89: error: not a valid operand +// NOGFX89: error: register not available on this GPU // GFX10: s_add_u32 s101, s102, s103 ; encoding: [0x66,0x67,0x65,0x80] s_lshl1_add_u32 s5, s1, s2 diff --git a/llvm/test/MC/AMDGPU/sopk.s b/llvm/test/MC/AMDGPU/sopk.s index e128df94c611f..14523dcec8567 100644 --- a/llvm/test/MC/AMDGPU/sopk.s +++ b/llvm/test/MC/AMDGPU/sopk.s @@ -19,74 +19,92 @@ s_movk_i32 s2, 0x6 s_cmovk_i32 s2, 0x6 // SICI: s_cmovk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb1] // VI9: s_cmovk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb0] +// GFX10: s_cmovk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb1] s_cmpk_eq_i32 s2, 0x6 // SICI: s_cmpk_eq_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb1] // VI9: s_cmpk_eq_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb1] +// GFX10: s_cmpk_eq_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb1] s_cmpk_lg_i32 s2, 0x6 // SICI: s_cmpk_lg_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb2] // VI9: s_cmpk_lg_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb1] +// GFX10: s_cmpk_lg_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb2] s_cmpk_gt_i32 s2, 0x6 // SICI: s_cmpk_gt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb2] // VI9: s_cmpk_gt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb2] +// GFX10: s_cmpk_gt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb2] s_cmpk_ge_i32 s2, 0x6 // SICI: s_cmpk_ge_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb3] // VI9: s_cmpk_ge_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb2] +// GFX10: s_cmpk_ge_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb3] s_cmpk_lt_i32 s2, 0x6 // SICI: s_cmpk_lt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb3] // VI9: s_cmpk_lt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb3] +// GFX10: s_cmpk_lt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb3] s_cmpk_le_i32 s2, 0x6 // SICI: s_cmpk_le_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb4] // VI9: s_cmpk_le_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb3] +// GFX10: s_cmpk_le_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb4] s_cmpk_eq_u32 s2, 0x6 // SICI: s_cmpk_eq_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb4] // VI9: s_cmpk_eq_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb4] +// GFX10: s_cmpk_eq_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb4] s_cmpk_lg_u32 s2, 0x6 // SICI: s_cmpk_lg_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb5] // VI9: s_cmpk_lg_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb4] +// GFX10: s_cmpk_lg_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb5] s_cmpk_gt_u32 s2, 0x6 // SICI: s_cmpk_gt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb5] // VI9: s_cmpk_gt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb5] +// GFX10: s_cmpk_gt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb5] s_cmpk_ge_u32 s2, 0x6 // SICI: s_cmpk_ge_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb6] // VI9: s_cmpk_ge_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb5] +// GFX10: s_cmpk_ge_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb6] s_cmpk_lt_u32 s2, 0x6 // SICI: s_cmpk_lt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb6] // VI9: s_cmpk_lt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb6] +// GFX10: s_cmpk_lt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb6] s_cmpk_le_u32 s2, 0x6 // SICI: s_cmpk_le_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb7] // VI9: s_cmpk_le_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb6] +// GFX10: s_cmpk_le_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb7] s_cmpk_le_u32 s2, 0xFFFF // SICI: s_cmpk_le_u32 s2, 0xffff ; encoding: [0xff,0xff,0x02,0xb7] // VI9: s_cmpk_le_u32 s2, 0xffff ; encoding: [0xff,0xff,0x82,0xb6] +// GFX10: s_cmpk_le_u32 s2, 0xffff ; encoding: [0xff,0xff,0x02,0xb7] s_addk_i32 s2, 0x6 // SICI: s_addk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb7] // VI9: s_addk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb7] +// GFX10: s_addk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb7] s_mulk_i32 s2, 0x6 // SICI: s_mulk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb8] // VI9: s_mulk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb7] +// GFX10: s_mulk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb8] s_mulk_i32 s2, -1 // SICI: s_mulk_i32 s2, 0xffff ; encoding: [0xff,0xff,0x02,0xb8] // VI9: s_mulk_i32 s2, 0xffff ; encoding: [0xff,0xff,0x82,0xb7] +// GFX10: s_mulk_i32 s2, 0xffff ; encoding: [0xff,0xff,0x02,0xb8] s_mulk_i32 s2, 0xFFFF // SICI: s_mulk_i32 s2, 0xffff ; encoding: [0xff,0xff,0x02,0xb8] // VI9: s_mulk_i32 s2, 0xffff ; encoding: [0xff,0xff,0x82,0xb7] +// GFX10: s_mulk_i32 s2, 0xffff ; encoding: [0xff,0xff,0x02,0xb8] s_cbranch_i_fork s[2:3], 0x6 // SICI: s_cbranch_i_fork s[2:3], 6 ; encoding: [0x06,0x00,0x82,0xb8] @@ -100,26 +118,31 @@ s_cbranch_i_fork s[2:3], 0x6 s_getreg_b32 s2, 0x6 // SICI: s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) ; encoding: [0x06,0x00,0x02,0xb9] // VI9: s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) ; encoding: [0x06,0x00,0x82,0xb8] +// GFX10: s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) ; encoding: [0x06,0x00,0x02,0xb9] // HW register identifier, non-default offset/width s_getreg_b32 s2, hwreg(HW_REG_GPR_ALLOC, 1, 31) // SICI: s_getreg_b32 s2, hwreg(HW_REG_GPR_ALLOC, 1, 31) ; encoding: [0x45,0xf0,0x02,0xb9] // VI9: s_getreg_b32 s2, hwreg(HW_REG_GPR_ALLOC, 1, 31) ; encoding: [0x45,0xf0,0x82,0xb8] +// GFX10: s_getreg_b32 s2, hwreg(HW_REG_GPR_ALLOC, 1, 31) ; encoding: [0x45,0xf0,0x02,0xb9] // HW register code of unknown HW register, non-default offset/width s_getreg_b32 s2, hwreg(51, 1, 31) // SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9] // VI9: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8] +// GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9] // HW register code of unknown HW register, default offset/width s_getreg_b32 s2, hwreg(51) // SICI: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9] // VI9: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8] +// GFX10: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9] // HW register code of unknown HW register, valid symbolic name range but no name available s_getreg_b32 s2, hwreg(10) // SICI: s_getreg_b32 s2, hwreg(10) ; encoding: [0x0a,0xf8,0x02,0xb9] // VI9: s_getreg_b32 s2, hwreg(10) ; encoding: [0x0a,0xf8,0x82,0xb8] +// GFX10: s_getreg_b32 s2, hwreg(10) ; encoding: [0x0a,0xf8,0x02,0xb9] // HW_REG_SH_MEM_BASES valid starting from GFX9 s_getreg_b32 s2, hwreg(15) @@ -183,31 +206,37 @@ s_getreg_b32 s2, hwreg(25) s_setreg_b32 0x6, s2 // SICI: s_setreg_b32 hwreg(HW_REG_LDS_ALLOC, 0, 1), s2 ; encoding: [0x06,0x00,0x82,0xb9] // VI9: s_setreg_b32 hwreg(HW_REG_LDS_ALLOC, 0, 1), s2 ; encoding: [0x06,0x00,0x02,0xb9] +// GFX10: s_setreg_b32 hwreg(HW_REG_LDS_ALLOC, 0, 1), s2 ; encoding: [0x06,0x00,0x82,0xb9] // raw number mapped to unknown HW register s_setreg_b32 0x33, s2 // SICI: s_setreg_b32 hwreg(51, 0, 1), s2 ; encoding: [0x33,0x00,0x82,0xb9] // VI9: s_setreg_b32 hwreg(51, 0, 1), s2 ; encoding: [0x33,0x00,0x02,0xb9] +// GFX10: s_setreg_b32 hwreg(51, 0, 1), s2 ; encoding: [0x33,0x00,0x82,0xb9] // raw number mapped to known HW register, default offset/width s_setreg_b32 0xf803, s2 // SICI: s_setreg_b32 hwreg(HW_REG_TRAPSTS), s2 ; encoding: [0x03,0xf8,0x82,0xb9] // VI9: s_setreg_b32 hwreg(HW_REG_TRAPSTS), s2 ; encoding: [0x03,0xf8,0x02,0xb9] +// GFX10: s_setreg_b32 hwreg(HW_REG_TRAPSTS), s2 ; encoding: [0x03,0xf8,0x82,0xb9] // HW register identifier, default offset/width implied s_setreg_b32 hwreg(HW_REG_HW_ID), s2 // SICI: s_setreg_b32 hwreg(HW_REG_HW_ID), s2 ; encoding: [0x04,0xf8,0x82,0xb9] // VI9: s_setreg_b32 hwreg(HW_REG_HW_ID), s2 ; encoding: [0x04,0xf8,0x02,0xb9] +// GFX10: s_setreg_b32 hwreg(HW_REG_HW_ID), s2 ; encoding: [0x04,0xf8,0x82,0xb9] // HW register identifier, non-default offset/width s_setreg_b32 hwreg(HW_REG_GPR_ALLOC, 1, 31), s2 // SICI: s_setreg_b32 hwreg(HW_REG_GPR_ALLOC, 1, 31), s2 ; encoding: [0x45,0xf0,0x82,0xb9] // VI9: s_setreg_b32 hwreg(HW_REG_GPR_ALLOC, 1, 31), s2 ; encoding: [0x45,0xf0,0x02,0xb9] +// GFX10: s_setreg_b32 hwreg(HW_REG_GPR_ALLOC, 1, 31), s2 ; encoding: [0x45,0xf0,0x82,0xb9] // HW register code of unknown HW register, valid symbolic name range but no name available s_setreg_b32 hwreg(10), s2 // SICI: s_setreg_b32 hwreg(10), s2 ; encoding: [0x0a,0xf8,0x82,0xb9] // VI9: s_setreg_b32 hwreg(10), s2 ; encoding: [0x0a,0xf8,0x02,0xb9] +// GFX10: s_setreg_b32 hwreg(10), s2 ; encoding: [0x0a,0xf8,0x82,0xb9] // HW_REG_SH_MEM_BASES valid starting from GFX9 s_setreg_b32 hwreg(15), s2 @@ -271,16 +300,19 @@ s_setreg_b32 hwreg(25), s2 s_setreg_b32 hwreg(5, 1, 31), s2 // SICI: s_setreg_b32 hwreg(HW_REG_GPR_ALLOC, 1, 31), s2 ; encoding: [0x45,0xf0,0x82,0xb9] // VI9: s_setreg_b32 hwreg(HW_REG_GPR_ALLOC, 1, 31), s2 ; encoding: [0x45,0xf0,0x02,0xb9] +// GFX10: s_setreg_b32 hwreg(HW_REG_GPR_ALLOC, 1, 31), s2 ; encoding: [0x45,0xf0,0x82,0xb9] // raw number mapped to known HW register s_setreg_imm32_b32 0x6, 0xff // SICI: s_setreg_imm32_b32 hwreg(HW_REG_LDS_ALLOC, 0, 1), 0xff ; encoding: [0x06,0x00,0x80,0xba,0xff,0x00,0x00,0x00] // VI9: s_setreg_imm32_b32 hwreg(HW_REG_LDS_ALLOC, 0, 1), 0xff ; encoding: [0x06,0x00,0x00,0xba,0xff,0x00,0x00,0x00] +// GFX10: s_setreg_imm32_b32 hwreg(HW_REG_LDS_ALLOC, 0, 1), 0xff ; encoding: [0x06,0x00,0x80,0xba,0xff,0x00,0x00,0x00] // HW register identifier, non-default offset/width s_setreg_imm32_b32 hwreg(HW_REG_GPR_ALLOC, 1, 31), 0xff // SICI: s_setreg_imm32_b32 hwreg(HW_REG_GPR_ALLOC, 1, 31), 0xff ; encoding: [0x45,0xf0,0x80,0xba,0xff,0x00,0x00,0x00] // VI9: s_setreg_imm32_b32 hwreg(HW_REG_GPR_ALLOC, 1, 31), 0xff ; encoding: [0x45,0xf0,0x00,0xba,0xff,0x00,0x00,0x00] +// GFX10: s_setreg_imm32_b32 hwreg(HW_REG_GPR_ALLOC, 1, 31), 0xff ; encoding: [0x45,0xf0,0x80,0xba,0xff,0x00,0x00,0x00] //===----------------------------------------------------------------------===// // expressions and hwreg macro @@ -290,16 +322,19 @@ hwreg=6 s_getreg_b32 s2, hwreg // SICI: s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) ; encoding: [0x06,0x00,0x02,0xb9] // VI9: s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) ; encoding: [0x06,0x00,0x82,0xb8] +// GFX10: s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) ; encoding: [0x06,0x00,0x02,0xb9] x=5 s_getreg_b32 s2, x+1 // SICI: s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) ; encoding: [0x06,0x00,0x02,0xb9] // VI9: s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) ; encoding: [0x06,0x00,0x82,0xb8] +// GFX10: s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) ; encoding: [0x06,0x00,0x02,0xb9] x=5 s_getreg_b32 s2, 1+x // SICI: s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) ; encoding: [0x06,0x00,0x02,0xb9] // VI9: s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) ; encoding: [0x06,0x00,0x82,0xb8] +// GFX10: s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) ; encoding: [0x06,0x00,0x02,0xb9] reg=50 offset=2 @@ -307,10 +342,12 @@ width=30 s_getreg_b32 s2, hwreg(reg + 1, offset - 1, width + 1) // SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9] // VI9: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8] +// GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9] s_getreg_b32 s2, hwreg(1 + reg, -1 + offset, 1 + width) // SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9] // VI9: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8] +// GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9] //===----------------------------------------------------------------------===// // Instructions @@ -319,30 +356,36 @@ s_getreg_b32 s2, hwreg(1 + reg, -1 + offset, 1 + width) s_endpgm_ordered_ps_done // GFX9: s_endpgm_ordered_ps_done ; encoding: [0x00,0x00,0x9e,0xbf] // NOSICIVI: error: instruction not supported on this GPU +// GFX10: s_endpgm_ordered_ps_done ; encoding: [0x00,0x00,0x9e,0xbf] s_call_b64 null, 12609 // GFX10: s_call_b64 null, 12609 ; encoding: [0x41,0x31,0x7d,0xbb] -// NOSICIVI: error: not a valid operand. -// NOGFX9: error: not a valid operand. +// NOSICIVI: error: 'null' operand is not supported on this GPU +// NOGFX9: error: 'null' operand is not supported on this GPU s_call_b64 s[12:13], 12609 // GFX9: s_call_b64 s[12:13], 12609 ; encoding: [0x41,0x31,0x8c,0xba] // NOSICIVI: error: instruction not supported on this GPU +// GFX10: s_call_b64 s[12:13], 12609 ; encoding: [0x41,0x31,0x0c,0xbb] s_call_b64 s[100:101], 12609 // GFX9: s_call_b64 s[100:101], 12609 ; encoding: [0x41,0x31,0xe4,0xba] // NOSICIVI: error: instruction not supported on this GPU +// GFX10: s_call_b64 s[100:101], 12609 ; encoding: [0x41,0x31,0x64,0xbb] s_call_b64 s[10:11], 49617 // GFX9: s_call_b64 s[10:11], 49617 ; encoding: [0xd1,0xc1,0x8a,0xba] // NOSICIVI: error: instruction not supported on this GPU +// GFX10: s_call_b64 s[10:11], 49617 ; encoding: [0xd1,0xc1,0x0a,0xbb] offset = 4 s_call_b64 s[0:1], offset + 4 // GFX9: s_call_b64 s[0:1], 8 ; encoding: [0x08,0x00,0x80,0xba] // NOSICIVI: error: instruction not supported on this GPU +// GFX10: s_call_b64 s[0:1], 8 ; encoding: [0x08,0x00,0x00,0xbb] offset = 4 s_call_b64 s[0:1], 4 + offset // GFX9: s_call_b64 s[0:1], 8 ; encoding: [0x08,0x00,0x80,0xba] // NOSICIVI: error: instruction not supported on this GPU +// GFX10: s_call_b64 s[0:1], 8 ; encoding: [0x08,0x00,0x00,0xbb] diff --git a/llvm/test/MC/AMDGPU/trap.s b/llvm/test/MC/AMDGPU/trap.s index 5d23c1f30d6ed..18296c859642f 100644 --- a/llvm/test/MC/AMDGPU/trap.s +++ b/llvm/test/MC/AMDGPU/trap.s @@ -20,124 +20,124 @@ s_add_u32 ttmp0, ttmp0, 4 s_add_u32 ttmp4, 8, ttmp4 // SICI: s_add_u32 ttmp4, 8, ttmp4 ; encoding: [0x88,0x74,0x74,0x80] // VI: s_add_u32 ttmp4, 8, ttmp4 ; encoding: [0x88,0x74,0x74,0x80] -// GXF9: s_add_u32 ttmp4, 8, ttmp4 ; encoding: [0x88,0x70,0x70,0x80] +// GFX9: s_add_u32 ttmp4, 8, ttmp4 ; encoding: [0x88,0x70,0x70,0x80] s_add_u32 ttmp4, ttmp4, 0x00000100 // SICI: s_add_u32 ttmp4, ttmp4, 0x100 ; encoding: [0x74,0xff,0x74,0x80,0x00,0x01,0x00,0x00] // VI: s_add_u32 ttmp4, ttmp4, 0x100 ; encoding: [0x74,0xff,0x74,0x80,0x00,0x01,0x00,0x00] -// GXF9: s_add_u32 ttmp4, ttmp4, 0x100 ; encoding: [0x70,0xff,0x70,0x80,0x00,0x01,0x00,0x00] +// GFX9: s_add_u32 ttmp4, ttmp4, 0x100 ; encoding: [0x70,0xff,0x70,0x80,0x00,0x01,0x00,0x00] s_add_u32 ttmp4, ttmp4, 4 // SICI: s_add_u32 ttmp4, ttmp4, 4 ; encoding: [0x74,0x84,0x74,0x80] // VI: s_add_u32 ttmp4, ttmp4, 4 ; encoding: [0x74,0x84,0x74,0x80] -// GXF9: s_add_u32 ttmp4, ttmp4, 4 ; encoding: [0x70,0x84,0x70,0x80] +// GFX9: s_add_u32 ttmp4, ttmp4, 4 ; encoding: [0x70,0x84,0x70,0x80] s_add_u32 ttmp4, ttmp8, ttmp4 // SICI: s_add_u32 ttmp4, ttmp8, ttmp4 ; encoding: [0x78,0x74,0x74,0x80] // VI: s_add_u32 ttmp4, ttmp8, ttmp4 ; encoding: [0x78,0x74,0x74,0x80] -// GXF9: s_add_u32 ttmp4, ttmp8, ttmp4 ; encoding: [0x74,0x70,0x70,0x80] +// GFX9: s_add_u32 ttmp4, ttmp8, ttmp4 ; encoding: [0x74,0x70,0x70,0x80] s_and_b32 ttmp10, ttmp8, 0x00000080 // SICI: s_and_b32 ttmp10, ttmp8, 0x80 ; encoding: [0x78,0xff,0x7a,0x87,0x80,0x00,0x00,0x00] // VI: s_and_b32 ttmp10, ttmp8, 0x80 ; encoding: [0x78,0xff,0x7a,0x86,0x80,0x00,0x00,0x00] -// GXF9: s_and_b32 ttmp10, ttmp8, 0x80 ; encoding: [0x74,0xff,0x74,0x86,0x80,0x00,0x00,0x00] +// GFX9: s_and_b32 ttmp10, ttmp8, 0x80 ; encoding: [0x74,0xff,0x76,0x86,0x80,0x00,0x00,0x00] s_and_b32 ttmp9, tma_hi, 0x0000ffff // SICI: s_and_b32 ttmp9, tma_hi, 0xffff ; encoding: [0x6f,0xff,0x79,0x87,0xff,0xff,0x00,0x00] // VI: s_and_b32 ttmp9, tma_hi, 0xffff ; encoding: [0x6f,0xff,0x79,0x86,0xff,0xff,0x00,0x00] -// NOGFX9: error: not a valid operand +// NOGFX9: error: register not available on this GPU s_and_b32 ttmp9, ttmp9, 0x000001ff // SICI: s_and_b32 ttmp9, ttmp9, 0x1ff ; encoding: [0x79,0xff,0x79,0x87,0xff,0x01,0x00,0x00] // VI: s_and_b32 ttmp9, ttmp9, 0x1ff ; encoding: [0x79,0xff,0x79,0x86,0xff,0x01,0x00,0x00] -// GXF9: s_and_b32 ttmp9, ttmp9, 0x1ff ; encoding: [0x75,0xff,0x75,0x86,0xff,0x01,0x00,0x00] +// GFX9: s_and_b32 ttmp9, ttmp9, 0x1ff ; encoding: [0x75,0xff,0x75,0x86,0xff,0x01,0x00,0x00] s_and_b32 ttmp9, tma_lo, 0xffff0000 // SICI: s_and_b32 ttmp9, tma_lo, 0xffff0000 ; encoding: [0x6e,0xff,0x79,0x87,0x00,0x00,0xff,0xff] // VI: s_and_b32 ttmp9, tma_lo, 0xffff0000 ; encoding: [0x6e,0xff,0x79,0x86,0x00,0x00,0xff,0xff] -// NOGFX9: error: not a valid operand +// NOGFX9: error: register not available on this GPU s_and_b32 ttmp9, ttmp9, ttmp8 // SICI: s_and_b32 ttmp9, ttmp9, ttmp8 ; encoding: [0x79,0x78,0x79,0x87] // VI: s_and_b32 ttmp9, ttmp9, ttmp8 ; encoding: [0x79,0x78,0x79,0x86] -// GXF9: s_and_b32 ttmp9, ttmp9, ttmp8 ; encoding: [0x75,0x78,0x75,0x86] +// GFX9: s_and_b32 ttmp9, ttmp9, ttmp8 ; encoding: [0x75,0x74,0x75,0x86] s_and_b32 ttmp8, ttmp1, 0x01000000 // SICI: s_and_b32 ttmp8, ttmp1, 0x1000000 ; encoding: [0x71,0xff,0x78,0x87,0x00,0x00,0x00,0x01] // VI: s_and_b32 ttmp8, ttmp1, 0x1000000 ; encoding: [0x71,0xff,0x78,0x86,0x00,0x00,0x00,0x01] -// GXF9: s_and_b32 ttmp8, ttmp1, 0x1000000 ; encoding: [0x6d,0xff,0x74,0x86,0x00,0x00,0x00,0x01] +// GFX9: s_and_b32 ttmp8, ttmp1, 0x1000000 ; encoding: [0x6d,0xff,0x74,0x86,0x00,0x00,0x00,0x01] s_cmp_eq_i32 ttmp8, 0 // SICI: s_cmp_eq_i32 ttmp8, 0 ; encoding: [0x78,0x80,0x00,0xbf] // VI: s_cmp_eq_i32 ttmp8, 0 ; encoding: [0x78,0x80,0x00,0xbf] -// GXF9: s_cmp_eq_i32 ttmp8, 0 ; encoding: [0x74,0x80,0x00,0xbf] +// GFX9: s_cmp_eq_i32 ttmp8, 0 ; encoding: [0x74,0x80,0x00,0xbf] s_cmp_eq_i32 ttmp8, 0x000000fe // SICI: s_cmp_eq_i32 ttmp8, 0xfe ; encoding: [0x78,0xff,0x00,0xbf,0xfe,0x00,0x00,0x00] // VI: s_cmp_eq_i32 ttmp8, 0xfe ; encoding: [0x78,0xff,0x00,0xbf,0xfe,0x00,0x00,0x00] -// GXF9: s_cmp_eq_i32 ttmp8, 0xfe ; encoding: [0x74,0xff,0x00,0xbf,0xfe,0x00,0x00,0x00] +// GFX9: s_cmp_eq_i32 ttmp8, 0xfe ; encoding: [0x74,0xff,0x00,0xbf,0xfe,0x00,0x00,0x00] s_lshr_b32 ttmp8, ttmp8, 12 // SICI: s_lshr_b32 ttmp8, ttmp8, 12 ; encoding: [0x78,0x8c,0x78,0x90] // VI: s_lshr_b32 ttmp8, ttmp8, 12 ; encoding: [0x78,0x8c,0x78,0x8f] -// GXF9: s_lshr_b32 ttmp8, ttmp8, 12 ; encoding: [0x74,0x8c,0x74,0x8f] +// GFX9: s_lshr_b32 ttmp8, ttmp8, 12 ; encoding: [0x74,0x8c,0x74,0x8f] v_mov_b32_e32 v1, ttmp8 // SICI: v_mov_b32_e32 v1, ttmp8 ; encoding: [0x78,0x02,0x02,0x7e] // VI: v_mov_b32_e32 v1, ttmp8 ; encoding: [0x78,0x02,0x02,0x7e] -// GXF9: v_mov_b32_e32 v1, ttmp8 ; encoding: [0x74,0x02,0x02,0x7e] +// GFX9: v_mov_b32_e32 v1, ttmp8 ; encoding: [0x74,0x02,0x02,0x7e] s_mov_b32 m0, ttmp8 // SICI: s_mov_b32 m0, ttmp8 ; encoding: [0x78,0x03,0xfc,0xbe] // VI: s_mov_b32 m0, ttmp8 ; encoding: [0x78,0x00,0xfc,0xbe] -// GXF9: s_mov_b32 m0, ttmp8 ; encoding: [0x74,0x00,0xfc,0xbe] +// GFX9: s_mov_b32 m0, ttmp8 ; encoding: [0x74,0x00,0xfc,0xbe] s_mov_b32 ttmp10, 0 // SICI: s_mov_b32 ttmp10, 0 ; encoding: [0x80,0x03,0xfa,0xbe] // VI: s_mov_b32 ttmp10, 0 ; encoding: [0x80,0x00,0xfa,0xbe] -// GXF9: s_mov_b32 ttmp10, 0 ; encoding: [0x80,0x00,0xf6,0xbe] +// GFX9: s_mov_b32 ttmp10, 0 ; encoding: [0x80,0x00,0xf6,0xbe] s_mov_b32 ttmp11, 0x01024fac // SICI: s_mov_b32 ttmp11, 0x1024fac ; encoding: [0xff,0x03,0xfb,0xbe,0xac,0x4f,0x02,0x01] // VI: s_mov_b32 ttmp11, 0x1024fac ; encoding: [0xff,0x00,0xfb,0xbe,0xac,0x4f,0x02,0x01] -// GXF9: s_mov_b32 ttmp11, 0x1024fac ; encoding: [0xff,0x00,0xf7,0xbe,0xac,0x4f,0x02,0x01] +// GFX9: s_mov_b32 ttmp11, 0x1024fac ; encoding: [0xff,0x00,0xf7,0xbe,0xac,0x4f,0x02,0x01] s_mov_b32 ttmp8, m0 // SICI: s_mov_b32 ttmp8, m0 ; encoding: [0x7c,0x03,0xf8,0xbe] // VI: s_mov_b32 ttmp8, m0 ; encoding: [0x7c,0x00,0xf8,0xbe] -// GXF9: s_mov_b32 ttmp8, m0 ; encoding: [0x7c,0x00,0xf4,0xbe] +// GFX9: s_mov_b32 ttmp8, m0 ; encoding: [0x7c,0x00,0xf4,0xbe] s_mov_b32 ttmp8, tma_lo // SICI: s_mov_b32 ttmp8, tma_lo ; encoding: [0x6e,0x03,0xf8,0xbe] // VI: s_mov_b32 ttmp8, tma_lo ; encoding: [0x6e,0x00,0xf8,0xbe] -// NOGFX9: error: not a valid operand +// NOGFX9: error: register not available on this GPU s_mul_i32 ttmp8, 0x00000324, ttmp8 // SICI: s_mul_i32 ttmp8, 0x324, ttmp8 ; encoding: [0xff,0x78,0x78,0x93,0x24,0x03,0x00,0x00] // VI: s_mul_i32 ttmp8, 0x324, ttmp8 ; encoding: [0xff,0x78,0x78,0x92,0x24,0x03,0x00,0x00] -// GXF9: s_mul_i32 ttmp8, 0x324, ttmp8 ; encoding: [0xff,0x74,0x74,0x92,0x24,0x03,0x00,0x00] +// GFX9: s_mul_i32 ttmp8, 0x324, ttmp8 ; encoding: [0xff,0x74,0x74,0x92,0x24,0x03,0x00,0x00] s_or_b32 ttmp9, ttmp9, 0x00280000 // SICI: s_or_b32 ttmp9, ttmp9, 0x280000 ; encoding: [0x79,0xff,0x79,0x88,0x00,0x00,0x28,0x00] // VI: s_or_b32 ttmp9, ttmp9, 0x280000 ; encoding: [0x79,0xff,0x79,0x87,0x00,0x00,0x28,0x00] -// GXF9: s_or_b32 ttmp9, ttmp9, 0x280000 ; encoding: [0x75,0xff,0x75,0x87,0x00,0x00,0x28,0x00] +// GFX9: s_or_b32 ttmp9, ttmp9, 0x280000 ; encoding: [0x75,0xff,0x75,0x87,0x00,0x00,0x28,0x00] // ttmp12..ttmp15 (GFX9 only) s_add_u32 ttmp0, ttmp12, 4 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_add_u32 ttmp0, ttmp12, 4 ; encoding: [0x78,0x84,0x6c,0x80] s_add_u32 ttmp0, ttmp13, 4 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_add_u32 ttmp0, ttmp13, 4 ; encoding: [0x79,0x84,0x6c,0x80] s_add_u32 ttmp0, ttmp14, 4 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_add_u32 ttmp0, ttmp14, 4 ; encoding: [0x7a,0x84,0x6c,0x80] s_add_u32 ttmp0, ttmp15, 4 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_add_u32 ttmp0, ttmp15, 4 ; encoding: [0x7b,0x84,0x6c,0x80] //===----------------------------------------------------------------------===// @@ -162,31 +162,31 @@ s_mov_b64 exec, [ttmp4,ttmp5] s_mov_b64 tba, ttmp[4:5] // SICI: s_mov_b64 tba, ttmp[4:5] ; encoding: [0x74,0x04,0xec,0xbe] // VI: s_mov_b64 tba, ttmp[4:5] ; encoding: [0x74,0x01,0xec,0xbe] -// NOGFX9: error: not a valid operand +// NOGFX9: error: register not available on this GPU s_mov_b64 ttmp[4:5], tba // SICI: s_mov_b64 ttmp[4:5], tba ; encoding: [0x6c,0x04,0xf4,0xbe] // VI: s_mov_b64 ttmp[4:5], tba ; encoding: [0x6c,0x01,0xf4,0xbe] -// NOGFX9: error: not a valid operand +// NOGFX9: error: register not available on this GPU s_mov_b64 tma, ttmp[4:5] // SICI: s_mov_b64 tma, ttmp[4:5] ; encoding: [0x74,0x04,0xee,0xbe] // VI: s_mov_b64 tma, ttmp[4:5] ; encoding: [0x74,0x01,0xee,0xbe] -// NOGFX9: error: not a valid operand +// NOGFX9: error: register not available on this GPU s_mov_b64 ttmp[4:5], tma // SICI: s_mov_b64 ttmp[4:5], tma ; encoding: [0x6e,0x04,0xf4,0xbe] // VI: s_mov_b64 ttmp[4:5], tma ; encoding: [0x6e,0x01,0xf4,0xbe] -// NOGFX9: error: not a valid operand +// NOGFX9: error: register not available on this GPU // ttmp12..ttmp15 (GFX9 only) s_mov_b64 ttmp[12:13], exec -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_mov_b64 ttmp[12:13], exec ; encoding: [0x7e,0x01,0xf8,0xbe] s_mov_b64 ttmp[14:15], exec -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: s_mov_b64 ttmp[14:15], exec ; encoding: [0x7e,0x01,0xfa,0xbe] //===----------------------------------------------------------------------===// @@ -197,25 +197,29 @@ s_mov_b64 ttmp[14:15], exec s_buffer_load_dwordx8 ttmp[0:7], s[0:3], s0 // VI: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00] // GFX9: [0x00,0x1b,0x2c,0xc0,0x00,0x00,0x00,0x00] +// SICI: s_buffer_load_dwordx8 ttmp[0:7], s[0:3], s0 ; encoding: [0x00,0x00,0xf8,0xc2] s_buffer_load_dwordx8 ttmp[4:11], s[0:3], s0 // VI: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00] // GFX9: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00] +// SICI: s_buffer_load_dwordx8 ttmp[4:11], s[0:3], s0 ; encoding: [0x00,0x00,0xfa,0xc2] s_buffer_load_dwordx8 ttmp[8:15], s[0:3], s0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00] s_load_dwordx8 ttmp[0:7], s[0:1], s0 // VI: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00] // GFX9: [0x00,0x1b,0x0c,0xc0,0x00,0x00,0x00,0x00] +// SICI: s_load_dwordx8 ttmp[0:7], s[0:1], s0 ; encoding: [0x00,0x00,0xf8,0xc0] s_load_dwordx8 ttmp[4:11], s[0:1], s0 // VI: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00] // GFX9: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00] +// SICI: s_load_dwordx8 ttmp[4:11], s[0:1], s0 ; encoding: [0x00,0x00,0xfa,0xc0] s_load_dwordx8 ttmp[8:15], s[0:1], s0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00] //===----------------------------------------------------------------------===// @@ -224,11 +228,11 @@ s_load_dwordx8 ttmp[8:15], s[0:1], s0 //===----------------------------------------------------------------------===// s_buffer_load_dwordx16 ttmp[0:15], s[0:3], s0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: [0x00,0x1b,0x30,0xc0,0x00,0x00,0x00,0x00] s_load_dwordx16 ttmp[0:15], s[0:1], s0 -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: [0x00,0x1b,0x10,0xc0,0x00,0x00,0x00,0x00] //===----------------------------------------------------------------------===// @@ -253,5 +257,5 @@ buffer_atomic_inc v1, off, ttmp[8:11], 56 glc // ttmp12..ttmp15 (GFX9 only) buffer_atomic_inc v1, off, ttmp[12:15], 56 glc -// NOSICIVI: error: not a valid operand +// NOSICIVI: error: register not available on this GPU // GFX9: buffer_atomic_inc v1, off, ttmp[12:15], 56 glc ; encoding: [0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8] diff --git a/llvm/test/MC/AMDGPU/vop3.s b/llvm/test/MC/AMDGPU/vop3.s index e5ff3f030a6fc..2c083e7024e3c 100644 --- a/llvm/test/MC/AMDGPU/vop3.s +++ b/llvm/test/MC/AMDGPU/vop3.s @@ -289,17 +289,17 @@ v_mac_f32_e64 v0, -v1, |v2| v_mac_f16_e64 v0, 0.5, flat_scratch_lo // VI: v_mac_f16_e64 v0, 0.5, flat_scratch_lo ; encoding: [0x00,0x00,0x23,0xd1,0xf0,0xcc,0x00,0x00] // NOCI: error: instruction not supported on this GPU -// NOSI: error: not a valid operand. +// NOSI: error: register not available on this GPU v_mac_f16_e64 v0, -4.0, flat_scratch_lo // VI: v_mac_f16_e64 v0, -4.0, flat_scratch_lo ; encoding: [0x00,0x00,0x23,0xd1,0xf7,0xcc,0x00,0x00] // NOCI: error: instruction not supported on this GPU -// NOSI: error: not a valid operand. +// NOSI: error: register not available on this GPU v_mac_f16_e64 v0, flat_scratch_lo, -4.0 // VI: v_mac_f16_e64 v0, flat_scratch_lo, -4.0 ; encoding: [0x00,0x00,0x23,0xd1,0x66,0xee,0x01,0x00] // NOCI: error: instruction not supported on this GPU -// NOSI: error: not a valid operand. +// NOSI: error: register not available on this GPU v_add_u32 v84, vcc, v13, s31 clamp // NOSICI: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/vop_sdwa.s b/llvm/test/MC/AMDGPU/vop_sdwa.s index 88386e046917f..9a4283e73e384 100644 --- a/llvm/test/MC/AMDGPU/vop_sdwa.s +++ b/llvm/test/MC/AMDGPU/vop_sdwa.s @@ -717,8 +717,8 @@ v_mov_b32 v1, s2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD // GFX9: v_mov_b32_sdwa v1, exec_lo dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x7e,0x10,0x86,0x00] v_mov_b32 v1, exec_lo dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD -// NOSICI: error: not a valid operand. -// NOVI: error: not a valid operand. +// NOSICI: error: register not available on this GPU +// NOVI: error: register not available on this GPU // GFX9: v_mov_b32_sdwa v1, ttmp12 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x78,0x10,0x86,0x00] v_mov_b32_sdwa v1, ttmp12 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD @@ -735,19 +735,16 @@ v_add_f32 v0, v0, s22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_s // NOSICI: error: invalid operand for instruction // NOVI: error: invalid operand for instruction // NOGFX9: error: invalid operand for instruction -// NO: invalid operand (violates constant bus restrictions) v_add_f32 v0, exec_lo, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 // NOSICI: error: invalid operand for instruction // NOVI: error: invalid operand for instruction -// NOGFX9: error: not a valid operand. -// NO: error: not a valid operand +// NOGFX9: error: register not available on this GPU v_add_f32 v0, v1, tba_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 // NOSICI: error: invalid operand for instruction // NOVI: error: invalid operand for instruction -// NOGFX9: error: not a valid operand. -// NO: error: not a valid operand +// NOGFX9: error: register not available on this GPU v_add_f32 v0, v1, tma_hi dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 // NOSICI: error: invalid operand for instruction @@ -760,25 +757,23 @@ v_cmp_eq_f32_sdwa vcc, s1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 // GFX9: v_cmp_eq_f32_sdwa vcc, v1, s22 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x2c,0x84,0x7c,0x01,0x00,0x05,0x82] v_cmp_eq_f32_sdwa vcc, v1, s22 src0_sel:WORD_1 src1_sel:BYTE_2 -// NOSICI: error: not a valid operand. -// NOVI: error: not a valid operand. +// NOSICI: error: register not available on this GPU +// NOVI: error: register not available on this GPU // GFX9: v_cmp_eq_f32_sdwa ttmp[12:13], v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0xf8,0x05,0x02] v_cmp_eq_f32_sdwa ttmp[12:13], v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 // NOSICI: error: invalid operand for instruction // NOVI: error: instruction not supported on this GPU -// NOGFX9: error: not a valid operand. -// NO: error: not a valid operand +// NOGFX9: error: register not available on this GPU v_cmp_eq_f32_sdwa tba, v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 // NOSICI: error: invalid operand for instruction // NOVI: error: instruction not supported on this GPU -// NOGFX9: error: not a valid operand. -// NO: error: not a valid operand +// NOGFX9: error: register not available on this GPU v_cmp_eq_f32_sdwa tma, v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 -// NOSICI: error: not a valid operand. -// NOVI: error: not a valid operand. +// NOSICI: error: register not available on this GPU +// NOVI: error: register not available on this GPU // GFX9: v_cmp_eq_f32_sdwa vcc, v1, ttmp15 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0xf6,0x84,0x7c,0x01,0x00,0x05,0x82] v_cmp_eq_f32_sdwa vcc, v1, ttmp15 src0_sel:WORD_1 src1_sel:BYTE_2 @@ -789,7 +784,7 @@ v_cmp_eq_f32_sdwa vcc, exec_lo, vcc_lo src0_sel:WORD_1 src1_sel:BYTE_2 // NOVI: error: invalid operand for instruction // GFX9: v_ceil_f16_sdwa v5, flat_scratch_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0x66,0x06,0x86,0x00] -// NOSI: error: not a valid operand. +// NOSI: error: register not available on this GPU // NOCI: error: not a valid operand. v_ceil_f16_sdwa v5, flat_scratch_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD diff --git a/llvm/test/MC/AMDGPU/xnack-mask.s b/llvm/test/MC/AMDGPU/xnack-mask.s index 0fa5242d37899..e6e310724d453 100644 --- a/llvm/test/MC/AMDGPU/xnack-mask.s +++ b/llvm/test/MC/AMDGPU/xnack-mask.s @@ -7,25 +7,25 @@ // RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -show-encoding %s | FileCheck -check-prefix=XNACK %s s_mov_b64 xnack_mask, -1 -// NOSICIVI10: error: not a valid operand. +// NOSICIVI10: error: register not available on this GPU // XNACK: s_mov_b64 xnack_mask, -1 ; encoding: [0xc1,0x01,0xe8,0xbe] s_mov_b32 xnack_mask_lo, -1 -// NOSICIVI10: error: not a valid operand. +// NOSICIVI10: error: register not available on this GPU // XNACK: s_mov_b32 xnack_mask_lo, -1 ; encoding: [0xc1,0x00,0xe8,0xbe] s_mov_b32 xnack_mask_hi, -1 -// NOSICIVI10: error: not a valid operand. +// NOSICIVI10: error: register not available on this GPU // XNACK: s_mov_b32 xnack_mask_hi, -1 ; encoding: [0xc1,0x00,0xe9,0xbe] s_mov_b32 xnack_mask, -1 -// NOSICIVI10: error: not a valid operand. +// NOSICIVI10: error: register not available on this GPU // XNACKERR: error: invalid operand for instruction s_mov_b64 xnack_mask_lo, -1 -// NOSICIVI10: error: not a valid operand. +// NOSICIVI10: error: register not available on this GPU // XNACKERR: error: invalid operand for instruction s_mov_b64 xnack_mask_hi, -1 -// NOSICIVI10: error: not a valid operand. +// NOSICIVI10: error: register not available on this GPU // XNACKERR: error: invalid operand for instruction diff --git a/llvm/test/MC/ARM/directive-if-subtraction.s b/llvm/test/MC/ARM/directive-if-subtraction.s new file mode 100644 index 0000000000000..edb386593ba63 --- /dev/null +++ b/llvm/test/MC/ARM/directive-if-subtraction.s @@ -0,0 +1,52 @@ +// RUN: llvm-mc -triple armv7a-linux-gnueabihf %s -filetype=obj -o /dev/null 2>&1 | FileCheck --check-prefix=OBJ --allow-empty %s +// RUN: not llvm-mc -triple armv7a-linux-gnueabihf %s -o /dev/null 2>&1 | FileCheck --check-prefix=ASM %s +// RUN: llvm-mc -triple armv7a-linux-gnueabihf %s -filetype=obj -o - | llvm-objdump -d - | FileCheck --check-prefix=DISASM %s + +nop +// Create a new MCDataFragment due to Subtarget change +.arch_extension sec +9997:nop +.if . - 9997b == 0 +// OBJ-NOT:[[@LINE-1]]:5: error: expected absolute expression +// ASM:[[@LINE-2]]:5: error: expected absolute expression +// DISASM: orr r1, r1, #2 +orr r1, r1, #1 +.else +orr r1, r1, #2 +.endif + + + +@ RUN: not llvm-mc -filetype=obj -triple arm-linux-gnueabihf --defsym=ERR=1 %s -o /dev/null 2>&1 | FileCheck --check-prefix=ARM-ERR %s +@ RUN: not llvm-mc -filetype=obj -triple thumbv7a-linux-gnueabihf --defsym=ERR=1 %s -o /dev/null 2>&1 | FileCheck --check-prefix=THUMB2-ERR %s + +.ifdef ERR +9997: nop + .align 4 + nop +.if . - 9997b == 4 +// ARM-ERR:[[@LINE-1]]:5: error: expected absolute expression +.endif + +9997: nop + .space 4 + nop +.if . - 9997b == 4 +// ARM-ERR:[[@LINE-1]]:5: error: expected absolute expression +.endif + +9997: + ldr r0,=0x12345678 + .ltorg + nop +.if . - 9997b == 4 +// ARM-ERR:[[@LINE-1]]:5: error: expected absolute expression +.endif + +9997: nop + b external + nop +.if . - 9997b == 4 +// THUMB2-ERR:[[@LINE-1]]:5: error: expected absolute expression +.endif +.endif diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt index 26c50ecc4cf0f..11e1f08be93f4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt @@ -52,6 +52,30 @@ # GFX10: v_fma_legacy_f32 v0, s1, 2.0, -v3 0x00,0x00,0x40,0xd5,0x01,0xe8,0x0d,0x84 +# GFX10: image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] +0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x00 + +# GFX10: image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 +0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x40 + +# GFX10: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] +0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x00 + +# GFX10: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 +0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x40 + +# GFX10: image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15] +0x07,0x9f,0x98,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x00,0x00 + +# GFX10: image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20], s[12:15] a16 +0x05,0x9f,0x98,0xf1,0x32,0x27,0x03,0x40,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x00 + +# GFX10: image_bvh64_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40, v42], s[12:15] +0x07,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x2a,0x00 + +# GFX10: image_bvh64_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19], s[12:15] a16 +0x05,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x40,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13 + # GFX10: image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D 0x01,0x0f,0x00,0xf0,0x05,0x01,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt index a07f10d1bf6b6..a9c70b713538e 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt @@ -225,6 +225,18 @@ # CHECK: xxblendvd 6, 63, 21, 34 0x05 0x00 0x00 0x00 0x84 0xdf 0xa8 0xbc +# CHECK: setbc 21, 11 +0x7e 0xab 0x03 0x00 + +# CHECK: setbcr 21, 11 +0x7e 0xab 0x03 0x40 + +# CHECK: setnbc 21, 11 +0x7e 0xab 0x03 0x80 + +# CHECK: setnbcr 21, 11 +0x7e 0xab 0x03 0xc0 + # CHECK: vsldbi 2, 3, 4, 5 0x10 0x43 0x21 0x56 diff --git a/llvm/test/MC/ELF/org.s b/llvm/test/MC/ELF/org.s index ec6264f823c27..d8f52311420ee 100644 --- a/llvm/test/MC/ELF/org.s +++ b/llvm/test/MC/ELF/org.s @@ -1,15 +1,21 @@ -// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -S - | FileCheck %s +# RUN: llvm-mc -filetype=obj -triple x86_64 %s -o - | llvm-readobj -S - | FileCheck %s --strict-whitespace .zero 4 foo: .zero 4 .org foo+16 -// CHECK: Section { -// CHECK: Name: .text -// CHECK-NEXT: Type: -// CHECK-NEXT: Flags [ -// CHECK: ] -// CHECK-NEXT: Address: -// CHECK-NEXT: Offset: -// CHECK-NEXT: Size: 20 +.bss + .zero 1 +# .org is a zero initializer and can appear in a SHT_NOBITS section. + .org .bss+5 + +# CHECK: Section { +# CHECK: Name: .text +# CHECK: Size: +# CHECK-SAME: {{ 20$}} + +# CHECK: Section { +# CHECK: Name: .bss +# CHECK: Size: +# CHECK-SAME: {{ 5$}} diff --git a/llvm/test/MC/MachO/reloc-diff.s b/llvm/test/MC/MachO/reloc-diff.s index 8b2e7606b3542..ba00e7bb1c9ff 100644 --- a/llvm/test/MC/MachO/reloc-diff.s +++ b/llvm/test/MC/MachO/reloc-diff.s @@ -22,9 +22,5 @@ Ltemp: // CHECK-NEXT: 0x0 0 2 n/a GENERIC_RELOC_PAIR 1 0x0 // CHECK-NEXT: 0x8 0 2 n/a GENERIC_RELOC_LOCAL_SECTDIFF 1 0x0 // CHECK-NEXT: 0x0 0 2 n/a GENERIC_RELOC_PAIR 1 0x0 -// CHECK-NEXT: 0x4 0 2 n/a GENERIC_RELOC_LOCAL_SECTDIFF 1 0x0 -// CHECK-NEXT: 0x0 0 2 n/a GENERIC_RELOC_PAIR 1 0x0 -// CHECK-NEXT: 0x0 0 2 n/a GENERIC_RELOC_SECTDIFF 1 0x0 -// CHECK-NEXT: 0x0 0 2 n/a GENERIC_RELOC_PAIR 1 0x0 // CHECK-NEXT: } // CHECK-NEXT: ] diff --git a/llvm/test/MC/PowerPC/pcrel-tls-local-exec-address-load-reloc.s b/llvm/test/MC/PowerPC/pcrel-tls-local-exec-address-load-reloc.s new file mode 100644 index 0000000000000..ae3eb8b886623 --- /dev/null +++ b/llvm/test/MC/PowerPC/pcrel-tls-local-exec-address-load-reloc.s @@ -0,0 +1,15 @@ +# RUN: llvm-mc -triple=powerpc64le-unknown-unknown -filetype=obj %s 2>&1 | \ +# RUN: FileCheck %s -check-prefix=MC +# RUN: llvm-mc -triple=powerpc64le-unknown-unknown -filetype=obj %s | \ +# RUN: llvm-readobj -r - | FileCheck %s -check-prefix=READOBJ + +# This test checks that on Power PC we can correctly convert x@TPREL +# into R_PPC64_TPREL34 for local exec relocations with address loaded. + +# MC-NOT: error: invalid variant + +# READOBJ: 0x0 R_PPC64_TPREL34 x 0x0 + +LocalExec: + paddi 3, 13, x@TPREL, 0 + blr diff --git a/llvm/test/MC/PowerPC/pcrel-tls-local-exec-value-load-reloc.s b/llvm/test/MC/PowerPC/pcrel-tls-local-exec-value-load-reloc.s new file mode 100644 index 0000000000000..6ebee2ff9cffb --- /dev/null +++ b/llvm/test/MC/PowerPC/pcrel-tls-local-exec-value-load-reloc.s @@ -0,0 +1,16 @@ +# RUN: llvm-mc -triple=powerpc64le-unknown-unknown -filetype=obj %s 2>&1 | \ +# RUN: FileCheck %s -check-prefix=MC +# RUN: llvm-mc -triple=powerpc64le-unknown-unknown -filetype=obj %s | \ +# RUN: llvm-readobj -r - | FileCheck %s -check-prefix=READOBJ + +# This test checks that on Power PC we can correctly convert x@TPREL +# into R_PPC64_TPREL34 for local exec relocations with the value loaded. + +# MC-NOT: error: invalid variant + +# READOBJ: 0x0 R_PPC64_TPREL34 x 0x0 + +LocalExecLoad: + paddi 3, 13, x@TPREL, 0 + lwz 3, 0(3) + blr diff --git a/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s b/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s index 29e9a7a74bf6f..08cdcc74dc42f 100644 --- a/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s +++ b/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s @@ -351,6 +351,18 @@ # CHECK-LE: xxblendvd 6, 63, 21, 34 # encoding: [0x00,0x00,0x00,0x05, # CHECK-LE-SAME: 0xbc,0xa8,0xdf,0x84] xxblendvd 6, 63, 21, 34 +# CHECK-BE: setbc 21, 11 # encoding: [0x7e,0xab,0x03,0x00] +# CHECK-LE: setbc 21, 11 # encoding: [0x00,0x03,0xab,0x7e] + setbc 21, 11 +# CHECK-BE: setbcr 21, 11 # encoding: [0x7e,0xab,0x03,0x40] +# CHECK-LE: setbcr 21, 11 # encoding: [0x40,0x03,0xab,0x7e] + setbcr 21, 11 +# CHECK-BE: setnbc 21, 11 # encoding: [0x7e,0xab,0x03,0x80] +# CHECK-LE: setnbc 21, 11 # encoding: [0x80,0x03,0xab,0x7e] + setnbc 21, 11 +# CHECK-BE: setnbcr 21, 11 # encoding: [0x7e,0xab,0x03,0xc0] +# CHECK-LE: setnbcr 21, 11 # encoding: [0xc0,0x03,0xab,0x7e] + setnbcr 21, 11 # CHECK-BE: vsldbi 2, 3, 4, 5 # encoding: [0x10,0x43,0x21,0x56] # CHECK-LE: vsldbi 2, 3, 4, 5 # encoding: [0x56,0x21,0x43,0x10] vsldbi 2, 3, 4, 5 diff --git a/llvm/test/MC/WebAssembly/globals.s b/llvm/test/MC/WebAssembly/globals.s index 10d696b7090a7..717d28b2945c5 100644 --- a/llvm/test/MC/WebAssembly/globals.s +++ b/llvm/test/MC/WebAssembly/globals.s @@ -6,7 +6,7 @@ .globl read_global .globl write_global .globaltype foo_global, i32 -.globaltype global2, i64 +.globaltype global2, i64, immutable .globaltype global3, f32 .globaltype global4, f64 @@ -42,6 +42,12 @@ global4: # BIN-NEXT: InitExpr: # BIN-NEXT: Opcode: I32_CONST # BIN-NEXT: Value: 0 +# BIN-NEXT: - Index: 1 +# BIN-NEXT: Type: I64 +# BIN-NEXT: Mutable: false +# BIN-NEXT: InitExpr: +# BIN-NEXT: Opcode: I64_CONST +# BIN-NEXT: Value: 0 # BIN: - Type: CUSTOM # BIN-NEXT: Name: linking diff --git a/llvm/test/MC/X86/x86-16.s b/llvm/test/MC/X86/x86-16.s index f92164e57314a..f1b4428703f10 100644 --- a/llvm/test/MC/X86/x86-16.s +++ b/llvm/test/MC/X86/x86-16.s @@ -1056,3 +1056,8 @@ foo: // CHECK: encoding: [0x0f,0x84,A,A] // CHECK: fixup A - offset: 2, value: foo-2, kind: FK_PCRel_2 {disp32} je foo + +// CHECK: movl nearer, %ebx +// CHECK: encoding: [0x66,0x8b,0x1e,A,A] +// CHECK: fixup A - offset: 3, value: nearer, kind: FK_Data_2 +movl nearer, %ebx diff --git a/llvm/test/ObjectYAML/MachO/DWARF-debug_ranges.yaml b/llvm/test/ObjectYAML/MachO/DWARF-debug_ranges.yaml index 8948bf92b7d76..5aea820145cf7 100644 --- a/llvm/test/ObjectYAML/MachO/DWARF-debug_ranges.yaml +++ b/llvm/test/ObjectYAML/MachO/DWARF-debug_ranges.yaml @@ -239,3 +239,72 @@ DWARF: - AbbrCode: 0x00000000 Values: [] ... + +## Test generating and dumping an empty __debug_ranges section. + +# RUN: yaml2obj --docnum=2 %s | obj2yaml | FileCheck %s --check-prefix=EMPTY + +# EMPTY: DWARF: +# EMPTY-NEXT: debug_ranges: [] +# EMPTY-NEXT: ... + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x01000007 + cpusubtype: 0x00000003 + filetype: 0x0000000A + ncmds: 1 + sizeofcmds: 232 + flags: 0x00000000 + reserved: 0x00000000 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DWARF + vmaddr: 0x00 + vmsize: 0x00 + fileoff: 0x00 + filesize: 0x00 + maxprot: 0 + initprot: 0 + nsects: 1 + flags: 0 + Sections: + - sectname: __debug_ranges + segname: __DWARF + addr: 0x00 + size: [[SIZE=0]] + offset: 0x210 + align: 0 + reloff: 0x00000000 + nreloc: 0 + flags: 0x00000000 + reserved1: 0x00000000 + reserved2: 0x00000000 + reserved3: 0x00000000 + content: [[CONTENT=]] + +## Test generating and dumping a __debug_ranges section whose size isn't a +## multiple of the address size. This test case is to ensure that when the +## parser fails, the content of the __debug_ranges section will be dumped into +## the 'content' entry and the 'debug_ranges' entry will not exist. + +# RUN: yaml2obj --docnum=2 -DSIZE=3 -DCONTENT='010203' %s | obj2yaml | FileCheck %s --check-prefix=FAILS + +# FAILS-NOT: DWARF: +# FAILS: Sections: +# FAILS-NEXT: - sectname: __debug_ranges +# FAILS-NEXT: segname: __DWARF +# FAILS-NEXT: addr: 0x0000000000000000 +# FAILS-NEXT: size: 3 +# FAILS-NEXT: offset: 0x00000210 +# FAILS-NEXT: align: 0 +# FAILS-NEXT: reloff: 0x00000000 +# FAILS-NEXT: nreloc: 0 +# FAILS-NEXT: flags: 0x00000000 +# FAILS-NEXT: reserved1: 0x00000000 +# FAILS-NEXT: reserved2: 0x00000000 +# FAILS-NEXT: reserved3: 0x00000000 +# FAILS-NEXT: content: '010203' +# FAILS-NEXT: ... diff --git a/llvm/test/ObjectYAML/MachO/DWARF-debug_str.yaml b/llvm/test/ObjectYAML/MachO/DWARF-debug_str.yaml index 29247b334a1a9..9bb55ea350911 100644 --- a/llvm/test/ObjectYAML/MachO/DWARF-debug_str.yaml +++ b/llvm/test/ObjectYAML/MachO/DWARF-debug_str.yaml @@ -321,3 +321,61 @@ DWARF: # EMPTY-STRING-NEXT: debug_str: # EMPTY-STRING-NEXT: - '' # EMPTY-STRING-NEXT: ... + +## d) Test generating and dumping a __debug_str section which contains a string without a null terminator. + +# RUN: yaml2obj --docnum=3 %s | obj2yaml | FileCheck %s --check-prefix=NO-TERMINATOR + +# NO-TERMINATOR-NOT: DWARF: +# NO-TERMINATOR: Sections: +# NO-TERMINATOR-NEXT: - sectname: __debug_str +# NO-TERMINATOR-NEXT: segname: __DWARF +# NO-TERMINATOR-NEXT: addr: 0x0000000000000000 +# NO-TERMINATOR-NEXT: size: 7 +# NO-TERMINATOR-NEXT: offset: 0x00000210 +# NO-TERMINATOR-NEXT: align: 0 +# NO-TERMINATOR-NEXT: reloff: 0x00000000 +# NO-TERMINATOR-NEXT: nreloc: 0 +# NO-TERMINATOR-NEXT: flags: 0x00000000 +# NO-TERMINATOR-NEXT: reserved1: 0x00000000 +# NO-TERMINATOR-NEXT: reserved2: 0x00000000 +# NO-TERMINATOR-NEXT: reserved3: 0x00000000 +# NO-TERMINATOR-NEXT: content: '61626300616263' +# NO-TERMINATOR-NEXT: ... + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x01000007 + cpusubtype: 0x00000003 + filetype: 0x0000000A + ncmds: 1 + sizeofcmds: 232 + flags: 0x00000000 + reserved: 0x00000000 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DWARF + vmaddr: 0x00 + vmsize: 0x00 + fileoff: 0x00 + filesize: 0x00 + maxprot: 0 + initprot: 0 + nsects: 1 + flags: 0 + Sections: + - sectname: __debug_str + segname: __DWARF + addr: 0x00 + size: 7 + offset: 0x210 + align: 0 + reloff: 0x00000000 + nreloc: 0 + flags: 0x00000000 + reserved1: 0x00000000 + reserved2: 0x00000000 + reserved3: 0x00000000 + content: '61626300616263' ## "abc\0abc" diff --git a/llvm/test/Other/constant-fold-gep.ll b/llvm/test/Other/constant-fold-gep.ll index 8028b4fff9870..8be214713d5ce 100644 --- a/llvm/test/Other/constant-fold-gep.ll +++ b/llvm/test/Other/constant-fold-gep.ll @@ -11,7 +11,8 @@ ; RUN: opt -S -o - -instcombine -globalopt -data-layout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64" < %s | FileCheck --check-prefix=TO %s ; "SCEV" - ScalarEvolution with default target layout -; RUN: opt -analyze -scalar-evolution < %s | FileCheck --check-prefix=SCEV %s +; RUN: opt -analyze -scalar-evolution < %s -enable-new-pm=0 | FileCheck --check-prefix=SCEV %s +; RUN: opt -passes='print' < %s -disable-output 2>&1 | FileCheck --check-prefix=SCEV %s ; The automatic constant folder in opt does not have targetdata access, so diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll index e606e7cfac171..58ed6b2a0820a 100644 --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -111,6 +111,8 @@ ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops ; CHECK-NEXT: Memory SSA +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Unswitch loops @@ -168,6 +170,8 @@ ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Post-Dominator Tree Construction @@ -270,10 +274,10 @@ ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Scalar Evolution Analysis -; CHECK-NEXT: Loop Pass Manager -; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Warn about non-applied transformations ; CHECK-NEXT: Alignment from assumptions diff --git a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll index aaee6f786bac9..493957e865d4f 100644 --- a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll +++ b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll @@ -116,6 +116,8 @@ ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops ; CHECK-NEXT: Memory SSA +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Unswitch loops @@ -173,6 +175,8 @@ ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Post-Dominator Tree Construction @@ -282,10 +286,10 @@ ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Scalar Evolution Analysis -; CHECK-NEXT: Loop Pass Manager -; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Warn about non-applied transformations ; CHECK-NEXT: Alignment from assumptions diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll index b2d2f85ae21be..f674dabd52173 100644 --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -116,6 +116,8 @@ ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops ; CHECK-NEXT: Memory SSA +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Unswitch loops @@ -173,6 +175,8 @@ ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Post-Dominator Tree Construction @@ -275,10 +279,10 @@ ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Scalar Evolution Analysis -; CHECK-NEXT: Loop Pass Manager -; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Warn about non-applied transformations ; CHECK-NEXT: Alignment from assumptions diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll index cc91707c4b009..66df666a64c69 100644 --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -97,6 +97,8 @@ ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops ; CHECK-NEXT: Memory SSA +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Unswitch loops @@ -154,6 +156,8 @@ ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Post-Dominator Tree Construction @@ -256,10 +260,10 @@ ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Scalar Evolution Analysis -; CHECK-NEXT: Loop Pass Manager -; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Warn about non-applied transformations ; CHECK-NEXT: Alignment from assumptions diff --git a/llvm/test/Other/opt-hot-cold-split.ll b/llvm/test/Other/opt-hot-cold-split.ll index f43f3a3d893ce..cd01314f1f7e1 100644 --- a/llvm/test/Other/opt-hot-cold-split.ll +++ b/llvm/test/Other/opt-hot-cold-split.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=x86_64-- -Os -hot-cold-split=true -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=DEFAULT-Os +; RUN: opt -mtriple=x86_64-- -Os -hot-cold-split=true -debug-pass=Structure -enable-new-pm=0 < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=DEFAULT-Os ; RUN: opt -mtriple=x86_64-- -hot-cold-split=true -passes='lto-pre-link' -debug-pass-manager < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=LTO-PRELINK-Os ; RUN: opt -mtriple=x86_64-- -hot-cold-split=true -passes='thinlto-pre-link' -debug-pass-manager < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=THINLTO-PRELINK-Os ; RUN: opt -mtriple=x86_64-- -hot-cold-split=true -passes='lto' -debug-pass-manager < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=LTO-POSTLINK-Os diff --git a/llvm/test/TableGen/AllowDuplicateRegisterNames.td b/llvm/test/TableGen/AllowDuplicateRegisterNames.td index 2ba63c434ca5f..897a628fe64b8 100644 --- a/llvm/test/TableGen/AllowDuplicateRegisterNames.td +++ b/llvm/test/TableGen/AllowDuplicateRegisterNames.td @@ -27,7 +27,7 @@ class ArchReg alt, list altidx> def ABIRegAltName : RegAltNameIndex; -foreach i = 0-3 in { +foreach i = 0...3 in { def R#i#_32 : ArchReg<"r"#i, ["x"#i], [ABIRegAltName]>; def R#i#_64 : ArchReg<"r"#i, ["x"#i], [ABIRegAltName]>; } diff --git a/llvm/test/TableGen/BigEncoder.td b/llvm/test/TableGen/BigEncoder.td index 5c4bc016e269c..9b9d382433508 100644 --- a/llvm/test/TableGen/BigEncoder.td +++ b/llvm/test/TableGen/BigEncoder.td @@ -19,8 +19,8 @@ def foo : Instruction { let InOperandList = (ins i32imm:$factor); field bits<65> Inst; bits<32> factor; - let Inst{7-0} = 0xAA; - let Inst{14-8} = factor{6-0}; // no offset + let Inst{7...0} = 0xAA; + let Inst{14...8} = factor{6...0}; // no offset let AsmString = "foo $factor"; field bits<16> SoftFail = 0; } @@ -29,8 +29,8 @@ def bar : Instruction { let InOperandList = (ins i32imm:$factor); field bits<65> Inst; bits<32> factor; - let Inst{7-0} = 0xBB; - let Inst{15-8} = factor{10-3}; // offset by 3 + let Inst{7...0} = 0xBB; + let Inst{15...8} = factor{10...3}; // offset by 3 let AsmString = "bar $factor"; field bits<16> SoftFail = 0; } @@ -39,8 +39,8 @@ def biz : Instruction { let InOperandList = (ins i32imm:$factor); field bits<65> Inst; bits<32> factor; - let Inst{7-0} = 0xCC; - let Inst{11-8,15-12} = factor{10-3}; // offset by 3, multipart + let Inst{7...0} = 0xCC; + let Inst{11...8,15...12} = factor{10...3}; // offset by 3, multipart let AsmString = "biz $factor"; field bits<16> SoftFail = 0; } diff --git a/llvm/test/TableGen/BitOffsetDecoder.td b/llvm/test/TableGen/BitOffsetDecoder.td index a928664398f0f..f94e8d4f09789 100644 --- a/llvm/test/TableGen/BitOffsetDecoder.td +++ b/llvm/test/TableGen/BitOffsetDecoder.td @@ -19,8 +19,8 @@ def foo : Instruction { let InOperandList = (ins i32imm:$factor); field bits<16> Inst; bits<32> factor; - let Inst{7-0} = 0xAA; - let Inst{14-8} = factor{6-0}; // no offset + let Inst{7...0} = 0xAA; + let Inst{14...8} = factor{6...0}; // no offset let AsmString = "foo $factor"; field bits<16> SoftFail = 0; } @@ -29,8 +29,8 @@ def bar : Instruction { let InOperandList = (ins i32imm:$factor); field bits<16> Inst; bits<32> factor; - let Inst{7-0} = 0xBB; - let Inst{15-8} = factor{10-3}; // offset by 3 + let Inst{7...0} = 0xBB; + let Inst{15...8} = factor{10...3}; // offset by 3 let AsmString = "bar $factor"; field bits<16> SoftFail = 0; } @@ -39,8 +39,8 @@ def biz : Instruction { let InOperandList = (ins i32imm:$factor); field bits<16> Inst; bits<32> factor; - let Inst{7-0} = 0xCC; - let Inst{11-8,15-12} = factor{10-3}; // offset by 3, multipart + let Inst{7...0} = 0xCC; + let Inst{11...8,15...12} = factor{10...3}; // offset by 3, multipart let AsmString = "biz $factor"; field bits<16> SoftFail = 0; } @@ -49,8 +49,8 @@ def baz : Instruction { let InOperandList = (ins Myi32:$factor); field bits<16> Inst; bits<32> factor; - let Inst{7-0} = 0xDD; - let Inst{15-8} = factor{11-4}; // offset by 4 + custom decode + let Inst{7...0} = 0xDD; + let Inst{15...8} = factor{11...4}; // offset by 4 + custom decode let AsmString = "baz $factor"; field bits<16> SoftFail = 0; } diff --git a/llvm/test/TableGen/BitsInit.td b/llvm/test/TableGen/BitsInit.td index 16d2d07753ad7..c5527aebb9417 100644 --- a/llvm/test/TableGen/BitsInit.td +++ b/llvm/test/TableGen/BitsInit.td @@ -38,8 +38,8 @@ def { bits<2> D8 = { 0 }; // type mismatch. RHS doesn't have enough bits bits<8> E; - let E{7-0} = {0,0,1,?,?,?,?,?}; - let E{3-0} = 0b0010; + let E{7...0} = {0,0,1,?,?,?,?,?}; + let E{3...0} = 0b0010; bits<8> F1 = { 0, 1, 0b1001, 0, 0b0 }; // ok bits<7> F2 = { 0, 1, 0b1001, 0, 0b0 }; // LHS doesn't have enough bits @@ -50,7 +50,7 @@ def { bits<8> G3 = { 0, 1, { 0b1001 }, 0, 0b0 }; // ok bits<16> H; - let H{15-0} = { { 0b11001100 }, 0b00110011 }; + let H{15...0} = { { 0b11001100 }, 0b00110011 }; bits<16> I = { G1, G2 }; // Make sure we can initialise ints with bits<> values. diff --git a/llvm/test/TableGen/DAGDefaultOps.td b/llvm/test/TableGen/DAGDefaultOps.td index 1c98c4d8d07be..702a2232db305 100644 --- a/llvm/test/TableGen/DAGDefaultOps.td +++ b/llvm/test/TableGen/DAGDefaultOps.td @@ -16,10 +16,10 @@ class TestEncoding : Instruction { } class TestReg : Register<"R"#index, []> { - let HWEncoding{15-4} = 0; - let HWEncoding{3-0} = !cast>(index); + let HWEncoding{15...4} = 0; + let HWEncoding{3...0} = !cast>(index); } -foreach i = 0-15 in +foreach i = 0...15 in def "R"#i : TestReg; def Reg : RegisterClass<"TestTarget", [i32], 32, (sequence "R%d", 0, 15)>; @@ -36,11 +36,11 @@ class RRI Opcode> : TestEncoding { field bits<4> src1; field bits<4> src2; field bits<16> imm; - let Inst{31-28} = Opcode; - let Inst{27-24} = dest; - let Inst{23-20} = src1; - let Inst{19-16} = src2; - let Inst{15-0} = imm; + let Inst{31...28} = Opcode; + let Inst{27...24} = dest; + let Inst{23...20} = src1; + let Inst{19...16} = src2; + let Inst{15...0} = imm; } def AddRRI : RRI<"add", 0b0001>; diff --git a/llvm/test/TableGen/ForeachLoop.td b/llvm/test/TableGen/ForeachLoop.td index ce8d44c7526e7..173285b5e722f 100644 --- a/llvm/test/TableGen/ForeachLoop.td +++ b/llvm/test/TableGen/ForeachLoop.td @@ -7,7 +7,7 @@ class Register { // CHECK-NOT: !strconcat -foreach i = 0-3 in +foreach i = 0...3 in def Q#i : Register<"Q"#i, i>; // CHECK: def Q0 @@ -50,7 +50,7 @@ foreach i = [0, 1, 2, 3, 4, 5, 6, 7] in // CHECK: string Name = "R7"; // CHECK: int Index = 7; -foreach i = {0-3,9-7} in { +foreach i = {0...3,9...7} in { def S#i : Register<"Q"#i, i>; def : Register<"T"#i, i>; } diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter.td index ed7bed3f711f0..acf5cf55320ee 100644 --- a/llvm/test/TableGen/GlobalISelEmitter.td +++ b/llvm/test/TableGen/GlobalISelEmitter.td @@ -78,7 +78,7 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; } // CHECK-NEXT: bool testImmPredicate_APInt(unsigned PredicateID, const APInt &Imm) const override; // CHECK-NEXT: bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat &Imm) const override; // CHECK-NEXT: const int64_t *getMatchTable() const override; -// CHECK-NEXT: bool testMIPredicate_MI(unsigned PredicateID, const MachineInstr &MI) const override; +// CHECK-NEXT: bool testMIPredicate_MI(unsigned PredicateID, const MachineInstr &MI, const std::array &Operands) const override; // CHECK-NEXT: #endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL // CHECK-LABEL: #ifdef GET_GLOBALISEL_TEMPORARIES_INIT @@ -255,7 +255,7 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; } // R19N-NEXT: // MIs[0] src1 // R19N-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, // R19N-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, -// R19N-NEXT: // MIs[0] Operand 2 +// R19N-NEXT: // MIs[0] complex_rr:src2a:src2b // R19N-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, // // R19N-NEXT: GIM_CheckComplexPattern, /*MI*/0, /*Op*/2, /*Renderer*/0, GICP_gi_complex_rr, @@ -274,7 +274,7 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; } // R19N-NEXT: // MIs[1] src4 // R19N-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32, // R19N-NEXT: GIM_CheckComplexPattern, /*MI*/1, /*Op*/2, /*Renderer*/1, GICP_gi_complex, -// R19N-NEXT: // MIs[1] Operand 3 +// R19N-NEXT: // MIs[1] complex:src5a:src5b // R19N-NEXT: GIM_CheckType, /*MI*/1, /*Op*/3, /*Type*/GILLT_s32, // R19N-NEXT: GIM_CheckComplexPattern, /*MI*/1, /*Op*/3, /*Renderer*/2, GICP_gi_complex, // R19O-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, diff --git a/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td b/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td index d985ef5da9245..6f6320f6389d0 100644 --- a/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td +++ b/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td @@ -45,61 +45,67 @@ def and_or_pat : PatFrag< let GISelPredicateCode = [{ return doesComplexCheck(MI); }]; + let PredicateCodeUsesOperands = 1; } -// CHECK: GIM_Try, /*On fail goto*//*Label 0*/ {{[0-9]+}}, // Rule ID 1 // +// CHECK: GIM_Try, /*On fail goto*//*Label 0*/ 99, // Rule ID 2 // // CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, // CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_AND, // CHECK-NEXT: // MIs[0] dst // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, // CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/Test::DRegsRegClassID, -// CHECK-NEXT: // MIs[0] Operand 1 +// CHECK-NEXT: // MIs[0] src2 // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, -// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1] +// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/0, /*Op*/1, /*StoreIdx*/2, // Name : pred:2:z +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/Test::DRegsRegClassID, +// CHECK-NEXT: // MIs[0] Operand 2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/2, // MIs[1] // CHECK-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3, // CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_OR, // CHECK-NEXT: // MIs[1] Operand 0 -// CHECK-NEXT:GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[1] src0 // CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/1, /*Op*/1, /*StoreIdx*/0, // Name : pred:2:x // CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/Test::DRegsRegClassID, // CHECK-NEXT: // MIs[1] src1 // CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/1, /*Op*/2, /*StoreIdx*/1, // Name : pred:2:y // CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/Test::DRegsRegClassID, -// CHECK-NEXT: // MIs[0] src2 -// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, -// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/2, /*RC*/Test::DRegsRegClassID, // CHECK-NEXT: GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIPFP_MI_Predicate_and_or_pat, // CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1, -// CHECK-NEXT: // (and:{ *:[i32] } (or:{ *:[i32] } DOP:{ *:[i32] }:$src0, DOP:{ *:[i32] }:$src1), DOP:{ *:[i32] }:$src2)<> => (AND_OR:{ *:[i32] } DOP:{ *:[i32] }:$src0, DOP:{ *:[i32] }:$src1, DOP:{ *:[i32] }:$src2) +// CHECK-NEXT: // (and:{ *:[i32] } DOP:{ *:[i32] }:$src2:$pred:2:z, (or:{ *:[i32] } DOP:{ *:[i32] }:$src0:$pred:2:x, DOP:{ *:[i32] }:$src1:$pred:2:y))<> => (AND_OR:{ *:[i32] } DOP:{ *:[i32] }:$src0, DOP:{ *:[i32] }:$src1, DOP:{ *:[i32] }:$src2) // CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::AND_OR, - -// CHECK: GIM_Try, /*On fail goto*//*Label 1*/ {{[0-9]+}}, // Rule ID 2 // +// CHECK: GIM_Try, /*On fail goto*//*Label 1*/ 198, // Rule ID 1 // // CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, // CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_AND, // CHECK-NEXT: // MIs[0] dst // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, // CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/Test::DRegsRegClassID, -// CHECK-NEXT: // MIs[0] src2 +// CHECK-NEXT: // MIs[0] Operand 1 // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, -// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/Test::DRegsRegClassID, -// CHECK-NEXT: // MIs[0] Operand 2 -// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, -// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/2, // MIs[1] +// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1] // CHECK-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3, // CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_OR, // CHECK-NEXT: // MIs[1] Operand 0 // CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[1] src0 // CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/1, /*Op*/1, /*StoreIdx*/0, // Name : pred:2:x // CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/Test::DRegsRegClassID, // CHECK-NEXT: // MIs[1] src1 // CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/1, /*Op*/2, /*StoreIdx*/1, // Name : pred:2:y // CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/Test::DRegsRegClassID, +// CHECK-NEXT: // MIs[0] src2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/0, /*Op*/2, /*StoreIdx*/2, // Name : pred:2:z +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/2, /*RC*/Test::DRegsRegClassID, // CHECK-NEXT: GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIPFP_MI_Predicate_and_or_pat, // CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1, -// CHECK-NEXT: // (and:{ *:[i32] } DOP:{ *:[i32] }:$src2, (or:{ *:[i32] } DOP:{ *:[i32] }:$src0, DOP:{ *:[i32] }:$src1))<> => (AND_OR:{ *:[i32] } DOP:{ *:[i32] }:$src0, DOP:{ *:[i32] }:$src1, DOP:{ *:[i32] }:$src2) +// CHECK-NEXT: // (and:{ *:[i32] } (or:{ *:[i32] } DOP:{ *:[i32] }:$src0:$pred:2:x, DOP:{ *:[i32] }:$src1:$pred:2:y), DOP:{ *:[i32] }:$src2:$pred:2:z)<> => (AND_OR:{ *:[i32] } DOP:{ *:[i32] }:$src0, DOP:{ *:[i32] }:$src1, DOP:{ *:[i32] }:$src2) // CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::AND_OR, // Test commutative, standalone pattern. @@ -115,9 +121,11 @@ def sub3_pat : PatFrag< let GISelPredicateCode = [{ return doesComplexCheck(MI); }]; + + let PredicateCodeUsesOperands = 1; } -// CHECK: GIM_Try, /*On fail goto*//*Label 2*/ {{[0-9]+}}, // Rule ID 0 // +// CHECK: GIM_Try, /*On fail goto*//*Label 2*/ 285, // Rule ID 0 // // CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, // CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_SUB, // CHECK-NEXT: // MIs[0] dst @@ -132,13 +140,16 @@ def sub3_pat : PatFrag< // CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[1] src0 // CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/1, /*Op*/1, /*StoreIdx*/0, // Name : pred:1:x // CHECK-NEXT: // MIs[1] src1 // CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/1, /*Op*/2, /*StoreIdx*/1, // Name : pred:1:y // CHECK-NEXT: // MIs[0] src2 // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/0, /*Op*/2, /*StoreIdx*/2, // Name : pred:1:z // CHECK-NEXT: GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIPFP_MI_Predicate_sub3_pat, // CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1, -// CHECK-NEXT: // (sub:{ *:[i32] } (sub:{ *:[i32] } i32:{ *:[i32] }:$src0, i32:{ *:[i32] }:$src1), i32:{ *:[i32] }:$src2)<> => (SUB3:{ *:[i32] } i32:{ *:[i32] }:$src0, i32:{ *:[i32] }:$src1, i32:{ *:[i32] }:$src2) +// CHECK-NEXT: // (sub:{ *:[i32] } (sub:{ *:[i32] } i32:{ *:[i32] }:$src0:$pred:1:x, i32:{ *:[i32] }:$src1:$pred:1:y), i32:{ *:[i32] }:$src2:$pred:1:z)<> => (SUB3:{ *:[i32] } i32:{ *:[i32] }:$src0, i32:{ *:[i32] }:$src1, i32:{ *:[i32] }:$src2) // CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::SUB3, // Test a non-commutative pattern. diff --git a/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td b/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td index b9ba1a7d8c554..7c9df02ebd87c 100644 --- a/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td +++ b/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td @@ -23,7 +23,7 @@ def INSN : I<(outs GPR32:$dst), (ins GPR32:$src1, complex:$src2), []>; //===- Bail out when we define a variable twice wrt complex suboperands. -===// -// CHECK: warning: Skipped pattern: Complex suboperand referenced more than once (Operand: x) +// CHECK: warning: Skipped pattern: Error: Complex suboperand x referenced by different operands: complex_rr:x:y and complex_rr:x:z. def : Pat<(add (complex_rr GPR32:$x, GPR32:$y), (complex_rr GPR32:$x, GPR32:$z)), (INSN GPR32:$z, complex:$y)>; diff --git a/llvm/test/TableGen/HwModeEncodeDecode.td b/llvm/test/TableGen/HwModeEncodeDecode.td index 1c9b86ff26a75..bac432271888b 100644 --- a/llvm/test/TableGen/HwModeEncodeDecode.td +++ b/llvm/test/TableGen/HwModeEncodeDecode.td @@ -22,9 +22,9 @@ def fooTypeEncA : InstructionEncoding { field bits<32> SoftFail = 0; bits<32> Inst; bits<8> factor; - let Inst{7-0} = factor; - let Inst{3-2} = 0b11; - let Inst{1-0} = 0b00; + let Inst{7...0} = factor; + let Inst{3...2} = 0b11; + let Inst{1...0} = 0b00; } def fooTypeEncB : InstructionEncoding { @@ -32,8 +32,8 @@ def fooTypeEncB : InstructionEncoding { field bits<32> SoftFail = 0; bits<32> Inst; bits<8> factor; - let Inst{15-8} = factor; - let Inst{1-0} = 0b11; + let Inst{15...8} = factor; + let Inst{1...0} = 0b11; } let OutOperandList = (outs) in { @@ -52,8 +52,8 @@ def bar: Instruction { bits<32> Inst; bits<32> SoftFail; bits<8> factor; - let Inst{31-24} = factor; - let Inst{1-0} = 0b10; + let Inst{31...24} = factor; + let Inst{1...0} = 0b10; let AsmString = "bar $factor"; } diff --git a/llvm/test/TableGen/JSON.td b/llvm/test/TableGen/JSON.td index 968c2577fa993..3fb2ec4014fbc 100644 --- a/llvm/test/TableGen/JSON.td +++ b/llvm/test/TableGen/JSON.td @@ -97,8 +97,8 @@ def VarObj : Variables { bits<2> undef_bits; bits<4> ref_bits; - let ref_bits{3-2} = 0b10; - let ref_bits{1-0} = undef_bits{1-0}; + let ref_bits{3...2} = 0b10; + let ref_bits{1...0} = undef_bits{1...0}; // CHECK: data['VarObj']['ref_bits'][3] == 1 // CHECK: data['VarObj']['ref_bits'][2] == 0 // CHECK: data['VarObj']['ref_bits'][1]['kind'] == 'varbit' diff --git a/llvm/test/TableGen/ListSlices.td b/llvm/test/TableGen/ListSlices.td index cbb2326a95c00..2f40334798b28 100644 --- a/llvm/test/TableGen/ListSlices.td +++ b/llvm/test/TableGen/ListSlices.td @@ -6,12 +6,12 @@ def A { } def B { - list X = [10, 20, 30, 4, 1, 1231, 20] [2-4,2,2,0-6]; + list X = [10, 20, 30, 4, 1, 1231, 20] [2...4,2,2,0...6]; list Y = X[4,5]; int Z = X[4]; - list C = A.B[1-4]; + list C = A.B[1...4]; list> AA = [X, Y]; diff --git a/llvm/test/TableGen/UnsetBitInit.td b/llvm/test/TableGen/UnsetBitInit.td index 694847358f66c..07e37e08efab3 100644 --- a/llvm/test/TableGen/UnsetBitInit.td +++ b/llvm/test/TableGen/UnsetBitInit.td @@ -21,7 +21,7 @@ def A { bit P; bit Q; - let Inst{7-2} = 0x3f; + let Inst{7...2} = 0x3f; let Inst{1} = P; let Inst{0} = Q; @@ -34,7 +34,7 @@ class x { } class y B> : x { - let A{21-20} = B; + let A{21...20} = B; } def z : y<{0,?}>; diff --git a/llvm/test/TableGen/cond-let.td b/llvm/test/TableGen/cond-let.td index 044878f2ab8e3..4e46445cc327a 100644 --- a/llvm/test/TableGen/cond-let.td +++ b/llvm/test/TableGen/cond-let.td @@ -11,13 +11,13 @@ class C x, bits<4> y, bit z> { y{1}: x{1}, y{0}: x{2}, {1} :?); - let n{10-9}= !cond(x{2}: y{3-2}, - x{1}: y{2-1}, - x{1}: y{1-0}, - {1} : ?); - let n{8-6} = !cond(x{2}: 0b010, 1 : 0b110); - let n{5-4} = !cond(x{1}: y{3-2}, 1 : {0, 1}); - let n{3-0} = !cond(x{0}: y{3-0}, 1 : {z, y{2}, y{1}, y{0}}); + let n{10...9}= !cond(x{2}: y{3...2}, + x{1}: y{2...1}, + x{1}: y{1...0}, + {1} : ?); + let n{8...6} = !cond(x{2}: 0b010, 1 : 0b110); + let n{5...4} = !cond(x{1}: y{3...2}, 1 : {0, 1}); + let n{3...0} = !cond(x{0}: y{3...0}, 1 : {z, y{2}, y{1}, y{0}}); } diff --git a/llvm/test/TableGen/dag-isel-regclass-emit-enum.td b/llvm/test/TableGen/dag-isel-regclass-emit-enum.td index 0002614fd5748..462bb3f2cd6da 100644 --- a/llvm/test/TableGen/dag-isel-regclass-emit-enum.td +++ b/llvm/test/TableGen/dag-isel-regclass-emit-enum.td @@ -12,7 +12,7 @@ let Namespace = "TestNamespace" in { def R0 : Register<"r0">; -foreach i = 0-127 in { +foreach i = 0...127 in { def GPR#i : RegisterClass<"TestTarget", [i32], 32, (add R0)>; } diff --git a/llvm/test/TableGen/defset.td b/llvm/test/TableGen/defset.td index 3c5fb68ea7ef0..ef9f54ba6e2db 100644 --- a/llvm/test/TableGen/defset.td +++ b/llvm/test/TableGen/defset.td @@ -40,7 +40,7 @@ multiclass C { defset list As = { def A0 : A<1>; - foreach i = 1-2 in { + foreach i = 1...2 in { def A#i : A; } defset list SubAs = { diff --git a/llvm/test/TableGen/foreach-variable-range.td b/llvm/test/TableGen/foreach-variable-range.td index 3ddb2c08ff20e..2a576d247a351 100644 --- a/llvm/test/TableGen/foreach-variable-range.td +++ b/llvm/test/TableGen/foreach-variable-range.td @@ -13,84 +13,84 @@ def Constants : ConstantsImpl; // CHECK-DAG: def var_bound_whitespaceA0 // CHECK-DAG: def var_bound_whitespaceA1 // CHECK-DAG: def var_bound_whitespaceA2 -foreach Index = Constants.Zero - Constants.Two in { +foreach Index = Constants.Zero ... Constants.Two in { def var_bound_whitespaceA#Index; } // CHECK-DAG: def var_bound_whitespaceB0 // CHECK-DAG: def var_bound_whitespaceB1 // CHECK-DAG: def var_bound_whitespaceB2 -foreach Index = Constants.Zero-Constants.Two in { +foreach Index = Constants.Zero...Constants.Two in { def var_bounds_whitespaceB#Index; } // CHECK-DAG: def var_bound_whitespaceC0 // CHECK-DAG: def var_bound_whitespaceC1 // CHECK-DAG: def var_bound_whitespaceC2 -foreach Index = Constants.Zero -Constants.Two in { +foreach Index = Constants.Zero ...Constants.Two in { def var_bounds_whitespaceC#Index; } // CHECK-DAG: def var_bound_whitespaceD0 // CHECK-DAG: def var_bound_whitespaceD1 // CHECK-DAG: def var_bound_whitespaceD2 -foreach Index = Constants.Zero- Constants.Two in { +foreach Index = Constants.Zero... Constants.Two in { def var_bounds_whitespaceD#Index; } // CHECK-DAG: def const_lower_whitespaceA0 // CHECK-DAG: def const_lower_whitespaceA1 // CHECK-DAG: def const_lower_whitespaceA2 -foreach Index = 0 - Constants.Two in { +foreach Index = 0 ... Constants.Two in { def const_lower_whitespaceA#Index; } // CHECK-DAG: def const_lower_whitespaceB0 // CHECK-DAG: def const_lower_whitespaceB1 // CHECK-DAG: def const_lower_whitespaceB2 -foreach Index = 0-Constants.Two in { +foreach Index = 0...Constants.Two in { def const_lower_whitespaceB#Index; } // CHECK-DAG: def const_lower_whitespaceC0 // CHECK-DAG: def const_lower_whitespaceC1 // CHECK-DAG: def const_lower_whitespaceC2 -foreach Index = 0 -Constants.Two in { +foreach Index = 0 ...Constants.Two in { def const_lower_whitespaceC#Index; } // CHECK-DAG: def const_lower_whitespaceD0 // CHECK-DAG: def const_lower_whitespaceD1 // CHECK-DAG: def const_lower_whitespaceD2 -foreach Index = 0- Constants.Two in { +foreach Index = 0... Constants.Two in { def const_lower_whitespaceD#Index; } // CHECK-DAG: def const_upper_whitespaceA0 // CHECK-DAG: def const_upper_whitespaceA1 // CHECK-DAG: def const_upper_whitespaceA2 -foreach Index = Constants.Zero - 2 in { +foreach Index = Constants.Zero ... 2 in { def const_upper_whitespaceA#Index; } // CHECK-DAG: def const_upper_whitespaceB0 // CHECK-DAG: def const_upper_whitespaceB1 // CHECK-DAG: def const_upper_whitespaceB2 -foreach Index = Constants.Zero-2 in { +foreach Index = Constants.Zero...2 in { def const_upper_whitespaceB#Index; } // CHECK-DAG: def const_upper_whitespaceC0 // CHECK-DAG: def const_upper_whitespaceC1 // CHECK-DAG: def const_upper_whitespaceC2 -foreach Index = Constants.Zero -2 in { +foreach Index = Constants.Zero ...2 in { def const_upper_whitespaceC#Index; } // CHECK-DAG: def const_upper_whitespaceD0 // CHECK-DAG: def const_upper_whitespaceD1 // CHECK-DAG: def const_upper_whitespaceD2 -foreach Index = Constants.Zero- 2 in { +foreach Index = Constants.Zero... 2 in { def const_upper_whitespaceD#Index; } @@ -98,7 +98,7 @@ foreach Index = Constants.Zero- 2 in { // CHECK-DAG: def multi_rangeA1 // CHECK-DAG: def multi_rangeA2 // CHECK-DAG: def multi_rangeA3 -foreach Index = {Constants.Zero-Constants.One, Constants.Two-Constants.Three} in { +foreach Index = {Constants.Zero...Constants.One, Constants.Two...Constants.Three} in { def multi_rangeA#Index; } @@ -107,7 +107,7 @@ foreach Index = {Constants.Zero-Constants.One, Constants.Two-Constants.Three} in // CHECK-DAG: def multi_rangeB3 // CHECK-DAG: def multi_rangeB4 // CHECK-DAG: def multi_rangeB5 -foreach Index = {0-Constants.One, Constants.Three-Constants.Five} in { +foreach Index = {0...Constants.One, Constants.Three...Constants.Five} in { def multi_rangeB#Index; } @@ -115,7 +115,7 @@ foreach Index = {0-Constants.One, Constants.Three-Constants.Five} in { // CHECK-DAG: def multi_rangeC1 // CHECK-DAG: def multi_rangeC2 // CHECK-DAG: def multi_rangeC3 -foreach Index = {0-Constants.One, 2-Constants.Three} in { +foreach Index = {0...Constants.One, 2...Constants.Three} in { def multi_rangeC#Index; } @@ -123,6 +123,6 @@ foreach Index = {0-Constants.One, 2-Constants.Three} in { // CHECK-DAG: def multi_rangeD1 // CHECK-DAG: def multi_rangeD2 // CHECK-DAG: def multi_rangeD3 -foreach Index = {0-1, Constants.Two-3} in { +foreach Index = {0...1, Constants.Two...3} in { def multi_rangeD#Index; } diff --git a/llvm/test/TableGen/if.td b/llvm/test/TableGen/if.td index a6af59e72830d..1fbee6966ff38 100644 --- a/llvm/test/TableGen/if.td +++ b/llvm/test/TableGen/if.td @@ -11,12 +11,12 @@ class C x, bits<4> y, bit z> { !if(y{2}, x{0}, !if(y{1}, x{1}, !if(y{0}, x{2}, ?)))); - let n{10-9}= !if(x{2}, y{3-2}, - !if(x{1}, y{2-1}, - !if(x{0}, y{1-0}, ?))); - let n{8-6} = !if(x{2}, 0b010, 0b110); - let n{5-4} = !if(x{1}, y{3-2}, {0, 1}); - let n{3-0} = !if(x{0}, y{3-0}, {z, y{2}, y{1}, y{0}}); + let n{10...9}= !if(x{2}, y{3...2}, + !if(x{1}, y{2...1}, + !if(x{0}, y{1...0}, ?))); + let n{8...6} = !if(x{2}, 0b010, 0b110); + let n{5...4} = !if(x{1}, y{3...2}, {0, 1}); + let n{3...0} = !if(x{0}, y{3...0}, {z, y{2}, y{1}, y{0}}); } def C1 : C<{1, 0, 1}, {0, 1, 0, 1}, 0>; diff --git a/llvm/test/TableGen/ifstmt.td b/llvm/test/TableGen/ifstmt.td index 22354310e7baf..5c0093a9a9ea1 100644 --- a/llvm/test/TableGen/ifstmt.td +++ b/llvm/test/TableGen/ifstmt.td @@ -15,7 +15,7 @@ if 1 then def aYes; // CHECK: def bNotThree2 // CHECK: def bNotThree4 // CHECK: def bThree3 -foreach i = 1-4 in { +foreach i = 1...4 in { if !eq(i, 3) then { def "bThree" # i; } else { @@ -61,8 +61,8 @@ defm c3: Multi<3>; // CHECK-NOT: def dThenElse1 // CHECK-NOT: def dThenElse11 // CHECK: def dThenThen01 -foreach i = 0-1 in - foreach j = 0-1 in +foreach i = 0...1 in + foreach j = 0...1 in if !eq(i,0) then if !eq(j,1) then def "dThenThen"#i#j; diff --git a/llvm/test/TableGen/list-element-bitref.td b/llvm/test/TableGen/list-element-bitref.td index 0f59b537fa6d6..4aae62f329de1 100644 --- a/llvm/test/TableGen/list-element-bitref.td +++ b/llvm/test/TableGen/list-element-bitref.td @@ -2,8 +2,8 @@ // XFAIL: vg_leak class C> L> { - bits<2> V0 = L[0]{1-0}; - bits<2> V1 = L[1]{3-2}; + bits<2> V0 = L[0]{1...0}; + bits<2> V1 = L[1]{3...2}; string V2 = !if(L[0]{0}, "Odd", "Even"); } diff --git a/llvm/test/TableGen/range-lists.td b/llvm/test/TableGen/range-lists.td index 82f4338323e52..85e0939f2ec0e 100644 --- a/llvm/test/TableGen/range-lists.td +++ b/llvm/test/TableGen/range-lists.td @@ -1,7 +1,8 @@ // RUN: llvm-tblgen %s | FileCheck %s // XFAIL: vg_leak -// This file has tests for range lists and range pieces. +// This file has tests for range lists and range pieces. Some use the +// deprecated '-' range punctuation just to be sure it still works. // These are tests for bits ranges. diff --git a/llvm/test/TableGen/simplify-patfrag.td b/llvm/test/TableGen/simplify-patfrag.td index 693658317d5d0..904c29696a6e2 100644 --- a/llvm/test/TableGen/simplify-patfrag.td +++ b/llvm/test/TableGen/simplify-patfrag.td @@ -9,7 +9,7 @@ def Demo : Target { } // Some registers which can hold ints or floats -foreach i = 0-7 in +foreach i = 0...7 in def "R" # i: Register<"r" # i>; def GPR : RegisterClass<"Demo", [i32, f32], 32, (sequence "R%u", 0, 7)>; diff --git a/llvm/test/TableGen/trydecode-emission3.td b/llvm/test/TableGen/trydecode-emission3.td index 8fc5150a0d8ea..84ce4f9a749b1 100644 --- a/llvm/test/TableGen/trydecode-emission3.td +++ b/llvm/test/TableGen/trydecode-emission3.td @@ -28,8 +28,8 @@ def InstBOp : Operand { def InstB : TestInstruction { bits<2> op; - let Inst{7-2} = {0,0,0,0,0,0}; - let Inst{1-0} = op; + let Inst{7...2} = {0,0,0,0,0,0}; + let Inst{1...0} = op; let OutOperandList = (outs InstBOp:$op); let AsmString = "InstB"; } diff --git a/llvm/test/Transforms/AlignmentFromAssumptions/simple.ll b/llvm/test/Transforms/AlignmentFromAssumptions/simple.ll index 14e764f042c7a..610fd448c3b98 100644 --- a/llvm/test/Transforms/AlignmentFromAssumptions/simple.ll +++ b/llvm/test/Transforms/AlignmentFromAssumptions/simple.ll @@ -4,10 +4,7 @@ target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" define i32 @foo(i32* nocapture %a) nounwind uwtable readonly { entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + tail call void @llvm.assume(i1 true) ["align"(i32* %a, i32 32)] %0 = load i32, i32* %a, align 4 ret i32 %0 @@ -18,11 +15,7 @@ entry: define i32 @foo2(i32* nocapture %a) nounwind uwtable readonly { entry: - %ptrint = ptrtoint i32* %a to i64 - %offsetptr = add i64 %ptrint, 24 - %maskedptr = and i64 %offsetptr, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + tail call void @llvm.assume(i1 true) ["align"(i32* %a, i32 32, i32 24)] %arrayidx = getelementptr inbounds i32, i32* %a, i64 2 %0 = load i32, i32* %arrayidx, align 4 ret i32 %0 @@ -34,11 +27,7 @@ entry: define i32 @foo2a(i32* nocapture %a) nounwind uwtable readonly { entry: - %ptrint = ptrtoint i32* %a to i64 - %offsetptr = add i64 %ptrint, 28 - %maskedptr = and i64 %offsetptr, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + tail call void @llvm.assume(i1 true) ["align"(i32* %a, i32 32, i32 28)] %arrayidx = getelementptr inbounds i32, i32* %a, i64 -1 %0 = load i32, i32* %arrayidx, align 4 ret i32 %0 @@ -50,10 +39,7 @@ entry: define i32 @goo(i32* nocapture %a) nounwind uwtable readonly { entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + tail call void @llvm.assume(i1 true) ["align"(i32* %a, i32 32, i32 0)] %0 = load i32, i32* %a, align 4 ret i32 %0 @@ -64,10 +50,7 @@ entry: define i32 @hoo(i32* nocapture %a) nounwind uwtable readonly { entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + tail call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32, i32 0)] br label %for.body for.body: ; preds = %entry, %for.body @@ -98,10 +81,7 @@ for.end: ; preds = %for.body ; load(a, i0+i1+i2+32) define void @hoo2(i32* nocapture %a, i64 %id, i64 %num) nounwind uwtable readonly { entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + tail call void @llvm.assume(i1 true) ["align"(i32* %a, i8 32, i64 0)] %id.mul = shl nsw i64 %id, 6 %num.mul = shl nsw i64 %num, 6 br label %for0.body @@ -147,10 +127,7 @@ return: define i32 @joo(i32* nocapture %a) nounwind uwtable readonly { entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + tail call void @llvm.assume(i1 true) ["align"(i32* %a, i8 32, i8 0)] br label %for.body for.body: ; preds = %entry, %for.body @@ -175,16 +152,13 @@ for.end: ; preds = %for.body define i32 @koo(i32* nocapture %a) nounwind uwtable readonly { entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) br label %for.body for.body: ; preds = %entry, %for.body %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ] %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + tail call void @llvm.assume(i1 true) ["align"(i32* %a, i8 32, i8 0)] %0 = load i32, i32* %arrayidx, align 4 %add = add nsw i32 %0, %r.06 %indvars.iv.next = add i64 %indvars.iv, 4 @@ -203,10 +177,7 @@ for.end: ; preds = %for.body define i32 @koo2(i32* nocapture %a) nounwind uwtable readonly { entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + tail call void @llvm.assume(i1 true) ["align"(i32* %a, i128 32, i128 0)] br label %for.body for.body: ; preds = %entry, %for.body @@ -231,10 +202,7 @@ for.end: ; preds = %for.body define i32 @moo(i32* nocapture %a) nounwind uwtable { entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + tail call void @llvm.assume(i1 true) ["align"(i32* %a, i16 32)] %0 = bitcast i32* %a to i8* tail call void @llvm.memset.p0i8.i64(i8* align 4 %0, i8 0, i64 64, i1 false) ret i32 undef @@ -246,15 +214,9 @@ entry: define i32 @moo2(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) - %ptrint1 = ptrtoint i32* %b to i64 - %maskedptr3 = and i64 %ptrint1, 127 - %maskcond4 = icmp eq i64 %maskedptr3, 0 - tail call void @llvm.assume(i1 %maskcond4) + tail call void @llvm.assume(i1 true) ["align"(i32* %b, i32 128)] %0 = bitcast i32* %a to i8* + tail call void @llvm.assume(i1 true) ["align"(i8* %0, i16 32)] %1 = bitcast i32* %b to i8* tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 64, i1 false) ret i32 undef @@ -264,6 +226,19 @@ entry: ; CHECK: ret i32 undef } +define i32 @moo3(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { +entry: + %0 = bitcast i32* %a to i8* + tail call void @llvm.assume(i1 true) ["align"(i8* %0, i16 32), "align"(i32* %b, i32 128)] + %1 = bitcast i32* %b to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 64, i1 false) + ret i32 undef + +; CHECK-LABEL: @moo3 +; CHECK: @llvm.memcpy.p0i8.p0i8.i64(i8* align 32 %0, i8* align 128 %1, i64 64, i1 false) +; CHECK: ret i32 undef +} + declare void @llvm.assume(i1) nounwind declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind diff --git a/llvm/test/Transforms/AlignmentFromAssumptions/simple32.ll b/llvm/test/Transforms/AlignmentFromAssumptions/simple32.ll index 3f0819e3641b3..453899c15c4fb 100644 --- a/llvm/test/Transforms/AlignmentFromAssumptions/simple32.ll +++ b/llvm/test/Transforms/AlignmentFromAssumptions/simple32.ll @@ -7,18 +7,12 @@ define i32 @foo(i32* nocapture %a) nounwind uwtable readonly { ; CHECK-LABEL: define {{[^@]+}}@foo ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 32 ; CHECK-NEXT: ret i32 [[TMP0]] ; entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)] %0 = load i32, i32* %a, align 4 ret i32 %0 @@ -28,21 +22,13 @@ define i32 @foo2(i32* nocapture %a) nounwind uwtable readonly { ; CHECK-LABEL: define {{[^@]+}}@foo2 ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64 -; CHECK-NEXT: [[OFFSETPTR:%.*]] = add i64 [[PTRINT]], 24 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[OFFSETPTR]], 31 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32, i64 24) ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 16 ; CHECK-NEXT: ret i32 [[TMP0]] ; entry: - %ptrint = ptrtoint i32* %a to i64 - %offsetptr = add i64 %ptrint, 24 - %maskedptr = and i64 %offsetptr, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32, i64 24)] %arrayidx = getelementptr inbounds i32, i32* %a, i64 2 %0 = load i32, i32* %arrayidx, align 4 ret i32 %0 @@ -53,21 +39,13 @@ define i32 @foo2a(i32* nocapture %a) nounwind uwtable readonly { ; CHECK-LABEL: define {{[^@]+}}@foo2a ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64 -; CHECK-NEXT: [[OFFSETPTR:%.*]] = add i64 [[PTRINT]], 28 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[OFFSETPTR]], 31 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32, i64 28) ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 -1 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 32 ; CHECK-NEXT: ret i32 [[TMP0]] ; entry: - %ptrint = ptrtoint i32* %a to i64 - %offsetptr = add i64 %ptrint, 28 - %maskedptr = and i64 %offsetptr, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32, i64 28)] %arrayidx = getelementptr inbounds i32, i32* %a, i64 -1 %0 = load i32, i32* %arrayidx, align 4 ret i32 %0 @@ -78,18 +56,12 @@ define i32 @goo(i32* nocapture %a) nounwind uwtable readonly { ; CHECK-LABEL: define {{[^@]+}}@goo ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 32 ; CHECK-NEXT: ret i32 [[TMP0]] ; entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)] %0 = load i32, i32* %a, align 4 ret i32 %0 @@ -99,10 +71,7 @@ define i32 @hoo(i32* nocapture %a) nounwind uwtable readonly { ; CHECK-LABEL: define {{[^@]+}}@hoo ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -119,10 +88,7 @@ define i32 @hoo(i32* nocapture %a) nounwind uwtable readonly { ; CHECK-NEXT: ret i32 [[ADD_LCSSA]] ; entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)] br label %for.body for.body: ; preds = %entry, %for.body @@ -146,10 +112,7 @@ define i32 @joo(i32* nocapture %a) nounwind uwtable readonly { ; CHECK-LABEL: define {{[^@]+}}@joo ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 4, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -166,10 +129,7 @@ define i32 @joo(i32* nocapture %a) nounwind uwtable readonly { ; CHECK-NEXT: ret i32 [[ADD_LCSSA]] ; entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)] br label %for.body for.body: ; preds = %entry, %for.body @@ -193,10 +153,7 @@ define i32 @koo(i32* nocapture %a) nounwind uwtable readonly { ; CHECK-LABEL: define {{[^@]+}}@koo ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -213,10 +170,7 @@ define i32 @koo(i32* nocapture %a) nounwind uwtable readonly { ; CHECK-NEXT: ret i32 [[ADD_LCSSA]] ; entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)] br label %for.body for.body: ; preds = %entry, %for.body @@ -240,10 +194,7 @@ define i32 @koo2(i32* nocapture %a) nounwind uwtable readonly { ; CHECK-LABEL: define {{[^@]+}}@koo2 ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ -4, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -260,10 +211,7 @@ define i32 @koo2(i32* nocapture %a) nounwind uwtable readonly { ; CHECK-NEXT: ret i32 [[ADD_LCSSA]] ; entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)] br label %for.body for.body: ; preds = %entry, %for.body @@ -287,19 +235,13 @@ define i32 @moo(i32* nocapture %a) nounwind uwtable { ; CHECK-LABEL: define {{[^@]+}}@moo ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #1 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to i8* ; CHECK-NEXT: tail call void @llvm.memset.p0i8.i64(i8* align 32 [[TMP0]], i8 0, i64 64, i1 false) ; CHECK-NEXT: ret i32 undef ; entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) + call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)] %0 = bitcast i32* %a to i8* tail call void @llvm.memset.p0i8.i64(i8* align 4 %0, i8 0, i64 64, i1 false) ret i32 undef @@ -310,28 +252,16 @@ define i32 @moo2(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { ; CHECK-LABEL: define {{[^@]+}}@moo2 ; CHECK-SAME: (i32* nocapture [[A:%.*]], i32* nocapture [[B:%.*]]) #1 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) -; CHECK-NEXT: [[PTRINT1:%.*]] = ptrtoint i32* [[B]] to i64 -; CHECK-NEXT: [[MASKEDPTR3:%.*]] = and i64 [[PTRINT1]], 127 -; CHECK-NEXT: [[MASKCOND4:%.*]] = icmp eq i64 [[MASKEDPTR3]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(i32* [[B]], i64 128) ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to i8* ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[B]] to i8* ; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 32 [[TMP0]], i8* align 128 [[TMP1]], i64 64, i1 false) ; CHECK-NEXT: ret i32 undef ; entry: - %ptrint = ptrtoint i32* %a to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - tail call void @llvm.assume(i1 %maskcond) - %ptrint1 = ptrtoint i32* %b to i64 - %maskedptr3 = and i64 %ptrint1, 127 - %maskcond4 = icmp eq i64 %maskedptr3, 0 - tail call void @llvm.assume(i1 %maskcond4) + call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)] + call void @llvm.assume(i1 true) ["align"(i32* %b, i64 128)] %0 = bitcast i32* %a to i8* %1 = bitcast i32* %b to i8* tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 64, i1 false) diff --git a/llvm/test/Transforms/ArgumentPromotion/profile.ll b/llvm/test/Transforms/ArgumentPromotion/profile.ll index f4bceb3eb913d..941eafad1af3e 100644 --- a/llvm/test/Transforms/ArgumentPromotion/profile.ll +++ b/llvm/test/Transforms/ArgumentPromotion/profile.ll @@ -15,9 +15,9 @@ define void @caller() #0 { ret void } -define internal void @promote_i32_ptr(i32* %xp) { +define internal void @promote_i32_ptr(i32* %xp) !prof !1 { ; CHECK-LABEL: define {{[^@]+}}@promote_i32_ptr -; CHECK-SAME: (i32 [[XP_VAL:%.*]]) +; CHECK-SAME: (i32 [[XP_VAL:%.*]]) !prof !1 ; CHECK-NEXT: call void @use_i32(i32 [[XP_VAL]]) ; CHECK-NEXT: ret void ; @@ -29,3 +29,4 @@ define internal void @promote_i32_ptr(i32* %xp) { declare void @use_i32(i32) !0 = !{!"branch_weights", i32 30} +!1 = !{!"function_entry_count", i64 100} diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll index 421ddc2bdd396..a50017ac73315 100644 --- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll +++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=4 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll index 50d318198e149..310abfba58d55 100644 --- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll +++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=4 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM @@ -44,7 +44,7 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* ; IS__TUNIT_OPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11:#.*]] +; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11:#.*]] ; IS__TUNIT_OPM-NEXT: call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) [[ATTR12:#.*]] ; IS__TUNIT_OPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 ; IS__TUNIT_OPM-NEXT: store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2 @@ -57,7 +57,7 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* ; IS__TUNIT_NPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11:#.*]] +; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11:#.*]] ; IS__TUNIT_NPM-NEXT: [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64 ; IS__TUNIT_NPM-NEXT: call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) [[ATTR12:#.*]] ; IS__TUNIT_NPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 @@ -138,7 +138,7 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* ; IS__TUNIT_OPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_OPM-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) [[ATTR12]] ; IS__TUNIT_OPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 ; IS__TUNIT_OPM-NEXT: store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2 @@ -151,7 +151,7 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* ; IS__TUNIT_NPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_NPM-NEXT: [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64 ; IS__TUNIT_NPM-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) [[ATTR12]] ; IS__TUNIT_NPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 @@ -232,7 +232,7 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* ; IS__TUNIT_OPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_OPM-NEXT: call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) [[ATTR12]] ; IS__TUNIT_OPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 ; IS__TUNIT_OPM-NEXT: store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2 @@ -245,7 +245,7 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* ; IS__TUNIT_NPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_NPM-NEXT: [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64 ; IS__TUNIT_NPM-NEXT: call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) [[ATTR12]] ; IS__TUNIT_NPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 @@ -326,7 +326,7 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* ; IS__TUNIT_OPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_OPM-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) [[ATTR12]] ; IS__TUNIT_OPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 ; IS__TUNIT_OPM-NEXT: store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2 @@ -339,7 +339,7 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* ; IS__TUNIT_NPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_NPM-NEXT: [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64 ; IS__TUNIT_NPM-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) [[ATTR12]] ; IS__TUNIT_NPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 @@ -418,7 +418,7 @@ define void @avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* ; IS__TUNIT_OPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_OPM-NEXT: call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) [[ATTR12]] ; IS__TUNIT_OPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 ; IS__TUNIT_OPM-NEXT: store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2 @@ -431,7 +431,7 @@ define void @avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* ; IS__TUNIT_NPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_NPM-NEXT: call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) [[ATTR12]] ; IS__TUNIT_NPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 ; IS__TUNIT_NPM-NEXT: store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2 @@ -508,7 +508,7 @@ define void @avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* ; IS__TUNIT_OPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_OPM-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) [[ATTR12]] ; IS__TUNIT_OPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 ; IS__TUNIT_OPM-NEXT: store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2 @@ -521,7 +521,7 @@ define void @avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* ; IS__TUNIT_NPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_NPM-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) [[ATTR12]] ; IS__TUNIT_NPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 ; IS__TUNIT_NPM-NEXT: store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2 @@ -600,7 +600,7 @@ define void @avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* %ar ; IS__TUNIT_OPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_OPM-NEXT: call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) [[ATTR12]] ; IS__TUNIT_OPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 ; IS__TUNIT_OPM-NEXT: store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2 @@ -613,7 +613,7 @@ define void @avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* %ar ; IS__TUNIT_NPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_NPM-NEXT: [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64 ; IS__TUNIT_NPM-NEXT: call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) [[ATTR12]] ; IS__TUNIT_NPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 @@ -694,7 +694,7 @@ define void @avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* %ar ; IS__TUNIT_OPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_OPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_OPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_OPM-NEXT: call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) [[ATTR12]] ; IS__TUNIT_OPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 ; IS__TUNIT_OPM-NEXT: store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2 @@ -707,7 +707,7 @@ define void @avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* %ar ; IS__TUNIT_NPM-NEXT: [[TMP:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; IS__TUNIT_NPM-NEXT: [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8* -; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] +; IS__TUNIT_NPM-NEXT: call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) [[ATTR11]] ; IS__TUNIT_NPM-NEXT: [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64 ; IS__TUNIT_NPM-NEXT: call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) [[ATTR12]] ; IS__TUNIT_NPM-NEXT: [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64 diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/thiscall.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/thiscall.ll index 25729fb893335..29f6a1bf6d3f5 100644 --- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/thiscall.ll +++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/thiscall.ll @@ -4,8 +4,8 @@ ; we don't do that anymore. It also verifies that the combination of ; globalopt and argpromotion is able to optimize the call safely. ; -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=7 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM -; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=7 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/dbg.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/dbg.ll index 5e40294cdb27b..64d5adaa75020 100644 --- a/llvm/test/Transforms/Attributor/ArgumentPromotion/dbg.ll +++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/dbg.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=4 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM -; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=4 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/profile.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/profile.ll index 3584172b242da..932f9197e9ce1 100644 --- a/llvm/test/Transforms/Attributor/ArgumentPromotion/profile.ll +++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/profile.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=4 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM -; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=4 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/multiple_callbacks.ll b/llvm/test/Transforms/Attributor/IPConstantProp/multiple_callbacks.ll index ee411ec0c857e..91bf46ca2148f 100644 --- a/llvm/test/Transforms/Attributor/IPConstantProp/multiple_callbacks.ll +++ b/llvm/test/Transforms/Attributor/IPConstantProp/multiple_callbacks.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=6 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM -; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=6 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM ; diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll b/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll index 4d8b20cb1cf3f..5afeb2071d192 100644 --- a/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll +++ b/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=7 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM -; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=7 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=1 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=1 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM ; diff --git a/llvm/test/Transforms/Attributor/callbacks.ll b/llvm/test/Transforms/Attributor/callbacks.ll index 03ca89fd1b08a..26e4ce2679ccc 100644 --- a/llvm/test/Transforms/Attributor/callbacks.ll +++ b/llvm/test/Transforms/Attributor/callbacks.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM -; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM @@ -115,6 +115,7 @@ declare !callback !0 void @t0_callback_broker(i32*, i32*, void (i32*, i32*, ...) ; we deduce and propagate noalias and others properly. define void @t1_caller(i32* noalias %a) { +; ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@t1_caller ; IS__TUNIT_OPM-SAME: (i32* noalias nocapture align 256 [[A:%.*]]) { ; IS__TUNIT_OPM-NEXT: entry: @@ -136,7 +137,7 @@ define void @t1_caller(i32* noalias %a) { ; IS__TUNIT_NPM-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to i8* ; IS__TUNIT_NPM-NEXT: store i32 42, i32* [[B]], align 32 ; IS__TUNIT_NPM-NEXT: store i32* [[B]], i32** [[C]], align 64 -; IS__TUNIT_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) +; IS__TUNIT_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) ; IS__TUNIT_NPM-NEXT: ret void ; ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@t1_caller @@ -160,7 +161,7 @@ define void @t1_caller(i32* noalias %a) { ; IS__CGSCC_NPM-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to i8* ; IS__CGSCC_NPM-NEXT: store i32 42, i32* [[B]], align 32 ; IS__CGSCC_NPM-NEXT: store i32* [[B]], i32** [[C]], align 64 -; IS__CGSCC_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 noundef 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) +; IS__CGSCC_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 noundef 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) ; IS__CGSCC_NPM-NEXT: ret void ; entry: @@ -190,7 +191,7 @@ define internal void @t1_callback_callee(i32* %is_not_null, i32* %ptr, i32* %a, ; ; IS________NPM: Function Attrs: nosync ; IS________NPM-LABEL: define {{[^@]+}}@t1_callback_callee -; IS________NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* noalias nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C:%.*]]) [[ATTR0:#.*]] { +; IS________NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C:%.*]]) [[ATTR0:#.*]] { ; IS________NPM-NEXT: entry: ; IS________NPM-NEXT: [[PTR_VAL:%.*]] = load i32, i32* [[PTR]], align 8 ; IS________NPM-NEXT: store i32 [[PTR_VAL]], i32* [[IS_NOT_NULL]], align 4 @@ -236,7 +237,7 @@ define void @t2_caller(i32* noalias %a) { ; IS__TUNIT_NPM-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to i8* ; IS__TUNIT_NPM-NEXT: store i32 42, i32* [[B]], align 32 ; IS__TUNIT_NPM-NEXT: store i32* [[B]], i32** [[C]], align 64 -; IS__TUNIT_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) +; IS__TUNIT_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) ; IS__TUNIT_NPM-NEXT: ret void ; ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@t2_caller @@ -260,7 +261,7 @@ define void @t2_caller(i32* noalias %a) { ; IS__CGSCC_NPM-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to i8* ; IS__CGSCC_NPM-NEXT: store i32 42, i32* [[B]], align 32 ; IS__CGSCC_NPM-NEXT: store i32* [[B]], i32** [[C]], align 64 -; IS__CGSCC_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 noundef 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) +; IS__CGSCC_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 noundef 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) ; IS__CGSCC_NPM-NEXT: ret void ; entry: @@ -337,8 +338,8 @@ define void @t3_caller(i32* noalias %a) { ; IS__TUNIT_NPM-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to i8* ; IS__TUNIT_NPM-NEXT: store i32 42, i32* [[B]], align 32 ; IS__TUNIT_NPM-NEXT: store i32* [[B]], i32** [[C]], align 64 -; IS__TUNIT_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) -; IS__TUNIT_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) +; IS__TUNIT_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) +; IS__TUNIT_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) ; IS__TUNIT_NPM-NEXT: ret void ; ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@t3_caller @@ -363,8 +364,8 @@ define void @t3_caller(i32* noalias %a) { ; IS__CGSCC_NPM-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to i8* ; IS__CGSCC_NPM-NEXT: store i32 42, i32* [[B]], align 32 ; IS__CGSCC_NPM-NEXT: store i32* [[B]], i32** [[C]], align 64 -; IS__CGSCC_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 noundef 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) -; IS__CGSCC_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 noundef 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) +; IS__CGSCC_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 noundef 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) +; IS__CGSCC_NPM-NEXT: call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 noundef 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]]) ; IS__CGSCC_NPM-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/Attributor/chain.ll b/llvm/test/Transforms/Attributor/chain.ll new file mode 100644 index 0000000000000..0306fe22c0b3c --- /dev/null +++ b/llvm/test/Transforms/Attributor/chain.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes --check-attributes +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -attributor-max-initialization-chain-length=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK_1 +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-annotate-decl-cs -attributor-max-initialization-chain-length=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK_1 +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -attributor-max-initialization-chain-length=1024 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK_5 +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-annotate-decl-cs -attributor-max-initialization-chain-length=1024 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK_5 + +declare void @foo(i8* dereferenceable(8) %arg) + +define dso_local i32 @bar(i32* %arg) { +; CHECK_1-LABEL: define {{[^@]+}}@bar +; CHECK_1-SAME: (i32* dereferenceable_or_null(8) [[ARG:%.*]]) { +; CHECK_1-NEXT: entry: +; CHECK_1-NEXT: [[BC1:%.*]] = bitcast i32* [[ARG]] to i8* +; CHECK_1-NEXT: call void @foo(i8* dereferenceable_or_null(8) [[BC1]]) +; CHECK_1-NEXT: [[LD:%.*]] = load i32, i32* [[ARG]], align 4 +; CHECK_1-NEXT: ret i32 [[LD]] +; +; CHECK_5-LABEL: define {{[^@]+}}@bar +; CHECK_5-SAME: (i32* nonnull dereferenceable(8) [[ARG:%.*]]) { +; CHECK_5-NEXT: entry: +; CHECK_5-NEXT: [[BC1:%.*]] = bitcast i32* [[ARG]] to i8* +; CHECK_5-NEXT: call void @foo(i8* nonnull dereferenceable(8) [[BC1]]) +; CHECK_5-NEXT: [[LD:%.*]] = load i32, i32* [[ARG]], align 4 +; CHECK_5-NEXT: ret i32 [[LD]] +; +entry: + %bc1 = bitcast i32* %arg to i8* + call void @foo(i8* %bc1) + %ld = load i32, i32* %arg + ret i32 %ld +} diff --git a/llvm/test/Transforms/Attributor/depgraph.ll b/llvm/test/Transforms/Attributor/depgraph.ll index 791af581b22a0..d7dc9d42f49b2 100644 --- a/llvm/test/Transforms/Attributor/depgraph.ll +++ b/llvm/test/Transforms/Attributor/depgraph.ll @@ -51,88 +51,214 @@ define i32* @checkAndAdvance(i32* align 16 %0) { ; Check for graph ; -; GRAPH: [AANoUnwind] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nounwind -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nounwind -; GRAPH: updates [AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nounwind -; GRAPH: updates [AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nounwind -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: [AANoSync] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nosync -; GRAPH: updates [AANoSync] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nosync -; GRAPH: updates [AANoSync] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nosync -; GRAPH: [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nofree -; GRAPH: updates [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state nofree -; GRAPH: updates [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state nofree -; GRAPH: updates [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nofree -; GRAPH: updates [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nofree -; GRAPH: [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state readonly -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly -; GRAPH: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly -; GRAPH: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly -; GRAPH: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly -; GRAPH: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly -; GRAPH: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument -; GRAPH: updates [AAMemoryLocation] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state memory:argument -; GRAPH: updates [AAMemoryLocation] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state memory:argument -; GRAPH: [AAAlign] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state align<0-16> -; GRAPH: updates [AAAlign] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state align<1-16> -; GRAPH: updates [AAAlign] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state align<1-16> -; GRAPH: [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull -; GRAPH: updates [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state nonnull -; GRAPH: updates [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state nonnull -; GRAPH: [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly -; GRAPH: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state readonly -; GRAPH: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state readonly -; GRAPH: [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state nofree -; GRAPH: updates [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state nofree -; GRAPH: [AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nounwind -; GRAPH: updates [AAIsDead] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state assumed-live -; GRAPH: updates [AAIsDead] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state assumed-live -; GRAPH: updates [AANoUnwind] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nounwind -; GRAPH: [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly -; GRAPH: updates [AAIsDead] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state assumed-live -; GRAPH: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state readonly -; GRAPH: [AANoCapture] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned -; GRAPH: [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state readonly -; GRAPH: updates [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument -; GRAPH: updates [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument -; GRAPH: updates [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument -; GRAPH: [AANonNull] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position {flt: [@-1]} with state nonnull -; GRAPH: updates [AANonNull] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position {flt: [@-1]} with state nonnull -; GRAPH: updates [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state nonnull -; GRAPH: updates [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull -; GRAPH: updates [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull -; GRAPH: [AANoSync] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nosync -; GRAPH: updates [AANoSync] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nosync -; GRAPH: [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nofree -; GRAPH: updates [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nofree -; GRAPH: [AAMemoryLocation] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state memory:argument -; GRAPH: updates [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument -; GRAPH: [AAAlign] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state align<1-16> -; GRAPH: updates [AAAlign] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state align<0-16> -; GRAPH: [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state nonnull -; GRAPH: updates [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull +; GRAPH: [AAIsDead] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state Live[#BB 4/4][#TBEP 0][#KDE 1] +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAValueSimplify] for CtxI ' %3 = icmp eq i32 %2, 0' at position {flt: [@-1]} with state simplified +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAWillReturn] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state may-noreturn +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAUndefinedBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state undefined-behavior +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAValueSimplify] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state simplified +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoUndef] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state may-undef-or-poison +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAReturnedValues] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state returns(#3)[#UC: 1] +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoUnwind] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nounwind +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nounwind +; GRAPH-NEXT: updates [AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nounwind +; GRAPH-NEXT: updates [AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nounwind +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoSync] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nosync +; GRAPH-NEXT: updates [AANoSync] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nosync +; GRAPH-NEXT: updates [AANoSync] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nosync +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAIsDead] for CtxI ' %2 = load i32, i32* %0, align 4' at position {flt: [@-1]} with state assumed-live +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAValueSimplify] for CtxI ' %2 = load i32, i32* %0, align 4' at position {flt: [@-1]} with state simplified +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAValueConstantRange] for CtxI ' %2 = load i32, i32* %0, align 4' at position {flt: [@-1]} with state range(32) +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAPotentialValues] for CtxI ' %2 = load i32, i32* %0, align 4' at position {flt: [@-1]} with state set-state(< {full-set} >) +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAIsDead] for CtxI ' %3 = icmp eq i32 %2, 0' at position {flt: [@-1]} with state assumed-live +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAIsDead] for CtxI ' br i1 %3, label %4, label %7' at position {flt: [@-1]} with state assumed-live +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nofree +; GRAPH-NEXT: updates [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state nofree +; GRAPH-NEXT: updates [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state nofree +; GRAPH-NEXT: updates [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nofree +; GRAPH-NEXT: updates [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nofree +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoReturn] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state may-return +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoRecurse] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state may-recurse +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state readonly +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly +; GRAPH-NEXT: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly +; GRAPH-NEXT: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly +; GRAPH-NEXT: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly +; GRAPH-NEXT: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly +; GRAPH-NEXT: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument +; GRAPH-NEXT: updates [AAMemoryLocation] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state memory:argument +; GRAPH-NEXT: updates [AAMemoryLocation] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state memory:argument +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAHeapToStack] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state [H2S] Mallocs: 0 +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAIsDead] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state assumed-live +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAValueSimplify] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state simplified +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAAlign] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state align<0-16> +; GRAPH-NEXT: updates [AAAlign] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state align<1-16> +; GRAPH-NEXT: updates [AAAlign] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state align<1-16> +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull +; GRAPH-NEXT: updates [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state nonnull +; GRAPH-NEXT: updates [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state nonnull +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoAlias] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state may-alias +; GRAPH-EMPTY: +; GRAPH-NEXT: [AADereferenceable] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state unknown-dereferenceable +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAIsDead] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed-live +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoUndef] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state may-undef-or-poison +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state nonnull +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoAlias] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state may-alias +; GRAPH-EMPTY: +; GRAPH-NEXT: [AADereferenceable] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state dereferenceable<4-4> +; GRAPH-EMPTY: +; GRAPH-NEXT: [AADereferenceable] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state unknown-dereferenceable +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state nonnull +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAAlign] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state align<16-16> +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAAlign] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state align<16-16> +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly +; GRAPH-NEXT: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state readonly +; GRAPH-NEXT: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state readonly +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state nofree +; GRAPH-NEXT: updates [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state nofree +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAPrivatizablePtr] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state [no-priv] +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAIsDead] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state assumed-live +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nounwind +; GRAPH-NEXT: updates [AAIsDead] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state assumed-live +; GRAPH-NEXT: updates [AAIsDead] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state assumed-live +; GRAPH-NEXT: updates [AANoUnwind] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nounwind +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly +; GRAPH-NEXT: updates [AAIsDead] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state assumed-live +; GRAPH-NEXT: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state readonly +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAIsDead] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state assumed-live +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAValueSimplify] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state simplified +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoUndef] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state may-undef-or-poison +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoCapture] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-NEXT: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoAlias] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state may-alias +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state readonly +; GRAPH-NEXT: updates [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument +; GRAPH-NEXT: updates [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument +; GRAPH-NEXT: updates [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state nofree +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAValueConstantRange] for CtxI ' %3 = icmp eq i32 %2, 0' at position {flt: [@-1]} with state range(1) +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAValueConstantRange] for CtxI <> at position {flt: [@-1]} with state range(32)<[0,1) / [0,1)> +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAPotentialValues] for CtxI ' %3 = icmp eq i32 %2, 0' at position {flt: [@-1]} with state set-state(< {full-set} >) +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoReturn] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state may-return +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoAlias] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position {flt: [@-1]} with state may-alias +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAValueSimplify] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position {flt: [@-1]} with state simplified +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoUndef] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position {flt: [@-1]} with state may-undef-or-poison +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAValueSimplify] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state simplified +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAAlign] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position {flt: [@-1]} with state align<16-16> +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANonNull] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position {flt: [@-1]} with state nonnull +; GRAPH-NEXT: updates [AANonNull] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position {flt: [@-1]} with state nonnull +; GRAPH-NEXT: updates [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state nonnull +; GRAPH-NEXT: updates [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull +; GRAPH-NEXT: updates [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAIsDead] for CtxI ' ret i32* %.0' at position {flt: [@-1]} with state assumed-live +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAIsDead] for CtxI ' br label %8' at position {flt: [@-1]} with state assumed-live +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAIsDead] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position {flt: [@-1]} with state assumed-live +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAWillReturn] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state may-noreturn +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoRecurse] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state may-recurse +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoSync] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nosync +; GRAPH-NEXT: updates [AANoSync] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nosync +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nofree +; GRAPH-NEXT: updates [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nofree +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAMemoryLocation] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state memory:argument +; GRAPH-NEXT: updates [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAIsDead] for CtxI ' br label %8' at position {flt: [@-1]} with state assumed-live +; GRAPH-EMPTY: +; GRAPH-NEXT: [AAAlign] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state align<1-16> +; GRAPH-NEXT: updates [AAAlign] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state align<0-16> +; GRAPH-EMPTY: +; GRAPH-NEXT: [AADereferenceable] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state unknown-dereferenceable +; GRAPH-EMPTY: +; GRAPH-NEXT: [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state nonnull +; GRAPH-NEXT: updates [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull +; GRAPH-EMPTY: +; GRAPH-NEXT: [AADereferenceable] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position {flt: [@-1]} with state unknown-dereferenceable + ; GRAPH-NOT: update ; diff --git a/llvm/test/Transforms/Attributor/dereferenceable-2.ll b/llvm/test/Transforms/Attributor/dereferenceable-2.ll index aa3130e4a3190..816e5c47ef35b 100644 --- a/llvm/test/Transforms/Attributor/dereferenceable-2.ll +++ b/llvm/test/Transforms/Attributor/dereferenceable-2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM -; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=1 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=1 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll index 3c34419a960d4..27774c525c4e0 100644 --- a/llvm/test/Transforms/Attributor/heap_to_stack.ll +++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll @@ -428,9 +428,8 @@ define void @test11() { ; IS________OPM-NEXT: ret void ; ; IS________NPM-LABEL: define {{[^@]+}}@test11() { -; IS________NPM-NEXT: [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 noundef 4) +; IS________NPM-NEXT: [[TMP1:%.*]] = alloca i8, i64 4, align 1 ; IS________NPM-NEXT: tail call void @sync_will_return(i8* [[TMP1]]) [[ATTR6]] -; IS________NPM-NEXT: tail call void @free(i8* nocapture [[TMP1]]) ; IS________NPM-NEXT: ret void ; %1 = tail call noalias i8* @malloc(i64 4) @@ -739,10 +738,9 @@ define void @test16c(i8 %v, i8** %P) { ; ; IS________NPM-LABEL: define {{[^@]+}}@test16c ; IS________NPM-SAME: (i8 [[V:%.*]], i8** nocapture writeonly [[P:%.*]]) { -; IS________NPM-NEXT: [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 noundef 4) +; IS________NPM-NEXT: [[TMP1:%.*]] = alloca i8, i64 4, align 1 ; IS________NPM-NEXT: store i8* [[TMP1]], i8** [[P]], align 8 ; IS________NPM-NEXT: tail call void @no_sync_func(i8* nocapture nofree [[TMP1]]) [[ATTR6]] -; IS________NPM-NEXT: tail call void @free(i8* nocapture [[TMP1]]) ; IS________NPM-NEXT: ret void ; %1 = tail call noalias i8* @malloc(i64 4) diff --git a/llvm/test/Transforms/Attributor/internalize.ll b/llvm/test/Transforms/Attributor/internalize.ll index 8a244b5c998c3..3e485382e9be0 100644 --- a/llvm/test/Transforms/Attributor/internalize.ll +++ b/llvm/test/Transforms/Attributor/internalize.ll @@ -12,16 +12,14 @@ ; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=8 -attributor-allow-deep-wrappers -disable-inlining -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM,CHECK_ENABLED,NOT_CGSCC_OPM_ENABLED,NOT_CGSCC_NPM_ENABLED,NOT_TUNIT_OPM_ENABLED,IS__TUNIT_____ENABLED,IS________NPM_ENABLED,IS__TUNIT_NPM_ENABLED ; RUN: opt -attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -attributor-allow-deep-wrappers -disable-inlining -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM,CHECK_ENABLED,NOT_TUNIT_NPM_ENABLED,NOT_TUNIT_OPM_ENABLED,NOT_CGSCC_NPM_ENABLED,IS__CGSCC_____ENABLED,IS________OPM_ENABLED,IS__CGSCC_OPM_ENABLED ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -attributor-allow-deep-wrappers -disable-inlining -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM,CHECK_ENABLED,NOT_TUNIT_NPM_ENABLED,NOT_TUNIT_OPM_ENABLED,NOT_CGSCC_OPM_ENABLED,IS__CGSCC_____ENABLED,IS________NPM_ENABLED,IS__CGSCC_NPM_ENABLED -; RUN: opt -attributor -attributor-cgscc -disable-inlining -attributor-allow-deep-wrappers -S < %s | FileCheck %s --check-prefix=DWRAPPER ; TEST 1: This function is of linkage `linkonce`, we cannot internalize this ; function and use information derived from it ; -; DWRAPPER-NOT: Function Attrs -; DWRAPPER-NOT: inner1.internalized +; CHECK-NOT: inner1.internalized define linkonce i32 @inner1(i32 %a, i32 %b) { ; CHECK-LABEL: define {{[^@]+}}@inner1 -; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] ; CHECK-NEXT: ret i32 [[C]] @@ -34,11 +32,10 @@ entry: ; TEST 2: This function is of linkage `weak`, we cannot internalize this function and ; use information derived from it ; -; DWRAPPER-NOT: Function Attrs -; DWRAPPER-NOT: inner2.internalized +; CHECK-NOT: inner2.internalized define weak i32 @inner2(i32 %a, i32 %b) { ; CHECK-LABEL: define {{[^@]+}}@inner2 -; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] ; CHECK-NEXT: ret i32 [[C]] @@ -51,17 +48,12 @@ entry: ; TEST 3: This function is of linkage `linkonce_odr`, which can be internalized using the ; deep wrapper, and the IP information derived from this function can be used ; -; DWRAPPER: Function Attrs: nofree norecurse nosync nounwind readnone willreturn -; DWRAPPER: define private i32 @inner3.internalized(i32 %a, i32 %b) -; DWRAPPER-NEXT: entry: -; DWRAPPER-NEXT: %c = add i32 %a, %b -; DWRAPPER-NEXT: ret i32 %c define linkonce_odr i32 @inner3(i32 %a, i32 %b) { -; CHECK-LABEL: define {{[^@]+}}@inner3 -; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) -; CHECK-NEXT: entry: -; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] -; CHECK-NEXT: ret i32 [[C]] +; CHECK_DISABLED-LABEL: define {{[^@]+}}@inner3 +; CHECK_DISABLED-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK_DISABLED-NEXT: entry: +; CHECK_DISABLED-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] +; CHECK_DISABLED-NEXT: ret i32 [[C]] ; entry: %c = add i32 %a, %b @@ -71,17 +63,12 @@ entry: ; TEST 4: This function is of linkage `weak_odr`, which can be internalized using the deep ; wrapper ; -; DWRAPPER: Function Attrs: nofree norecurse nosync nounwind readnone willreturn -; DWRAPPER: define private i32 @inner4.internalized(i32 %a, i32 %b) -; DWRAPPER-NEXT: entry: -; DWRAPPER-NEXT: %c = add i32 %a, %b -; DWRAPPER-NEXT: ret i32 %c define weak_odr i32 @inner4(i32 %a, i32 %b) { -; CHECK-LABEL: define {{[^@]+}}@inner4 -; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) -; CHECK-NEXT: entry: -; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] -; CHECK-NEXT: ret i32 [[C]] +; CHECK_DISABLED-LABEL: define {{[^@]+}}@inner4 +; CHECK_DISABLED-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK_DISABLED-NEXT: entry: +; CHECK_DISABLED-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] +; CHECK_DISABLED-NEXT: ret i32 [[C]] ; entry: %c = add i32 %a, %b @@ -91,10 +78,10 @@ entry: ; TEST 5: This function has linkage `linkonce_odr` but is never called (num of use = 0), so there ; is no need to internalize this ; -; DWRAPPER-NOT: inner5.internalized +; CHECK-NOT: inner5.internalized define linkonce_odr i32 @inner5(i32 %a, i32 %b) { ; CHECK-LABEL: define {{[^@]+}}@inner5 -; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] ; CHECK-NEXT: ret i32 [[C]] @@ -109,16 +96,8 @@ entry: ; Since the inner3 is internalized, the use of the original function should be replaced by the ; copied one ; -; DWRAPPER-NOT: call i32 @inner1.internalized -; DWRAPPER: call i32 @inner1 -; DWRAPPER-NOT: call i32 @inner2.internalized -; DWRAPPER: call i32 @inner2 -; DWRAPPER-NOT: call i32 @inner3 -; DWRAPPER: call i32 @inner3.internalized -; DWRAPPER-NOT: call i32 @inner4 -; DWRAPPER: call i32 @inner4.internalized define i32 @outer1() { -; CHECK_DISABLED-LABEL: define {{[^@]+}}@outer1() +; CHECK_DISABLED-LABEL: define {{[^@]+}}@outer1() { ; CHECK_DISABLED-NEXT: entry: ; CHECK_DISABLED-NEXT: [[RET1:%.*]] = call i32 @inner1(i32 noundef 1, i32 noundef 2) ; CHECK_DISABLED-NEXT: [[RET2:%.*]] = call i32 @inner2(i32 noundef 1, i32 noundef 2) @@ -126,7 +105,7 @@ define i32 @outer1() { ; CHECK_DISABLED-NEXT: [[RET4:%.*]] = call i32 @inner4(i32 [[RET3]], i32 [[RET3]]) ; CHECK_DISABLED-NEXT: ret i32 [[RET4]] ; -; CHECK_ENABLED-LABEL: define {{[^@]+}}@outer1() +; CHECK_ENABLED-LABEL: define {{[^@]+}}@outer1() { ; CHECK_ENABLED-NEXT: entry: ; CHECK_ENABLED-NEXT: [[RET1:%.*]] = call i32 @inner1(i32 noundef 1, i32 noundef 2) ; CHECK_ENABLED-NEXT: [[RET2:%.*]] = call i32 @inner2(i32 noundef 1, i32 noundef 2) @@ -145,29 +124,38 @@ entry: define linkonce_odr void @unused_arg(i8) { ; CHECK_DISABLED-LABEL: define {{[^@]+}}@unused_arg -; CHECK_DISABLED-SAME: (i8 [[TMP0:%.*]]) +; CHECK_DISABLED-SAME: (i8 [[TMP0:%.*]]) { ; CHECK_DISABLED-NEXT: unreachable ; unreachable } define void @unused_arg_caller() { -; CHECK_DISABLED-LABEL: define {{[^@]+}}@unused_arg_caller() +; CHECK_DISABLED-LABEL: define {{[^@]+}}@unused_arg_caller() { ; CHECK_DISABLED-NEXT: call void @unused_arg(i8 noundef 0) ; CHECK_DISABLED-NEXT: ret void ; ; IS__TUNIT_____ENABLED: Function Attrs: nofree noreturn nosync nounwind readnone willreturn -; IS__TUNIT_____ENABLED-LABEL: define {{[^@]+}}@unused_arg_caller() +; IS__TUNIT_____ENABLED-LABEL: define {{[^@]+}}@unused_arg_caller +; IS__TUNIT_____ENABLED-SAME: () [[ATTR1:#.*]] { ; IS__TUNIT_____ENABLED-NEXT: unreachable ; ; IS__CGSCC_____ENABLED: Function Attrs: nofree norecurse noreturn nosync nounwind readnone willreturn -; IS__CGSCC_____ENABLED-LABEL: define {{[^@]+}}@unused_arg_caller() +; IS__CGSCC_____ENABLED-LABEL: define {{[^@]+}}@unused_arg_caller +; IS__CGSCC_____ENABLED-SAME: () [[ATTR2:#.*]] { ; IS__CGSCC_____ENABLED-NEXT: unreachable -; -; DWRAPPER: Function Attrs: nofree norecurse noreturn nosync nounwind readnone willreturn -; DWRAPPER-LABEL: define {{[^@]+}}@unused_arg_caller() -; DWRAPPER-NEXT: unreachable ; call void @unused_arg(i8 0) ret void } + +; Don't crash on linkonce_odr hidden functions +define linkonce_odr hidden void @__clang_call_terminate() { +; CHECK_DISABLED-LABEL: define {{[^@]+}}@__clang_call_terminate() { +; CHECK_DISABLED-NEXT: call void @__clang_call_terminate() +; CHECK_DISABLED-NEXT: unreachable +; + call void @__clang_call_terminate() + unreachable +} + diff --git a/llvm/test/Transforms/Attributor/liveness.ll b/llvm/test/Transforms/Attributor/liveness.ll index ea36bb5f66e8c..8919cf66cbb9b 100644 --- a/llvm/test/Transforms/Attributor/liveness.ll +++ b/llvm/test/Transforms/Attributor/liveness.ll @@ -854,22 +854,22 @@ define internal void @middle() { ; NOT_CGSCC_NPM-NEXT: call void @non_dead_b3() [[ATTR11]] ; NOT_CGSCC_NPM-NEXT: br label [[BB1:%.*]] ; NOT_CGSCC_NPM: bb1: -; NOT_CGSCC_NPM-NEXT: call void @non_dead_b4() [[ATTR2:#.*]] -; NOT_CGSCC_NPM-NEXT: call void @non_dead_b5() [[ATTR2]] -; NOT_CGSCC_NPM-NEXT: call void @non_dead_b6() [[ATTR2]] -; NOT_CGSCC_NPM-NEXT: call void @non_dead_b7() [[ATTR2]] +; NOT_CGSCC_NPM-NEXT: call void @non_dead_b4() [[ATTR11]] +; NOT_CGSCC_NPM-NEXT: call void @non_dead_b5() [[ATTR11]] +; NOT_CGSCC_NPM-NEXT: call void @non_dead_b6() [[ATTR11]] +; NOT_CGSCC_NPM-NEXT: call void @non_dead_b7() [[ATTR11]] ; NOT_CGSCC_NPM-NEXT: br label [[BB2:%.*]] ; NOT_CGSCC_NPM: bb2: -; NOT_CGSCC_NPM-NEXT: call void @non_dead_b8() [[ATTR2]] -; NOT_CGSCC_NPM-NEXT: call void @non_dead_b9() [[ATTR2]] -; NOT_CGSCC_NPM-NEXT: call void @non_dead_b10() [[ATTR2]] -; NOT_CGSCC_NPM-NEXT: call void @non_dead_b11() [[ATTR2]] +; NOT_CGSCC_NPM-NEXT: call void @non_dead_b8() [[ATTR11]] +; NOT_CGSCC_NPM-NEXT: call void @non_dead_b9() [[ATTR11]] +; NOT_CGSCC_NPM-NEXT: call void @non_dead_b10() [[ATTR11]] +; NOT_CGSCC_NPM-NEXT: call void @non_dead_b11() [[ATTR11]] ; NOT_CGSCC_NPM-NEXT: br label [[BB3:%.*]] ; NOT_CGSCC_NPM: bb3: -; NOT_CGSCC_NPM-NEXT: call void @non_dead_b12() [[ATTR2]] -; NOT_CGSCC_NPM-NEXT: call void @non_dead_b13() [[ATTR2]] -; NOT_CGSCC_NPM-NEXT: call void @non_dead_b14() [[ATTR2]] -; NOT_CGSCC_NPM-NEXT: call void @non_dead_b15() [[ATTR2]] +; NOT_CGSCC_NPM-NEXT: call void @non_dead_b12() [[ATTR11]] +; NOT_CGSCC_NPM-NEXT: call void @non_dead_b13() [[ATTR11]] +; NOT_CGSCC_NPM-NEXT: call void @non_dead_b14() [[ATTR11]] +; NOT_CGSCC_NPM-NEXT: call void @non_dead_b15() [[ATTR11]] ; NOT_CGSCC_NPM-NEXT: br label [[BB4:%.*]] ; NOT_CGSCC_NPM: bb4: ; NOT_CGSCC_NPM-NEXT: call void @non_exact2() diff --git a/llvm/test/Transforms/Attributor/misc.ll b/llvm/test/Transforms/Attributor/misc.ll index 3fa65e07a5162..a5c4556ac0417 100644 --- a/llvm/test/Transforms/Attributor/misc.ll +++ b/llvm/test/Transforms/Attributor/misc.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=6 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM -; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=6 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=4 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=4 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM ; diff --git a/llvm/test/Transforms/Attributor/noalias.ll b/llvm/test/Transforms/Attributor/noalias.ll index e7e47d42f4566..18bb8e9719d52 100644 --- a/llvm/test/Transforms/Attributor/noalias.ll +++ b/llvm/test/Transforms/Attributor/noalias.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=9 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM -; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=9 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM -; TODO: The old pass manager cgscc run is disabled as it causes a crash on windows which is under investigation: http://lab.llvm.org:8011/builders/llvm-clang-x86_64-expensive-checks-win/builds/23151 +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=7 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=7 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM +; TODO: The old pass manager cgscc run is disabled as it causes a crash on windows which is under investigation: http://lab.llvm.org:8011/builders/llvm-clang-x86_64-expensive-checks-win/builds/25479/steps/test-check-all/logs/FAIL%3A%20LLVM%3A%3Anoalias.ll ; opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM @@ -572,29 +572,17 @@ define internal i32 @ret(i32* %arg) { ; Function Attrs: nounwind optsize define internal fastcc double @strtox(i8* %s, i8** %p, i32 %prec) unnamed_addr { -; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@strtox -; NOT_CGSCC_NPM-SAME: (i8* [[S:%.*]]) unnamed_addr { -; NOT_CGSCC_NPM-NEXT: entry: -; NOT_CGSCC_NPM-NEXT: [[F:%.*]] = alloca [[STRUCT__IO_FILE:%.*]], align 8 -; NOT_CGSCC_NPM-NEXT: [[TMP0:%.*]] = bitcast %struct._IO_FILE* [[F]] to i8* -; NOT_CGSCC_NPM-NEXT: call void @llvm.lifetime.start.p0i8(i64 noundef 144, i8* nocapture noundef nonnull align 8 dereferenceable(240) [[TMP0]]) [[ATTR10:#.*]] -; NOT_CGSCC_NPM-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @sh_fromstring to i32 (%struct._IO_FILE*, i8*)*)(%struct._IO_FILE* nonnull align 8 dereferenceable(240) [[F]], i8* [[S]]) -; NOT_CGSCC_NPM-NEXT: call void @__shlim(%struct._IO_FILE* noundef nonnull align 8 dereferenceable(240) [[F]], i64 noundef 0) -; NOT_CGSCC_NPM-NEXT: [[CALL1:%.*]] = call double @__floatscan(%struct._IO_FILE* noundef nonnull align 8 dereferenceable(240) [[F]], i32 noundef 1, i32 noundef 1) -; NOT_CGSCC_NPM-NEXT: call void @llvm.lifetime.end.p0i8(i64 noundef 144, i8* nocapture noundef nonnull align 8 dereferenceable(240) [[TMP0]]) -; NOT_CGSCC_NPM-NEXT: ret double [[CALL1]] -; -; IS__CGSCC____-LABEL: define {{[^@]+}}@strtox -; IS__CGSCC____-SAME: (i8* noalias [[S:%.*]]) unnamed_addr { -; IS__CGSCC____-NEXT: entry: -; IS__CGSCC____-NEXT: [[F:%.*]] = alloca [[STRUCT__IO_FILE:%.*]], align 8 -; IS__CGSCC____-NEXT: [[TMP0:%.*]] = bitcast %struct._IO_FILE* [[F]] to i8* -; IS__CGSCC____-NEXT: call void @llvm.lifetime.start.p0i8(i64 noundef 144, i8* nocapture noundef nonnull align 8 dereferenceable(240) [[TMP0]]) [[ATTR10]] -; IS__CGSCC____-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @sh_fromstring to i32 (%struct._IO_FILE*, i8*)*)(%struct._IO_FILE* nonnull align 8 dereferenceable(240) [[F]], i8* [[S]]) -; IS__CGSCC____-NEXT: call void @__shlim(%struct._IO_FILE* noundef nonnull align 8 dereferenceable(240) [[F]], i64 noundef 0) -; IS__CGSCC____-NEXT: [[CALL1:%.*]] = call double @__floatscan(%struct._IO_FILE* noundef nonnull align 8 dereferenceable(240) [[F]], i32 noundef 1, i32 noundef 1) -; IS__CGSCC____-NEXT: call void @llvm.lifetime.end.p0i8(i64 noundef 144, i8* nocapture noundef nonnull align 8 dereferenceable(240) [[TMP0]]) -; IS__CGSCC____-NEXT: ret double [[CALL1]] +; CHECK-LABEL: define {{[^@]+}}@strtox +; CHECK-SAME: (i8* noalias [[S:%.*]]) unnamed_addr { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[F:%.*]] = alloca [[STRUCT__IO_FILE:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct._IO_FILE* [[F]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 noundef 144, i8* nocapture noundef nonnull align 8 dereferenceable(240) [[TMP0]]) [[ATTR10:#.*]] +; CHECK-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @sh_fromstring to i32 (%struct._IO_FILE*, i8*)*)(%struct._IO_FILE* nonnull align 8 dereferenceable(240) [[F]], i8* [[S]]) +; CHECK-NEXT: call void @__shlim(%struct._IO_FILE* noundef nonnull align 8 dereferenceable(240) [[F]], i64 noundef 0) +; CHECK-NEXT: [[CALL1:%.*]] = call double @__floatscan(%struct._IO_FILE* noundef nonnull align 8 dereferenceable(240) [[F]], i32 noundef 1, i32 noundef 1) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 noundef 144, i8* nocapture noundef nonnull align 8 dereferenceable(240) [[TMP0]]) +; CHECK-NEXT: ret double [[CALL1]] ; entry: %f = alloca %struct._IO_FILE, align 8 diff --git a/llvm/test/Transforms/Attributor/nofree.ll b/llvm/test/Transforms/Attributor/nofree.ll index 6cbaf71a01e39..b459527fe2eda 100644 --- a/llvm/test/Transforms/Attributor/nofree.ll +++ b/llvm/test/Transforms/Attributor/nofree.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=11 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM -; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=11 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM diff --git a/llvm/test/Transforms/Attributor/noundef.ll b/llvm/test/Transforms/Attributor/noundef.ll index 34142af9ef8cd..211338eefa0b9 100644 --- a/llvm/test/Transforms/Attributor/noundef.ll +++ b/llvm/test/Transforms/Attributor/noundef.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes -; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM -; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM +; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=1 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=1 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM diff --git a/llvm/test/Transforms/BDCE/intrinsics.ll b/llvm/test/Transforms/BDCE/intrinsics.ll index 5a186f01fd298..ea0a2289feb2d 100644 --- a/llvm/test/Transforms/BDCE/intrinsics.ll +++ b/llvm/test/Transforms/BDCE/intrinsics.ll @@ -8,8 +8,8 @@ declare i8 @llvm.smin.i8(i8, i8) define i8 @umax(i8 %x, i8 %y, i1 %a, i1 %b) { ; CHECK-LABEL: @umax( -; CHECK-NEXT: [[A2:%.*]] = zext i1 [[A:%.*]] to i8 -; CHECK-NEXT: [[B2:%.*]] = zext i1 [[B:%.*]] to i8 +; CHECK-NEXT: [[A2:%.*]] = zext i1 false to i8 +; CHECK-NEXT: [[B2:%.*]] = zext i1 false to i8 ; CHECK-NEXT: [[X2:%.*]] = or i8 [[X:%.*]], [[A2]] ; CHECK-NEXT: [[Y2:%.*]] = or i8 [[Y:%.*]], [[B2]] ; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.umax.i8(i8 [[X2]], i8 [[Y2]]) @@ -27,8 +27,8 @@ define i8 @umax(i8 %x, i8 %y, i1 %a, i1 %b) { define i8 @umin(i8 %x, i8 %y, i1 %a, i1 %b) { ; CHECK-LABEL: @umin( -; CHECK-NEXT: [[A2:%.*]] = zext i1 [[A:%.*]] to i8 -; CHECK-NEXT: [[B2:%.*]] = zext i1 [[B:%.*]] to i8 +; CHECK-NEXT: [[A2:%.*]] = zext i1 false to i8 +; CHECK-NEXT: [[B2:%.*]] = zext i1 false to i8 ; CHECK-NEXT: [[X2:%.*]] = or i8 [[X:%.*]], [[A2]] ; CHECK-NEXT: [[Y2:%.*]] = or i8 [[Y:%.*]], [[B2]] ; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.umin.i8(i8 [[X2]], i8 [[Y2]]) @@ -46,8 +46,8 @@ define i8 @umin(i8 %x, i8 %y, i1 %a, i1 %b) { define i8 @smax(i8 %x, i8 %y, i1 %a, i1 %b) { ; CHECK-LABEL: @smax( -; CHECK-NEXT: [[A2:%.*]] = zext i1 [[A:%.*]] to i8 -; CHECK-NEXT: [[B2:%.*]] = zext i1 [[B:%.*]] to i8 +; CHECK-NEXT: [[A2:%.*]] = zext i1 false to i8 +; CHECK-NEXT: [[B2:%.*]] = zext i1 false to i8 ; CHECK-NEXT: [[X2:%.*]] = or i8 [[X:%.*]], [[A2]] ; CHECK-NEXT: [[Y2:%.*]] = or i8 [[Y:%.*]], [[B2]] ; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smax.i8(i8 [[X2]], i8 [[Y2]]) @@ -65,8 +65,8 @@ define i8 @smax(i8 %x, i8 %y, i1 %a, i1 %b) { define i8 @smin(i8 %x, i8 %y, i1 %a, i1 %b) { ; CHECK-LABEL: @smin( -; CHECK-NEXT: [[A2:%.*]] = zext i1 [[A:%.*]] to i8 -; CHECK-NEXT: [[B2:%.*]] = zext i1 [[B:%.*]] to i8 +; CHECK-NEXT: [[A2:%.*]] = zext i1 false to i8 +; CHECK-NEXT: [[B2:%.*]] = zext i1 false to i8 ; CHECK-NEXT: [[X2:%.*]] = or i8 [[X:%.*]], [[A2]] ; CHECK-NEXT: [[Y2:%.*]] = or i8 [[Y:%.*]], [[B2]] ; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[X2]], i8 [[Y2]]) diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/recursively-delete-dead-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/X86/recursively-delete-dead-instructions.ll new file mode 100644 index 0000000000000..0366b7d7e6d2e --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/X86/recursively-delete-dead-instructions.ll @@ -0,0 +1,27 @@ +; RUN: opt -codegenprepare -S -mtriple=x86_64-linux < %s | FileCheck %s + +declare void @llvm.assume(i1 noundef) nounwind willreturn + +; Recursively deleting dead operands of assume() may result in its next +; instruction deleted and the iterator pointing to the next instruction +; invalidated. This prevents the following simple loop in +; CodeGenPrepare::optimizeBlock() unless CurInstIterator is fixed: +; +; CurInstIterator = BB.begin(); +; while (CurInstIterator != BB.end()) +; optimizeInst(&*CurInstIterator++, ModifiedDT); +; +define i32 @test_assume_in_loop(i1 %cond1, i1 %cond2) { +; CHECK-LABEL: @test_assume_in_loop( +; CHECK-NEXT: entry: +entry: + br label %loop + +; CHECK: loop: +; CHECK-NEXT: br label %loop +loop: + %cond3 = phi i1 [%cond1, %entry], [%cond4, %loop] + call void @llvm.assume(i1 %cond3) + %cond4 = icmp ult i1 %cond1, %cond2 + br label %loop +} diff --git a/llvm/test/Transforms/ConstraintElimination/dom.ll b/llvm/test/Transforms/ConstraintElimination/dom.ll new file mode 100644 index 0000000000000..8002697352448 --- /dev/null +++ b/llvm/test/Transforms/ConstraintElimination/dom.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -constraint-elimination -S %s | FileCheck %s + +; Test cases where both the true and false successors reach the same block, +; dominated by one of them. + +declare void @use(i1) + +define i32 @test1(i32 %x) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ule i32 [[X:%.*]], 10 +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp ule i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: br label [[BB2]] +; CHECK: bb2: +; CHECK-NEXT: [[C_3:%.*]] = icmp ugt i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 [[C_3]]) +; CHECK-NEXT: ret i32 20 +; +entry: + %c.1 = icmp ule i32 %x, 10 + br i1 %c.1, label %bb1, label %bb2 + +bb1: + %c.2 = icmp ule i32 %x, 10 + call void @use(i1 %c.2) + br label %bb2 + +bb2: + %c.3 = icmp ugt i32 %x, 10 + call void @use(i1 %c.3) + ret i32 20 +} + + +define i32 @test2(i32 %x) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ule i32 [[X:%.*]], 10 +; CHECK-NEXT: br i1 [[C_1]], label [[BB2:%.*]], label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp ugt i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: ret i32 20 +; CHECK: bb2: +; CHECK-NEXT: [[C_3:%.*]] = icmp ule i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: br label [[BB1]] +; +entry: + %c.1 = icmp ule i32 %x, 10 + br i1 %c.1, label %bb2, label %bb1 + +bb1: + %c.2 = icmp ugt i32 %x, 10 + call void @use(i1 %c.2) + ret i32 20 + +bb2: + %c.3 = icmp ule i32 %x, 10 + call void @use(i1 %c.3) + br label %bb1 +} + + +; Test cases where the true/false successors are not domianted by the conditional branching block. +define i32 @test3(i32 %x, i1 %c) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB_COND:%.*]], label [[BB1:%.*]] +; CHECK: bb.cond: +; CHECK-NEXT: [[C_1:%.*]] = icmp ule i32 [[X:%.*]], 10 +; CHECK-NEXT: br i1 [[C_1]], label [[BB1]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp ule i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: ret i32 10 +; CHECK: bb2: +; CHECK-NEXT: [[C_3:%.*]] = icmp ugt i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: ret i32 20 +; +entry: + br i1 %c, label %bb.cond, label %bb1 + +bb.cond: + %c.1 = icmp ule i32 %x, 10 + br i1 %c.1, label %bb1, label %bb2 + +bb1: + %c.2 = icmp ule i32 %x, 10 + call void @use(i1 %c.2) + ret i32 10 + +bb2: + %c.3 = icmp ugt i32 %x, 10 + call void @use(i1 %c.3) + ret i32 20 +} + +define i32 @test4(i32 %x, i1 %c) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB_COND:%.*]], label [[BB2:%.*]] +; CHECK: bb.cond: +; CHECK-NEXT: [[C_1:%.*]] = icmp ule i32 [[X:%.*]], 10 +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[BB2]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp ule i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: ret i32 10 +; CHECK: bb2: +; CHECK-NEXT: [[C_3:%.*]] = icmp ugt i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 [[C_3]]) +; CHECK-NEXT: ret i32 20 +; +entry: + br i1 %c, label %bb.cond, label %bb2 + +bb.cond: + %c.1 = icmp ule i32 %x, 10 + br i1 %c.1, label %bb1, label %bb2 + +bb1: + %c.2 = icmp ule i32 %x, 10 + call void @use(i1 %c.2) + ret i32 10 + +bb2: + %c.3 = icmp ugt i32 %x, 10 + call void @use(i1 %c.3) + ret i32 20 +} diff --git a/llvm/test/Transforms/ConstraintElimination/geps.2d.ll b/llvm/test/Transforms/ConstraintElimination/geps.2d.ll new file mode 100644 index 0000000000000..35ffadbd85ea1 --- /dev/null +++ b/llvm/test/Transforms/ConstraintElimination/geps.2d.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -constraint-elimination -S %s | FileCheck %s + +define void @test.not.uge.ult([10 x i8]* %start, i8* %low, i8* %high) { +; CHECK-LABEL: @test.not.uge.ult( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[START:%.*]], i64 1, i64 3 +; CHECK-NEXT: [[C_1:%.*]] = icmp uge i8* [[ADD_PTR_I]], [[HIGH:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[START_0:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[START]], i64 10, i64 0 +; CHECK-NEXT: [[C_0:%.*]] = icmp ult i8* [[START_0]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 [[C_0]]) +; CHECK-NEXT: ret void +; +entry: + %add.ptr.i = getelementptr inbounds [10 x i8], [10 x i8]* %start, i64 1, i64 3 + %c.1 = icmp uge i8* %add.ptr.i, %high + br i1 %c.1, label %if.then, label %if.end + +if.then: ; preds = %entry + ret void + +if.end: ; preds = %entry + %start.0 = getelementptr inbounds [10 x i8], [10 x i8]* %start, i64 10, i64 0 + %c.0 = icmp ult i8* %start.0, %high + call void @use(i1 %c.0) + ret void +} + +define void @test.not.uge.ule([10 x i8]* %start, i8* %low, i8* %high) { +; CHECK-LABEL: @test.not.uge.ule( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[START:%.*]], i64 1, i64 3 +; CHECK-NEXT: [[C:%.*]] = icmp uge i8* [[ADD_PTR_I]], [[HIGH:%.*]] +; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[START_0:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[START]], i64 10, i64 0 +; CHECK-NEXT: [[C_0:%.*]] = icmp ule i8* [[START_0]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 [[C_0]]) +; CHECK-NEXT: [[START_1:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[START]], i64 2, i64 1 +; CHECK-NEXT: [[C_1:%.*]] = icmp ule i8* [[START_1]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 [[C_1]]) +; CHECK-NEXT: ret void +; +entry: + %add.ptr.i = getelementptr inbounds [10 x i8], [10 x i8]* %start, i64 1, i64 3 + %c = icmp uge i8* %add.ptr.i, %high + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %entry + ret void + +if.end: ; preds = %entry + %start.0 = getelementptr inbounds [10 x i8], [10 x i8]* %start, i64 10, i64 0 + %c.0 = icmp ule i8* %start.0, %high + call void @use(i1 %c.0) + %start.1 = getelementptr inbounds [10 x i8], [10 x i8]* %start, i64 2, i64 1 + %c.1 = icmp ule i8* %start.1, %high + call void @use(i1 %c.1) + ret void +} + +define void @test.not.uge.ugt([10 x i8]* %start, i8* %low, i8* %high) { +; CHECK-LABEL: @test.not.uge.ugt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[START:%.*]], i64 1, i64 3 +; CHECK-NEXT: [[C:%.*]] = icmp uge i8* [[ADD_PTR_I]], [[HIGH:%.*]] +; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[START_0:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[START]], i64 3, i64 0 +; CHECK-NEXT: [[C_0:%.*]] = icmp ugt i8* [[START_0]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 [[C_0]]) +; CHECK-NEXT: [[START_1:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[START]], i64 3, i64 1 +; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8* [[START_1]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 [[C_1]]) +; CHECK-NEXT: ret void +; +entry: + %add.ptr.i = getelementptr inbounds [10 x i8], [10 x i8]* %start, i64 1, i64 3 + %c = icmp uge i8* %add.ptr.i, %high + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %entry + ret void + +if.end: ; preds = %entry + %start.0 = getelementptr inbounds [10 x i8], [10 x i8]* %start, i64 3, i64 0 + %c.0 = icmp ugt i8* %start.0, %high + call void @use(i1 %c.0) + + %start.1 = getelementptr inbounds [10 x i8], [10 x i8]* %start, i64 3, i64 1 + %c.1 = icmp ugt i8* %start.1, %high + call void @use(i1 %c.1) + ret void +} + +define void @test.not.uge.uge([10 x i8]* %start, i8* %low, i8* %high) { +; CHECK-LABEL: @test.not.uge.uge( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[START:%.*]], i64 1, i64 3 +; CHECK-NEXT: [[C_1:%.*]] = icmp uge i8* [[ADD_PTR_I]], [[HIGH:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[START_0:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[START]], i64 3, i64 0 +; CHECK-NEXT: [[C_0:%.*]] = icmp uge i8* [[START_0]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 [[C_0]]) +; CHECK-NEXT: ret void +; +entry: + %add.ptr.i = getelementptr inbounds [10 x i8], [10 x i8]* %start, i64 1, i64 3 + %c.1 = icmp uge i8* %add.ptr.i, %high + br i1 %c.1, label %if.then, label %if.end + +if.then: ; preds = %entry + ret void + +if.end: ; preds = %entry + %start.0 = getelementptr inbounds [10 x i8], [10 x i8]* %start, i64 3, i64 0 + %c.0 = icmp uge i8* %start.0, %high + call void @use(i1 %c.0) + + ret void +} + +declare void @use(i1) diff --git a/llvm/test/Transforms/ConstraintElimination/geps.ll b/llvm/test/Transforms/ConstraintElimination/geps.ll new file mode 100644 index 0000000000000..46763c08b3820 --- /dev/null +++ b/llvm/test/Transforms/ConstraintElimination/geps.ll @@ -0,0 +1,332 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -constraint-elimination -S %s | FileCheck %s + +define i32 @test.ult(i32* readonly %src, i32* readnone %min, i32* readnone %max) { +; CHECK-LABEL: @test.ult( +; CHECK-NEXT: check.0.min: +; CHECK-NEXT: [[C_MIN_0:%.*]] = icmp ult i32* [[SRC:%.*]], [[MIN:%.*]] +; CHECK-NEXT: br i1 [[C_MIN_0]], label [[TRAP:%.*]], label [[CHECK_0_MAX:%.*]] +; CHECK: trap: +; CHECK-NEXT: ret i32 10 +; CHECK: check.0.max: +; CHECK-NEXT: [[C_MAX_0:%.*]] = icmp ult i32* [[SRC]], [[MAX:%.*]] +; CHECK-NEXT: br i1 [[C_MAX_0]], label [[CHECK_3_MIN:%.*]], label [[TRAP]] +; CHECK: check.3.min: +; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[ADD_PTR_I36:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[C_3_MIN:%.*]] = icmp ult i32* [[ADD_PTR_I36]], [[MIN]] +; CHECK-NEXT: br i1 false, label [[TRAP]], label [[CHECK_3_MAX:%.*]] +; CHECK: check.3.max: +; CHECK-NEXT: [[C_3_MAX:%.*]] = icmp ult i32* [[ADD_PTR_I36]], [[MAX]] +; CHECK-NEXT: br i1 [[C_3_MAX]], label [[CHECK_1_MIN:%.*]], label [[TRAP]] +; CHECK: check.1.min: +; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[ADD_PTR_I36]], align 4 +; CHECK-NEXT: [[ADD_PTR_I29:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 +; CHECK-NEXT: [[C_1_MIN:%.*]] = icmp ult i32* [[ADD_PTR_I29]], [[MIN]] +; CHECK-NEXT: br i1 false, label [[TRAP]], label [[CHECK_1_MAX:%.*]] +; CHECK: check.1.max: +; CHECK-NEXT: [[C_1_MAX:%.*]] = icmp ult i32* [[ADD_PTR_I29]], [[MAX]] +; CHECK-NEXT: br i1 true, label [[CHECK_2_MIN:%.*]], label [[TRAP]] +; CHECK: check.2.min: +; CHECK-NEXT: [[L2:%.*]] = load i32, i32* [[ADD_PTR_I29]], align 4 +; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[C_2_MIN:%.*]] = icmp ult i32* [[ADD_PTR_I]], [[MIN]] +; CHECK-NEXT: br i1 false, label [[TRAP]], label [[CHECK_2_MAX:%.*]] +; CHECK: check.2.max: +; CHECK-NEXT: [[C_2_MAX:%.*]] = icmp ult i32* [[ADD_PTR_I]], [[MAX]] +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[TRAP]] +; CHECK: exit: +; CHECK-NEXT: [[L3:%.*]] = load i32, i32* [[ADD_PTR_I]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]] +; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD]], [[L2]] +; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD8]], [[L3]] +; CHECK-NEXT: ret i32 [[ADD9]] +; +check.0.min: + %c.min.0 = icmp ult i32* %src, %min + br i1 %c.min.0, label %trap, label %check.0.max + +trap: ; preds = %check.2.max, %check.2.min, %check.1.max, %check.1.min, %check.3.max, %check.3.min, %check.0.max, %check.0.min + ret i32 10 + +check.0.max: ; preds = %check.0.min + %c.max.0 = icmp ult i32* %src, %max + br i1 %c.max.0, label %check.3.min, label %trap + +check.3.min: ; preds = %check.0.max + %l0 = load i32, i32* %src, align 4 + %add.ptr.i36 = getelementptr inbounds i32, i32* %src, i64 3 + %c.3.min = icmp ult i32* %add.ptr.i36, %min + br i1 %c.3.min, label %trap, label %check.3.max + +check.3.max: ; preds = %check.3.min + %c.3.max = icmp ult i32* %add.ptr.i36, %max + br i1 %c.3.max, label %check.1.min, label %trap + +check.1.min: ; preds = %check.3.max + %l1 = load i32, i32* %add.ptr.i36, align 4 + %add.ptr.i29 = getelementptr inbounds i32, i32* %src, i64 1 + %c.1.min = icmp ult i32* %add.ptr.i29, %min + br i1 %c.1.min, label %trap, label %check.1.max + +check.1.max: ; preds = %check.1.min + %c.1.max = icmp ult i32* %add.ptr.i29, %max + br i1 %c.1.max, label %check.2.min, label %trap + +check.2.min: ; preds = %check.1.max + %l2 = load i32, i32* %add.ptr.i29, align 4 + %add.ptr.i = getelementptr inbounds i32, i32* %src, i64 2 + %c.2.min = icmp ult i32* %add.ptr.i, %min + br i1 %c.2.min, label %trap, label %check.2.max + +check.2.max: ; preds = %check.2.min + %c.2.max = icmp ult i32* %add.ptr.i, %max + br i1 %c.2.max, label %exit, label %trap + +exit: ; preds = %check.2.max + %l3 = load i32, i32* %add.ptr.i, align 4 + %add = add nsw i32 %l1, %l0 + %add8 = add nsw i32 %add, %l2 + %add9 = add nsw i32 %add8, %l3 + ret i32 %add9 +} + +define void @test.not.uge.ult(i8* %start, i8* %low, i8* %high) { +; CHECK-LABEL: @test.not.uge.ult( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds i8, i8* [[START:%.*]], i64 3 +; CHECK-NEXT: [[C_1:%.*]] = icmp uge i8* [[ADD_PTR_I]], [[HIGH:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[T_0:%.*]] = icmp ult i8* [[START]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[START_1:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 1 +; CHECK-NEXT: [[T_1:%.*]] = icmp ult i8* [[START_1]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[START_2:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 2 +; CHECK-NEXT: [[T_2:%.*]] = icmp ult i8* [[START_2]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[START_3:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 3 +; CHECK-NEXT: [[T_3:%.*]] = icmp ult i8* [[START_3]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[START_4:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 4 +; CHECK-NEXT: [[C_4:%.*]] = icmp ult i8* [[START_4]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 [[C_4]]) +; CHECK-NEXT: ret void +; +entry: + %add.ptr.i = getelementptr inbounds i8, i8* %start, i64 3 + %c.1 = icmp uge i8* %add.ptr.i, %high + br i1 %c.1, label %if.then, label %if.end + +if.then: ; preds = %entry + ret void + +if.end: ; preds = %entry + %t.0 = icmp ult i8* %start, %high + call void @use(i1 %t.0) + %start.1 = getelementptr inbounds i8, i8* %start, i64 1 + %t.1 = icmp ult i8* %start.1, %high + call void @use(i1 %t.1) + %start.2 = getelementptr inbounds i8, i8* %start, i64 2 + %t.2 = icmp ult i8* %start.2, %high + call void @use(i1 %t.2) + %start.3 = getelementptr inbounds i8, i8* %start, i64 3 + %t.3 = icmp ult i8* %start.3, %high + call void @use(i1 %t.3) + %start.4 = getelementptr inbounds i8, i8* %start, i64 4 + %c.4 = icmp ult i8* %start.4, %high + call void @use(i1 %c.4) + ret void +} + +define void @test.not.uge.ule(i8* %start, i8* %low, i8* %high) { +; CHECK-LABEL: @test.not.uge.ule( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds i8, i8* [[START:%.*]], i64 3 +; CHECK-NEXT: [[C_1:%.*]] = icmp uge i8* [[ADD_PTR_I]], [[HIGH:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[T_0:%.*]] = icmp ule i8* [[START]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[START_1:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 1 +; CHECK-NEXT: [[T_1:%.*]] = icmp ule i8* [[START_1]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[START_2:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 2 +; CHECK-NEXT: [[T_2:%.*]] = icmp ule i8* [[START_2]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[START_3:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 3 +; CHECK-NEXT: [[T_3:%.*]] = icmp ule i8* [[START_3]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[START_4:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 4 +; CHECK-NEXT: [[T_4:%.*]] = icmp ule i8* [[START_4]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[START_5:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 5 +; CHECK-NEXT: [[C_5:%.*]] = icmp ule i8* [[START_5]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: ret void +; +entry: + %add.ptr.i = getelementptr inbounds i8, i8* %start, i64 3 + %c.1 = icmp uge i8* %add.ptr.i, %high + br i1 %c.1, label %if.then, label %if.end + +if.then: ; preds = %entry + ret void + +if.end: ; preds = %entry + %t.0 = icmp ule i8* %start, %high + call void @use(i1 %t.0) + %start.1 = getelementptr inbounds i8, i8* %start, i64 1 + %t.1 = icmp ule i8* %start.1, %high + call void @use(i1 %t.1) + %start.2 = getelementptr inbounds i8, i8* %start, i64 2 + %t.2 = icmp ule i8* %start.2, %high + call void @use(i1 %t.2) + %start.3 = getelementptr inbounds i8, i8* %start, i64 3 + %t.3 = icmp ule i8* %start.3, %high + call void @use(i1 %t.3) + %start.4 = getelementptr inbounds i8, i8* %start, i64 4 + %t.4 = icmp ule i8* %start.4, %high + call void @use(i1 %t.4) + + %start.5 = getelementptr inbounds i8, i8* %start, i64 5 + %c.5 = icmp ule i8* %start.5, %high + call void @use(i1 %c.5) + + ret void +} + +define void @test.not.uge.ugt(i8* %start, i8* %low, i8* %high) { +; CHECK-LABEL: @test.not.uge.ugt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds i8, i8* [[START:%.*]], i64 3 +; CHECK-NEXT: [[C_1:%.*]] = icmp uge i8* [[ADD_PTR_I]], [[HIGH:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[F_0:%.*]] = icmp ugt i8* [[START]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[START_1:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 1 +; CHECK-NEXT: [[F_1:%.*]] = icmp ugt i8* [[START_1]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[START_2:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 2 +; CHECK-NEXT: [[F_2:%.*]] = icmp ugt i8* [[START_2]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[START_3:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 3 +; CHECK-NEXT: [[F_3:%.*]] = icmp ugt i8* [[START_3]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[START_4:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 4 +; CHECK-NEXT: [[F_4:%.*]] = icmp ugt i8* [[START_4]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[START_5:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 5 +; CHECK-NEXT: [[C_5:%.*]] = icmp ugt i8* [[START_5]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: ret void +; +entry: + %add.ptr.i = getelementptr inbounds i8, i8* %start, i64 3 + %c.1 = icmp uge i8* %add.ptr.i, %high + br i1 %c.1, label %if.then, label %if.end + +if.then: ; preds = %entry + ret void + +if.end: ; preds = %entry + %f.0 = icmp ugt i8* %start, %high + call void @use(i1 %f.0) + + %start.1 = getelementptr inbounds i8, i8* %start, i64 1 + %f.1 = icmp ugt i8* %start.1, %high + call void @use(i1 %f.1) + + %start.2 = getelementptr inbounds i8, i8* %start, i64 2 + %f.2 = icmp ugt i8* %start.2, %high + call void @use(i1 %f.2) + + %start.3 = getelementptr inbounds i8, i8* %start, i64 3 + %f.3 = icmp ugt i8* %start.3, %high + call void @use(i1 %f.3) + + %start.4 = getelementptr inbounds i8, i8* %start, i64 4 + %f.4 = icmp ugt i8* %start.4, %high + call void @use(i1 %f.4) + + %start.5 = getelementptr inbounds i8, i8* %start, i64 5 + %c.5 = icmp ugt i8* %start.5, %high + call void @use(i1 %c.5) + + ret void +} + +define void @test.not.uge.uge(i8* %start, i8* %low, i8* %high) { +; CHECK-LABEL: @test.not.uge.uge( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds i8, i8* [[START:%.*]], i64 3 +; CHECK-NEXT: [[C_1:%.*]] = icmp uge i8* [[ADD_PTR_I]], [[HIGH:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[F_0:%.*]] = icmp ugt i8* [[START]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[START_1:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 1 +; CHECK-NEXT: [[F_1:%.*]] = icmp uge i8* [[START_1]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[START_2:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 2 +; CHECK-NEXT: [[F_2:%.*]] = icmp uge i8* [[START_2]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[START_3:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 3 +; CHECK-NEXT: [[F_3:%.*]] = icmp uge i8* [[START_3]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[START_4:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 4 +; CHECK-NEXT: [[C_4:%.*]] = icmp uge i8* [[START_4]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 [[C_4]]) +; CHECK-NEXT: [[START_5:%.*]] = getelementptr inbounds i8, i8* [[START]], i64 5 +; CHECK-NEXT: [[C_5:%.*]] = icmp uge i8* [[START_5]], [[HIGH]] +; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: ret void +; +entry: + %add.ptr.i = getelementptr inbounds i8, i8* %start, i64 3 + %c.1 = icmp uge i8* %add.ptr.i, %high + br i1 %c.1, label %if.then, label %if.end + +if.then: ; preds = %entry + ret void + +if.end: ; preds = %entry + %f.0 = icmp ugt i8* %start, %high + call void @use(i1 %f.0) + + %start.1 = getelementptr inbounds i8, i8* %start, i64 1 + %f.1 = icmp uge i8* %start.1, %high + call void @use(i1 %f.1) + + %start.2 = getelementptr inbounds i8, i8* %start, i64 2 + %f.2 = icmp uge i8* %start.2, %high + call void @use(i1 %f.2) + + %start.3 = getelementptr inbounds i8, i8* %start, i64 3 + %f.3 = icmp uge i8* %start.3, %high + call void @use(i1 %f.3) + + %start.4 = getelementptr inbounds i8, i8* %start, i64 4 + %c.4 = icmp uge i8* %start.4, %high + call void @use(i1 %c.4) + + %start.5 = getelementptr inbounds i8, i8* %start, i64 5 + %c.5 = icmp uge i8* %start.5, %high + call void @use(i1 %c.5) + + ret void +} + + +declare void @use(i1) +declare void @llvm.trap() diff --git a/llvm/test/Transforms/ConstraintElimination/i128.ll b/llvm/test/Transforms/ConstraintElimination/i128.ll new file mode 100644 index 0000000000000..d021db6aa907f --- /dev/null +++ b/llvm/test/Transforms/ConstraintElimination/i128.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -constraint-elimination -S %s | FileCheck %s + +declare void @use(i1) + +define void @test_unsigned_too_large(i128 %x) { +; CHECK-LABEL: @test_unsigned_too_large( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ule i128 [[X:%.*]], 12345678901234123123123 +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp ult i128 [[X]], -12345678901234123123123 +; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: [[C_3:%.*]] = icmp uge i128 [[X]], -12345678901234123123123 +; CHECK-NEXT: call void @use(i1 [[C_3]]) +; CHECK-NEXT: [[C_4:%.*]] = icmp uge i128 [[X]], -12345678901234123123123 +; CHECK-NEXT: call void @use(i1 [[C_4]]) +; CHECK-NEXT: ret void +; CHECK: bb2: +; CHECK-NEXT: ret void +; +entry: + %c.1 = icmp ule i128 %x, 12345678901234123123123 + br i1 %c.1, label %bb1, label %bb2 + +bb1: + %c.2 = icmp ult i128 %x, -12345678901234123123123 + call void @use(i1 %c.2) + %c.3 = icmp uge i128 %x, -12345678901234123123123 + call void @use(i1 %c.3) + %c.4 = icmp uge i128 %x, -12345678901234123123123 + call void @use(i1 %c.4) + ret void + +bb2: + ret void +} diff --git a/llvm/test/Transforms/ConstraintElimination/loops.ll b/llvm/test/Transforms/ConstraintElimination/loops.ll new file mode 100644 index 0000000000000..37373e1fbcaf9 --- /dev/null +++ b/llvm/test/Transforms/ConstraintElimination/loops.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -constraint-elimination -S %s | FileCheck %s + +; Make sure conditions in loops are not used to simplify themselves. + +define void @loop1(float* %T, float* %x, i32 %points, i32 %trigint) { +; CHECK-LABEL: @loop1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[POINTS:%.*]] to i64 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR1:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 -8 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[POINTS]], 1 +; CHECK-NEXT: [[IDX_EXT2:%.*]] = sext i32 [[SHR]] to i64 +; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IDX_EXT2]] +; CHECK-NEXT: [[ADD_PTR4:%.*]] = getelementptr inbounds float, float* [[ADD_PTR3]], i64 -8 +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[X2_0:%.*]] = phi float* [ [[ADD_PTR4]], [[ENTRY:%.*]] ], [ [[ADD_PTR106:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[X1_0:%.*]] = phi float* [ [[ADD_PTR1]], [[ENTRY]] ], [ [[ADD_PTR105:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[ADD_PTR105]] = getelementptr inbounds float, float* [[X1_0]], i64 -8 +; CHECK-NEXT: [[ADD_PTR106]] = getelementptr inbounds float, float* [[X2_0]], i64 -8 +; CHECK-NEXT: [[CMP:%.*]] = icmp uge float* [[ADD_PTR106]], [[X]] +; CHECK-NEXT: br i1 [[CMP]], label [[DO_BODY]], label [[DO_END:%.*]] +; CHECK: do.end: +; CHECK-NEXT: ret void +; +entry: + %idx.ext = sext i32 %points to i64 + %add.ptr = getelementptr inbounds float, float* %x, i64 %idx.ext + %add.ptr1 = getelementptr inbounds float, float* %add.ptr, i64 -8 + %shr = ashr i32 %points, 1 + %idx.ext2 = sext i32 %shr to i64 + %add.ptr3 = getelementptr inbounds float, float* %x, i64 %idx.ext2 + %add.ptr4 = getelementptr inbounds float, float* %add.ptr3, i64 -8 + br label %do.body + +do.body: ; preds = %do.body, %entry + %x2.0 = phi float* [ %add.ptr4, %entry ], [ %add.ptr106, %do.body ] + %x1.0 = phi float* [ %add.ptr1, %entry ], [ %add.ptr105, %do.body ] + %add.ptr105 = getelementptr inbounds float, float* %x1.0, i64 -8 + %add.ptr106 = getelementptr inbounds float, float* %x2.0, i64 -8 + %cmp = icmp uge float* %add.ptr106, %x + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + ret void +} diff --git a/llvm/test/Transforms/ConstraintElimination/mixed.ll b/llvm/test/Transforms/ConstraintElimination/mixed.ll new file mode 100644 index 0000000000000..c0fb37883f71f --- /dev/null +++ b/llvm/test/Transforms/ConstraintElimination/mixed.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -constraint-elimination -S %s | FileCheck %s + +; Make sure we do not incorrectly add variables to the system. + +define i1 @test(i32* %p1, i32* %p2, i32 %num_rows, i32 %start_row, i1 %c) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[NUM_ROWS:%.*]], [[START_ROW:%.*]] +; CHECK-NEXT: [[L3:%.*]] = load i32, i32* [[P1:%.*]], align 4 +; CHECK-NEXT: [[CMP6:%.*]] = icmp ugt i32 [[L3]], [[START_ROW]] +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_END36:%.*]], label [[IF_END36]] +; CHECK: if.end36: +; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[P2:%.*]], align 4 +; CHECK-NEXT: [[CMP37:%.*]] = icmp ult i32 [[L1]], [[ADD]] +; CHECK-NEXT: br i1 [[CMP37]], label [[IF_THEN39:%.*]], label [[EXIT:%.*]] +; CHECK: if.then39: +; CHECK-NEXT: [[CMP41:%.*]] = icmp ult i32 [[L1]], [[START_ROW]] +; CHECK-NEXT: ret i1 [[CMP41]] +; CHECK: exit: +; CHECK-NEXT: ret i1 false +; +entry: + %add = add i32 %num_rows, %start_row + %l3 = load i32, i32* %p1, align 4 + %cmp6 = icmp ugt i32 %l3, %start_row + br i1 %c, label %if.end36, label %if.end36 + +if.end36: ; preds = %if.then11 + %l1 = load i32, i32* %p2, align 4 + %cmp37 = icmp ult i32 %l1, %add + br i1 %cmp37, label %if.then39, label %exit + +if.then39: ; preds = %if.end36 + %cmp41 = icmp ult i32 %l1, %start_row + ret i1 %cmp41 + +exit: ; preds = %if.end36 + ret i1 false +} diff --git a/llvm/test/Transforms/ConstraintElimination/uge.ll b/llvm/test/Transforms/ConstraintElimination/uge.ll new file mode 100644 index 0000000000000..bacb9a7f3d917 --- /dev/null +++ b/llvm/test/Transforms/ConstraintElimination/uge.ll @@ -0,0 +1,255 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -constraint-elimination -S %s | FileCheck %s + +declare void @use(i1) + +define void @test_1_variable_constraint(i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: @test_1_variable_constraint( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp uge i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[T_1:%.*]] = icmp uge i32 [[X]], [[Y]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[C_2:%.*]] = icmp uge i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: [[C_3:%.*]] = icmp uge i32 [[Y]], [[X]] +; CHECK-NEXT: call void @use(i1 [[C_3]]) +; CHECK-NEXT: [[C_4:%.*]] = icmp uge i32 10, [[X]] +; CHECK-NEXT: call void @use(i1 [[C_4]]) +; CHECK-NEXT: ret void +; CHECK: bb2: +; CHECK-NEXT: [[T_2:%.*]] = icmp uge i32 [[Y]], [[X]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[F_1:%.*]] = icmp uge i32 [[X]], [[Y]] +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[C_5:%.*]] = icmp uge i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: [[C_6:%.*]] = icmp uge i32 10, [[X]] +; CHECK-NEXT: call void @use(i1 [[C_6]]) +; CHECK-NEXT: ret void +; +entry: + %c.1 = icmp uge i32 %x, %y + br i1 %c.1, label %bb1, label %bb2 + +bb1: + %t.1 = icmp uge i32 %x, %y + call void @use(i1 %t.1) + %c.2 = icmp uge i32 %x, 10 + call void @use(i1 %c.2) + %c.3 = icmp uge i32 %y, %x + call void @use(i1 %c.3) + %c.4 = icmp uge i32 10, %x + call void @use(i1 %c.4) + ret void + +bb2: + %t.2 = icmp uge i32 %y, %x + call void @use(i1 %t.2) + %f.1 = icmp uge i32 %x, %y + call void @use(i1 %f.1) + %c.5 = icmp uge i32 %x, 10 + call void @use(i1 %c.5) + %c.6 = icmp uge i32 10, %x + call void @use(i1 %c.6) + ret void +} + +define void @test_1_constant_constraint(i32 %x) { +; CHECK-LABEL: @test_1_constant_constraint( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp uge i32 [[X:%.*]], 10 +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[T_1:%.*]] = icmp uge i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[T_2:%.*]] = icmp uge i32 [[X]], 9 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[C_2:%.*]] = icmp uge i32 [[X]], 11 +; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: [[C_4:%.*]] = icmp uge i32 10, [[X]] +; CHECK-NEXT: call void @use(i1 [[C_4]]) +; CHECK-NEXT: ret void +; CHECK: bb2: +; CHECK-NEXT: [[T_3:%.*]] = icmp uge i32 11, [[X]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[F_1:%.*]] = icmp uge i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[F_1_1:%.*]] = icmp uge i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[C_5:%.*]] = icmp uge i32 [[X]], 9 +; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: [[C_6:%.*]] = icmp uge i32 1, [[X]] +; CHECK-NEXT: call void @use(i1 [[C_6]]) +; CHECK-NEXT: ret void +; +entry: + %c.1 = icmp uge i32 %x, 10 + br i1 %c.1, label %bb1, label %bb2 + +bb1: + %t.1 = icmp uge i32 %x, 10 + call void @use(i1 %t.1) + %t.2 = icmp uge i32 %x, 9 + call void @use(i1 %t.2) + %c.2 = icmp uge i32 %x, 11 + call void @use(i1 %c.2) + %c.4 = icmp uge i32 10, %x + call void @use(i1 %c.4) + ret void + +bb2: + %t.3 = icmp uge i32 11, %x + call void @use(i1 %t.3) + %f.1 = icmp uge i32 %x, 10 + call void @use(i1 %f.1) + + + %f.1.1 = icmp uge i32 %x, 10 + call void @use(i1 %f.1.1) + %c.5 = icmp uge i32 %x, 9 + call void @use(i1 %c.5) + %c.6 = icmp uge i32 1, %x + call void @use(i1 %c.6) + ret void +} + +define i32 @test1(i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp uge i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[EXIT:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp uge i32 [[Y]], [[Z:%.*]] +; CHECK-NEXT: br i1 [[C_2]], label [[BB2:%.*]], label [[EXIT]] +; CHECK: bb2: +; CHECK-NEXT: [[C_3:%.*]] = icmp uge i32 [[X]], [[Z]] +; CHECK-NEXT: br i1 true, label [[BB3:%.*]], label [[EXIT]] +; CHECK: bb3: +; CHECK-NEXT: ret i32 10 +; CHECK: exit: +; CHECK-NEXT: ret i32 20 +; +entry: + %c.1 = icmp uge i32 %x, %y + br i1 %c.1, label %bb1, label %exit + +bb1: + %c.2 = icmp uge i32 %y, %z + br i1 %c.2, label %bb2, label %exit + +bb2: + %c.3 = icmp uge i32 %x, %z + br i1 %c.3, label %bb3, label %exit + +bb3: + ret i32 10 + +exit: + ret i32 20 +} + + +define i32 @test2(i32 %x, i32 %y, i32 %z, i32 %a) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp uge i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[EXIT:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp uge i32 [[Y]], [[Z:%.*]] +; CHECK-NEXT: br i1 [[C_2]], label [[BB2:%.*]], label [[EXIT]] +; CHECK: bb2: +; CHECK-NEXT: [[C_3:%.*]] = icmp uge i32 [[X]], [[A:%.*]] +; CHECK-NEXT: br i1 [[C_3]], label [[BB3:%.*]], label [[EXIT]] +; CHECK: bb3: +; CHECK-NEXT: ret i32 10 +; CHECK: exit: +; CHECK-NEXT: ret i32 20 +; +entry: + %c.1 = icmp uge i32 %x, %y + br i1 %c.1, label %bb1, label %exit + +bb1: + %c.2 = icmp uge i32 %y, %z + br i1 %c.2, label %bb2, label %exit + +bb2: + %c.3 = icmp uge i32 %x, %a + br i1 %c.3, label %bb3, label %exit + +bb3: + ret i32 10 + +exit: + ret i32 20 +} + + +define i32 @test3(i32 %x, i32 %y) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp uge i32 [[X:%.*]], 10 +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[EXIT:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp uge i32 [[Y:%.*]], 20 +; CHECK-NEXT: br i1 [[C_2]], label [[BB2:%.*]], label [[EXIT]] +; CHECK: bb2: +; CHECK-NEXT: ret i32 10 +; CHECK: exit: +; CHECK-NEXT: ret i32 20 +; +entry: + %c.1 = icmp uge i32 %x, 10 + br i1 %c.1, label %bb1, label %exit + +bb1: + %c.2 = icmp uge i32 %y, 20 + br i1 %c.2, label %bb2, label %exit + +bb2: + ret i32 10 + +exit: + ret i32 20 +} + +define i32 @test4(i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp uge i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[EXIT:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp uge i32 [[Y]], [[Z:%.*]] +; CHECK-NEXT: br i1 [[C_2]], label [[BB2:%.*]], label [[EXIT]] +; CHECK: bb2: +; CHECK-NEXT: [[T_1:%.*]] = icmp uge i32 [[X]], [[Z]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[U_1:%.*]] = icmp eq i32 [[X]], [[Z]] +; CHECK-NEXT: call void @use(i1 [[U_1]]) +; CHECK-NEXT: ret i32 10 +; CHECK: exit: +; CHECK-NEXT: ret i32 20 +; +entry: + %c.1 = icmp uge i32 %x, %y + br i1 %c.1, label %bb1, label %exit + +bb1: + %c.2 = icmp uge i32 %y, %z + br i1 %c.2, label %bb2, label %exit + +bb2: + %t.1 = icmp uge i32 %x, %z + call void @use(i1 %t.1) + %u.1 = icmp eq i32 %x, %z + call void @use(i1 %u.1) + ret i32 10 + + +exit: + ret i32 20 +} + + diff --git a/llvm/test/Transforms/ConstraintElimination/ugt-ule.ll b/llvm/test/Transforms/ConstraintElimination/ugt-ule.ll new file mode 100644 index 0000000000000..cc9eca9a6605f --- /dev/null +++ b/llvm/test/Transforms/ConstraintElimination/ugt-ule.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -constraint-elimination -S %s | FileCheck %s + +declare void @use(i1) + +define void @test(i8* %m, i8* %ptr) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP_1:%.*]] = icmp ult i8* [[M:%.*]], [[PTR:%.*]] +; CHECK-NEXT: br i1 [[CMP_1]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; CHECK: bb.1: +; CHECK-NEXT: [[CMP_2:%.*]] = icmp uge i8* [[M]], [[PTR]] +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: ret void +; CHECK: bb.2: +; CHECK-NEXT: br label [[BB_2_NEXT:%.*]] +; CHECK: bb.2.next: +; CHECK-NEXT: [[CMP_3:%.*]] = icmp uge i8* [[M]], [[PTR]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: ret void +; +entry: + %cmp.1 = icmp ult i8* %m, %ptr + br i1 %cmp.1, label %bb.1, label %bb.2 + +bb.1: + %cmp.2 = icmp uge i8* %m, %ptr + call void @use(i1 %cmp.2) + ret void + +bb.2: + br label %bb.2.next + +bb.2.next: + %cmp.3 = icmp uge i8* %m, %ptr + call void @use(i1 %cmp.3) + ret void +} diff --git a/llvm/test/Transforms/ConstraintElimination/ule.ll b/llvm/test/Transforms/ConstraintElimination/ule.ll new file mode 100644 index 0000000000000..c5356550159e3 --- /dev/null +++ b/llvm/test/Transforms/ConstraintElimination/ule.ll @@ -0,0 +1,254 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -constraint-elimination -S %s | FileCheck %s + +declare void @use(i1) + +define void @test_1_variable_constraint(i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: @test_1_variable_constraint( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ule i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[T_1:%.*]] = icmp ule i32 [[X]], [[Y]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[C_2:%.*]] = icmp ule i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: [[C_3:%.*]] = icmp ule i32 [[Y]], [[X]] +; CHECK-NEXT: call void @use(i1 [[C_3]]) +; CHECK-NEXT: [[C_4:%.*]] = icmp ule i32 10, [[X]] +; CHECK-NEXT: call void @use(i1 [[C_4]]) +; CHECK-NEXT: ret void +; CHECK: bb2: +; CHECK-NEXT: [[T_2:%.*]] = icmp ule i32 [[Y]], [[X]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[F_1:%.*]] = icmp ule i32 [[X]], [[Y]] +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[C_5:%.*]] = icmp ule i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: [[C_6:%.*]] = icmp ule i32 10, [[X]] +; CHECK-NEXT: call void @use(i1 [[C_6]]) +; CHECK-NEXT: ret void +; +entry: + %c.1 = icmp ule i32 %x, %y + br i1 %c.1, label %bb1, label %bb2 + +bb1: + %t.1 = icmp ule i32 %x, %y + call void @use(i1 %t.1) + %c.2 = icmp ule i32 %x, 10 + call void @use(i1 %c.2) + %c.3 = icmp ule i32 %y, %x + call void @use(i1 %c.3) + %c.4 = icmp ule i32 10, %x + call void @use(i1 %c.4) + ret void + +bb2: + %t.2 = icmp ule i32 %y, %x + call void @use(i1 %t.2) + %f.1 = icmp ule i32 %x, %y + call void @use(i1 %f.1) + %c.5 = icmp ule i32 %x, 10 + call void @use(i1 %c.5) + %c.6 = icmp ule i32 10, %x + call void @use(i1 %c.6) + ret void +} + +define void @test_1_constant_constraint(i32 %x) { +; CHECK-LABEL: @test_1_constant_constraint( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ule i32 [[X:%.*]], 10 +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[T_1:%.*]] = icmp ule i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[T_2:%.*]] = icmp ule i32 [[X]], 11 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[C_2:%.*]] = icmp ule i32 [[X]], 9 +; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: [[C_4:%.*]] = icmp ule i32 10, [[X]] +; CHECK-NEXT: call void @use(i1 [[C_4]]) +; CHECK-NEXT: ret void +; CHECK: bb2: +; CHECK-NEXT: [[T_3:%.*]] = icmp ule i32 10, [[X]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[F_1:%.*]] = icmp ule i32 [[X]], 9 +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[F_1_1:%.*]] = icmp ule i32 [[X]], 10 +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: [[C_5:%.*]] = icmp ule i32 [[X]], 11 +; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: [[C_6:%.*]] = icmp ule i32 12, [[X]] +; CHECK-NEXT: call void @use(i1 [[C_6]]) +; CHECK-NEXT: ret void +; +entry: + %c.1 = icmp ule i32 %x, 10 + br i1 %c.1, label %bb1, label %bb2 + +bb1: + %t.1 = icmp ule i32 %x, 10 + call void @use(i1 %t.1) + %t.2 = icmp ule i32 %x, 11 + call void @use(i1 %t.2) + %c.2 = icmp ule i32 %x, 9 + call void @use(i1 %c.2) + %c.4 = icmp ule i32 10, %x + call void @use(i1 %c.4) + ret void + +bb2: + %t.3 = icmp ule i32 10, %x + call void @use(i1 %t.3) + %f.1 = icmp ule i32 %x, 9 + call void @use(i1 %f.1) + + + %f.1.1 = icmp ule i32 %x, 10 + call void @use(i1 %f.1.1) + %c.5 = icmp ule i32 %x, 11 + call void @use(i1 %c.5) + %c.6 = icmp ule i32 12, %x + call void @use(i1 %c.6) + ret void +} + + +define i32 @test1(i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ule i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[EXIT:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp ule i32 [[Y]], [[Z:%.*]] +; CHECK-NEXT: br i1 [[C_2]], label [[BB2:%.*]], label [[EXIT]] +; CHECK: bb2: +; CHECK-NEXT: [[C_3:%.*]] = icmp ule i32 [[X]], [[Z]] +; CHECK-NEXT: br i1 true, label [[BB3:%.*]], label [[EXIT]] +; CHECK: bb3: +; CHECK-NEXT: ret i32 10 +; CHECK: exit: +; CHECK-NEXT: ret i32 20 +; +entry: + %c.1 = icmp ule i32 %x, %y + br i1 %c.1, label %bb1, label %exit + +bb1: + %c.2 = icmp ule i32 %y, %z + br i1 %c.2, label %bb2, label %exit + +bb2: + %c.3 = icmp ule i32 %x, %z + br i1 %c.3, label %bb3, label %exit + +bb3: + ret i32 10 + +exit: + ret i32 20 +} + + +define i32 @test2(i32 %x, i32 %y, i32 %z, i32 %a) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ule i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[EXIT:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp ule i32 [[Y]], [[Z:%.*]] +; CHECK-NEXT: br i1 [[C_2]], label [[BB2:%.*]], label [[EXIT]] +; CHECK: bb2: +; CHECK-NEXT: [[C_3:%.*]] = icmp ule i32 [[X]], [[A:%.*]] +; CHECK-NEXT: br i1 [[C_3]], label [[BB3:%.*]], label [[EXIT]] +; CHECK: bb3: +; CHECK-NEXT: ret i32 10 +; CHECK: exit: +; CHECK-NEXT: ret i32 20 +; +entry: + %c.1 = icmp ule i32 %x, %y + br i1 %c.1, label %bb1, label %exit + +bb1: + %c.2 = icmp ule i32 %y, %z + br i1 %c.2, label %bb2, label %exit + +bb2: + %c.3 = icmp ule i32 %x, %a + br i1 %c.3, label %bb3, label %exit + +bb3: + ret i32 10 + +exit: + ret i32 20 +} + + +define i32 @test3(i32 %x, i32 %y) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ule i32 [[X:%.*]], 10 +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[EXIT:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp ule i32 [[Y:%.*]], 20 +; CHECK-NEXT: br i1 [[C_2]], label [[BB2:%.*]], label [[EXIT]] +; CHECK: bb2: +; CHECK-NEXT: ret i32 10 +; CHECK: exit: +; CHECK-NEXT: ret i32 20 +; +entry: + %c.1 = icmp ule i32 %x, 10 + br i1 %c.1, label %bb1, label %exit + +bb1: + %c.2 = icmp ule i32 %y, 20 + br i1 %c.2, label %bb2, label %exit + +bb2: + ret i32 10 + +exit: + ret i32 20 +} + +define i32 @test4(i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ule i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[BB1:%.*]], label [[EXIT:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[C_2:%.*]] = icmp ule i32 [[Y]], [[Z:%.*]] +; CHECK-NEXT: br i1 [[C_2]], label [[BB2:%.*]], label [[EXIT]] +; CHECK: bb2: +; CHECK-NEXT: [[T_1:%.*]] = icmp ule i32 [[X]], [[Z]] +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[U_1:%.*]] = icmp eq i32 [[X]], [[Z]] +; CHECK-NEXT: call void @use(i1 [[U_1]]) +; CHECK-NEXT: ret i32 10 +; CHECK: exit: +; CHECK-NEXT: ret i32 20 +; +entry: + %c.1 = icmp ule i32 %x, %y + br i1 %c.1, label %bb1, label %exit + +bb1: + %c.2 = icmp ule i32 %y, %z + br i1 %c.2, label %bb2, label %exit + +bb2: + %t.1 = icmp ule i32 %x, %z + call void @use(i1 %t.1) + %u.1 = icmp eq i32 %x, %z + call void @use(i1 %u.1) + ret i32 10 + + +exit: + ret i32 20 +} diff --git a/llvm/test/Transforms/Coroutines/coro-param-copy.ll b/llvm/test/Transforms/Coroutines/coro-param-copy.ll index 5967a05226fdb..da08c4f15e156 100644 --- a/llvm/test/Transforms/Coroutines/coro-param-copy.ll +++ b/llvm/test/Transforms/Coroutines/coro-param-copy.ll @@ -5,22 +5,37 @@ define i8* @f() "coroutine.presplit"="1" { entry: + %a.addr = alloca i64 ; read-only before coro.begin + %a = load i64, i64* %a.addr ; cannot modify the value, don't need to copy + %x.addr = alloca i64 - call void @use(i64* %x.addr) ; might write to %x + call void @use(i64* %x.addr) ; uses %x.addr before coro.begin + %y.addr = alloca i64 - %y = load i64, i64* %y.addr ; cannot modify the value, don't need to copy - call void @print(i64 %y) + %y.cast = bitcast i64* %y.addr to i8* ; alias created and used after coro.begin + + %z.addr = alloca i64 + %flag = call i1 @check() + br i1 %flag, label %flag_true, label %flag_merge + +flag_true: + call void @use(i64* %z.addr) ; conditionally used %z.addr + br label %flag_merge +flag_merge: %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %size = call i32 @llvm.coro.size.i32() - %alloc = call i8* @myAlloc(i64 %y, i32 %size) + %alloc = call i8* @myAlloc(i32 %size) %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) + call void @llvm.memset.p0i8.i32(i8* %y.cast, i8 1, i32 4, i1 false) %0 = call i8 @llvm.coro.suspend(token none, i1 false) switch i8 %0, label %suspend [i8 0, label %resume i8 1, label %cleanup] resume: + call void @use(i64* %a.addr) call void @use(i64* %x.addr) call void @use(i64* %y.addr) + call void @use(i64* %z.addr) br label %cleanup cleanup: @@ -33,26 +48,36 @@ suspend: } ; See that we added both x and y to the frame. -; CHECK: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i64, i64, i1 } +; CHECK: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i64, i64, i64, i64, i1 } ; See that all of the uses prior to coro-begin stays put. ; CHECK-LABEL: define i8* @f() { ; CHECK-NEXT: entry: +; CHECK-NEXT: %a.addr = alloca i64 ; CHECK-NEXT: %x.addr = alloca i64 ; CHECK-NEXT: call void @use(i64* %x.addr) ; CHECK-NEXT: %y.addr = alloca i64 -; CHECK-NEXT: %y = load i64, i64* %y.addr -; CHECK-NEXT: call void @print(i64 %y) +; CHECK-NEXT: %z.addr = alloca i64 ; See that we only copy the x as y was not modified prior to coro.begin. -; CHECK: store void (%f.Frame*)* @f.destroy, void (%f.Frame*)** %destroy.addr -; CHECK-NEXT: %0 = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 2 -; CHECK-NEXT: %1 = load i64, i64* %x.addr -; CHECK-NEXT: store i64 %1, i64* %0 -; CHECK-NEXT: %index.addr1 = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 4 -; CHECK-NEXT: store i1 false, i1* %index.addr1 +; CHECK: store void (%f.Frame*)* @f.destroy, void (%f.Frame*)** %destroy.addr +; The next 3 instructions are to copy data in %x.addr from stack to frame. +; CHECK-NEXT: %0 = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 3 +; CHECK-NEXT: %1 = load i64, i64* %x.addr, align 4 +; CHECK-NEXT: store i64 %1, i64* %0, align 4 +; The next 2 instructions are to recreate %y.cast in the original IR. +; CHECK-NEXT: %2 = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 4 +; CHECK-NEXT: %3 = bitcast i64* %2 to i8* +; The next 3 instructions are to copy data in %z.addr from stack to frame. +; CHECK-NEXT: %4 = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 5 +; CHECK-NEXT: %5 = load i64, i64* %z.addr, align 4 +; CHECK-NEXT: store i64 %5, i64* %4, align 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %3, i8 1, i32 4, i1 false) +; CHECK-NEXT: %index.addr1 = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 6 +; CHECK-NEXT: store i1 false, i1* %index.addr1, align 1 ; CHECK-NEXT: ret i8* %hdl + declare i8* @llvm.coro.free(token, i8*) declare i32 @llvm.coro.size.i32() declare i8 @llvm.coro.suspend(token, i1) @@ -64,7 +89,9 @@ declare i1 @llvm.coro.alloc(token) declare i8* @llvm.coro.begin(token, i8*) declare i1 @llvm.coro.end(i8*, i1) -declare noalias i8* @myAlloc(i64, i32) -declare void @print(i64) +declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i1) + +declare noalias i8* @myAlloc(i32) declare void @use(i64*) declare void @free(i8*) +declare i1 @check() diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-frame.ll b/llvm/test/Transforms/Coroutines/coro-retcon-frame.ll index c7ca8e3a01370..a1b83eeaee774 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-frame.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-frame.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -coro-split -S | FileCheck %s +; RUN: opt < %s -passes=coro-split -S | FileCheck %s target datalayout = "p:64:64:64" diff --git a/llvm/test/Transforms/Coroutines/coro-spill-defs-before-corobegin.ll b/llvm/test/Transforms/Coroutines/coro-spill-defs-before-corobegin.ll new file mode 100644 index 0000000000000..2521c902baf60 --- /dev/null +++ b/llvm/test/Transforms/Coroutines/coro-spill-defs-before-corobegin.ll @@ -0,0 +1,80 @@ +; Verifies that phi and invoke definitions before CoroBegin are spilled properly. +; RUN: opt < %s -coro-split -S | FileCheck %s +; RUN: opt < %s -passes=coro-split -S | FileCheck %s + +define i8* @f(i1 %n) "coroutine.presplit"="1" personality i32 0 { +entry: + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) + %size = call i32 @llvm.coro.size.i32() + %alloc = call i8* @malloc(i32 %size) + %flag = call i1 @check(i8* %alloc) + br i1 %flag, label %flag_true, label %flag_false + +flag_true: + br label %merge + +flag_false: + br label %merge + +merge: + %value_phi = phi i32 [ 0, %flag_true ], [ 1, %flag_false ] + %value_invoke = invoke i32 @calc() to label %normal unwind label %lpad + +normal: + %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) + call i32 @print(i32 %value_phi) + call i32 @print(i32 %value_invoke) + %sp1 = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %sp1, label %suspend [i8 0, label %resume + i8 1, label %cleanup] +resume: + call i32 @print(i32 %value_phi) + call i32 @print(i32 %value_invoke) + br label %cleanup + +cleanup: + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) + call void @free(i8* %mem) + br label %suspend +suspend: + call i1 @llvm.coro.end(i8* %hdl, i1 0) + ret i8* %hdl + +lpad: + %lpval = landingpad { i8*, i32 } + cleanup + + resume { i8*, i32 } %lpval +} + +; Verifies that the both value_phi and value_invoke are stored correctly in the coroutine frame +; CHECK: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i32, i32, i1 } +; CHECK-LABEL: @f( +; CHECK: %alloc = call i8* @malloc(i32 32) +; CHECK-NEXT: %flag = call i1 @check(i8* %alloc) +; CHECK-NEXT: %value_phi = select i1 %flag, i32 0, i32 1 +; CHECK-NEXT: %value_invoke = call i32 @calc() +; CHECK-NEXT: %hdl = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc) + +; CHECK: store void (%f.Frame*)* @f.destroy, void (%f.Frame*)** %destroy.addr +; CHECK-NEXT: %value_invoke.spill.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 3 +; CHECK-NEXT: store i32 %value_invoke, i32* %value_invoke.spill.addr +; CHECK-NEXT: %value_phi.spill.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 2 +; CHECK-NEXT: store i32 %value_phi, i32* %value_phi.spill.addr + +declare i8* @llvm.coro.free(token, i8*) +declare i32 @llvm.coro.size.i32() +declare i8 @llvm.coro.suspend(token, i1) +declare void @llvm.coro.resume(i8*) +declare void @llvm.coro.destroy(i8*) + +declare token @llvm.coro.id(i32, i8*, i8*, i8*) +declare i1 @llvm.coro.alloc(token) +declare i8* @llvm.coro.begin(token, i8*) +declare i1 @llvm.coro.end(i8*, i1) + +declare noalias i8* @malloc(i32) +declare i32 @print(i32) +declare i1 @check(i8*) +declare i32 @calc() +declare void @free(i8*) diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic-todo.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic-todo.ll index 04361e63e6d08..8dfb85719c309 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic-todo.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic-todo.ll @@ -21,14 +21,3 @@ define i32 @test9() { store i32 1, i32* @x ret i32 %x } - -; DSE across monotonic store (allowed as long as the eliminated store isUnordered) -define void @test10() { -; CHECK-LABEL: test10 -; CHECK-NOT: store i32 0 -; CHECK: store i32 1 - store i32 0, i32* @x - store atomic i32 42, i32* @y monotonic, align 4 - store i32 1, i32* @x - ret void -} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll index 5a3ea376415c3..51129fe2bcadb 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll @@ -88,6 +88,17 @@ define i32 @test8() { ret i32 %x } +; DSE across monotonic store (allowed as long as the eliminated store isUnordered) +define void @test10() { +; CHECK-LABEL: test10 +; CHECK-NOT: store i32 0 +; CHECK: store i32 1 + store i32 0, i32* @x + store atomic i32 42, i32* @y monotonic, align 4 + store i32 1, i32* @x + ret void +} + ; DSE across monotonic load (forbidden since the eliminated store is atomic) define i32 @test11() { ; CHECK-LABEL: @test11( diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll index 5aeda18309724..02fc8f22b6b40 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -S | FileCheck %s +; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa=false -S | FileCheck %s ; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -enable-dse-memoryssa -S | FileCheck %s target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" @@ -91,3 +92,21 @@ define void @test18_atomic(i8* %P, i8* %Q, i8* %R) nounwind ssp { tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %R, i64 12, i32 1) ret void } + +define void @test_memset_memcpy_inline(i8* noalias %P, i8* noalias %Q) { + tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i1 false) + tail call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false) + ret void +} + +define void @test_store_memcpy_inline(i8* noalias %P, i8* noalias %Q) { + store i8 0, i8* %P + %P.1 = getelementptr i8, i8* %P, i64 1 + store i8 1, i8* %P.1 + %P.4 = getelementptr i8, i8* %P, i64 4 + store i8 4, i8* %P.4 + tail call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 4, i1 false) + ret void +} + +declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64 immarg, i1 immarg) diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loop-carried-dependence.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loop-carried-dependence.ll new file mode 100644 index 0000000000000..b7a882a65bc15 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loop-carried-dependence.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py + +; RUN: opt -dse -enable-dse-memoryssa -S %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" + +declare void @use(i32) + +; Test cases with a loop carried dependence in %loop.2, where %l.2 reads the +; value stored by the previous iteration. Hence, the store in %loop.2 is not +; dead at the end of the function or after the call to lifetime.end(). + +define void @test.1() { +; CHECK-LABEL: @test.1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca [100 x i32], align 4 +; CHECK-NEXT: br label [[LOOP_1:%.*]] +; CHECK: loop.1: +; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_1]] ] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[A]], i64 0, i64 [[IV_1]] +; CHECK-NEXT: store i32 0, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[IV_1_NEXT]] = add nsw i64 [[IV_1]], 1 +; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV_1_NEXT]], 100 +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_1]], label [[LOOP_2_PH:%.*]] +; CHECK: loop.2.ph: +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.2: +; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[LOOP_2]] ], [ 0, [[LOOP_2_PH]] ] +; CHECK-NEXT: [[PTR_IV_2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[A]], i64 0, i64 [[IV_2]] +; CHECK-NEXT: [[L_0:%.*]] = load i32, i32* [[PTR_IV_2]], align 4 +; CHECK-NEXT: call void @use(i32 [[L_0]]) +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[IV_2]], 1 +; CHECK-NEXT: [[PTR_IV_2_ADD_1:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[A]], i64 0, i64 [[ADD]] +; CHECK-NEXT: store i32 10, i32* [[PTR_IV_2_ADD_1]], align 4 +; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[PTR_IV_2]], align 4 +; CHECK-NEXT: call void @use(i32 [[L_1]]) +; CHECK-NEXT: [[IV_2_NEXT]] = add nsw i64 [[IV_2]], 1 +; CHECK-NEXT: [[C_2:%.*]] = icmp slt i64 [[IV_2_NEXT]], 100 +; CHECK-NEXT: br i1 [[C_2]], label [[LOOP_2]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %A = alloca [100 x i32], align 4 + br label %loop.1 + +loop.1: + %iv.1 = phi i64 [ 1, %entry ], [ %iv.1.next, %loop.1 ] + %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %iv.1 + store i32 0, i32* %arrayidx1, align 4 + %iv.1.next = add nsw i64 %iv.1, 1 + %c.1 = icmp slt i64 %iv.1.next, 100 + br i1 %c.1, label %loop.1, label %loop.2.ph + +loop.2.ph: + br label %loop.2 + +loop.2: + %iv.2 = phi i64 [ %iv.2.next, %loop.2 ], [ 0, %loop.2.ph ] + %ptr.iv.2 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %iv.2 + %l.0 = load i32, i32* %ptr.iv.2, align 4 + call void @use(i32 %l.0) + %add = add nsw i64 %iv.2, 1 + %ptr.iv.2.add.1 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %add + store i32 10, i32* %ptr.iv.2.add.1, align 4 + %l.1 = load i32, i32* %ptr.iv.2, align 4 + call void @use(i32 %l.1) + %iv.2.next = add nsw i64 %iv.2, 1 + %c.2 = icmp slt i64 %iv.2.next, 100 + br i1 %c.2, label %loop.2, label %exit + +exit: + ret void +} + +define void @test.2() { +; CHECK-LABEL: @test.2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca [100 x i32], align 4 +; CHECK-NEXT: [[A_CAST:%.*]] = bitcast [100 x i32]* [[A]] to i8* +; CHECK-NEXT: br label [[LOOP_1:%.*]] +; CHECK: loop.1: +; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_1]] ] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[A]], i64 0, i64 [[IV_1]] +; CHECK-NEXT: store i32 0, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[IV_1_NEXT]] = add nsw i64 [[IV_1]], 1 +; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV_1_NEXT]], 100 +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_1]], label [[LOOP_2_PH:%.*]] +; CHECK: loop.2.ph: +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.2: +; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[LOOP_2]] ], [ 0, [[LOOP_2_PH]] ] +; CHECK-NEXT: [[PTR_IV_2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[A]], i64 0, i64 [[IV_2]] +; CHECK-NEXT: [[L_0:%.*]] = load i32, i32* [[PTR_IV_2]], align 4 +; CHECK-NEXT: call void @use(i32 [[L_0]]) +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[IV_2]], 1 +; CHECK-NEXT: [[PTR_IV_2_ADD_1:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[A]], i64 0, i64 [[ADD]] +; CHECK-NEXT: store i32 10, i32* [[PTR_IV_2_ADD_1]], align 4 +; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[PTR_IV_2]], align 4 +; CHECK-NEXT: call void @use(i32 [[L_1]]) +; CHECK-NEXT: [[IV_2_NEXT]] = add nsw i64 [[IV_2]], 1 +; CHECK-NEXT: [[C_2:%.*]] = icmp slt i64 [[IV_2_NEXT]], 100 +; CHECK-NEXT: br i1 [[C_2]], label [[LOOP_2]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 400, i8* nonnull [[A_CAST]]) +; CHECK-NEXT: ret void +; +entry: + %A = alloca [100 x i32], align 4 + %A.cast = bitcast [100 x i32]* %A to i8* + br label %loop.1 + +loop.1: + %iv.1 = phi i64 [ 1, %entry ], [ %iv.1.next, %loop.1 ] + %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %iv.1 + store i32 0, i32* %arrayidx1, align 4 + %iv.1.next = add nsw i64 %iv.1, 1 + %c.1 = icmp slt i64 %iv.1.next, 100 + br i1 %c.1, label %loop.1, label %loop.2.ph + +loop.2.ph: + br label %loop.2 + +loop.2: + %iv.2 = phi i64 [ %iv.2.next, %loop.2 ], [ 0, %loop.2.ph ] + %ptr.iv.2 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %iv.2 + %l.0 = load i32, i32* %ptr.iv.2, align 4 + call void @use(i32 %l.0) + %add = add nsw i64 %iv.2, 1 + %ptr.iv.2.add.1 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %add + store i32 10, i32* %ptr.iv.2.add.1, align 4 + %l.1 = load i32, i32* %ptr.iv.2, align 4 + call void @use(i32 %l.1) + %iv.2.next = add nsw i64 %iv.2, 1 + %c.2 = icmp slt i64 %iv.2.next, 100 + br i1 %c.2, label %loop.2, label %exit + +exit: + call void @llvm.lifetime.end.p0i8(i64 400, i8* nonnull %A.cast) #5 + ret void +} + +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) + +; Make sure `store i32 10, i32* %ptr.2` in %cond.store is not removed. The +; stored value may be read by `%use = load i32, i32* %ptr.1` in a future +; iteration. +define void@test.3() { +; CHECK-LABEL: @test.3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[NODESTACK:%.*]] = alloca [12 x i32], align 4 +; CHECK-NEXT: [[NODESTACK_CAST:%.*]] = bitcast [12 x i32]* [[NODESTACK]] to i8* +; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond(i32 1) +; CHECK-NEXT: br i1 [[C_1]], label [[CLEANUP:%.*]], label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[DEPTH_1:%.*]] = phi i32 [ [[DEPTH_1_BE:%.*]], [[LOOP_LATCH:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[DEPTH_1]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_READ:%.*]], label [[COND_STORE:%.*]] +; CHECK: cond.read: +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[DEPTH_1]], -3 +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds [12 x i32], [12 x i32]* [[NODESTACK]], i32 0, i32 [[SUB]] +; CHECK-NEXT: [[USE:%.*]] = load i32, i32* [[PTR_1]], align 4 +; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond(i32 [[USE]]) +; CHECK-NEXT: br i1 [[C_2]], label [[LOOP_LATCH]], label [[COND_STORE]] +; CHECK: cond.store: +; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds [12 x i32], [12 x i32]* [[NODESTACK]], i32 0, i32 [[DEPTH_1]] +; CHECK-NEXT: store i32 10, i32* [[PTR_2]], align 4 +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[DEPTH_1]], 1 +; CHECK-NEXT: [[C_3:%.*]] = call i1 @cond(i32 20) +; CHECK-NEXT: br i1 [[C_3]], label [[CLEANUP]], label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[DEPTH_1_BE]] = phi i32 [ [[SUB]], [[COND_READ]] ], [ [[INC]], [[COND_STORE]] ] +; CHECK-NEXT: br label [[LOOP_HEADER]] +; CHECK: cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 48, i8* nonnull [[NODESTACK_CAST]]) +; CHECK-NEXT: ret void +; +entry: + %nodeStack = alloca [12 x i32], align 4 + %nodeStack.cast = bitcast [12 x i32]* %nodeStack to i8* + %c.1 = call i1 @cond(i32 1) + br i1 %c.1, label %cleanup, label %loop.header + +loop.header: ; preds = %entry, %while.cond.backedge + %depth.1 = phi i32 [ %depth.1.be, %loop.latch ], [ 3, %entry ] + %cmp = icmp sgt i32 %depth.1, 0 + br i1 %cmp, label %cond.read, label %cond.store + +cond.read: ; preds = %while.cond + %sub = add nsw i32 %depth.1, -3 + %ptr.1 = getelementptr inbounds [12 x i32], [12 x i32]* %nodeStack, i32 0, i32 %sub + %use = load i32, i32* %ptr.1, align 4 + %c.2 = call i1 @cond(i32 %use) + br i1 %c.2, label %loop.latch, label %cond.store + +cond.store: + %ptr.2 = getelementptr inbounds [12 x i32], [12 x i32]* %nodeStack, i32 0, i32 %depth.1 + store i32 10, i32* %ptr.2, align 4 + %inc = add nsw i32 %depth.1, 1 + %c.3 = call i1 @cond(i32 20) + br i1 %c.3, label %cleanup, label %loop.latch + +loop.latch: + %depth.1.be = phi i32 [ %sub, %cond.read ], [ %inc, %cond.store ] + br label %loop.header + +cleanup: ; preds = %while.body, %while.end, %entry + call void @llvm.lifetime.end.p0i8(i64 48, i8* nonnull %nodeStack.cast) #3 + ret void +} + +declare i1 @cond(i32) diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll index b213edbaf09e6..ba61b3250f5e7 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll @@ -9,7 +9,7 @@ define void @test13(i32* noalias %P) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: -; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br i1 false, label [[FOR]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret void @@ -29,7 +29,7 @@ define void @test14(i32* noalias %P) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: -; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br i1 false, label [[FOR]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret void @@ -48,12 +48,12 @@ define void @test18(i32* noalias %P) { ; CHECK-LABEL: @test18( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8* -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P]], align 4 ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: -; CHECK-NEXT: store i8 1, i8* [[P2]] -; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[P]] -; CHECK-NEXT: store i8 2, i8* [[P2]] +; CHECK-NEXT: store i8 1, i8* [[P2]], align 1 +; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[P]], align 4 +; CHECK-NEXT: store i8 2, i8* [[P2]], align 1 ; CHECK-NEXT: br i1 false, label [[FOR]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret void @@ -111,6 +111,7 @@ define void @test_loop(i32 %N, i32* noalias nocapture readonly %A, i32* noalias ; CHECK: for.body4.lr.ph: ; CHECK-NEXT: [[I_028:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_028]] +; CHECK-NEXT: store i32 0, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_028]], [[N]] ; CHECK-NEXT: br label [[FOR_BODY4:%.*]] ; CHECK: for.body4: @@ -183,7 +184,7 @@ define void @loop_multiple_def_uses(i32* noalias %P) { ; CHECK-NEXT: br i1 [[C1]], label [[FOR_BODY:%.*]], label [[END:%.*]] ; CHECK: for.body: ; CHECK-NEXT: store i32 1, i32* [[P]], align 4 -; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[P]] +; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[P]], align 4 ; CHECK-NEXT: br label [[FOR_HEADER]] ; CHECK: end: ; CHECK-NEXT: store i32 3, i32* [[P]], align 4 @@ -220,7 +221,7 @@ define void @loop_multiple_def_uses_partial_write(i32* noalias %p) { ; CHECK: for.body: ; CHECK-NEXT: [[C:%.*]] = bitcast i32* [[P]] to i8* ; CHECK-NEXT: store i8 1, i8* [[C]], align 4 -; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[P]] +; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[P]], align 4 ; CHECK-NEXT: br label [[FOR_HEADER]] ; CHECK: end: ; CHECK-NEXT: store i32 3, i32* [[P]], align 4 @@ -257,7 +258,7 @@ define void @loop_multiple_def_uses_mayalias_write(i32* %p, i32* %q) { ; CHECK-NEXT: br i1 [[C1]], label [[FOR_BODY:%.*]], label [[END:%.*]] ; CHECK: for.body: ; CHECK-NEXT: store i32 1, i32* [[Q:%.*]], align 4 -; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[P]] +; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[P]], align 4 ; CHECK-NEXT: br label [[FOR_HEADER]] ; CHECK: end: ; CHECK-NEXT: store i32 3, i32* [[P]], align 4 @@ -314,3 +315,44 @@ bb1: ; preds = %bb1, %bb } declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) + +@x = global [10 x i16] zeroinitializer, align 1 + +; Make sure we do not eliminate the store in %do.body, because it writes to +; multiple locations in the loop and the store in %if.end10 only stores to +; the last one. +define i16 @test_loop_carried_dep() { +; CHECK-LABEL: @test_loop_carried_dep( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[I_0:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 [[I_0]] +; CHECK-NEXT: store i16 2, i16* [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i16 [[I_0]], 4 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[IF_END10:%.*]], label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[INC]] = add nuw nsw i16 [[I_0]], 1 +; CHECK-NEXT: br label [[DO_BODY]] +; CHECK: if.end10: +; CHECK-NEXT: store i16 1, i16* [[ARRAYIDX2]], align 1 +; CHECK-NEXT: ret i16 0 +; +entry: + br label %do.body + +do.body: ; preds = %if.end, %entry + %i.0 = phi i16 [ 0, %entry ], [ %inc, %if.end ] + %arrayidx2 = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 %i.0 + store i16 2, i16* %arrayidx2, align 1 + %exitcond = icmp eq i16 %i.0, 4 + br i1 %exitcond, label %if.end10, label %if.end + +if.end: ; preds = %do.body + %inc = add nuw nsw i16 %i.0, 1 + br label %do.body + +if.end10: ; preds = %do.body + store i16 1, i16* %arrayidx2, align 1 + ret i16 0 +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-malloc-free.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-malloc-free.ll index 763362dd3d479..5c14f92b8d74a 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-malloc-free.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-malloc-free.ll @@ -180,6 +180,7 @@ define void @test27() { ; CHECK-NEXT: br i1 true, label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: [[M:%.*]] = call noalias i8* @malloc(i64 10) +; CHECK-NEXT: store i8 1, i8* [[M]], align 1 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[R:%.*]] = phi i8* [ null, [[BB1:%.*]] ], [ [[M]], [[BB2]] ] diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll index d7945e888f4d0..df6113928fe53 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll @@ -123,10 +123,18 @@ bb3: define void @alloca_1(i1 %c) { ; CHECK-LABEL: @alloca_1( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[P_ALLOCA:%.*]] = alloca [32 x i32], align 4 +; CHECK-NEXT: [[P:%.*]] = bitcast [32 x i32]* [[P_ALLOCA]] to i32* +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i1 false) ; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: ret void @@ -152,10 +160,20 @@ bb3: define void @alloca_2(i1 %c) { ; CHECK-LABEL: @alloca_2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[P_ALLOCA:%.*]] = alloca [32 x i32], align 4 +; CHECK-NEXT: [[P:%.*]] = bitcast [32 x i32]* [[P_ALLOCA]] to i32* +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i1 false) ; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/read-clobber-after-overwrite.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/read-clobber-after-overwrite.ll new file mode 100644 index 0000000000000..4f704c35a90b1 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/read-clobber-after-overwrite.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -dse -enable-dse-memoryssa -S %s | FileCheck %s + +declare i1 @cond() readnone + +define i32 @test() { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[M0:%.*]] = alloca [4 x i32], align 16 +; CHECK-NEXT: br label [[LOOP_1:%.*]] +; CHECK: loop.1: +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.2: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[LOOP_1]] ], [ [[IV_NEXT:%.*]], [[LOOP_2]] ] +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[M0]], i64 3, i64 [[IV]] +; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[M0]], i64 0, i64 [[IV]] +; CHECK-NEXT: store i32 20, i32* [[PTR_2]], align 4 +; CHECK-NEXT: store i32 30, i32* [[PTR_1]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[C_3:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_3]], label [[LOOP_1_LATCH:%.*]], label [[LOOP_2]] +; CHECK: loop.1.latch: +; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_2]], label [[EXIT:%.*]], label [[LOOP_1]] +; CHECK: exit: +; CHECK-NEXT: [[PTR_3:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[M0]], i64 0, i64 1 +; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[PTR_3]], align 16 +; CHECK-NEXT: ret i32 [[LV]] +; +entry: + %M0 = alloca [4 x i32], align 16 + br label %loop.1 + +loop.1: + br label %loop.2 + +loop.2: + %iv = phi i64 [ 0, %loop.1 ], [ %iv.next, %loop.2 ] + %ptr.1 = getelementptr inbounds [4 x i32], [4 x i32]* %M0, i64 3, i64 %iv + store i32 10, i32* %ptr.1, align 4 + %ptr.2 = getelementptr inbounds [4 x i32], [4 x i32]* %M0, i64 0, i64 %iv + store i32 20, i32* %ptr.2, align 4 + store i32 30, i32* %ptr.1, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %c.3 = call i1 @cond() + br i1 %c.3, label %loop.1.latch, label %loop.2 + +loop.1.latch: + %c.2 = call i1 @cond() + br i1 %c.2, label %exit, label %loop.1 + +exit: + %ptr.3 = getelementptr inbounds [4 x i32], [4 x i32]* %M0, i64 0, i64 1 + %lv = load i32, i32* %ptr.3, align 16 + ret i32 %lv + + +} diff --git a/llvm/test/Transforms/DeadStoreElimination/masked-dead-store.ll b/llvm/test/Transforms/DeadStoreElimination/masked-dead-store.ll new file mode 100644 index 0000000000000..85673e9fe5431 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/masked-dead-store.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -tbaa -dse -enable-dse-memoryssa=false -S < %s | FileCheck %s +; RUN: opt -tbaa -dse -enable-dse-memoryssa=true -S < %s | FileCheck %s +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" + +define dllexport i32 @f0(i8** %a0, i8** %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) #0 { +; CHECK-LABEL: @f0( +; CHECK-NEXT: b0: +; CHECK-NEXT: [[V0:%.*]] = getelementptr inbounds i8*, i8** [[A0:%.*]], i32 [[A2:%.*]] +; CHECK-NEXT: [[V1:%.*]] = load i8*, i8** [[V0]], align 4, [[TBAA0:!tbaa !.*]] +; CHECK-NEXT: [[V2:%.*]] = getelementptr i8, i8* [[V1]], i32 [[A3:%.*]] +; CHECK-NEXT: [[V3:%.*]] = bitcast i8* [[V2]] to <128 x i8>* +; CHECK-NEXT: [[V6:%.*]] = getelementptr inbounds i8*, i8** [[A1:%.*]], i32 [[A4:%.*]] +; CHECK-NEXT: [[V7:%.*]] = load i8*, i8** [[V6]], align 4, [[TBAA3:!tbaa !.*]] +; CHECK-NEXT: [[V8:%.*]] = getelementptr i8, i8* [[V7]], i32 [[A5:%.*]] +; CHECK-NEXT: [[V9:%.*]] = bitcast i8* [[V8]] to <128 x i8>* +; CHECK-NEXT: [[V10:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* [[V9]], i32 32, <128 x i1> , <128 x i8> undef), [[TBAA5:!tbaa !.*]] +; CHECK-NEXT: [[V11:%.*]] = shufflevector <128 x i8> [[V10]], <128 x i8> undef, <32 x i32> +; CHECK-NEXT: [[V14:%.*]] = shufflevector <32 x i8> [[V11]], <32 x i8> undef, <128 x i32> +; CHECK-NEXT: [[V16:%.*]] = shufflevector <128 x i8> [[V14]], <128 x i8> undef, <32 x i32> +; CHECK-NEXT: [[V17:%.*]] = getelementptr inbounds i8*, i8** [[A1]], i32 [[A6:%.*]] +; CHECK-NEXT: [[V18:%.*]] = load i8*, i8** [[V17]], align 4, [[TBAA3]] +; CHECK-NEXT: [[V19:%.*]] = getelementptr i8, i8* [[V18]], i32 [[A7:%.*]] +; CHECK-NEXT: [[V20:%.*]] = bitcast i8* [[V19]] to <128 x i8>* +; CHECK-NEXT: [[V21:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* [[V20]], i32 32, <128 x i1> , <128 x i8> undef), [[TBAA5]] +; CHECK-NEXT: [[V22:%.*]] = shufflevector <128 x i8> [[V21]], <128 x i8> undef, <32 x i32> +; CHECK-NEXT: [[V23:%.*]] = icmp ugt <32 x i8> [[V16]], [[V22]] +; CHECK-NEXT: [[V24:%.*]] = select <32 x i1> [[V23]], <32 x i8> [[V16]], <32 x i8> [[V22]] +; CHECK-NEXT: [[V25:%.*]] = shufflevector <32 x i8> [[V24]], <32 x i8> undef, <128 x i32> +; CHECK-NEXT: tail call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[V25]], <128 x i8>* [[V3]], i32 32, <128 x i1> ), [[TBAA8:!tbaa !.*]] +; CHECK-NEXT: ret i32 0 +; +b0: + %v0 = getelementptr inbounds i8*, i8** %a0, i32 %a2 + %v1 = load i8*, i8** %v0, align 4, !tbaa !0 + %v2 = getelementptr i8, i8* %v1, i32 %a3 + %v3 = bitcast i8* %v2 to <128 x i8>* + tail call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> , <128 x i8>* %v3, i32 32, <128 x i1> ), !tbaa !3 + %v6 = getelementptr inbounds i8*, i8** %a1, i32 %a4 + %v7 = load i8*, i8** %v6, align 4, !tbaa !6 + %v8 = getelementptr i8, i8* %v7, i32 %a5 + %v9 = bitcast i8* %v8 to <128 x i8>* + %v10 = tail call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %v9, i32 32, <128 x i1> , <128 x i8> undef), !tbaa !8 + %v11 = shufflevector <128 x i8> %v10, <128 x i8> undef, <32 x i32> + %v14 = shufflevector <32 x i8> %v11, <32 x i8> undef, <128 x i32> + tail call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> %v14, <128 x i8>* %v3, i32 32, <128 x i1> ), !tbaa !3 + %v16 = shufflevector <128 x i8> %v14, <128 x i8> undef, <32 x i32> + %v17 = getelementptr inbounds i8*, i8** %a1, i32 %a6 + %v18 = load i8*, i8** %v17, align 4, !tbaa !6 + %v19 = getelementptr i8, i8* %v18, i32 %a7 + %v20 = bitcast i8* %v19 to <128 x i8>* + %v21 = tail call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %v20, i32 32, <128 x i1> , <128 x i8> undef), !tbaa !8 + %v22 = shufflevector <128 x i8> %v21, <128 x i8> undef, <32 x i32> + %v23 = icmp ugt <32 x i8> %v16, %v22 + %v24 = select <32 x i1> %v23, <32 x i8> %v16, <32 x i8> %v22 + %v25 = shufflevector <32 x i8> %v24, <32 x i8> undef, <128 x i32> + tail call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> %v25, <128 x i8>* %v3, i32 32, <128 x i1> ), !tbaa !3 + ret i32 0 +} + +declare void @llvm.masked.store.v128i8.p0v128i8(<128 x i8>, <128 x i8>*, i32 immarg, <128 x i1>) #1 +declare <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>*, i32 immarg, <128 x i1>, <128 x i8>) #2 + +attributes #0 = { nounwind willreturn } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { argmemonly nounwind readonly willreturn } + +!0 = !{!1, !1, i64 0} +!1 = !{!"0x2cf74d0", !2, i64 0} +!2 = !{!"tvm-tbaa"} +!3 = !{!4, !4, i64 0} +!4 = !{!"i8", !5, i64 0} +!5 = !{!"0x2c6ebb0", !2, i64 0} +!6 = !{!7, !7, i64 0} +!7 = !{!"0x2cff870", !2, i64 0} +!8 = !{!9, !9, i64 0} +!9 = !{!"i8", !10, i64 0} +!10 = !{!"0x2c6c3c0", !2, i64 0} diff --git a/llvm/test/Transforms/EarlyCSE/commute.ll b/llvm/test/Transforms/EarlyCSE/commute.ll index 57c5a853a12ff..a172ba81c6527 100644 --- a/llvm/test/Transforms/EarlyCSE/commute.ll +++ b/llvm/test/Transforms/EarlyCSE/commute.ll @@ -684,6 +684,26 @@ define i32 @select_not_invert_pred_cond_wrong_select_op(i8 %x, i8 %y, i32 %t, i3 ret i32 %r } +; This test is a reproducer for a bug involving inverted min/max selects +; hashing differently but comparing as equal. It exhibits such a pair of +; values, and we run this test with -earlycse-debug-hash which would catch +; the disagreement and fail if it regressed. +; EarlyCSE should be able to detect the 2nd redundant `select` and eliminate +; it. +define i32 @inverted_max(i32 %i) { +; CHECK-LABEL: @inverted_max( +; CHECK-NEXT: [[CMP:%.*]] = icmp sle i32 0, [[I:%.*]] +; CHECK-NEXT: [[M1:%.*]] = select i1 [[CMP]], i32 [[I]], i32 0 +; CHECK-NEXT: [[CMPINV:%.*]] = icmp sgt i32 0, [[I:%.*]] +; CHECK-NEXT: [[R:%.*]] = add i32 [[M1]], [[M1]] +; CHECK-NEXT: ret i32 [[R]] + %cmp = icmp sle i32 0, %i + %m1 = select i1 %cmp, i32 %i, i32 0 + %cmpinv = icmp sgt i32 0, %i + %m2 = select i1 %cmpinv, i32 0, i32 %i + %r = add i32 %m1, %m2 + ret i32 %r +} ; This test is a reproducer for a bug involving inverted min/max selects ; hashing differently but comparing as equal. It exhibits such a pair of diff --git a/llvm/test/Transforms/EarlyCSE/masked-intrinsics.ll b/llvm/test/Transforms/EarlyCSE/masked-intrinsics.ll new file mode 100644 index 0000000000000..77183ab97a6b0 --- /dev/null +++ b/llvm/test/Transforms/EarlyCSE/masked-intrinsics.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -early-cse < %s | FileCheck %s + +define <128 x i8> @f0(<128 x i8>* %a0, <128 x i8> %a1, <128 x i8> %a2) { +; CHECK-LABEL: @f0( +; CHECK-NEXT: [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[A1]], <128 x i8>* [[A0:%.*]], i32 4, <128 x i1> [[V0]]) +; CHECK-NEXT: [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* [[A0]], i32 4, <128 x i1> [[V0]], <128 x i8> undef) +; CHECK-NEXT: ret <128 x i8> [[V1]] +; + %v0 = icmp eq <128 x i8> %a1, %a2 + call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> %a1, <128 x i8>* %a0, i32 4, <128 x i1> %v0) + %v1 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %a0, i32 4, <128 x i1> %v0, <128 x i8> undef) + ret <128 x i8> %v1 +} + +define <128 x i8> @f1(<128 x i8>* %a0, <128 x i8> %a1, <128 x i8> %a2) { +; CHECK-LABEL: @f1( +; CHECK-NEXT: [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* [[A0:%.*]], i32 4, <128 x i1> [[V0]], <128 x i8> undef) +; CHECK-NEXT: call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[V1]], <128 x i8>* [[A0]], i32 4, <128 x i1> [[V0]]) +; CHECK-NEXT: ret <128 x i8> [[V1]] +; + %v0 = icmp eq <128 x i8> %a1, %a2 + %v1 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %a0, i32 4, <128 x i1> %v0, <128 x i8> undef) + call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> %v1, <128 x i8>* %a0, i32 4, <128 x i1> %v0) + ret <128 x i8> %v1 +} + +define <128 x i8> @f2(<128 x i8>* %a0, <128 x i8> %a1, <128 x i8> %a2) { +; CHECK-LABEL: @f2( +; CHECK-NEXT: [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* [[A0:%.*]], i32 4, <128 x i1> [[V0]], <128 x i8> undef) +; CHECK-NEXT: [[V3:%.*]] = add <128 x i8> [[V1]], [[V1]] +; CHECK-NEXT: ret <128 x i8> [[V3]] +; + %v0 = icmp eq <128 x i8> %a1, %a2 + %v1 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %a0, i32 4, <128 x i1> %v0, <128 x i8> undef) + %v2 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %a0, i32 4, <128 x i1> %v0, <128 x i8> undef) + %v3 = add <128 x i8> %v1, %v2 + ret <128 x i8> %v3 +} + +declare <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>*, i32, <128 x i1>, <128 x i8>) +declare void @llvm.masked.store.v128i8.p0v128i8(<128 x i8>, <128 x i8>*, i32, <128 x i1>) diff --git a/llvm/test/Transforms/EarlyCSE/reuse-preserved-memoryssa.ll b/llvm/test/Transforms/EarlyCSE/reuse-preserved-memoryssa.ll new file mode 100644 index 0000000000000..744389c24db28 --- /dev/null +++ b/llvm/test/Transforms/EarlyCSE/reuse-preserved-memoryssa.ll @@ -0,0 +1,7 @@ +; RUN: opt -memoryssa -gvn -early-cse-memssa %s -S | FileCheck %s + +; CHECK: define void @foo( + +define void @foo() { + ret void +} diff --git a/llvm/test/Transforms/GCOVProfiling/atomic-counter.ll b/llvm/test/Transforms/GCOVProfiling/atomic-counter.ll index 01843e26331fc..2c5ea41b6fd81 100644 --- a/llvm/test/Transforms/GCOVProfiling/atomic-counter.ll +++ b/llvm/test/Transforms/GCOVProfiling/atomic-counter.ll @@ -4,12 +4,7 @@ ; CHECK-LABEL: void @empty() ; CHECK-NEXT: entry: -; CHECK-NEXT: br label %0, !dbg [[DBG:![0-9]+]] -; CHECK: 0: -; CHECK-NEXT: %1 = phi i64* [ getelementptr inbounds ([2 x i64], [2 x i64]* @__llvm_gcov_ctr, i64 0, i64 0), %entry ], !dbg [[DBG]] -; CHECK-NEXT: %2 = atomicrmw add i64* %1, i64 1 monotonic, !dbg [[DBG]] -;; Counter for the exit. -; CHECK-NEXT: %3 = atomicrmw add i64* getelementptr inbounds ([2 x i64], [2 x i64]* @__llvm_gcov_ctr, i64 0, i64 1), i64 1 monotonic, !dbg [[DBG]] +; CHECK-NEXT: %0 = atomicrmw add i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__llvm_gcov_ctr, i64 0, i64 0), i64 1 monotonic, !dbg [[DBG:![0-9]+]] ; CHECK-NEXT: ret void, !dbg [[DBG]] define dso_local void @empty() !dbg !5 { diff --git a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll new file mode 100644 index 0000000000000..4d4ffe4021fa1 --- /dev/null +++ b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll @@ -0,0 +1,61 @@ +; RUN: mkdir -p %t && cd %t +; RUN: opt < %s -passes=insert-gcov-profiling -S | FileCheck %s + +; CHECK: @__llvm_gcov_ctr = internal global [1 x i64] zeroinitializer + +;; If an indirectbr critical edge cannot be split, ignore it. +;; The edge will not be profiled. +; CHECK-LABEL: @cannot_split( +; CHECK: indirect.preheader: +; CHECK-NEXT: load {{.*}} @__llvm_gcov_ctr +; CHECK-NOT: load {{.*}} @__llvm_gcov_ctr + +define dso_local i32 @cannot_split(i8* nocapture readonly %p) #0 !dbg !7 { +entry: + %targets = alloca <2 x i8*>, align 16 + store <2 x i8*> , <2 x i8*>* %targets, align 16, !dbg !9 + br label %for.cond, !dbg !14 + +for.cond: ; preds = %for.cond, %entry + %p.addr.0 = phi i8* [ %p, %entry ], [ %incdec.ptr, %for.cond ] + %0 = load i8, i8* %p.addr.0, align 1, !dbg !15 + %cmp = icmp eq i8 %0, 7, !dbg !17 + %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1, !dbg !18 + br i1 %cmp, label %indirect.preheader, label %for.cond, !dbg !15, !llvm.loop !19 + +indirect.preheader: ; preds = %for.cond + %1 = load i8, i8* %incdec.ptr, align 1, !dbg !21 + %idxprom = sext i8 %1 to i64, !dbg !21 + %arrayidx4 = getelementptr inbounds <2 x i8*>, <2 x i8*>* %targets, i64 0, i64 %idxprom, !dbg !21 + %2 = load i8*, i8** %arrayidx4, align 8, !dbg !21 + br label %indirect + +indirect: ; preds = %indirect.preheader, %indirect + indirectbr i8* %2, [label %indirect, label %end] + +end: ; preds = %indirect + ret i32 0, !dbg !22 +} + +attributes #0 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "a.c", directory: "/tmp/c") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!7 = distinct !DISubprogram(name: "cannot_split", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !2) +!9 = !DILocation(line: 3, column: 14, scope: !7) +!14 = !DILocation(line: 5, column: 3, scope: !7) +!15 = !DILocation(line: 6, column: 9, scope: !7) +!17 = !DILocation(line: 6, column: 12, scope: !7) +!18 = !DILocation(line: 5, column: 12, scope: !7) +!19 = distinct !{!19, !14, !20} +!20 = !DILocation(line: 9, column: 5, scope: !7) +!21 = !DILocation(line: 0, scope: !7) +!22 = !DILocation(line: 11, column: 3, scope: !7) diff --git a/llvm/test/Transforms/GVN/assume.ll b/llvm/test/Transforms/GVN/assume.ll new file mode 100644 index 0000000000000..ef2865791715c --- /dev/null +++ b/llvm/test/Transforms/GVN/assume.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -gvn -S | FileCheck %s + +declare void @llvm.assume(i1) +declare void @use(i1) + +define void @assume_arg(i1 %x) { +; CHECK-LABEL: @assume_arg( +; CHECK-NEXT: call void @llvm.assume(i1 [[X:%.*]]) +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 %x) + call void @use(i1 %x) + ret void +} + +define void @assume_not_arg(i1 %x) { +; CHECK-LABEL: @assume_not_arg( +; CHECK-NEXT: [[XOR:%.*]] = xor i1 [[X:%.*]], true +; CHECK-NEXT: call void @llvm.assume(i1 [[XOR]]) +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: ret void +; + %xor = xor i1 %x, true + call void @llvm.assume(i1 %xor) + call void @use(i1 %x) + ret void +} + +define void @pr47496(i8 %x) { +; CHECK-LABEL: @pr47496( +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0 +; CHECK-NEXT: [[XOR:%.*]] = xor i1 [[CMP]], true +; CHECK-NEXT: call void @llvm.assume(i1 [[XOR]]) +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: ret void +; + %cmp = icmp slt i8 %x, 0 + %xor = xor i1 %cmp, true + call void @llvm.assume(i1 %xor) + call void @use(i1 %cmp) + ret void +} diff --git a/llvm/test/Transforms/GVN/masked-load-store-vn-crash.ll b/llvm/test/Transforms/GVN/masked-load-store-vn-crash.ll new file mode 100644 index 0000000000000..ae8369cd19452 --- /dev/null +++ b/llvm/test/Transforms/GVN/masked-load-store-vn-crash.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -gvn -S < %s | FileCheck %s +@file_mask = external global [8 x i64], align 32 + +define fastcc void @test() { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WIDE_MASKED_LOAD_1_I:%.*]] = tail call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* nonnull bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @file_mask, i64 0, i64 7) to <4 x i64>*), i32 8, <4 x i1> , <4 x i64> undef) +; CHECK-NEXT: unreachable +; +entry: + %wide.masked.load.1.i = tail call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* nonnull bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @file_mask, i64 0, i64 7) to <4 x i64>*), i32 8, <4 x i1> , <4 x i64> undef) #2 + %.pre392.i = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @file_mask, i64 0, i64 7), align 8 + %or156.4.i = or i64 %.pre392.i, undef + %wide.masked.load614.1.i = tail call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* nonnull bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @file_mask, i64 0, i64 7) to <4 x i64>*), i32 8, <4 x i1> , <4 x i64> undef) #2 + unreachable +} + +; Function Attrs: argmemonly nounwind readonly willreturn +declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32 immarg, <4 x i1>, <4 x i64>) diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll new file mode 100644 index 0000000000000..0b71a10a067db --- /dev/null +++ b/llvm/test/Transforms/GVN/masked-load-store.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -gvn -S < %s | FileCheck %s + +; Check that in both cases the second load is recognized as redundant +; and is removed. + +define <128 x i8> @f0(<128 x i8>* %a0, <128 x i8> %a1, <128 x i8> %a2) { +; CHECK-LABEL: @f0( +; CHECK-NEXT: [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* [[A0:%.*]], i32 4, <128 x i1> [[V0]], <128 x i8> undef) +; CHECK-NEXT: [[V3:%.*]] = add <128 x i8> [[V1]], [[V1]] +; CHECK-NEXT: ret <128 x i8> [[V3]] +; + %v0 = icmp eq <128 x i8> %a1, %a2 + %v1 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %a0, i32 4, <128 x i1> %v0, <128 x i8> undef) + %v2 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %a0, i32 4, <128 x i1> %v0, <128 x i8> undef) + %v3 = add <128 x i8> %v1, %v2 + ret <128 x i8> %v3 +} + +define <128 x i8> @f1(<128 x i8>* %a0, <128 x i8> %a1, <128 x i8> %a2) { +; CHECK-LABEL: @f1( +; CHECK-NEXT: [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: [[V1:%.*]] = getelementptr <128 x i8>, <128 x i8>* [[A0:%.*]], i32 1 +; CHECK-NEXT: [[V2:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* [[A0]], i32 4, <128 x i1> [[V0]], <128 x i8> undef) +; CHECK-NEXT: call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[A2]], <128 x i8>* [[V1]], i32 4, <128 x i1> [[V0]]) +; CHECK-NEXT: [[V4:%.*]] = add <128 x i8> [[V2]], [[V2]] +; CHECK-NEXT: ret <128 x i8> [[V4]] +; + %v0 = icmp eq <128 x i8> %a1, %a2 + %v1 = getelementptr <128 x i8>, <128 x i8>* %a0, i32 1 + %v2 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %a0, i32 4, <128 x i1> %v0, <128 x i8> undef) + call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> %a2, <128 x i8>* %v1, i32 4, <128 x i1> %v0) + %v3 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %a0, i32 4, <128 x i1> %v0, <128 x i8> undef) + %v4 = add <128 x i8> %v2, %v3 + ret <128 x i8> %v4 +} + +declare <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>*, i32, <128 x i1>, <128 x i8>) +declare void @llvm.masked.store.v128i8.p0v128i8(<128 x i8>, <128 x i8>*, i32, <128 x i1>) + diff --git a/llvm/test/Transforms/HelloNew/helloworld.ll b/llvm/test/Transforms/HelloNew/helloworld.ll new file mode 100644 index 0000000000000..48817c24801ae --- /dev/null +++ b/llvm/test/Transforms/HelloNew/helloworld.ll @@ -0,0 +1,12 @@ +; RUN: opt -disable-output -passes=helloworld %s 2>&1 | FileCheck %s + +; CHECK: {{^}}foo{{$}} +define i32 @foo() { + %a = add i32 2, 3 + ret i32 %a +} + +; CHECK-NEXT: {{^}}bar{{$}} +define void @bar() { + ret void +} diff --git a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll index 36749a03553ea..16f967be12c21 100644 --- a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll +++ b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll @@ -18,344 +18,92 @@ define dso_local arm_aapcscc void @test(i32* nocapture %pDest, i16* nocapture re ; CHECK-NEXT: [[PSRCA_ADDR_090:%.*]] = phi i16* [ [[PSRCA_ADDR_2_LCSSA:%.*]], [[FOR_END40]] ], [ [[PSRCA:%.*]], [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[PSRCB_ADDR_089:%.*]] = phi i16* [ [[PSRCB_ADDR_2_LCSSA:%.*]], [[FOR_END40]] ], [ [[PSRCB:%.*]], [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[TMP0:%.*]] = lshr i32 [[I_092]], 2 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[I_092]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 3 -; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 2147483644 -; CHECK-NEXT: [[CMP272:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 2147483644 +; CHECK-NEXT: [[CMP272:%.*]] = icmp eq i32 [[TMP0]], 0 ; CHECK-NEXT: br i1 [[CMP272]], label [[FOR_END:%.*]], label [[FOR_BODY3_PREHEADER:%.*]] ; CHECK: for.body3.preheader: -; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[TMP3]], 3 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP2]], 3 -; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY3_PREHEADER_NEW:%.*]] -; CHECK: for.body3.preheader.new: -; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[TMP3]], [[XTRAITER]] ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] ; CHECK: for.body3: -; CHECK-NEXT: [[J_076:%.*]] = phi i32 [ 0, [[FOR_BODY3_PREHEADER_NEW]] ], [ [[ADD24_3:%.*]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[PDEST_ADDR_175:%.*]] = phi i32* [ [[PDEST_ADDR_091]], [[FOR_BODY3_PREHEADER_NEW]] ], [ [[INCDEC_PTR_3:%.*]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[PSRCA_ADDR_174:%.*]] = phi i16* [ [[PSRCA_ADDR_090]], [[FOR_BODY3_PREHEADER_NEW]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[PSRCB_ADDR_173:%.*]] = phi i16* [ [[PSRCB_ADDR_089]], [[FOR_BODY3_PREHEADER_NEW]] ], [ [[ADD_PTR23_3:%.*]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[FOR_BODY3_PREHEADER_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[PSRCA_ADDR_174]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[PSRCB_ADDR_173]], align 2 -; CHECK-NEXT: [[CONV5:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[J_076:%.*]] = phi i32 [ [[ADD24:%.*]], [[FOR_BODY3]] ], [ 0, [[FOR_BODY3_PREHEADER]] ] +; CHECK-NEXT: [[PDEST_ADDR_175:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY3]] ], [ [[PDEST_ADDR_091]], [[FOR_BODY3_PREHEADER]] ] +; CHECK-NEXT: [[PSRCA_ADDR_174:%.*]] = phi i16* [ [[ADD_PTR:%.*]], [[FOR_BODY3]] ], [ [[PSRCA_ADDR_090]], [[FOR_BODY3_PREHEADER]] ] +; CHECK-NEXT: [[PSRCB_ADDR_173:%.*]] = phi i16* [ [[ADD_PTR23:%.*]], [[FOR_BODY3]] ], [ [[PSRCB_ADDR_089]], [[FOR_BODY3_PREHEADER]] ] +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[PSRCA_ADDR_174]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[PSRCB_ADDR_173]], align 2 +; CHECK-NEXT: [[CONV5:%.*]] = sext i16 [[TMP4]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV5]], [[CONV]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 -; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP10]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX8]], align 2 -; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[TMP11]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[ARRAYIDX8]], align 2 +; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[TMP6]] to i32 ; CHECK-NEXT: [[MUL10:%.*]] = mul nsw i32 [[CONV9]], [[CONV7]] ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX11]], align 2 -; CHECK-NEXT: [[CONV12:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[ARRAYIDX11]], align 2 +; CHECK-NEXT: [[CONV12:%.*]] = sext i16 [[TMP7]] to i32 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 -; CHECK-NEXT: [[CONV14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 +; CHECK-NEXT: [[CONV14:%.*]] = sext i16 [[TMP8]] to i32 ; CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[CONV14]], [[CONV12]] ; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX17]], align 2 -; CHECK-NEXT: [[CONV18:%.*]] = sext i16 [[TMP14]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX17]], align 2 +; CHECK-NEXT: [[CONV18:%.*]] = sext i16 [[TMP9]] to i32 ; CHECK-NEXT: [[ADD21:%.*]] = add i32 [[MUL10]], [[MUL]] ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[ADD21]], [[CONV14]] ; CHECK-NEXT: [[ADD16:%.*]] = add i32 [[ADD]], [[MUL15]] ; CHECK-NEXT: [[ADD22:%.*]] = add i32 [[ADD16]], [[CONV18]] ; CHECK-NEXT: store i32 [[ADD22]], i32* [[PDEST_ADDR_175]], align 4 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 4 -; CHECK-NEXT: [[ADD_PTR23:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173]], i32 4 -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_175]], i32 1 -; CHECK-NEXT: [[ADD24:%.*]] = add nuw nsw i32 [[J_076]], 4 -; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1 -; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[ADD_PTR]], align 2 -; CHECK-NEXT: [[CONV_1:%.*]] = sext i16 [[TMP15]] to i32 -; CHECK-NEXT: [[TMP16:%.*]] = load i16, i16* [[ADD_PTR23]], align 2 -; CHECK-NEXT: [[CONV5_1:%.*]] = sext i16 [[TMP16]] to i32 -; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[CONV5_1]], [[CONV_1]] -; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR]], i32 1 -; CHECK-NEXT: [[TMP17:%.*]] = load i16, i16* [[ARRAYIDX6_1]], align 2 -; CHECK-NEXT: [[CONV7_1:%.*]] = sext i16 [[TMP17]] to i32 -; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX8_1]], align 2 -; CHECK-NEXT: [[CONV9_1:%.*]] = sext i16 [[TMP18]] to i32 -; CHECK-NEXT: [[MUL10_1:%.*]] = mul nsw i32 [[CONV9_1]], [[CONV7_1]] -; CHECK-NEXT: [[ARRAYIDX11_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR]], i32 2 -; CHECK-NEXT: [[TMP19:%.*]] = load i16, i16* [[ARRAYIDX11_1]], align 2 -; CHECK-NEXT: [[CONV12_1:%.*]] = sext i16 [[TMP19]] to i32 -; CHECK-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23]], i32 3 -; CHECK-NEXT: [[TMP20:%.*]] = load i16, i16* [[ARRAYIDX13_1]], align 2 -; CHECK-NEXT: [[CONV14_1:%.*]] = sext i16 [[TMP20]] to i32 -; CHECK-NEXT: [[MUL15_1:%.*]] = mul nsw i32 [[CONV14_1]], [[CONV12_1]] -; CHECK-NEXT: [[ARRAYIDX17_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR]], i32 3 -; CHECK-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX17_1]], align 2 -; CHECK-NEXT: [[CONV18_1:%.*]] = sext i16 [[TMP21]] to i32 -; CHECK-NEXT: [[ADD21_1:%.*]] = add i32 [[MUL10_1]], [[MUL_1]] -; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD21_1]], [[CONV14_1]] -; CHECK-NEXT: [[ADD16_1:%.*]] = add i32 [[ADD_1]], [[MUL15_1]] -; CHECK-NEXT: [[ADD22_1:%.*]] = add i32 [[ADD16_1]], [[CONV18_1]] -; CHECK-NEXT: store i32 [[ADD22_1]], i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR]], i32 4 -; CHECK-NEXT: [[ADD_PTR23_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23]], i32 4 -; CHECK-NEXT: [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i32, i32* [[INCDEC_PTR]], i32 1 -; CHECK-NEXT: [[ADD24_1:%.*]] = add nuw nsw i32 [[ADD24]], 4 -; CHECK-NEXT: [[NITER_NSUB_1:%.*]] = sub i32 [[NITER_NSUB]], 1 -; CHECK-NEXT: [[TMP22:%.*]] = load i16, i16* [[ADD_PTR_1]], align 2 -; CHECK-NEXT: [[CONV_2:%.*]] = sext i16 [[TMP22]] to i32 -; CHECK-NEXT: [[TMP23:%.*]] = load i16, i16* [[ADD_PTR23_1]], align 2 -; CHECK-NEXT: [[CONV5_2:%.*]] = sext i16 [[TMP23]] to i32 -; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[CONV5_2]], [[CONV_2]] -; CHECK-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_1]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX6_2]], align 2 -; CHECK-NEXT: [[CONV7_2:%.*]] = sext i16 [[TMP24]] to i32 -; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_1]], i32 1 -; CHECK-NEXT: [[TMP25:%.*]] = load i16, i16* [[ARRAYIDX8_2]], align 2 -; CHECK-NEXT: [[CONV9_2:%.*]] = sext i16 [[TMP25]] to i32 -; CHECK-NEXT: [[MUL10_2:%.*]] = mul nsw i32 [[CONV9_2]], [[CONV7_2]] -; CHECK-NEXT: [[ARRAYIDX11_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_1]], i32 2 -; CHECK-NEXT: [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX11_2]], align 2 -; CHECK-NEXT: [[CONV12_2:%.*]] = sext i16 [[TMP26]] to i32 -; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_1]], i32 3 -; CHECK-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX13_2]], align 2 -; CHECK-NEXT: [[CONV14_2:%.*]] = sext i16 [[TMP27]] to i32 -; CHECK-NEXT: [[MUL15_2:%.*]] = mul nsw i32 [[CONV14_2]], [[CONV12_2]] -; CHECK-NEXT: [[ARRAYIDX17_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_1]], i32 3 -; CHECK-NEXT: [[TMP28:%.*]] = load i16, i16* [[ARRAYIDX17_2]], align 2 -; CHECK-NEXT: [[CONV18_2:%.*]] = sext i16 [[TMP28]] to i32 -; CHECK-NEXT: [[ADD21_2:%.*]] = add i32 [[MUL10_2]], [[MUL_2]] -; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD21_2]], [[CONV14_2]] -; CHECK-NEXT: [[ADD16_2:%.*]] = add i32 [[ADD_2]], [[MUL15_2]] -; CHECK-NEXT: [[ADD22_2:%.*]] = add i32 [[ADD16_2]], [[CONV18_2]] -; CHECK-NEXT: store i32 [[ADD22_2]], i32* [[INCDEC_PTR_1]], align 4 -; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_1]], i32 4 -; CHECK-NEXT: [[ADD_PTR23_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_1]], i32 4 -; CHECK-NEXT: [[INCDEC_PTR_2:%.*]] = getelementptr inbounds i32, i32* [[INCDEC_PTR_1]], i32 1 -; CHECK-NEXT: [[ADD24_2:%.*]] = add nuw nsw i32 [[ADD24_1]], 4 -; CHECK-NEXT: [[NITER_NSUB_2:%.*]] = sub i32 [[NITER_NSUB_1]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = load i16, i16* [[ADD_PTR_2]], align 2 -; CHECK-NEXT: [[CONV_3:%.*]] = sext i16 [[TMP29]] to i32 -; CHECK-NEXT: [[TMP30:%.*]] = load i16, i16* [[ADD_PTR23_2]], align 2 -; CHECK-NEXT: [[CONV5_3:%.*]] = sext i16 [[TMP30]] to i32 -; CHECK-NEXT: [[MUL_3:%.*]] = mul nsw i32 [[CONV5_3]], [[CONV_3]] -; CHECK-NEXT: [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_2]], i32 1 -; CHECK-NEXT: [[TMP31:%.*]] = load i16, i16* [[ARRAYIDX6_3]], align 2 -; CHECK-NEXT: [[CONV7_3:%.*]] = sext i16 [[TMP31]] to i32 -; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_2]], i32 1 -; CHECK-NEXT: [[TMP32:%.*]] = load i16, i16* [[ARRAYIDX8_3]], align 2 -; CHECK-NEXT: [[CONV9_3:%.*]] = sext i16 [[TMP32]] to i32 -; CHECK-NEXT: [[MUL10_3:%.*]] = mul nsw i32 [[CONV9_3]], [[CONV7_3]] -; CHECK-NEXT: [[ARRAYIDX11_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_2]], i32 2 -; CHECK-NEXT: [[TMP33:%.*]] = load i16, i16* [[ARRAYIDX11_3]], align 2 -; CHECK-NEXT: [[CONV12_3:%.*]] = sext i16 [[TMP33]] to i32 -; CHECK-NEXT: [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_2]], i32 3 -; CHECK-NEXT: [[TMP34:%.*]] = load i16, i16* [[ARRAYIDX13_3]], align 2 -; CHECK-NEXT: [[CONV14_3:%.*]] = sext i16 [[TMP34]] to i32 -; CHECK-NEXT: [[MUL15_3:%.*]] = mul nsw i32 [[CONV14_3]], [[CONV12_3]] -; CHECK-NEXT: [[ARRAYIDX17_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_2]], i32 3 -; CHECK-NEXT: [[TMP35:%.*]] = load i16, i16* [[ARRAYIDX17_3]], align 2 -; CHECK-NEXT: [[CONV18_3:%.*]] = sext i16 [[TMP35]] to i32 -; CHECK-NEXT: [[ADD21_3:%.*]] = add i32 [[MUL10_3]], [[MUL_3]] -; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[ADD21_3]], [[CONV14_3]] -; CHECK-NEXT: [[ADD16_3:%.*]] = add i32 [[ADD_3]], [[MUL15_3]] -; CHECK-NEXT: [[ADD22_3:%.*]] = add i32 [[ADD16_3]], [[CONV18_3]] -; CHECK-NEXT: store i32 [[ADD22_3]], i32* [[INCDEC_PTR_2]], align 4 -; CHECK-NEXT: [[ADD_PTR_3]] = getelementptr inbounds i16, i16* [[ADD_PTR_2]], i32 4 -; CHECK-NEXT: [[ADD_PTR23_3]] = getelementptr inbounds i16, i16* [[ADD_PTR23_2]], i32 4 -; CHECK-NEXT: [[INCDEC_PTR_3]] = getelementptr inbounds i32, i32* [[INCDEC_PTR_2]], i32 1 -; CHECK-NEXT: [[ADD24_3]] = add nuw nsw i32 [[ADD24_2]], 4 -; CHECK-NEXT: [[NITER_NSUB_3]] = sub i32 [[NITER_NSUB_2]], 1 -; CHECK-NEXT: [[NITER_NCMP_3:%.*]] = icmp ne i32 [[NITER_NSUB_3]], 0 -; CHECK-NEXT: br i1 [[NITER_NCMP_3]], label [[FOR_BODY3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]] -; CHECK: for.end.loopexit.unr-lcssa.loopexit: -; CHECK-NEXT: [[ADD_PTR_LCSSA_PH_PH:%.*]] = phi i16* [ [[ADD_PTR_3]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[ADD_PTR23_LCSSA_PH_PH:%.*]] = phi i16* [ [[ADD_PTR23_3]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[INCDEC_PTR_LCSSA_PH_PH:%.*]] = phi i32* [ [[INCDEC_PTR_3]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[J_076_UNR_PH:%.*]] = phi i32 [ [[ADD24_3]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[PDEST_ADDR_175_UNR_PH:%.*]] = phi i32* [ [[INCDEC_PTR_3]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[PSRCA_ADDR_174_UNR_PH:%.*]] = phi i16* [ [[ADD_PTR_3]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[PSRCB_ADDR_173_UNR_PH:%.*]] = phi i16* [ [[ADD_PTR23_3]], [[FOR_BODY3]] ] -; CHECK-NEXT: br label [[FOR_END_LOOPEXIT_UNR_LCSSA]] -; CHECK: for.end.loopexit.unr-lcssa: -; CHECK-NEXT: [[ADD_PTR_LCSSA_PH:%.*]] = phi i16* [ undef, [[FOR_BODY3_PREHEADER]] ], [ [[ADD_PTR_LCSSA_PH_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] -; CHECK-NEXT: [[ADD_PTR23_LCSSA_PH:%.*]] = phi i16* [ undef, [[FOR_BODY3_PREHEADER]] ], [ [[ADD_PTR23_LCSSA_PH_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] -; CHECK-NEXT: [[INCDEC_PTR_LCSSA_PH:%.*]] = phi i32* [ undef, [[FOR_BODY3_PREHEADER]] ], [ [[INCDEC_PTR_LCSSA_PH_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] -; CHECK-NEXT: [[J_076_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY3_PREHEADER]] ], [ [[J_076_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] -; CHECK-NEXT: [[PDEST_ADDR_175_UNR:%.*]] = phi i32* [ [[PDEST_ADDR_091]], [[FOR_BODY3_PREHEADER]] ], [ [[PDEST_ADDR_175_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] -; CHECK-NEXT: [[PSRCA_ADDR_174_UNR:%.*]] = phi i16* [ [[PSRCA_ADDR_090]], [[FOR_BODY3_PREHEADER]] ], [ [[PSRCA_ADDR_174_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] -; CHECK-NEXT: [[PSRCB_ADDR_173_UNR:%.*]] = phi i16* [ [[PSRCB_ADDR_089]], [[FOR_BODY3_PREHEADER]] ], [ [[PSRCB_ADDR_173_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] -; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0 -; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[FOR_BODY3_EPIL_PREHEADER:%.*]], label [[FOR_END_LOOPEXIT:%.*]] -; CHECK: for.body3.epil.preheader: -; CHECK-NEXT: br label [[FOR_BODY3_EPIL:%.*]] -; CHECK: for.body3.epil: -; CHECK-NEXT: [[TMP36:%.*]] = load i16, i16* [[PSRCA_ADDR_174_UNR]], align 2 -; CHECK-NEXT: [[CONV_EPIL:%.*]] = sext i16 [[TMP36]] to i32 -; CHECK-NEXT: [[TMP37:%.*]] = load i16, i16* [[PSRCB_ADDR_173_UNR]], align 2 -; CHECK-NEXT: [[CONV5_EPIL:%.*]] = sext i16 [[TMP37]] to i32 -; CHECK-NEXT: [[MUL_EPIL:%.*]] = mul nsw i32 [[CONV5_EPIL]], [[CONV_EPIL]] -; CHECK-NEXT: [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174_UNR]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = load i16, i16* [[ARRAYIDX6_EPIL]], align 2 -; CHECK-NEXT: [[CONV7_EPIL:%.*]] = sext i16 [[TMP38]] to i32 -; CHECK-NEXT: [[ARRAYIDX8_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173_UNR]], i32 1 -; CHECK-NEXT: [[TMP39:%.*]] = load i16, i16* [[ARRAYIDX8_EPIL]], align 2 -; CHECK-NEXT: [[CONV9_EPIL:%.*]] = sext i16 [[TMP39]] to i32 -; CHECK-NEXT: [[MUL10_EPIL:%.*]] = mul nsw i32 [[CONV9_EPIL]], [[CONV7_EPIL]] -; CHECK-NEXT: [[ARRAYIDX11_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174_UNR]], i32 2 -; CHECK-NEXT: [[TMP40:%.*]] = load i16, i16* [[ARRAYIDX11_EPIL]], align 2 -; CHECK-NEXT: [[CONV12_EPIL:%.*]] = sext i16 [[TMP40]] to i32 -; CHECK-NEXT: [[ARRAYIDX13_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173_UNR]], i32 3 -; CHECK-NEXT: [[TMP41:%.*]] = load i16, i16* [[ARRAYIDX13_EPIL]], align 2 -; CHECK-NEXT: [[CONV14_EPIL:%.*]] = sext i16 [[TMP41]] to i32 -; CHECK-NEXT: [[MUL15_EPIL:%.*]] = mul nsw i32 [[CONV14_EPIL]], [[CONV12_EPIL]] -; CHECK-NEXT: [[ARRAYIDX17_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174_UNR]], i32 3 -; CHECK-NEXT: [[TMP42:%.*]] = load i16, i16* [[ARRAYIDX17_EPIL]], align 2 -; CHECK-NEXT: [[CONV18_EPIL:%.*]] = sext i16 [[TMP42]] to i32 -; CHECK-NEXT: [[ADD21_EPIL:%.*]] = add i32 [[MUL10_EPIL]], [[MUL_EPIL]] -; CHECK-NEXT: [[ADD_EPIL:%.*]] = add i32 [[ADD21_EPIL]], [[CONV14_EPIL]] -; CHECK-NEXT: [[ADD16_EPIL:%.*]] = add i32 [[ADD_EPIL]], [[MUL15_EPIL]] -; CHECK-NEXT: [[ADD22_EPIL:%.*]] = add i32 [[ADD16_EPIL]], [[CONV18_EPIL]] -; CHECK-NEXT: store i32 [[ADD22_EPIL]], i32* [[PDEST_ADDR_175_UNR]], align 4 -; CHECK-NEXT: [[ADD_PTR_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174_UNR]], i32 4 -; CHECK-NEXT: [[ADD_PTR23_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173_UNR]], i32 4 -; CHECK-NEXT: [[INCDEC_PTR_EPIL:%.*]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_175_UNR]], i32 1 -; CHECK-NEXT: [[ADD24_EPIL:%.*]] = add nuw nsw i32 [[J_076_UNR]], 4 -; CHECK-NEXT: [[EPIL_ITER_SUB:%.*]] = sub i32 [[XTRAITER]], 1 -; CHECK-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i32 [[EPIL_ITER_SUB]], 0 -; CHECK-NEXT: br i1 [[EPIL_ITER_CMP]], label [[FOR_BODY3_EPIL_1:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA:%.*]] -; CHECK: for.end.loopexit.epilog-lcssa: -; CHECK-NEXT: [[ADD_PTR_LCSSA_PH1:%.*]] = phi i16* [ [[ADD_PTR_EPIL]], [[FOR_BODY3_EPIL]] ], [ [[ADD_PTR_EPIL_1:%.*]], [[FOR_BODY3_EPIL_1]] ], [ [[ADD_PTR_EPIL_2:%.*]], [[FOR_BODY3_EPIL_2:%.*]] ] -; CHECK-NEXT: [[ADD_PTR23_LCSSA_PH2:%.*]] = phi i16* [ [[ADD_PTR23_EPIL]], [[FOR_BODY3_EPIL]] ], [ [[ADD_PTR23_EPIL_1:%.*]], [[FOR_BODY3_EPIL_1]] ], [ [[ADD_PTR23_EPIL_2:%.*]], [[FOR_BODY3_EPIL_2]] ] -; CHECK-NEXT: [[INCDEC_PTR_LCSSA_PH3:%.*]] = phi i32* [ [[INCDEC_PTR_EPIL]], [[FOR_BODY3_EPIL]] ], [ [[INCDEC_PTR_EPIL_1:%.*]], [[FOR_BODY3_EPIL_1]] ], [ [[INCDEC_PTR_EPIL_2:%.*]], [[FOR_BODY3_EPIL_2]] ] -; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] +; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 4 +; CHECK-NEXT: [[ADD_PTR23]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_175]], i32 1 +; CHECK-NEXT: [[ADD24]] = add nuw nsw i32 [[J_076]], 4 +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[ADD24]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END_LOOPEXIT:%.*]] ; CHECK: for.end.loopexit: -; CHECK-NEXT: [[ADD_PTR_LCSSA:%.*]] = phi i16* [ [[ADD_PTR_LCSSA_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA]] ], [ [[ADD_PTR_LCSSA_PH1]], [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] ] -; CHECK-NEXT: [[ADD_PTR23_LCSSA:%.*]] = phi i16* [ [[ADD_PTR23_LCSSA_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA]] ], [ [[ADD_PTR23_LCSSA_PH2]], [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] ] -; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi i32* [ [[INCDEC_PTR_LCSSA_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA]] ], [ [[INCDEC_PTR_LCSSA_PH3]], [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] ] +; CHECK-NEXT: [[ADD_PTR_LCSSA:%.*]] = phi i16* [ [[ADD_PTR]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[ADD_PTR23_LCSSA:%.*]] = phi i16* [ [[ADD_PTR23]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi i32* [ [[INCDEC_PTR]], [[FOR_BODY3]] ] ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[PSRCB_ADDR_1_LCSSA:%.*]] = phi i16* [ [[PSRCB_ADDR_089]], [[FOR_BODY]] ], [ [[ADD_PTR23_LCSSA]], [[FOR_END_LOOPEXIT]] ] ; CHECK-NEXT: [[PSRCA_ADDR_1_LCSSA:%.*]] = phi i16* [ [[PSRCA_ADDR_090]], [[FOR_BODY]] ], [ [[ADD_PTR_LCSSA]], [[FOR_END_LOOPEXIT]] ] ; CHECK-NEXT: [[PDEST_ADDR_1_LCSSA:%.*]] = phi i32* [ [[PDEST_ADDR_091]], [[FOR_BODY]] ], [ [[INCDEC_PTR_LCSSA]], [[FOR_END_LOOPEXIT]] ] -; CHECK-NEXT: [[J_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP6]], [[FOR_END_LOOPEXIT]] ] -; CHECK-NEXT: [[REM:%.*]] = and i32 [[TMP4]], 3 +; CHECK-NEXT: [[J_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP2]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[REM:%.*]] = and i32 [[TMP0]], 3 ; CHECK-NEXT: [[ADD25:%.*]] = or i32 [[J_0_LCSSA]], [[REM]] ; CHECK-NEXT: [[CMP2780:%.*]] = icmp ugt i32 [[ADD25]], [[J_0_LCSSA]] ; CHECK-NEXT: br i1 [[CMP2780]], label [[FOR_BODY29_PREHEADER:%.*]], label [[FOR_END40]] ; CHECK: for.body29.preheader: -; CHECK-NEXT: [[TMP43:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]] -; CHECK-NEXT: [[TMP44:%.*]] = sub i32 [[ADD25]], [[J_0_LCSSA]] -; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[ADD25]], -1 -; CHECK-NEXT: [[TMP46:%.*]] = sub i32 [[TMP45]], [[J_0_LCSSA]] -; CHECK-NEXT: [[XTRAITER4:%.*]] = and i32 [[TMP44]], 3 -; CHECK-NEXT: [[LCMP_MOD5:%.*]] = icmp ne i32 [[XTRAITER4]], 0 -; CHECK-NEXT: br i1 [[LCMP_MOD5]], label [[FOR_BODY29_PROL_PREHEADER:%.*]], label [[FOR_BODY29_PROL_LOOPEXIT:%.*]] -; CHECK: for.body29.prol.preheader: -; CHECK-NEXT: br label [[FOR_BODY29_PROL:%.*]] -; CHECK: for.body29.prol: -; CHECK-NEXT: [[ARRAYIDX30_PROL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_1_LCSSA]], i32 [[J_0_LCSSA]] -; CHECK-NEXT: [[TMP47:%.*]] = load i16, i16* [[ARRAYIDX30_PROL]], align 2 -; CHECK-NEXT: [[CONV31_PROL:%.*]] = sext i16 [[TMP47]] to i32 -; CHECK-NEXT: [[ARRAYIDX32_PROL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_1_LCSSA]], i32 [[J_0_LCSSA]] -; CHECK-NEXT: [[TMP48:%.*]] = load i16, i16* [[ARRAYIDX32_PROL]], align 2 -; CHECK-NEXT: [[CONV33_PROL:%.*]] = sext i16 [[TMP48]] to i32 -; CHECK-NEXT: [[MUL34_PROL:%.*]] = mul nsw i32 [[CONV33_PROL]], [[CONV31_PROL]] -; CHECK-NEXT: [[TMP49:%.*]] = load i32, i32* [[PDEST_ADDR_1_LCSSA]], align 4 -; CHECK-NEXT: [[ADD35_PROL:%.*]] = add nsw i32 [[MUL34_PROL]], [[TMP49]] -; CHECK-NEXT: store i32 [[ADD35_PROL]], i32* [[PDEST_ADDR_1_LCSSA]], align 4 -; CHECK-NEXT: [[INCDEC_PTR36_PROL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_1_LCSSA]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR37_PROL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_1_LCSSA]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR38_PROL:%.*]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_1_LCSSA]], i32 1 -; CHECK-NEXT: [[INC_PROL:%.*]] = add nuw i32 [[J_0_LCSSA]], 1 -; CHECK-NEXT: [[PROL_ITER_SUB:%.*]] = sub i32 [[XTRAITER4]], 1 -; CHECK-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i32 [[PROL_ITER_SUB]], 0 -; CHECK-NEXT: br i1 [[PROL_ITER_CMP]], label [[FOR_BODY29_PROL_1:%.*]], label [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA:%.*]] -; CHECK: for.body29.prol.loopexit.unr-lcssa: -; CHECK-NEXT: [[J_184_UNR_PH:%.*]] = phi i32 [ [[INC_PROL]], [[FOR_BODY29_PROL]] ], [ [[INC_PROL_1:%.*]], [[FOR_BODY29_PROL_1]] ], [ [[INC_PROL_2:%.*]], [[FOR_BODY29_PROL_2:%.*]] ] -; CHECK-NEXT: [[PDEST_ADDR_283_UNR_PH:%.*]] = phi i32* [ [[INCDEC_PTR38_PROL]], [[FOR_BODY29_PROL]] ], [ [[INCDEC_PTR38_PROL_1:%.*]], [[FOR_BODY29_PROL_1]] ], [ [[INCDEC_PTR38_PROL_2:%.*]], [[FOR_BODY29_PROL_2]] ] -; CHECK-NEXT: [[PSRCA_ADDR_282_UNR_PH:%.*]] = phi i16* [ [[INCDEC_PTR36_PROL]], [[FOR_BODY29_PROL]] ], [ [[INCDEC_PTR36_PROL_1:%.*]], [[FOR_BODY29_PROL_1]] ], [ [[INCDEC_PTR36_PROL_2:%.*]], [[FOR_BODY29_PROL_2]] ] -; CHECK-NEXT: [[PSRCB_ADDR_281_UNR_PH:%.*]] = phi i16* [ [[INCDEC_PTR37_PROL]], [[FOR_BODY29_PROL]] ], [ [[INCDEC_PTR37_PROL_1:%.*]], [[FOR_BODY29_PROL_1]] ], [ [[INCDEC_PTR37_PROL_2:%.*]], [[FOR_BODY29_PROL_2]] ] -; CHECK-NEXT: br label [[FOR_BODY29_PROL_LOOPEXIT]] -; CHECK: for.body29.prol.loopexit: -; CHECK-NEXT: [[J_184_UNR:%.*]] = phi i32 [ [[J_0_LCSSA]], [[FOR_BODY29_PREHEADER]] ], [ [[J_184_UNR_PH]], [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ] -; CHECK-NEXT: [[PDEST_ADDR_283_UNR:%.*]] = phi i32* [ [[PDEST_ADDR_1_LCSSA]], [[FOR_BODY29_PREHEADER]] ], [ [[PDEST_ADDR_283_UNR_PH]], [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ] -; CHECK-NEXT: [[PSRCA_ADDR_282_UNR:%.*]] = phi i16* [ [[PSRCA_ADDR_1_LCSSA]], [[FOR_BODY29_PREHEADER]] ], [ [[PSRCA_ADDR_282_UNR_PH]], [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ] -; CHECK-NEXT: [[PSRCB_ADDR_281_UNR:%.*]] = phi i16* [ [[PSRCB_ADDR_1_LCSSA]], [[FOR_BODY29_PREHEADER]] ], [ [[PSRCB_ADDR_281_UNR_PH]], [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ] -; CHECK-NEXT: [[TMP50:%.*]] = icmp ult i32 [[TMP46]], 3 -; CHECK-NEXT: br i1 [[TMP50]], label [[FOR_END40_LOOPEXIT:%.*]], label [[FOR_BODY29_PREHEADER_NEW:%.*]] -; CHECK: for.body29.preheader.new: +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]] ; CHECK-NEXT: br label [[FOR_BODY29:%.*]] ; CHECK: for.body29: -; CHECK-NEXT: [[J_184:%.*]] = phi i32 [ [[J_184_UNR]], [[FOR_BODY29_PREHEADER_NEW]] ], [ [[INC_3:%.*]], [[FOR_BODY29]] ] -; CHECK-NEXT: [[PDEST_ADDR_283:%.*]] = phi i32* [ [[PDEST_ADDR_283_UNR]], [[FOR_BODY29_PREHEADER_NEW]] ], [ [[INCDEC_PTR38_3:%.*]], [[FOR_BODY29]] ] -; CHECK-NEXT: [[PSRCA_ADDR_282:%.*]] = phi i16* [ [[PSRCA_ADDR_282_UNR]], [[FOR_BODY29_PREHEADER_NEW]] ], [ [[INCDEC_PTR36_3:%.*]], [[FOR_BODY29]] ] -; CHECK-NEXT: [[PSRCB_ADDR_281:%.*]] = phi i16* [ [[PSRCB_ADDR_281_UNR]], [[FOR_BODY29_PREHEADER_NEW]] ], [ [[INCDEC_PTR37_3:%.*]], [[FOR_BODY29]] ] +; CHECK-NEXT: [[J_184:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY29]] ], [ [[J_0_LCSSA]], [[FOR_BODY29_PREHEADER]] ] +; CHECK-NEXT: [[PDEST_ADDR_283:%.*]] = phi i32* [ [[INCDEC_PTR38:%.*]], [[FOR_BODY29]] ], [ [[PDEST_ADDR_1_LCSSA]], [[FOR_BODY29_PREHEADER]] ] +; CHECK-NEXT: [[PSRCA_ADDR_282:%.*]] = phi i16* [ [[INCDEC_PTR36:%.*]], [[FOR_BODY29]] ], [ [[PSRCA_ADDR_1_LCSSA]], [[FOR_BODY29_PREHEADER]] ] +; CHECK-NEXT: [[PSRCB_ADDR_281:%.*]] = phi i16* [ [[INCDEC_PTR37:%.*]], [[FOR_BODY29]] ], [ [[PSRCB_ADDR_1_LCSSA]], [[FOR_BODY29_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_282]], i32 [[J_184]] -; CHECK-NEXT: [[TMP51:%.*]] = load i16, i16* [[ARRAYIDX30]], align 2 -; CHECK-NEXT: [[CONV31:%.*]] = sext i16 [[TMP51]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX30]], align 2 +; CHECK-NEXT: [[CONV31:%.*]] = sext i16 [[TMP11]] to i32 ; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_281]], i32 [[J_184]] -; CHECK-NEXT: [[TMP52:%.*]] = load i16, i16* [[ARRAYIDX32]], align 2 -; CHECK-NEXT: [[CONV33:%.*]] = sext i16 [[TMP52]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX32]], align 2 +; CHECK-NEXT: [[CONV33:%.*]] = sext i16 [[TMP12]] to i32 ; CHECK-NEXT: [[MUL34:%.*]] = mul nsw i32 [[CONV33]], [[CONV31]] -; CHECK-NEXT: [[TMP53:%.*]] = load i32, i32* [[PDEST_ADDR_283]], align 4 -; CHECK-NEXT: [[ADD35:%.*]] = add nsw i32 [[MUL34]], [[TMP53]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[PDEST_ADDR_283]], align 4 +; CHECK-NEXT: [[ADD35:%.*]] = add nsw i32 [[MUL34]], [[TMP13]] ; CHECK-NEXT: store i32 [[ADD35]], i32* [[PDEST_ADDR_283]], align 4 -; CHECK-NEXT: [[INCDEC_PTR36:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_282]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR37:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_281]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR38:%.*]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_283]], i32 1 -; CHECK-NEXT: [[INC:%.*]] = add nuw i32 [[J_184]], 1 -; CHECK-NEXT: [[ARRAYIDX30_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36]], i32 [[INC]] -; CHECK-NEXT: [[TMP54:%.*]] = load i16, i16* [[ARRAYIDX30_1]], align 2 -; CHECK-NEXT: [[CONV31_1:%.*]] = sext i16 [[TMP54]] to i32 -; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37]], i32 [[INC]] -; CHECK-NEXT: [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX32_1]], align 2 -; CHECK-NEXT: [[CONV33_1:%.*]] = sext i16 [[TMP55]] to i32 -; CHECK-NEXT: [[MUL34_1:%.*]] = mul nsw i32 [[CONV33_1]], [[CONV31_1]] -; CHECK-NEXT: [[TMP56:%.*]] = load i32, i32* [[INCDEC_PTR38]], align 4 -; CHECK-NEXT: [[ADD35_1:%.*]] = add nsw i32 [[MUL34_1]], [[TMP56]] -; CHECK-NEXT: store i32 [[ADD35_1]], i32* [[INCDEC_PTR38]], align 4 -; CHECK-NEXT: [[INCDEC_PTR36_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR37_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR38_1:%.*]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38]], i32 1 -; CHECK-NEXT: [[INC_1:%.*]] = add nuw i32 [[INC]], 1 -; CHECK-NEXT: [[ARRAYIDX30_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_1]], i32 [[INC_1]] -; CHECK-NEXT: [[TMP57:%.*]] = load i16, i16* [[ARRAYIDX30_2]], align 2 -; CHECK-NEXT: [[CONV31_2:%.*]] = sext i16 [[TMP57]] to i32 -; CHECK-NEXT: [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_1]], i32 [[INC_1]] -; CHECK-NEXT: [[TMP58:%.*]] = load i16, i16* [[ARRAYIDX32_2]], align 2 -; CHECK-NEXT: [[CONV33_2:%.*]] = sext i16 [[TMP58]] to i32 -; CHECK-NEXT: [[MUL34_2:%.*]] = mul nsw i32 [[CONV33_2]], [[CONV31_2]] -; CHECK-NEXT: [[TMP59:%.*]] = load i32, i32* [[INCDEC_PTR38_1]], align 4 -; CHECK-NEXT: [[ADD35_2:%.*]] = add nsw i32 [[MUL34_2]], [[TMP59]] -; CHECK-NEXT: store i32 [[ADD35_2]], i32* [[INCDEC_PTR38_1]], align 4 -; CHECK-NEXT: [[INCDEC_PTR36_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_1]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR37_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_1]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR38_2:%.*]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38_1]], i32 1 -; CHECK-NEXT: [[INC_2:%.*]] = add nuw i32 [[INC_1]], 1 -; CHECK-NEXT: [[ARRAYIDX30_3:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_2]], i32 [[INC_2]] -; CHECK-NEXT: [[TMP60:%.*]] = load i16, i16* [[ARRAYIDX30_3]], align 2 -; CHECK-NEXT: [[CONV31_3:%.*]] = sext i16 [[TMP60]] to i32 -; CHECK-NEXT: [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_2]], i32 [[INC_2]] -; CHECK-NEXT: [[TMP61:%.*]] = load i16, i16* [[ARRAYIDX32_3]], align 2 -; CHECK-NEXT: [[CONV33_3:%.*]] = sext i16 [[TMP61]] to i32 -; CHECK-NEXT: [[MUL34_3:%.*]] = mul nsw i32 [[CONV33_3]], [[CONV31_3]] -; CHECK-NEXT: [[TMP62:%.*]] = load i32, i32* [[INCDEC_PTR38_2]], align 4 -; CHECK-NEXT: [[ADD35_3:%.*]] = add nsw i32 [[MUL34_3]], [[TMP62]] -; CHECK-NEXT: store i32 [[ADD35_3]], i32* [[INCDEC_PTR38_2]], align 4 -; CHECK-NEXT: [[INCDEC_PTR36_3]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_2]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR37_3]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_2]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR38_3]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38_2]], i32 1 -; CHECK-NEXT: [[INC_3]] = add nuw i32 [[INC_2]], 1 -; CHECK-NEXT: [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[ADD25]] -; CHECK-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END40_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY29]] -; CHECK: for.end40.loopexit.unr-lcssa: -; CHECK-NEXT: br label [[FOR_END40_LOOPEXIT]] +; CHECK-NEXT: [[INCDEC_PTR36]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_282]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_281]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_283]], i32 1 +; CHECK-NEXT: [[INC]] = add nuw i32 [[J_184]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ADD25]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END40_LOOPEXIT:%.*]], label [[FOR_BODY29]] ; CHECK: for.end40.loopexit: -; CHECK-NEXT: [[SCEVGEP93:%.*]] = getelementptr i16, i16* [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP43]] -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, i16* [[PSRCA_ADDR_1_LCSSA]], i32 [[TMP43]] -; CHECK-NEXT: [[SCEVGEP94:%.*]] = getelementptr i32, i32* [[PDEST_ADDR_1_LCSSA]], i32 [[TMP43]] +; CHECK-NEXT: [[SCEVGEP93:%.*]] = getelementptr i16, i16* [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP10]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, i16* [[PSRCA_ADDR_1_LCSSA]], i32 [[TMP10]] +; CHECK-NEXT: [[SCEVGEP94:%.*]] = getelementptr i32, i32* [[PDEST_ADDR_1_LCSSA]], i32 [[TMP10]] ; CHECK-NEXT: br label [[FOR_END40]] ; CHECK: for.end40: ; CHECK-NEXT: [[PSRCB_ADDR_2_LCSSA]] = phi i16* [ [[PSRCB_ADDR_1_LCSSA]], [[FOR_END]] ], [ [[SCEVGEP93]], [[FOR_END40_LOOPEXIT]] ] @@ -364,110 +112,6 @@ define dso_local arm_aapcscc void @test(i32* nocapture %pDest, i16* nocapture re ; CHECK-NEXT: [[INC42]] = add nuw i32 [[I_092]], 1 ; CHECK-NEXT: [[EXITCOND95:%.*]] = icmp eq i32 [[INC42]], [[BLKCNT]] ; CHECK-NEXT: br i1 [[EXITCOND95]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] -; CHECK: for.body3.epil.1: -; CHECK-NEXT: [[TMP63:%.*]] = load i16, i16* [[ADD_PTR_EPIL]], align 2 -; CHECK-NEXT: [[CONV_EPIL_1:%.*]] = sext i16 [[TMP63]] to i32 -; CHECK-NEXT: [[TMP64:%.*]] = load i16, i16* [[ADD_PTR23_EPIL]], align 2 -; CHECK-NEXT: [[CONV5_EPIL_1:%.*]] = sext i16 [[TMP64]] to i32 -; CHECK-NEXT: [[MUL_EPIL_1:%.*]] = mul nsw i32 [[CONV5_EPIL_1]], [[CONV_EPIL_1]] -; CHECK-NEXT: [[ARRAYIDX6_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL]], i32 1 -; CHECK-NEXT: [[TMP65:%.*]] = load i16, i16* [[ARRAYIDX6_EPIL_1]], align 2 -; CHECK-NEXT: [[CONV7_EPIL_1:%.*]] = sext i16 [[TMP65]] to i32 -; CHECK-NEXT: [[ARRAYIDX8_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL]], i32 1 -; CHECK-NEXT: [[TMP66:%.*]] = load i16, i16* [[ARRAYIDX8_EPIL_1]], align 2 -; CHECK-NEXT: [[CONV9_EPIL_1:%.*]] = sext i16 [[TMP66]] to i32 -; CHECK-NEXT: [[MUL10_EPIL_1:%.*]] = mul nsw i32 [[CONV9_EPIL_1]], [[CONV7_EPIL_1]] -; CHECK-NEXT: [[ARRAYIDX11_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL]], i32 2 -; CHECK-NEXT: [[TMP67:%.*]] = load i16, i16* [[ARRAYIDX11_EPIL_1]], align 2 -; CHECK-NEXT: [[CONV12_EPIL_1:%.*]] = sext i16 [[TMP67]] to i32 -; CHECK-NEXT: [[ARRAYIDX13_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL]], i32 3 -; CHECK-NEXT: [[TMP68:%.*]] = load i16, i16* [[ARRAYIDX13_EPIL_1]], align 2 -; CHECK-NEXT: [[CONV14_EPIL_1:%.*]] = sext i16 [[TMP68]] to i32 -; CHECK-NEXT: [[MUL15_EPIL_1:%.*]] = mul nsw i32 [[CONV14_EPIL_1]], [[CONV12_EPIL_1]] -; CHECK-NEXT: [[ARRAYIDX17_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL]], i32 3 -; CHECK-NEXT: [[TMP69:%.*]] = load i16, i16* [[ARRAYIDX17_EPIL_1]], align 2 -; CHECK-NEXT: [[CONV18_EPIL_1:%.*]] = sext i16 [[TMP69]] to i32 -; CHECK-NEXT: [[ADD21_EPIL_1:%.*]] = add i32 [[MUL10_EPIL_1]], [[MUL_EPIL_1]] -; CHECK-NEXT: [[ADD_EPIL_1:%.*]] = add i32 [[ADD21_EPIL_1]], [[CONV14_EPIL_1]] -; CHECK-NEXT: [[ADD16_EPIL_1:%.*]] = add i32 [[ADD_EPIL_1]], [[MUL15_EPIL_1]] -; CHECK-NEXT: [[ADD22_EPIL_1:%.*]] = add i32 [[ADD16_EPIL_1]], [[CONV18_EPIL_1]] -; CHECK-NEXT: store i32 [[ADD22_EPIL_1]], i32* [[INCDEC_PTR_EPIL]], align 4 -; CHECK-NEXT: [[ADD_PTR_EPIL_1]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL]], i32 4 -; CHECK-NEXT: [[ADD_PTR23_EPIL_1]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL]], i32 4 -; CHECK-NEXT: [[INCDEC_PTR_EPIL_1]] = getelementptr inbounds i32, i32* [[INCDEC_PTR_EPIL]], i32 1 -; CHECK-NEXT: [[ADD24_EPIL_1:%.*]] = add nuw nsw i32 [[ADD24_EPIL]], 4 -; CHECK-NEXT: [[EPIL_ITER_SUB_1:%.*]] = sub i32 [[EPIL_ITER_SUB]], 1 -; CHECK-NEXT: [[EPIL_ITER_CMP_1:%.*]] = icmp ne i32 [[EPIL_ITER_SUB_1]], 0 -; CHECK-NEXT: br i1 [[EPIL_ITER_CMP_1]], label [[FOR_BODY3_EPIL_2]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] -; CHECK: for.body3.epil.2: -; CHECK-NEXT: [[TMP70:%.*]] = load i16, i16* [[ADD_PTR_EPIL_1]], align 2 -; CHECK-NEXT: [[CONV_EPIL_2:%.*]] = sext i16 [[TMP70]] to i32 -; CHECK-NEXT: [[TMP71:%.*]] = load i16, i16* [[ADD_PTR23_EPIL_1]], align 2 -; CHECK-NEXT: [[CONV5_EPIL_2:%.*]] = sext i16 [[TMP71]] to i32 -; CHECK-NEXT: [[MUL_EPIL_2:%.*]] = mul nsw i32 [[CONV5_EPIL_2]], [[CONV_EPIL_2]] -; CHECK-NEXT: [[ARRAYIDX6_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL_1]], i32 1 -; CHECK-NEXT: [[TMP72:%.*]] = load i16, i16* [[ARRAYIDX6_EPIL_2]], align 2 -; CHECK-NEXT: [[CONV7_EPIL_2:%.*]] = sext i16 [[TMP72]] to i32 -; CHECK-NEXT: [[ARRAYIDX8_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL_1]], i32 1 -; CHECK-NEXT: [[TMP73:%.*]] = load i16, i16* [[ARRAYIDX8_EPIL_2]], align 2 -; CHECK-NEXT: [[CONV9_EPIL_2:%.*]] = sext i16 [[TMP73]] to i32 -; CHECK-NEXT: [[MUL10_EPIL_2:%.*]] = mul nsw i32 [[CONV9_EPIL_2]], [[CONV7_EPIL_2]] -; CHECK-NEXT: [[ARRAYIDX11_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL_1]], i32 2 -; CHECK-NEXT: [[TMP74:%.*]] = load i16, i16* [[ARRAYIDX11_EPIL_2]], align 2 -; CHECK-NEXT: [[CONV12_EPIL_2:%.*]] = sext i16 [[TMP74]] to i32 -; CHECK-NEXT: [[ARRAYIDX13_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL_1]], i32 3 -; CHECK-NEXT: [[TMP75:%.*]] = load i16, i16* [[ARRAYIDX13_EPIL_2]], align 2 -; CHECK-NEXT: [[CONV14_EPIL_2:%.*]] = sext i16 [[TMP75]] to i32 -; CHECK-NEXT: [[MUL15_EPIL_2:%.*]] = mul nsw i32 [[CONV14_EPIL_2]], [[CONV12_EPIL_2]] -; CHECK-NEXT: [[ARRAYIDX17_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL_1]], i32 3 -; CHECK-NEXT: [[TMP76:%.*]] = load i16, i16* [[ARRAYIDX17_EPIL_2]], align 2 -; CHECK-NEXT: [[CONV18_EPIL_2:%.*]] = sext i16 [[TMP76]] to i32 -; CHECK-NEXT: [[ADD21_EPIL_2:%.*]] = add i32 [[MUL10_EPIL_2]], [[MUL_EPIL_2]] -; CHECK-NEXT: [[ADD_EPIL_2:%.*]] = add i32 [[ADD21_EPIL_2]], [[CONV14_EPIL_2]] -; CHECK-NEXT: [[ADD16_EPIL_2:%.*]] = add i32 [[ADD_EPIL_2]], [[MUL15_EPIL_2]] -; CHECK-NEXT: [[ADD22_EPIL_2:%.*]] = add i32 [[ADD16_EPIL_2]], [[CONV18_EPIL_2]] -; CHECK-NEXT: store i32 [[ADD22_EPIL_2]], i32* [[INCDEC_PTR_EPIL_1]], align 4 -; CHECK-NEXT: [[ADD_PTR_EPIL_2]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL_1]], i32 4 -; CHECK-NEXT: [[ADD_PTR23_EPIL_2]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL_1]], i32 4 -; CHECK-NEXT: [[INCDEC_PTR_EPIL_2]] = getelementptr inbounds i32, i32* [[INCDEC_PTR_EPIL_1]], i32 1 -; CHECK-NEXT: [[ADD24_EPIL_2:%.*]] = add nuw nsw i32 [[ADD24_EPIL_1]], 4 -; CHECK-NEXT: [[EPIL_ITER_SUB_2:%.*]] = sub i32 [[EPIL_ITER_SUB_1]], 1 -; CHECK-NEXT: br label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] -; CHECK: for.body29.prol.1: -; CHECK-NEXT: [[ARRAYIDX30_PROL_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_PROL]], i32 [[INC_PROL]] -; CHECK-NEXT: [[TMP77:%.*]] = load i16, i16* [[ARRAYIDX30_PROL_1]], align 2 -; CHECK-NEXT: [[CONV31_PROL_1:%.*]] = sext i16 [[TMP77]] to i32 -; CHECK-NEXT: [[ARRAYIDX32_PROL_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_PROL]], i32 [[INC_PROL]] -; CHECK-NEXT: [[TMP78:%.*]] = load i16, i16* [[ARRAYIDX32_PROL_1]], align 2 -; CHECK-NEXT: [[CONV33_PROL_1:%.*]] = sext i16 [[TMP78]] to i32 -; CHECK-NEXT: [[MUL34_PROL_1:%.*]] = mul nsw i32 [[CONV33_PROL_1]], [[CONV31_PROL_1]] -; CHECK-NEXT: [[TMP79:%.*]] = load i32, i32* [[INCDEC_PTR38_PROL]], align 4 -; CHECK-NEXT: [[ADD35_PROL_1:%.*]] = add nsw i32 [[MUL34_PROL_1]], [[TMP79]] -; CHECK-NEXT: store i32 [[ADD35_PROL_1]], i32* [[INCDEC_PTR38_PROL]], align 4 -; CHECK-NEXT: [[INCDEC_PTR36_PROL_1]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_PROL]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR37_PROL_1]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_PROL]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR38_PROL_1]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38_PROL]], i32 1 -; CHECK-NEXT: [[INC_PROL_1]] = add nuw i32 [[INC_PROL]], 1 -; CHECK-NEXT: [[PROL_ITER_SUB_1:%.*]] = sub i32 [[PROL_ITER_SUB]], 1 -; CHECK-NEXT: [[PROL_ITER_CMP_1:%.*]] = icmp ne i32 [[PROL_ITER_SUB_1]], 0 -; CHECK-NEXT: br i1 [[PROL_ITER_CMP_1]], label [[FOR_BODY29_PROL_2]], label [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] -; CHECK: for.body29.prol.2: -; CHECK-NEXT: [[ARRAYIDX30_PROL_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_PROL_1]], i32 [[INC_PROL_1]] -; CHECK-NEXT: [[TMP80:%.*]] = load i16, i16* [[ARRAYIDX30_PROL_2]], align 2 -; CHECK-NEXT: [[CONV31_PROL_2:%.*]] = sext i16 [[TMP80]] to i32 -; CHECK-NEXT: [[ARRAYIDX32_PROL_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_PROL_1]], i32 [[INC_PROL_1]] -; CHECK-NEXT: [[TMP81:%.*]] = load i16, i16* [[ARRAYIDX32_PROL_2]], align 2 -; CHECK-NEXT: [[CONV33_PROL_2:%.*]] = sext i16 [[TMP81]] to i32 -; CHECK-NEXT: [[MUL34_PROL_2:%.*]] = mul nsw i32 [[CONV33_PROL_2]], [[CONV31_PROL_2]] -; CHECK-NEXT: [[TMP82:%.*]] = load i32, i32* [[INCDEC_PTR38_PROL_1]], align 4 -; CHECK-NEXT: [[ADD35_PROL_2:%.*]] = add nsw i32 [[MUL34_PROL_2]], [[TMP82]] -; CHECK-NEXT: store i32 [[ADD35_PROL_2]], i32* [[INCDEC_PTR38_PROL_1]], align 4 -; CHECK-NEXT: [[INCDEC_PTR36_PROL_2]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_PROL_1]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR37_PROL_2]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_PROL_1]], i32 1 -; CHECK-NEXT: [[INCDEC_PTR38_PROL_2]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38_PROL_1]], i32 1 -; CHECK-NEXT: [[INC_PROL_2]] = add nuw i32 [[INC_PROL_1]], 1 -; CHECK-NEXT: [[PROL_ITER_SUB_2:%.*]] = sub i32 [[PROL_ITER_SUB_1]], 1 -; CHECK-NEXT: br label [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ; entry: %cmp88 = icmp eq i32 %blkCnt, 0 @@ -576,3 +220,5 @@ for.end40: ; preds = %for.end40.loopexit, %exitcond95 = icmp eq i32 %inc42, %blkCnt br i1 %exitcond95, label %for.cond.cleanup, label %for.body } + +attributes #0 = { minsize optsize } diff --git a/llvm/test/Transforms/IndVarSimplify/canonicalize-cmp.ll b/llvm/test/Transforms/IndVarSimplify/canonicalize-cmp.ll index 2b939767284a4..7c4bad11a5ea5 100644 --- a/llvm/test/Transforms/IndVarSimplify/canonicalize-cmp.ll +++ b/llvm/test/Transforms/IndVarSimplify/canonicalize-cmp.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -indvars < %s | FileCheck %s ; Check that we replace signed comparisons between non-negative values with @@ -6,13 +7,35 @@ target datalayout = "n8:16:32:64" define i32 @test_01(i32 %a, i32 %b, i32* %p) { - ; CHECK-LABEL: @test_01( -; CHECK-NOT: icmp slt -; CHECK: %cmp1 = icmp ult i32 %iv, 100 -; CHECK: %cmp2 = icmp ult i32 %iv, 100 -; CHECK-NOT: %cmp3 -; CHECK: %exitcond = icmp ne i32 %iv.next, 1000 +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP_ENTRY:%.*]] +; CHECK: loop.entry: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_BE:%.*]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i32 [[IV]], 100 +; CHECK-NEXT: br i1 [[CMP1]], label [[B1:%.*]], label [[B2:%.*]] +; CHECK: b1: +; CHECK-NEXT: store i32 [[IV]], i32* [[P:%.*]], align 4 +; CHECK-NEXT: br label [[MERGE:%.*]] +; CHECK: b2: +; CHECK-NEXT: store i32 [[A:%.*]], i32* [[P]], align 4 +; CHECK-NEXT: br label [[MERGE]] +; CHECK: merge: +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IV]], 100 +; CHECK-NEXT: br i1 [[CMP2]], label [[B3:%.*]], label [[B4:%.*]] +; CHECK: b3: +; CHECK-NEXT: store i32 [[IV]], i32* [[P]], align 4 +; CHECK-NEXT: br label [[LOOP_BE]] +; CHECK: b4: +; CHECK-NEXT: store i32 [[B:%.*]], i32* [[P]], align 4 +; CHECK-NEXT: br label [[LOOP_BE]] +; CHECK: loop.be: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_ENTRY]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret i32 999 +; entry: br label %loop.entry @@ -52,13 +75,35 @@ exit: } define i32 @test_02(i32 %a, i32 %b, i32* %p) { - ; CHECK-LABEL: @test_02( -; CHECK-NOT: icmp sgt -; CHECK: %cmp1 = icmp ugt i32 100, %iv -; CHECK: %cmp2 = icmp ugt i32 100, %iv -; CHECK-NOT: %cmp3 -; CHECK: %exitcond = icmp ne i32 %iv.next, 1000 +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP_ENTRY:%.*]] +; CHECK: loop.entry: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_BE:%.*]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ugt i32 100, [[IV]] +; CHECK-NEXT: br i1 [[CMP1]], label [[B1:%.*]], label [[B2:%.*]] +; CHECK: b1: +; CHECK-NEXT: store i32 [[IV]], i32* [[P:%.*]], align 4 +; CHECK-NEXT: br label [[MERGE:%.*]] +; CHECK: b2: +; CHECK-NEXT: store i32 [[A:%.*]], i32* [[P]], align 4 +; CHECK-NEXT: br label [[MERGE]] +; CHECK: merge: +; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 100, [[IV]] +; CHECK-NEXT: br i1 [[CMP2]], label [[B3:%.*]], label [[B4:%.*]] +; CHECK: b3: +; CHECK-NEXT: store i32 [[IV]], i32* [[P]], align 4 +; CHECK-NEXT: br label [[LOOP_BE]] +; CHECK: b4: +; CHECK-NEXT: store i32 [[B:%.*]], i32* [[P]], align 4 +; CHECK-NEXT: br label [[LOOP_BE]] +; CHECK: loop.be: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_ENTRY]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret i32 999 +; entry: br label %loop.entry diff --git a/llvm/test/Transforms/IndVarSimplify/lftr-multi-exit.ll b/llvm/test/Transforms/IndVarSimplify/lftr-multi-exit.ll index 66951eda7a575..7dfd4ebc00158 100644 --- a/llvm/test/Transforms/IndVarSimplify/lftr-multi-exit.ll +++ b/llvm/test/Transforms/IndVarSimplify/lftr-multi-exit.ll @@ -19,7 +19,7 @@ define void @analyzeable_early_exit(i32 %n) { ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LATCH]], label [[EXIT:%.*]] ; CHECK: latch: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: store i32 [[IV]], i32* @A +; CHECK-NEXT: store i32 [[IV]], i32* @A, align 4 ; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp ne i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND1]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: @@ -49,12 +49,12 @@ define void @unanalyzeable_early_exit() { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[VOL:%.*]] = load volatile i32, i32* @A +; CHECK-NEXT: [[VOL:%.*]] = load volatile i32, i32* @A, align 4 ; CHECK-NEXT: [[EARLYCND:%.*]] = icmp ne i32 [[VOL]], 0 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[LATCH]], label [[EXIT:%.*]] ; CHECK: latch: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: store i32 [[IV]], i32* @A +; CHECK-NEXT: store i32 [[IV]], i32* @A, align 4 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: @@ -89,12 +89,12 @@ define void @multiple_early_exits(i32 %n, i32 %m) { ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[IV]], [[N:%.*]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[CONTINUE:%.*]], label [[EXIT:%.*]] ; CHECK: continue: -; CHECK-NEXT: store volatile i32 [[IV]], i32* @A +; CHECK-NEXT: store volatile i32 [[IV]], i32* @A, align 4 ; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp ne i32 [[IV]], [[M:%.*]] ; CHECK-NEXT: br i1 [[EXITCOND1]], label [[LATCH]], label [[EXIT]] ; CHECK: latch: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: store volatile i32 [[IV]], i32* @A +; CHECK-NEXT: store volatile i32 [[IV]], i32* @A, align 4 ; CHECK-NEXT: [[EXITCOND2:%.*]] = icmp ne i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND2]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: @@ -137,7 +137,7 @@ define void @compound_early_exit(i32 %n, i32 %m) { ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LATCH]], label [[EXIT:%.*]] ; CHECK: latch: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: store volatile i32 [[IV]], i32* @A +; CHECK-NEXT: store volatile i32 [[IV]], i32* @A, align 4 ; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp ne i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND1]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: @@ -174,8 +174,8 @@ define void @unanalyzeable_latch(i32 %n) { ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LATCH]], label [[EXIT:%.*]] ; CHECK: latch: ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: store i32 [[IV]], i32* @A -; CHECK-NEXT: [[VOL:%.*]] = load volatile i32, i32* @A +; CHECK-NEXT: store i32 [[IV]], i32* @A, align 4 +; CHECK-NEXT: [[VOL:%.*]] = load volatile i32, i32* @A, align 4 ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[VOL]], 1000 ; CHECK-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: @@ -210,7 +210,7 @@ define void @single_exit_no_latch(i32 %n) { ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LATCH]], label [[EXIT:%.*]] ; CHECK: latch: ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: store i32 [[IV]], i32* @A +; CHECK-NEXT: store i32 [[IV]], i32* @A, align 4 ; CHECK-NEXT: br label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -243,11 +243,11 @@ define void @no_latch_exit(i32 %n, i32 %m) { ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[IV]], [[N:%.*]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[CONTINUE:%.*]], label [[EXIT:%.*]] ; CHECK: continue: -; CHECK-NEXT: store volatile i32 [[IV]], i32* @A +; CHECK-NEXT: store volatile i32 [[IV]], i32* @A, align 4 ; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp ne i32 [[IV]], [[M:%.*]] ; CHECK-NEXT: br i1 [[EXITCOND1]], label [[LATCH]], label [[EXIT]] ; CHECK: latch: -; CHECK-NEXT: store volatile i32 [[IV]], i32* @A +; CHECK-NEXT: store volatile i32 [[IV]], i32* @A, align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: br label [[LOOP]] ; CHECK: exit: @@ -287,7 +287,7 @@ define void @combine_ivs(i32 %n) { ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LATCH]], label [[EXIT:%.*]] ; CHECK: latch: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: store volatile i32 [[IV]], i32* @A +; CHECK-NEXT: store volatile i32 [[IV]], i32* @A, align 4 ; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp ne i32 [[IV_NEXT]], 999 ; CHECK-NEXT: br i1 [[EXITCOND1]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: @@ -324,7 +324,7 @@ define void @combine_ivs2(i32 %n) { ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LATCH]], label [[EXIT:%.*]] ; CHECK: latch: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: store volatile i32 [[IV]], i32* @A +; CHECK-NEXT: store volatile i32 [[IV]], i32* @A, align 4 ; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp ne i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND1]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: @@ -362,7 +362,7 @@ define void @simplify_exit_test(i32 %n) { ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LATCH]], label [[EXIT:%.*]] ; CHECK: latch: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: store volatile i32 [[IV]], i32* @A +; CHECK-NEXT: store volatile i32 [[IV]], i32* @A, align 4 ; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp ne i32 [[IV_NEXT]], 65 ; CHECK-NEXT: br i1 [[EXITCOND1]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: @@ -396,13 +396,13 @@ define void @simplify_exit_test2(i32 %n) { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[VOL:%.*]] = load volatile i32, i32* @A +; CHECK-NEXT: [[VOL:%.*]] = load volatile i32, i32* @A, align 4 ; CHECK-NEXT: [[EARLYCND:%.*]] = icmp ne i32 [[VOL]], 0 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[LATCH]], label [[EXIT:%.*]] ; CHECK: latch: ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[FX:%.*]] = udiv i32 [[IV]], 4 -; CHECK-NEXT: store volatile i32 [[IV]], i32* @A +; CHECK-NEXT: store volatile i32 [[IV]], i32* @A, align 4 ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[FX]], 1024 ; CHECK-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: @@ -442,12 +442,12 @@ define void @nested(i32 %n) { ; CHECK-NEXT: br label [[OUTER:%.*]] ; CHECK: outer: ; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV1_NEXT:%.*]], [[OUTER_LATCH:%.*]] ] -; CHECK-NEXT: store volatile i32 [[IV1]], i32* @A +; CHECK-NEXT: store volatile i32 [[IV1]], i32* @A, align 4 ; CHECK-NEXT: [[IV1_NEXT]] = add nuw nsw i32 [[IV1]], 1 ; CHECK-NEXT: br label [[INNER:%.*]] ; CHECK: inner: ; CHECK-NEXT: [[IV2:%.*]] = phi i32 [ 0, [[OUTER]] ], [ [[IV2_NEXT:%.*]], [[INNER_LATCH:%.*]] ] -; CHECK-NEXT: store volatile i32 [[IV2]], i32* @A +; CHECK-NEXT: store volatile i32 [[IV2]], i32* @A, align 4 ; CHECK-NEXT: [[IV2_NEXT]] = add nuw nsw i32 [[IV2]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[IV2]], 20 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[INNER_LATCH]], label [[EXIT_LOOPEXIT:%.*]] diff --git a/llvm/test/Transforms/IndVarSimplify/monotonic_checks.ll b/llvm/test/Transforms/IndVarSimplify/monotonic_checks.ll index 988b3923263f6..048254427c5fa 100644 --- a/llvm/test/Transforms/IndVarSimplify/monotonic_checks.ll +++ b/llvm/test/Transforms/IndVarSimplify/monotonic_checks.ll @@ -83,8 +83,8 @@ exit: ret i32 0 } -; Monotonic incrementing iv. we should be able to prove that %iv.next s len +; basing on its nsw and the fact that its starting value >s len. define i32 @test_02(i32* %p) { ; CHECK-LABEL: @test_02( ; CHECK-NEXT: entry: @@ -164,6 +164,84 @@ exit: ret i32 0 } +define i32 @test_03(i32* %p) { +; CHECK-LABEL: @test_03( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LEN:%.*]] = load i32, i32* [[P:%.*]], align 4, [[RNG2:!range !.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[RC:%.*]] = icmp ugt i32 [[IV_NEXT]], [[LEN]] +; CHECK-NEXT: br i1 [[RC]], label [[BACKEDGE]], label [[FAIL:%.*]] +; CHECK: backedge: +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ne i32 [[IV]], 1000 +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: fail: +; CHECK-NEXT: ret i32 -1 +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %len = load i32, i32* %p, !range !2 + br label %loop + +loop: + %iv = phi i32 [%len, %entry], [%iv.next, %backedge] + %iv.next = add i32 %iv, 1 + %rc = icmp sgt i32 %iv.next, %len + br i1 %rc, label %backedge, label %fail + +backedge: + %loop.cond = icmp ne i32 %iv, 1000 + br i1 %loop.cond, label %loop, label %exit + +fail: + ret i32 -1 + +exit: + ret i32 0 +} + +define i32 @test_04(i32* %p) { +; CHECK-LABEL: @test_04( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LEN:%.*]] = load i32, i32* [[P:%.*]], align 4, [[RNG2]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], -1 +; CHECK-NEXT: [[RC:%.*]] = icmp slt i32 [[IV_NEXT]], [[LEN]] +; CHECK-NEXT: br i1 [[RC]], label [[BACKEDGE]], label [[FAIL:%.*]] +; CHECK: backedge: +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ne i32 [[IV]], 0 +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: fail: +; CHECK-NEXT: ret i32 -1 +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %len = load i32, i32* %p, !range !2 + br label %loop + +loop: + %iv = phi i32 [%len, %entry], [%iv.next, %backedge] + %iv.next = add i32 %iv, -1 + %rc = icmp slt i32 %iv.next, %len + br i1 %rc, label %backedge, label %fail + +backedge: + %loop.cond = icmp ne i32 %iv, 0 + br i1 %loop.cond, label %loop, label %exit + +fail: + ret i32 -1 + +exit: + ret i32 0 +} !0 = !{i32 0, i32 2147483647} !1 = !{i32 -2147483648, i32 0} +!2 = !{i32 0, i32 1000} diff --git a/llvm/test/Transforms/IndVarSimplify/pr18223.ll b/llvm/test/Transforms/IndVarSimplify/pr18223.ll index f922aa424a17e..da620c8062198 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr18223.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr18223.ll @@ -1,12 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -indvars -S < %s | FileCheck %s ; indvars should transform the phi node pair from the for-loop -; CHECK-LABEL: @main( -; CHECK: ret = phi i32 [ 0, %entry ], [ 0, {{.*}} ] @c = common global i32 0, align 4 define i32 @main() #0 { +; CHECK-LABEL: @main( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @c, align 4 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[TOBOOL]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: br label [[FOR_INC:%.*]] +; CHECK: for.inc: +; CHECK-NEXT: br i1 false, label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[RET:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[EXIT_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[RET]] +; entry: %0 = load i32, i32* @c, align 4 %tobool = icmp eq i32 %0, 0 diff --git a/llvm/test/Transforms/IndVarSimplify/predicated_ranges.ll b/llvm/test/Transforms/IndVarSimplify/predicated_ranges.ll index 7956735922fea..159caf014e3ce 100644 --- a/llvm/test/Transforms/IndVarSimplify/predicated_ranges.ll +++ b/llvm/test/Transforms/IndVarSimplify/predicated_ranges.ll @@ -10,8 +10,8 @@ ; 1 <= iv <= len [3]; ; 4. iv.next = iv - 1 and [3], therefore ; 0 <= iv.next < len. -define void @test_predicated_simple(i32* %p, i32* %arr) { -; CHECK-LABEL: @test_predicated_simple( +define void @test_predicated_simple_unsigned(i32* %p, i32* %arr) { +; CHECK-LABEL: @test_predicated_simple_unsigned( ; CHECK-NEXT: preheader: ; CHECK-NEXT: [[LEN:%.*]] = load i32, i32* [[P:%.*]], align 4, [[RNG0:!range !.*]] ; CHECK-NEXT: br label [[LOOP:%.*]] @@ -60,4 +60,412 @@ fail: unreachable } +define void @test_predicated_simple_signed(i32* %p, i32* %arr) { +; CHECK-LABEL: @test_predicated_simple_signed( +; CHECK-NEXT: preheader: +; CHECK-NEXT: [[LEN:%.*]] = load i32, i32* [[P:%.*]], align 4, [[RNG0]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[LEN]], [[PREHEADER:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[ZERO_COND:%.*]] = icmp eq i32 [[IV]], 0 +; CHECK-NEXT: br i1 [[ZERO_COND]], label [[EXIT:%.*]], label [[RANGE_CHECK_BLOCK:%.*]] +; CHECK: range_check_block: +; CHECK-NEXT: [[IV_NEXT]] = sub i32 [[IV]], 1 +; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp slt i32 [[IV_NEXT]], [[LEN]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[FAIL:%.*]] +; CHECK: backedge: +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, i32* [[P]], i32 [[IV]] +; CHECK-NEXT: [[EL:%.*]] = load i32, i32* [[EL_PTR]], align 4 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i32 [[EL]], 0 +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: fail: +; CHECK-NEXT: unreachable +; +preheader: + %len = load i32, i32* %p, !range !0 + br label %loop + +loop: + %iv = phi i32 [%len, %preheader], [%iv.next, %backedge] + %zero_cond = icmp eq i32 %iv, 0 + br i1 %zero_cond, label %exit, label %range_check_block + +range_check_block: + %iv.next = sub i32 %iv, 1 + %range_check = icmp slt i32 %iv.next, %len + br i1 %range_check, label %backedge, label %fail + +backedge: + %el.ptr = getelementptr i32, i32* %p, i32 %iv + %el = load i32, i32* %el.ptr + %loop.cond = icmp eq i32 %el, 0 + br i1 %loop.cond, label %loop, label %exit + +exit: + ret void + +fail: + unreachable +} + +; Cannot remove checks because the range check fails on the last iteration. +define void @predicated_outside_loop_signed_neg(i32 %arg) nounwind #0 { +; CHECK-LABEL: @predicated_outside_loop_signed_neg( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[ARG:%.*]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[SUB1]] +; CHECK-NEXT: br i1 [[CMP1]], label [[OUTER_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: outer.preheader: +; CHECK-NEXT: br label [[OUTER:%.*]] +; CHECK: outer: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INC:%.*]], [[OUTER_INC:%.*]] ], [ 0, [[OUTER_PREHEADER]] ] +; CHECK-NEXT: [[SUB2:%.*]] = sub nsw i32 [[ARG]], [[I]] +; CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[SUB2]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 0, [[SUB3]] +; CHECK-NEXT: br i1 [[CMP2]], label [[INNER_PH:%.*]], label [[OUTER_INC]] +; CHECK: inner.ph: +; CHECK-NEXT: br label [[INNER:%.*]] +; CHECK: inner: +; CHECK-NEXT: br i1 false, label [[INNER]], label [[OUTER_INC_LOOPEXIT:%.*]] +; CHECK: outer.inc.loopexit: +; CHECK-NEXT: br label [[OUTER_INC]] +; CHECK: outer.inc: +; CHECK-NEXT: [[I_INC]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: br i1 false, label [[OUTER]], label [[EXIT_LOOPEXIT:%.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %sub1 = sub nsw i32 %arg, 1 + %cmp1 = icmp slt i32 0, %sub1 + br i1 %cmp1, label %outer, label %exit + +outer: + %i = phi i32 [ 0, %entry ], [ %i.inc, %outer.inc ] + %sub2 = sub nsw i32 %arg, %i + %sub3 = sub nsw i32 %sub2, 1 + %cmp2 = icmp slt i32 0, %sub3 + br i1 %cmp2, label %inner.ph, label %outer.inc + +inner.ph: + br label %inner + +inner: + %j = phi i32 [ 0, %inner.ph ], [ %j.inc, %inner ] + %j.inc = add nsw i32 %j, 1 + %cmp3 = icmp slt i32 %j.inc, %sub3 + br i1 %cmp3, label %inner, label %outer.inc + +outer.inc: + %i.inc = add nsw i32 %i, 1 + %cmp4 = icmp slt i32 %i.inc, %arg + br i1 %cmp4, label %outer, label %exit + +exit: + ret void +} + +; Range check can be removed. +define void @predicated_outside_loop_signed_pos(i32 %arg) nounwind #0 { +; CHECK-LABEL: @predicated_outside_loop_signed_pos( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[ARG:%.*]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[SUB1]] +; CHECK-NEXT: br i1 [[CMP1]], label [[OUTER_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: outer.preheader: +; CHECK-NEXT: br label [[OUTER:%.*]] +; CHECK: outer: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INC:%.*]], [[OUTER_INC:%.*]] ], [ 0, [[OUTER_PREHEADER]] ] +; CHECK-NEXT: [[SUB2:%.*]] = sub nsw i32 [[ARG]], [[I]] +; CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[SUB2]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 0, [[SUB3]] +; CHECK-NEXT: br i1 [[CMP2]], label [[INNER_PH:%.*]], label [[OUTER_INC]] +; CHECK: inner.ph: +; CHECK-NEXT: br label [[INNER:%.*]] +; CHECK: inner: +; CHECK-NEXT: br i1 false, label [[INNER]], label [[OUTER_INC_LOOPEXIT:%.*]] +; CHECK: outer.inc.loopexit: +; CHECK-NEXT: br label [[OUTER_INC]] +; CHECK: outer.inc: +; CHECK-NEXT: [[I_INC]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: br i1 false, label [[OUTER]], label [[EXIT_LOOPEXIT:%.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %sub1 = sub nsw i32 %arg, 1 + %cmp1 = icmp slt i32 0, %sub1 + br i1 %cmp1, label %outer, label %exit + +outer: + %i = phi i32 [ 0, %entry ], [ %i.inc, %outer.inc ] + %sub2 = sub nsw i32 %arg, %i + %sub3 = sub nsw i32 %sub2, 1 + %cmp2 = icmp slt i32 0, %sub3 + br i1 %cmp2, label %inner.ph, label %outer.inc + +inner.ph: + br label %inner + +inner: + %j = phi i32 [ 0, %inner.ph ], [ %j.inc, %inner ] + %j.inc = add nsw i32 %j, 1 + %cmp3 = icmp slt i32 %j.inc, %sub3 + br i1 %cmp3, label %inner, label %outer.inc + +outer.inc: + %i.inc = add nsw i32 %i, 1 + %cmp4 = icmp slt i32 %i.inc, %sub1 + br i1 %cmp4, label %outer, label %exit + +exit: + ret void +} + +define void @predicated_outside_loop_unsigned(i32 %arg) nounwind #0 { +; CHECK-LABEL: @predicated_outside_loop_unsigned( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[ARG:%.*]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[SUB1]] +; CHECK-NEXT: br i1 [[CMP1]], label [[OUTER_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: outer.preheader: +; CHECK-NEXT: br label [[OUTER:%.*]] +; CHECK: outer: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INC:%.*]], [[OUTER_INC:%.*]] ], [ 0, [[OUTER_PREHEADER]] ] +; CHECK-NEXT: [[SUB2:%.*]] = sub nsw i32 [[ARG]], [[I]] +; CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[SUB2]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 0, [[SUB3]] +; CHECK-NEXT: br i1 [[CMP2]], label [[INNER_PH:%.*]], label [[OUTER_INC]] +; CHECK: inner.ph: +; CHECK-NEXT: br label [[INNER:%.*]] +; CHECK: inner: +; CHECK-NEXT: br i1 false, label [[INNER]], label [[OUTER_INC_LOOPEXIT:%.*]] +; CHECK: outer.inc.loopexit: +; CHECK-NEXT: br label [[OUTER_INC]] +; CHECK: outer.inc: +; CHECK-NEXT: [[I_INC]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: br i1 false, label [[OUTER]], label [[EXIT_LOOPEXIT:%.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %sub1 = sub nsw i32 %arg, 1 + %cmp1 = icmp slt i32 0, %sub1 + br i1 %cmp1, label %outer, label %exit + +outer: + %i = phi i32 [ 0, %entry ], [ %i.inc, %outer.inc ] + %sub2 = sub nsw i32 %arg, %i + %sub3 = sub nsw i32 %sub2, 1 + %cmp2 = icmp ult i32 0, %sub3 + br i1 %cmp2, label %inner.ph, label %outer.inc + +inner.ph: + br label %inner + +inner: + %j = phi i32 [ 0, %inner.ph ], [ %j.inc, %inner ] + %j.inc = add nsw i32 %j, 1 + %cmp3 = icmp slt i32 %j.inc, %sub3 + br i1 %cmp3, label %inner, label %outer.inc + +outer.inc: + %i.inc = add nsw i32 %i, 1 + %cmp4 = icmp slt i32 %i.inc, %arg + br i1 %cmp4, label %outer, label %exit + +exit: + ret void +} + +; Cannot remove checks because the range check fails on the last iteration. +define void @predicated_inside_loop_signed_neg(i32 %arg) nounwind #0 { +; CHECK-LABEL: @predicated_inside_loop_signed_neg( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[OUTER:%.*]] +; CHECK: outer: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_INC:%.*]], [[OUTER_INC:%.*]] ] +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[ARG:%.*]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[SUB1]] +; CHECK-NEXT: br i1 [[CMP1]], label [[GUARDED:%.*]], label [[EXIT:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[SUB2:%.*]] = sub nsw i32 [[ARG]], [[I]] +; CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[SUB2]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 0, [[SUB3]] +; CHECK-NEXT: br i1 [[CMP2]], label [[INNER_PH:%.*]], label [[OUTER_INC]] +; CHECK: inner.ph: +; CHECK-NEXT: br label [[INNER:%.*]] +; CHECK: inner: +; CHECK-NEXT: br i1 false, label [[INNER]], label [[OUTER_INC_LOOPEXIT:%.*]] +; CHECK: outer.inc.loopexit: +; CHECK-NEXT: br label [[OUTER_INC]] +; CHECK: outer.inc: +; CHECK-NEXT: [[I_INC]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I_INC]], [[ARG]] +; CHECK-NEXT: br i1 [[CMP4]], label [[OUTER]], label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %outer + +outer: + %i = phi i32 [ 0, %entry ], [ %i.inc, %outer.inc ] + %sub1 = sub nsw i32 %arg, 1 + %cmp1 = icmp slt i32 0, %sub1 + br i1 %cmp1, label %guarded, label %exit + +guarded: + %sub2 = sub nsw i32 %arg, %i + %sub3 = sub nsw i32 %sub2, 1 + %cmp2 = icmp slt i32 0, %sub3 + br i1 %cmp2, label %inner.ph, label %outer.inc + +inner.ph: + br label %inner + +inner: + %j = phi i32 [ 0, %inner.ph ], [ %j.inc, %inner ] + %j.inc = add nsw i32 %j, 1 + %cmp3 = icmp slt i32 %j.inc, %sub3 + br i1 %cmp3, label %inner, label %outer.inc + +outer.inc: + %i.inc = add nsw i32 %i, 1 + %cmp4 = icmp slt i32 %i.inc, %arg + br i1 %cmp4, label %outer, label %exit + +exit: + ret void +} + +; Range check can be trivially removed. +define void @predicated_inside_loop_signed_pos(i32 %arg) nounwind #0 { +; CHECK-LABEL: @predicated_inside_loop_signed_pos( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[OUTER:%.*]] +; CHECK: outer: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_INC:%.*]], [[OUTER_INC:%.*]] ] +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[ARG:%.*]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[SUB1]] +; CHECK-NEXT: br i1 [[CMP1]], label [[GUARDED:%.*]], label [[EXIT:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[SUB2:%.*]] = sub nsw i32 [[ARG]], [[I]] +; CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[SUB2]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 0, [[SUB3]] +; CHECK-NEXT: br i1 [[CMP2]], label [[INNER_PH:%.*]], label [[OUTER_INC]] +; CHECK: inner.ph: +; CHECK-NEXT: br label [[INNER:%.*]] +; CHECK: inner: +; CHECK-NEXT: br i1 false, label [[INNER]], label [[OUTER_INC_LOOPEXIT:%.*]] +; CHECK: outer.inc.loopexit: +; CHECK-NEXT: br label [[OUTER_INC]] +; CHECK: outer.inc: +; CHECK-NEXT: [[I_INC]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I_INC]], [[SUB1]] +; CHECK-NEXT: br i1 [[CMP4]], label [[OUTER]], label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %outer + +outer: + %i = phi i32 [ 0, %entry ], [ %i.inc, %outer.inc ] + %sub1 = sub nsw i32 %arg, 1 + %cmp1 = icmp slt i32 0, %sub1 + br i1 %cmp1, label %guarded, label %exit + +guarded: + %sub2 = sub nsw i32 %arg, %i + %sub3 = sub nsw i32 %sub2, 1 + %cmp2 = icmp slt i32 0, %sub3 + br i1 %cmp2, label %inner.ph, label %outer.inc + +inner.ph: + br label %inner + +inner: + %j = phi i32 [ 0, %inner.ph ], [ %j.inc, %inner ] + %j.inc = add nsw i32 %j, 1 + %cmp3 = icmp slt i32 %j.inc, %sub3 + br i1 %cmp3, label %inner, label %outer.inc + +outer.inc: + %i.inc = add nsw i32 %i, 1 + %cmp4 = icmp slt i32 %i.inc, %sub1 + br i1 %cmp4, label %outer, label %exit + +exit: + ret void +} + +define void @predicated_inside_loop_unsigned(i32 %arg) nounwind #0 { +; CHECK-LABEL: @predicated_inside_loop_unsigned( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[OUTER:%.*]] +; CHECK: outer: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_INC:%.*]], [[OUTER_INC:%.*]] ] +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[ARG:%.*]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[SUB1]] +; CHECK-NEXT: br i1 [[CMP1]], label [[GUARDED:%.*]], label [[EXIT:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[SUB2:%.*]] = sub nsw i32 [[ARG]], [[I]] +; CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[SUB2]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 0, [[SUB3]] +; CHECK-NEXT: br i1 [[CMP2]], label [[INNER_PH:%.*]], label [[OUTER_INC]] +; CHECK: inner.ph: +; CHECK-NEXT: br label [[INNER:%.*]] +; CHECK: inner: +; CHECK-NEXT: br i1 false, label [[INNER]], label [[OUTER_INC_LOOPEXIT:%.*]] +; CHECK: outer.inc.loopexit: +; CHECK-NEXT: br label [[OUTER_INC]] +; CHECK: outer.inc: +; CHECK-NEXT: [[I_INC]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I_INC]], [[ARG]] +; CHECK-NEXT: br i1 [[CMP4]], label [[OUTER]], label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %outer + +outer: + %i = phi i32 [ 0, %entry ], [ %i.inc, %outer.inc ] + %sub1 = sub nsw i32 %arg, 1 + %cmp1 = icmp slt i32 0, %sub1 + br i1 %cmp1, label %guarded, label %exit + +guarded: + %sub2 = sub nsw i32 %arg, %i + %sub3 = sub nsw i32 %sub2, 1 + %cmp2 = icmp ult i32 0, %sub3 + br i1 %cmp2, label %inner.ph, label %outer.inc + +inner.ph: + br label %inner + +inner: + %j = phi i32 [ 0, %inner.ph ], [ %j.inc, %inner ] + %j.inc = add nsw i32 %j, 1 + %cmp3 = icmp slt i32 %j.inc, %sub3 + br i1 %cmp3, label %inner, label %outer.inc + +outer.inc: + %i.inc = add nsw i32 %i, 1 + %cmp4 = icmp slt i32 %i.inc, %arg + br i1 %cmp4, label %outer, label %exit + +exit: + ret void +} + !0 = !{i32 0, i32 2147483647} diff --git a/llvm/test/Transforms/IndVarSimplify/trivial-checks.ll b/llvm/test/Transforms/IndVarSimplify/trivial-checks.ll new file mode 100644 index 0000000000000..a6fe59a6b2230 --- /dev/null +++ b/llvm/test/Transforms/IndVarSimplify/trivial-checks.ll @@ -0,0 +1,186 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -indvars -S < %s | FileCheck %s +; RUN: opt -passes=indvars -S < %s | FileCheck %s + +; FIXME: In all cases, x is from [0; 1000) and we cannot prove that x + 1 > x. + +define void @test_sgt(i32 %x) { +; CHECK-LABEL: @test_sgt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PRECONDITION:%.*]] = icmp ult i32 [[X:%.*]], 1000 +; CHECK-NEXT: br i1 [[PRECONDITION]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[GUARDED:%.*]] ], [ [[X]], [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[GUARD:%.*]] = icmp sgt i32 [[TMP]], [[IV]] +; CHECK-NEXT: br i1 [[GUARD]], label [[GUARDED]], label [[FAIL:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], -1 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: fail: +; CHECK-NEXT: unreachable +; +entry: + %precondition = icmp ult i32 %x, 1000 + br i1 %precondition, label %loop, label %exit + +loop: + %iv = phi i32 [%x, %entry], [%iv.next, %guarded] + %tmp = add i32 %iv, 1 + %guard = icmp sgt i32 %tmp, %iv + br i1 %guard, label %guarded, label %fail + +guarded: + %iv.next = add i32 %iv, -1 + %cond = icmp eq i32 %iv, 0 + br i1 %cond, label %loop, label %exit + +exit: + ret void + +fail: + unreachable +} + +define void @test_sge(i32 %x) { +; CHECK-LABEL: @test_sge( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PRECONDITION:%.*]] = icmp ult i32 [[X:%.*]], 1000 +; CHECK-NEXT: br i1 [[PRECONDITION]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[GUARDED:%.*]] ], [ [[X]], [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[GUARD:%.*]] = icmp sge i32 [[TMP]], [[IV]] +; CHECK-NEXT: br i1 [[GUARD]], label [[GUARDED]], label [[FAIL:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], -1 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: fail: +; CHECK-NEXT: unreachable +; +entry: + %precondition = icmp ult i32 %x, 1000 + br i1 %precondition, label %loop, label %exit + +loop: + %iv = phi i32 [%x, %entry], [%iv.next, %guarded] + %tmp = add i32 %iv, 1 + %guard = icmp sge i32 %tmp, %iv + br i1 %guard, label %guarded, label %fail + +guarded: + %iv.next = add i32 %iv, -1 + %cond = icmp eq i32 %iv, 0 + br i1 %cond, label %loop, label %exit + +exit: + ret void + +fail: + unreachable +} + +define void @test_ugt(i32 %x) { +; CHECK-LABEL: @test_ugt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PRECONDITION:%.*]] = icmp ult i32 [[X:%.*]], 1000 +; CHECK-NEXT: br i1 [[PRECONDITION]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[GUARDED:%.*]] ], [ [[X]], [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[GUARD:%.*]] = icmp ugt i32 [[TMP]], [[IV]] +; CHECK-NEXT: br i1 [[GUARD]], label [[GUARDED]], label [[FAIL:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], -1 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: fail: +; CHECK-NEXT: unreachable +; +entry: + %precondition = icmp ult i32 %x, 1000 + br i1 %precondition, label %loop, label %exit + +loop: + %iv = phi i32 [%x, %entry], [%iv.next, %guarded] + %tmp = add i32 %iv, 1 + %guard = icmp ugt i32 %tmp, %iv + br i1 %guard, label %guarded, label %fail + +guarded: + %iv.next = add i32 %iv, -1 + %cond = icmp eq i32 %iv, 0 + br i1 %cond, label %loop, label %exit + +exit: + ret void + +fail: + unreachable +} + + +define void @test_uge(i32 %x) { +; CHECK-LABEL: @test_uge( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PRECONDITION:%.*]] = icmp ult i32 [[X:%.*]], 1000 +; CHECK-NEXT: br i1 [[PRECONDITION]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[GUARDED:%.*]] ], [ [[X]], [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[GUARD:%.*]] = icmp uge i32 [[TMP]], [[IV]] +; CHECK-NEXT: br i1 [[GUARD]], label [[GUARDED]], label [[FAIL:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], -1 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: fail: +; CHECK-NEXT: unreachable +; +entry: + %precondition = icmp ult i32 %x, 1000 + br i1 %precondition, label %loop, label %exit + +loop: + %iv = phi i32 [%x, %entry], [%iv.next, %guarded] + %tmp = add i32 %iv, 1 + %guard = icmp uge i32 %tmp, %iv + br i1 %guard, label %guarded, label %fail + +guarded: + %iv.next = add i32 %iv, -1 + %cond = icmp eq i32 %iv, 0 + br i1 %cond, label %loop, label %exit + +exit: + ret void + +fail: + unreachable +} diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/self-phi.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/self-phi.ll new file mode 100644 index 0000000000000..2f6496ab19944 --- /dev/null +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/self-phi.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -S -infer-address-spaces %s | FileCheck %s + +define amdgpu_kernel void @phi_self(i8 addrspace(1)* %arg) { +; CHECK-LABEL: @phi_self( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[I:%.*]] = phi i8 addrspace(1)* [ [[I]], [[LOOP]] ], [ [[ARG:%.*]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[I1:%.*]] = load i8, i8 addrspace(1)* [[I]], align 1 +; CHECK-NEXT: [[I2:%.*]] = icmp eq i8 [[I1]], 0 +; CHECK-NEXT: br i1 [[I2]], label [[LOOP]], label [[RET:%.*]] +; CHECK: ret: +; CHECK-NEXT: ret void +; +entry: + %cast = addrspacecast i8 addrspace(1)* %arg to i8* + br label %loop + +loop: + %i = phi i8* [%i, %loop], [%cast, %entry] + %i1 = load i8, i8* %i, align 1 + %i2 = icmp eq i8 %i1, 0 + br i1 %i2, label %loop, label %ret + +ret: + ret void +} diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/unreachable-code-assert.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/unreachable-code-assert.ll new file mode 100644 index 0000000000000..73001b53634c0 --- /dev/null +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/unreachable-code-assert.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -S -infer-address-spaces %s | FileCheck %s + +define amdgpu_kernel void @subclass_data_assert() { +; CHECK-LABEL: @subclass_data_assert( +; CHECK-NEXT: entry: +; CHECK-NEXT: unreachable +; CHECK: strlen.while11: +; CHECK-NEXT: [[I:%.*]] = getelementptr i8, i8* [[I]], i64 1 +; CHECK-NEXT: [[I1:%.*]] = load i8, i8* [[I]], align 1 +; CHECK-NEXT: [[I2:%.*]] = icmp eq i8 [[I1]], 0 +; CHECK-NEXT: br i1 [[I2]], label [[STRLEN_WHILE_DONE12:%.*]], label [[STRLEN_WHILE11:%.*]] +; CHECK: strlen.while.done12: +; CHECK-NEXT: ret void +; +entry: + unreachable + +strlen.while11: ; preds = %strlen.while11 + %i = getelementptr i8, i8* %i, i64 1 + %i1 = load i8, i8* %i, align 1 + %i2 = icmp eq i8 %i1, 0 + br i1 %i2, label %strlen.while.done12, label %strlen.while11 + +strlen.while.done12: ; preds = %strlen.while11 + ret void +} diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll index 85c6e35266b71..7f52bf771769b 100644 --- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll +++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll @@ -11,7 +11,7 @@ declare i8* @_Znwm(i64) ; CHECK: declare noalias nonnull i8* @_Znwm(i64) [[G0]] declare i32 @__nvvm_reflect(i8*) -; CHECK-NVPTX: declare i32 @__nvvm_reflect(i8*) [[G0:#[0-9]+]] +; CHECK-NVPTX: declare noundef i32 @__nvvm_reflect(i8* noundef) [[G0:#[0-9]+]] ; CHECK-NVPTX: attributes [[G0]] = { nofree nounwind readnone } @@ -163,7 +163,7 @@ declare float @__sinpif(float) ; CHECK: declare i32 @abs(i32) [[G0]] declare i32 @abs(i32) -; CHECK: declare i32 @access(i8* nocapture readonly, i32) [[G1:#[0-9]+]] +; CHECK: declare noundef i32 @access(i8* nocapture noundef readonly, i32 noundef) [[G1:#[0-9]+]] declare i32 @access(i8*, i32) ; CHECK: declare double @acos(double) [[G0]] @@ -274,16 +274,16 @@ declare float @ceilf(float) ; CHECK: declare x86_fp80 @ceill(x86_fp80) [[G0]] declare x86_fp80 @ceill(x86_fp80) -; CHECK: declare i32 @chmod(i8* nocapture readonly, i16 zeroext) [[G1]] +; CHECK: declare noundef i32 @chmod(i8* nocapture noundef readonly, i16 noundef zeroext) [[G1]] declare i32 @chmod(i8*, i16 zeroext) -; CHECK: declare i32 @chown(i8* nocapture readonly, i32, i32) [[G1]] +; CHECK: declare noundef i32 @chown(i8* nocapture noundef readonly, i32 noundef, i32 noundef) [[G1]] declare i32 @chown(i8*, i32, i32) -; CHECK: declare void @clearerr(%opaque* nocapture) [[G1]] +; CHECK: declare void @clearerr(%opaque* nocapture noundef) [[G1]] declare void @clearerr(%opaque*) -; CHECK: declare i32 @closedir(%opaque* nocapture) [[G1]] +; CHECK: declare noundef i32 @closedir(%opaque* nocapture noundef) [[G1]] declare i32 @closedir(%opaque*) ; CHECK: declare double @copysign(double, double) [[G0]] @@ -313,7 +313,7 @@ declare x86_fp80 @coshl(x86_fp80) ; CHECK: declare x86_fp80 @cosl(x86_fp80) [[G0]] declare x86_fp80 @cosl(x86_fp80) -; CHECK: declare i8* @ctermid(i8* nocapture) [[G1]] +; CHECK: declare noundef i8* @ctermid(i8* nocapture noundef) [[G1]] declare i8* @ctermid(i8*) ; CHECK: declare double @exp(double) [[G0]] @@ -520,22 +520,22 @@ declare i32 @getchar() ; CHECK: declare noundef i32 @getchar_unlocked() [[G1]] declare i32 @getchar_unlocked() -; CHECK: declare i8* @getenv(i8* nocapture) [[G2]] +; CHECK: declare noundef i8* @getenv(i8* nocapture noundef) [[G2]] declare i8* @getenv(i8*) -; CHECK: declare i32 @getitimer(i32, %opaque* nocapture) [[G1]] +; CHECK: declare noundef i32 @getitimer(i32 noundef, %opaque* nocapture noundef) [[G1]] declare i32 @getitimer(i32, %opaque*) -; CHECK: declare i32 @getlogin_r(i8* nocapture, i64) [[G1]] +; CHECK: declare noundef i32 @getlogin_r(i8* nocapture noundef, i64 noundef) [[G1]] declare i32 @getlogin_r(i8*, i64) -; CHECK: declare %opaque* @getpwnam(i8* nocapture readonly) [[G1]] +; CHECK: declare noundef %opaque* @getpwnam(i8* nocapture noundef readonly) [[G1]] declare %opaque* @getpwnam(i8*) ; CHECK: declare noundef i8* @gets(i8* noundef) [[G1]] declare i8* @gets(i8*) -; CHECK: declare i32 @gettimeofday(%opaque* nocapture, i8* nocapture) [[G1]] +; CHECK: declare noundef i32 @gettimeofday(%opaque* nocapture noundef, i8* nocapture noundef) [[G1]] declare i32 @gettimeofday(%opaque*, i8*) ; CHECK: declare i32 @isascii(i32) [[G0]] @@ -547,7 +547,7 @@ declare i32 @isdigit(i32) ; CHECK: declare i64 @labs(i64) [[G0]] declare i64 @labs(i64) -; CHECK: declare i32 @lchown(i8* nocapture readonly, i32, i32) [[G1]] +; CHECK: declare noundef i32 @lchown(i8* nocapture noundef readonly, i32 noundef, i32 noundef) [[G1]] declare i32 @lchown(i8*, i32, i32) ; CHECK: declare double @ldexp(double, i32) [[G0]] @@ -607,10 +607,10 @@ declare float @logf(float) ; CHECK: declare x86_fp80 @logl(x86_fp80) [[G0]] declare x86_fp80 @logl(x86_fp80) -; CHECK: declare i32 @lstat(i8* nocapture readonly, %opaque* nocapture) [[G1]] +; CHECK: declare noundef i32 @lstat(i8* nocapture noundef readonly, %opaque* nocapture noundef) [[G1]] declare i32 @lstat(i8*, %opaque*) -; CHECK-LINUX: declare i32 @lstat64(i8* nocapture readonly, %opaque* nocapture) [[G1]] +; CHECK-LINUX: declare noundef i32 @lstat64(i8* nocapture noundef readonly, %opaque* nocapture noundef) [[G1]] declare i32 @lstat64(i8*, %opaque*) ; CHECK: declare noalias i8* @malloc(i64) [[G1]] @@ -642,10 +642,10 @@ declare i8* @memmove(i8*, i8*, i64) ; CHECK: declare i8* @memset(i8*, i32, i64) [[G0]] declare i8* @memset(i8*, i32, i64) -; CHECK: declare i32 @mkdir(i8* nocapture readonly, i16 zeroext) [[G1]] +; CHECK: declare noundef i32 @mkdir(i8* nocapture noundef readonly, i16 noundef zeroext) [[G1]] declare i32 @mkdir(i8*, i16 zeroext) -; CHECK: declare i64 @mktime(%opaque* nocapture) [[G1]] +; CHECK: declare noundef i64 @mktime(%opaque* nocapture noundef) [[G1]] declare i64 @mktime(%opaque*) ; CHECK: declare double @modf(double, double* nocapture) [[G1]] @@ -672,16 +672,16 @@ declare i32 @open(i8*, i32, ...) ; CHECK-LINUX: declare noundef i32 @open64(i8* nocapture noundef readonly, i32 noundef, ...) [[G0]] declare i32 @open64(i8*, i32, ...) -; CHECK: declare noalias %opaque* @opendir(i8* nocapture readonly) [[G1]] +; CHECK: declare noalias noundef %opaque* @opendir(i8* nocapture noundef readonly) [[G1]] declare %opaque* @opendir(i8*) -; CHECK: declare i32 @pclose(%opaque* nocapture) [[G1]] +; CHECK: declare noundef i32 @pclose(%opaque* nocapture noundef) [[G1]] declare i32 @pclose(%opaque*) ; CHECK: declare void @perror(i8* nocapture noundef readonly) [[G1]] declare void @perror(i8*) -; CHECK: declare noalias %opaque* @popen(i8* nocapture readonly, i8* nocapture readonly) [[G1]] +; CHECK: declare noalias noundef %opaque* @popen(i8* nocapture noundef readonly, i8* nocapture noundef readonly) [[G1]] declare %opaque* @popen(i8*, i8*) ; CHECK: declare i32 @posix_memalign(i8**, i64, i64) [[G0]] @@ -717,13 +717,13 @@ declare i32 @puts(i8*) ; CHECK: declare noundef i64 @pwrite(i32 noundef, i8* nocapture noundef readonly, i64 noundef, i64 noundef) [[G0]] declare i64 @pwrite(i32, i8*, i64, i64) -; CHECK: declare void @qsort(i8*, i64, i64, i32 (i8*, i8*)* nocapture) [[G0]] +; CHECK: declare void @qsort(i8* noundef, i64 noundef, i64 noundef, i32 (i8*, i8*)* nocapture noundef) [[G0]] declare void @qsort(i8*, i64, i64, i32 (i8*, i8*)*) ; CHECK: declare noundef i64 @read(i32 noundef, i8* nocapture noundef, i64 noundef) [[G0]] declare i64 @read(i32, i8*, i64) -; CHECK: declare i64 @readlink(i8* nocapture readonly, i8* nocapture, i64) [[G1]] +; CHECK: declare noundef i64 @readlink(i8* nocapture noundef readonly, i8* nocapture noundef, i64 noundef) [[G1]] declare i64 @readlink(i8*, i8*, i64) ; CHECK: declare noalias i8* @realloc(i8* nocapture, i64) [[G3]] @@ -732,13 +732,13 @@ declare i8* @realloc(i8*, i64) ; CHECK: declare i8* @reallocf(i8*, i64) declare i8* @reallocf(i8*, i64) -; CHECK: declare i8* @realpath(i8* nocapture readonly, i8*) [[G1]] +; CHECK: declare noundef i8* @realpath(i8* nocapture noundef readonly, i8* noundef) [[G1]] declare i8* @realpath(i8*, i8*) -; CHECK: declare i32 @remove(i8* nocapture readonly) [[G1]] +; CHECK: declare noundef i32 @remove(i8* nocapture noundef readonly) [[G1]] declare i32 @remove(i8*) -; CHECK: declare i32 @rename(i8* nocapture readonly, i8* nocapture readonly) [[G1]] +; CHECK: declare noundef i32 @rename(i8* nocapture noundef readonly, i8* nocapture noundef readonly) [[G1]] declare i32 @rename(i8*, i8*) ; CHECK: declare void @rewind(%opaque* nocapture noundef) [[G1]] @@ -753,7 +753,7 @@ declare float @rintf(float) ; CHECK: declare x86_fp80 @rintl(x86_fp80) [[G0]] declare x86_fp80 @rintl(x86_fp80) -; CHECK: declare i32 @rmdir(i8* nocapture readonly) [[G1]] +; CHECK: declare noundef i32 @rmdir(i8* nocapture noundef readonly) [[G1]] declare i32 @rmdir(i8*) ; CHECK: declare double @round(double) [[G0]] @@ -768,13 +768,13 @@ declare x86_fp80 @roundl(x86_fp80) ; CHECK: declare noundef i32 @scanf(i8* nocapture noundef readonly, ...) [[G1]] declare i32 @scanf(i8*, ...) -; CHECK: declare void @setbuf(%opaque* nocapture, i8*) [[G1]] +; CHECK: declare void @setbuf(%opaque* nocapture noundef, i8* noundef) [[G1]] declare void @setbuf(%opaque*, i8*) -; CHECK: declare i32 @setitimer(i32, %opaque* nocapture readonly, %opaque* nocapture) [[G1]] +; CHECK: declare noundef i32 @setitimer(i32 noundef, %opaque* nocapture noundef readonly, %opaque* nocapture noundef) [[G1]] declare i32 @setitimer(i32, %opaque*, %opaque*) -; CHECK: declare i32 @setvbuf(%opaque* nocapture, i8*, i32, i64) [[G1]] +; CHECK: declare noundef i32 @setvbuf(%opaque* nocapture noundef, i8* noundef, i32 noundef, i64 noundef) [[G1]] declare i32 @setvbuf(%opaque*, i8*, i32, i64) ; CHECK: declare double @sin(double) [[G0]] @@ -813,16 +813,16 @@ declare x86_fp80 @sqrtl(x86_fp80) ; CHECK: declare noundef i32 @sscanf(i8* nocapture noundef readonly, i8* nocapture noundef readonly, ...) [[G1]] declare i32 @sscanf(i8*, i8*, ...) -; CHECK: declare i32 @stat(i8* nocapture readonly, %opaque* nocapture) [[G1]] +; CHECK: declare noundef i32 @stat(i8* nocapture noundef readonly, %opaque* nocapture noundef) [[G1]] declare i32 @stat(i8*, %opaque*) -; CHECK-LINUX: declare i32 @stat64(i8* nocapture readonly, %opaque* nocapture) [[G1]] +; CHECK-LINUX: declare noundef i32 @stat64(i8* nocapture noundef readonly, %opaque* nocapture noundef) [[G1]] declare i32 @stat64(i8*, %opaque*) -; CHECK: declare i32 @statvfs(i8* nocapture readonly, %opaque* nocapture) [[G1]] +; CHECK: declare noundef i32 @statvfs(i8* nocapture noundef readonly, %opaque* nocapture noundef) [[G1]] declare i32 @statvfs(i8*, %opaque*) -; CHECK-LINUX: declare i32 @statvfs64(i8* nocapture readonly, %opaque* nocapture) [[G1]] +; CHECK-LINUX: declare noundef i32 @statvfs64(i8* nocapture noundef readonly, %opaque* nocapture noundef) [[G1]] declare i32 @statvfs64(i8*, %opaque*) ; CHECK: declare i8* @stpcpy(i8*, i8* nocapture readonly) [[G1]] @@ -918,7 +918,7 @@ declare i64 @strtoull(i8*, i8**, i32) ; CHECK: declare i64 @strxfrm(i8* nocapture, i8* nocapture readonly, i64) [[G1]] declare i64 @strxfrm(i8*, i8*, i64) -; CHECK: declare i32 @system(i8* nocapture readonly) [[G0]] +; CHECK: declare noundef i32 @system(i8* nocapture noundef readonly) [[G0]] declare i32 @system(i8*) ; CHECK: declare double @tan(double) [[G0]] @@ -939,13 +939,13 @@ declare x86_fp80 @tanhl(x86_fp80) ; CHECK: declare x86_fp80 @tanl(x86_fp80) [[G0]] declare x86_fp80 @tanl(x86_fp80) -; CHECK: declare i64 @times(%opaque* nocapture) [[G1]] +; CHECK: declare noundef i64 @times(%opaque* nocapture noundef) [[G1]] declare i64 @times(%opaque*) -; CHECK: declare noalias %opaque* @tmpfile() [[G1]] +; CHECK: declare noalias noundef %opaque* @tmpfile() [[G1]] declare %opaque* @tmpfile() -; CHECK-LINUX: declare noalias %opaque* @tmpfile64() [[G1]] +; CHECK-LINUX: declare noalias noundef %opaque* @tmpfile64() [[G1]] declare %opaque* @tmpfile64() ; CHECK: declare i32 @toascii(i32) [[G0]] @@ -960,22 +960,22 @@ declare float @truncf(float) ; CHECK: declare x86_fp80 @truncl(x86_fp80) [[G0]] declare x86_fp80 @truncl(x86_fp80) -; CHECK: declare i32 @uname(%opaque* nocapture) [[G1]] +; CHECK: declare noundef i32 @uname(%opaque* nocapture noundef) [[G1]] declare i32 @uname(%opaque*) ; CHECK: declare noundef i32 @ungetc(i32 noundef, %opaque* nocapture noundef) [[G1]] declare i32 @ungetc(i32, %opaque*) -; CHECK: declare i32 @unlink(i8* nocapture readonly) [[G1]] +; CHECK: declare noundef i32 @unlink(i8* nocapture noundef readonly) [[G1]] declare i32 @unlink(i8*) -; CHECK: declare i32 @unsetenv(i8* nocapture readonly) [[G1]] +; CHECK: declare noundef i32 @unsetenv(i8* nocapture noundef readonly) [[G1]] declare i32 @unsetenv(i8*) -; CHECK: declare i32 @utime(i8* nocapture readonly, %opaque* nocapture readonly) [[G1]] +; CHECK: declare noundef i32 @utime(i8* nocapture noundef readonly, %opaque* nocapture noundef readonly) [[G1]] declare i32 @utime(i8*, %opaque*) -; CHECK: declare i32 @utimes(i8* nocapture readonly, %opaque* nocapture readonly) [[G1]] +; CHECK: declare noundef i32 @utimes(i8* nocapture noundef readonly, %opaque* nocapture noundef readonly) [[G1]] declare i32 @utimes(i8*, %opaque*) ; CHECK: declare noalias i8* @valloc(i64) [[G1]] diff --git a/llvm/test/Transforms/Inline/AArch64/sve-alloca-merge.ll b/llvm/test/Transforms/Inline/AArch64/sve-alloca-merge.ll new file mode 100644 index 0000000000000..c355388ed836f --- /dev/null +++ b/llvm/test/Transforms/Inline/AArch64/sve-alloca-merge.ll @@ -0,0 +1,29 @@ +; RUN: opt -mtriple=aarch64--linux-gnu -mattr=+sve < %s -inline -S | FileCheck %s + +define void @bar(* %a) { +entry: + %b = alloca , align 16 + store zeroinitializer, * %b, align 16 + %c = load , * %a, align 16 + %d = load , * %b, align 16 + %e = add %c, %d + %f = add %e, %c + store %f, * %a, align 16 + ret void +} + +define i64 @foo() { +; CHECK-LABEL: @foo( +; CHECK: %0 = bitcast * %{{.*}} to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0) +; CHECK: %1 = bitcast * %{{.*}} to i8* +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* %1) +entry: + %a = alloca , align 16 + store zeroinitializer, * %a, align 16 + %a1 = bitcast * %a to i64* + store i64 1, i64* %a1, align 8 + call void @bar(* %a) + %el = load i64, i64* %a1 + ret i64 %el +} diff --git a/llvm/test/Transforms/Inline/align.ll b/llvm/test/Transforms/Inline/align.ll index ede6c3fa7bcf4..f3a5184564850 100644 --- a/llvm/test/Transforms/Inline/align.ll +++ b/llvm/test/Transforms/Inline/align.ll @@ -23,10 +23,7 @@ define void @foo(float* nocapture %a, float* nocapture readonly %c) #0 { ; CHECK-LABEL: define {{[^@]+}}@foo ; CHECK-SAME: (float* nocapture [[A:%.*]], float* nocapture readonly [[C:%.*]]) #0 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint float* [[A]] to i64 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 127 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(float* [[A]], i64 128) ] ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[C]], align 4 ; CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds float, float* [[A]], i64 5 ; CHECK-NEXT: store float [[TMP0]], float* [[ARRAYIDX_I]], align 4 @@ -87,14 +84,8 @@ define void @foo2(float* nocapture %a, float* nocapture %b, float* nocapture rea ; CHECK-LABEL: define {{[^@]+}}@foo2 ; CHECK-SAME: (float* nocapture [[A:%.*]], float* nocapture [[B:%.*]], float* nocapture readonly [[C:%.*]]) #0 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint float* [[A]] to i64 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 127 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: call void @llvm.assume(i1 [[MASKCOND]]) -; CHECK-NEXT: [[PTRINT1:%.*]] = ptrtoint float* [[B]] to i64 -; CHECK-NEXT: [[MASKEDPTR2:%.*]] = and i64 [[PTRINT1]], 127 -; CHECK-NEXT: [[MASKCOND3:%.*]] = icmp eq i64 [[MASKEDPTR2]], 0 -; CHECK-NEXT: call void @llvm.assume(i1 [[MASKCOND3]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(float* [[A]], i64 128) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(float* [[B]], i64 128) ] ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[C]], align 4 ; CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds float, float* [[A]], i64 5 ; CHECK-NEXT: store float [[TMP0]], float* [[ARRAYIDX_I]], align 4 diff --git a/llvm/test/Transforms/Inline/byref-align.ll b/llvm/test/Transforms/Inline/byref-align.ll index fb70db2af449d..4a94bd8bfe13a 100644 --- a/llvm/test/Transforms/Inline/byref-align.ll +++ b/llvm/test/Transforms/Inline/byref-align.ll @@ -8,7 +8,7 @@ target triple = "x86_64-unknown-linux-gnu" ; should be inserted. define void @byref_callee(float* align(128) byref(float) nocapture %a, float* %b) #0 { ; CHECK-LABEL: define {{[^@]+}}@byref_callee -; CHECK-SAME: (float* nocapture byref(float) align 128 [[A:%.*]], float* [[B:%.*]]) #0 +; CHECK-SAME: (float* nocapture byref(float) align 128 [[A:%.*]], float* [[B:%.*]]) [[ATTR0:#.*]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LOAD:%.*]] = load float, float* [[A]], align 4 ; CHECK-NEXT: [[B_IDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 8 @@ -26,12 +26,9 @@ entry: define void @byref_caller(float* nocapture align 64 %a, float* %b) #0 { ; CHECK-LABEL: define {{[^@]+}}@byref_caller -; CHECK-SAME: (float* nocapture align 64 [[A:%.*]], float* [[B:%.*]]) #0 +; CHECK-SAME: (float* nocapture align 64 [[A:%.*]], float* [[B:%.*]]) [[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint float* [[A]] to i64 -; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 127 -; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; CHECK-NEXT: call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(float* [[A]], i64 128) ] ; CHECK-NEXT: [[LOAD_I:%.*]] = load float, float* [[A]], align 4 ; CHECK-NEXT: [[B_IDX_I:%.*]] = getelementptr inbounds float, float* [[B]], i64 8 ; CHECK-NEXT: [[ADD_I:%.*]] = fadd float [[LOAD_I]], 2.000000e+00 diff --git a/llvm/test/Transforms/InstCombine/2010-03-03-ExtElim.ll b/llvm/test/Transforms/InstCombine/2010-03-03-ExtElim.ll index ad0fe5a21783d..da9d0469e5e2c 100644 --- a/llvm/test/Transforms/InstCombine/2010-03-03-ExtElim.ll +++ b/llvm/test/Transforms/InstCombine/2010-03-03-ExtElim.ll @@ -16,8 +16,8 @@ define i1 @PR6486() nounwind { ; CHECK: ret i1 true } -@d = common global i32 0, align 4 -@a = common global [1 x i32] zeroinitializer, align 4 +@d = global i32 0, align 4 +@a = global [1 x i32] zeroinitializer, align 4 define i1 @PR16462_1() nounwind { ; CHECK-LABEL: @PR16462_1( diff --git a/llvm/test/Transforms/InstCombine/AArch64/VectorUtils_heuristics.ll b/llvm/test/Transforms/InstCombine/AArch64/VectorUtils_heuristics.ll new file mode 100644 index 0000000000000..b3a166d10b696 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/VectorUtils_heuristics.ll @@ -0,0 +1,21 @@ +; RUN: opt -S -instcombine < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +; This test checks that instcombine does not crash while invoking +; maskIsAllOneOrUndef, maskIsAllZeroOrUndef, or possiblyDemandedEltsInMask. + +; CHECK-LABEL: novel_algorithm +; CHECK: unreachable +define void @novel_algorithm() { +entry: + %a = call @llvm.masked.load.nxv16i8.p0nxv16i8(* undef, i32 1, shufflevector ( insertelement ( undef, i1 true, i32 0), undef, zeroinitializer), undef) + %b = add undef, %a + call void @llvm.masked.store.nxv16i8.p0nxv16i8( %b, * undef, i32 1, shufflevector ( insertelement ( undef, i1 true, i32 0), undef, zeroinitializer)) + unreachable +} + +declare @llvm.masked.load.nxv16i8.p0nxv16i8(*, i32 immarg, , ) + +declare void @llvm.masked.store.nxv16i8.p0nxv16i8(, *, i32 immarg, ) diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll index 1969056311f8c..f8e7789d5f021 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -2161,10 +2161,9 @@ define amdgpu_ps half @extract_elt3_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc ret half %elt1 } -; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16). ; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16( -; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x half> %data, i32 2 +; CHECK-NEXT: %data = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) +; CHECK-NEXT: %elt1 = extractelement <3 x half> %data, i32 2 ; CHECK-NEXT: ret half %elt1 define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) @@ -2992,10 +2991,9 @@ define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %d ret half %elt0 } -; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32). ; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> +; CHECK-NEXT: %data = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32(i32 7, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) +; CHECK-NEXT: %res = shufflevector <3 x half> %data, <3 x half> undef, <4 x i32> ; CHECK-NEXT: ret <4 x half> %res define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) diff --git a/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll b/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll index d845dcb5cac4d..ff4c05164d000 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll @@ -12,7 +12,21 @@ define <4 x float> @mload(i8* %f, <4 x i32> %mask) { ; %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask) ret <4 x float> %ld +} + +; If the mask comes from a comparison, convert to an LLVM intrinsic. The backend should optimize further. +define <4 x float> @mload_v4f32_cmp(i8* %f, <4 x i32> %src) { +; CHECK-LABEL: @mload_v4f32_cmp( +; CHECK-NEXT: [[ICMP:%.*]] = icmp ne <4 x i32> [[SRC:%.*]], zeroinitializer +; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[CASTVEC]], i32 1, <4 x i1> [[ICMP]], <4 x float> zeroinitializer) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %icmp = icmp ne <4 x i32> %src, zeroinitializer + %mask = sext <4 x i1> %icmp to <4 x i32> + %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask) + ret <4 x float> %ld } ; Zero mask returns a zero vector. @@ -23,7 +37,6 @@ define <4 x float> @mload_zeros(i8* %f) { ; %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> zeroinitializer) ret <4 x float> %ld - } ; Only the sign bit matters. @@ -34,7 +47,6 @@ define <4 x float> @mload_fake_ones(i8* %f) { ; %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> ) ret <4 x float> %ld - } ; All mask bits are set, so this is just a vector load. @@ -47,7 +59,6 @@ define <4 x float> @mload_real_ones(i8* %f) { ; %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> ) ret <4 x float> %ld - } ; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further. @@ -60,7 +71,6 @@ define <4 x float> @mload_one_one(i8* %f) { ; %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> ) ret <4 x float> %ld - } ; Try doubles. @@ -73,7 +83,6 @@ define <2 x double> @mload_one_one_double(i8* %f) { ; %ld = tail call <2 x double> @llvm.x86.avx.maskload.pd(i8* %f, <2 x i64> ) ret <2 x double> %ld - } ; Try 256-bit FP ops. @@ -86,7 +95,24 @@ define <8 x float> @mload_v8f32(i8* %f) { ; %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %f, <8 x i32> ) ret <8 x float> %ld +} +define <8 x float> @mload_v8f32_cmp(i8* %f, <8 x float> %src0, <8 x float> %src1) { +; CHECK-LABEL: @mload_v8f32_cmp( +; CHECK-NEXT: [[ICMP0:%.*]] = fcmp one <8 x float> [[SRC0:%.*]], zeroinitializer +; CHECK-NEXT: [[ICMP1:%.*]] = fcmp one <8 x float> [[SRC1:%.*]], zeroinitializer +; CHECK-NEXT: [[MASK1:%.*]] = and <8 x i1> [[ICMP0]], [[ICMP1]] +; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <8 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[CASTVEC]], i32 1, <8 x i1> [[MASK1]], <8 x float> zeroinitializer) +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %icmp0 = fcmp one <8 x float> %src0, zeroinitializer + %icmp1 = fcmp one <8 x float> %src1, zeroinitializer + %ext0 = sext <8 x i1> %icmp0 to <8 x i32> + %ext1 = sext <8 x i1> %icmp1 to <8 x i32> + %mask = and <8 x i32> %ext0, %ext1 + %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %f, <8 x i32> %mask) + ret <8 x float> %ld } define <4 x double> @mload_v4f64(i8* %f) { @@ -97,7 +123,6 @@ define <4 x double> @mload_v4f64(i8* %f) { ; %ld = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %f, <4 x i64> ) ret <4 x double> %ld - } ; Try the AVX2 variants. @@ -110,7 +135,6 @@ define <4 x i32> @mload_v4i32(i8* %f) { ; %ld = tail call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %f, <4 x i32> ) ret <4 x i32> %ld - } define <2 x i64> @mload_v2i64(i8* %f) { @@ -121,7 +145,6 @@ define <2 x i64> @mload_v2i64(i8* %f) { ; %ld = tail call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %f, <2 x i64> ) ret <2 x i64> %ld - } define <8 x i32> @mload_v8i32(i8* %f) { @@ -132,7 +155,6 @@ define <8 x i32> @mload_v8i32(i8* %f) { ; %ld = tail call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %f, <8 x i32> ) ret <8 x i32> %ld - } define <4 x i64> @mload_v4i64(i8* %f) { @@ -143,9 +165,20 @@ define <4 x i64> @mload_v4i64(i8* %f) { ; %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %f, <4 x i64> ) ret <4 x i64> %ld - } +define <4 x i64> @mload_v4i64_cmp(i8* %f, <4 x i64> %src) { +; CHECK-LABEL: @mload_v4i64_cmp( +; CHECK-NEXT: [[SRC_LOBIT:%.*]] = ashr <4 x i64> [[SRC:%.*]], +; CHECK-NEXT: [[SRC_LOBIT_NOT:%.*]] = xor <4 x i64> [[SRC_LOBIT]], +; CHECK-NEXT: [[LD:%.*]] = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* [[F:%.*]], <4 x i64> [[SRC_LOBIT_NOT]]) +; CHECK-NEXT: ret <4 x i64> [[LD]] +; + %icmp = icmp sge <4 x i64> %src, zeroinitializer + %mask = sext <4 x i1> %icmp to <4 x i64> + %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %f, <4 x i64> %mask) + ret <4 x i64> %ld +} ;; MASKED STORES @@ -158,7 +191,21 @@ define void @mstore(i8* %f, <4 x i32> %mask, <4 x float> %v) { ; tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v) ret void +} +; If the mask comes from a comparison, convert to an LLVM intrinsic. The backend should optimize further. + +define void @mstore_v4f32_cmp(i8* %f, <4 x i32> %src, <4 x float> %v) { +; CHECK-LABEL: @mstore_v4f32_cmp( +; CHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[SRC:%.*]], zeroinitializer +; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x float>* +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> [[V:%.*]], <4 x float>* [[CASTVEC]], i32 1, <4 x i1> [[ICMP]]) +; CHECK-NEXT: ret void +; + %icmp = icmp eq <4 x i32> %src, zeroinitializer + %mask = sext <4 x i1> %icmp to <4 x i32> + tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v) + ret void } ; Zero mask is a nop. @@ -169,7 +216,6 @@ define void @mstore_zeros(i8* %f, <4 x float> %v) { ; tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> zeroinitializer, <4 x float> %v) ret void - } ; Only the sign bit matters. @@ -180,7 +226,6 @@ define void @mstore_fake_ones(i8* %f, <4 x float> %v) { ; tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> , <4 x float> %v) ret void - } ; All mask bits are set, so this is just a vector store. @@ -193,7 +238,6 @@ define void @mstore_real_ones(i8* %f, <4 x float> %v) { ; tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> , <4 x float> %v) ret void - } ; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further. @@ -206,7 +250,6 @@ define void @mstore_one_one(i8* %f, <4 x float> %v) { ; tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> , <4 x float> %v) ret void - } ; Try doubles. @@ -219,7 +262,6 @@ define void @mstore_one_one_double(i8* %f, <2 x double> %v) { ; tail call void @llvm.x86.avx.maskstore.pd(i8* %f, <2 x i64> , <2 x double> %v) ret void - } ; Try 256-bit FP ops. @@ -232,7 +274,6 @@ define void @mstore_v8f32(i8* %f, <8 x float> %v) { ; tail call void @llvm.x86.avx.maskstore.ps.256(i8* %f, <8 x i32> , <8 x float> %v) ret void - } define void @mstore_v4f64(i8* %f, <4 x double> %v) { @@ -243,7 +284,20 @@ define void @mstore_v4f64(i8* %f, <4 x double> %v) { ; tail call void @llvm.x86.avx.maskstore.pd.256(i8* %f, <4 x i64> , <4 x double> %v) ret void +} +define void @mstore_v4f64_cmp(i8* %f, <4 x i32> %src, <4 x double> %v) { +; CHECK-LABEL: @mstore_v4f64_cmp( +; CHECK-NEXT: [[SRC_LOBIT:%.*]] = ashr <4 x i32> [[SRC:%.*]], +; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i32> [[SRC_LOBIT]], +; CHECK-NEXT: [[DOTNOT:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64> +; CHECK-NEXT: tail call void @llvm.x86.avx.maskstore.pd.256(i8* [[F:%.*]], <4 x i64> [[DOTNOT]], <4 x double> [[V:%.*]]) +; CHECK-NEXT: ret void +; + %icmp = icmp sge <4 x i32> %src, zeroinitializer + %mask = sext <4 x i1> %icmp to <4 x i64> + tail call void @llvm.x86.avx.maskstore.pd.256(i8* %f, <4 x i64> %mask, <4 x double> %v) + ret void } ; Try the AVX2 variants. @@ -256,7 +310,6 @@ define void @mstore_v4i32(i8* %f, <4 x i32> %v) { ; tail call void @llvm.x86.avx2.maskstore.d(i8* %f, <4 x i32> , <4 x i32> %v) ret void - } define void @mstore_v2i64(i8* %f, <2 x i64> %v) { @@ -278,7 +331,6 @@ define void @mstore_v8i32(i8* %f, <8 x i32> %v) { ; tail call void @llvm.x86.avx2.maskstore.d.256(i8* %f, <8 x i32> , <8 x i32> %v) ret void - } define void @mstore_v4i64(i8* %f, <4 x i64> %v) { @@ -289,7 +341,24 @@ define void @mstore_v4i64(i8* %f, <4 x i64> %v) { ; tail call void @llvm.x86.avx2.maskstore.q.256(i8* %f, <4 x i64> , <4 x i64> %v) ret void +} +define void @mstore_v4i64_cmp(i8* %f, <4 x i64> %src0, <4 x i64> %src1, <4 x i64> %v) { +; CHECK-LABEL: @mstore_v4i64_cmp( +; CHECK-NEXT: [[ICMP0:%.*]] = icmp eq <4 x i64> [[SRC0:%.*]], zeroinitializer +; CHECK-NEXT: [[ICMP1:%.*]] = icmp ne <4 x i64> [[SRC1:%.*]], zeroinitializer +; CHECK-NEXT: [[MASK1:%.*]] = and <4 x i1> [[ICMP0]], [[ICMP1]] +; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x i64>* +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> [[V:%.*]], <4 x i64>* [[CASTVEC]], i32 1, <4 x i1> [[MASK1]]) +; CHECK-NEXT: ret void +; + %icmp0 = icmp eq <4 x i64> %src0, zeroinitializer + %icmp1 = icmp ne <4 x i64> %src1, zeroinitializer + %ext0 = sext <4 x i1> %icmp0 to <4 x i64> + %ext1 = sext <4 x i1> %icmp1 to <4 x i64> + %mask = and <4 x i64> %ext0, %ext1 + tail call void @llvm.x86.avx2.maskstore.q.256(i8* %f, <4 x i64> %mask, <4 x i64> %v) + ret void } ; The original SSE2 masked store variant. @@ -300,10 +369,8 @@ define void @mstore_v16i8_sse2_zeros(<16 x i8> %d, i8* %p) { ; tail call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %d, <16 x i8> zeroinitializer, i8* %p) ret void - } - declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) diff --git a/llvm/test/Transforms/InstCombine/abs-1.ll b/llvm/test/Transforms/InstCombine/abs-1.ll index 08cab94e3dfc2..fbc0fc1a835c3 100644 --- a/llvm/test/Transforms/InstCombine/abs-1.ll +++ b/llvm/test/Transforms/InstCombine/abs-1.ll @@ -12,10 +12,8 @@ declare i64 @llabs(i64) define i32 @test_abs(i32 %x) { ; CHECK-LABEL: @test_abs( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[NEG:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[NEG]], i32 [[X]] -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %ret = call i32 @abs(i32 %x) ret i32 %ret @@ -23,10 +21,8 @@ define i32 @test_abs(i32 %x) { define i64 @test_labs(i64 %x) { ; CHECK-LABEL: @test_labs( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i64 [[X:%.*]], 0 -; CHECK-NEXT: [[NEG:%.*]] = sub nsw i64 0, [[X]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[NEG]], i64 [[X]] -; CHECK-NEXT: ret i64 [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i64 [[TMP1]] ; %ret = call i64 @labs(i64 %x) ret i64 %ret @@ -34,10 +30,8 @@ define i64 @test_labs(i64 %x) { define i64 @test_llabs(i64 %x) { ; CHECK-LABEL: @test_llabs( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i64 [[X:%.*]], 0 -; CHECK-NEXT: [[NEG:%.*]] = sub nsw i64 0, [[X]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[NEG]], i64 [[X]] -; CHECK-NEXT: ret i64 [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i64 [[TMP1]] ; %ret = call i64 @llabs(i64 %x) ret i64 %ret @@ -47,10 +41,8 @@ define i64 @test_llabs(i64 %x) { define i8 @abs_canonical_1(i8 %x) { ; CHECK-LABEL: @abs_canonical_1( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i8 [[NEG]], i8 [[X]] -; CHECK-NEXT: ret i8 [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 false) +; CHECK-NEXT: ret i8 [[TMP1]] ; %cmp = icmp sgt i8 %x, 0 %neg = sub i8 0, %x @@ -62,10 +54,8 @@ define i8 @abs_canonical_1(i8 %x) { define <2 x i8> @abs_canonical_2(<2 x i8> %x) { ; CHECK-LABEL: @abs_canonical_2( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[X:%.*]], zeroinitializer -; CHECK-NEXT: [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i8> [[NEG]], <2 x i8> [[X]] -; CHECK-NEXT: ret <2 x i8> [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[X:%.*]], i1 false) +; CHECK-NEXT: ret <2 x i8> [[TMP1]] ; %cmp = icmp sgt <2 x i8> %x, %neg = sub <2 x i8> zeroinitializer, %x @@ -77,10 +67,8 @@ define <2 x i8> @abs_canonical_2(<2 x i8> %x) { define <2 x i8> @abs_canonical_2_vec_undef_elts(<2 x i8> %x) { ; CHECK-LABEL: @abs_canonical_2_vec_undef_elts( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[X:%.*]], zeroinitializer -; CHECK-NEXT: [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i8> [[NEG]], <2 x i8> [[X]] -; CHECK-NEXT: ret <2 x i8> [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[X:%.*]], i1 false) +; CHECK-NEXT: ret <2 x i8> [[TMP1]] ; %cmp = icmp sgt <2 x i8> %x, %neg = sub <2 x i8> zeroinitializer, %x @@ -92,10 +80,8 @@ define <2 x i8> @abs_canonical_2_vec_undef_elts(<2 x i8> %x) { define i8 @abs_canonical_3(i8 %x) { ; CHECK-LABEL: @abs_canonical_3( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[NEG:%.*]] = sub nsw i8 0, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i8 [[NEG]], i8 [[X]] -; CHECK-NEXT: ret i8 [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i8 [[TMP1]] ; %cmp = icmp slt i8 %x, 0 %neg = sub nsw i8 0, %x @@ -105,10 +91,8 @@ define i8 @abs_canonical_3(i8 %x) { define i8 @abs_canonical_4(i8 %x) { ; CHECK-LABEL: @abs_canonical_4( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i8 [[NEG]], i8 [[X]] -; CHECK-NEXT: ret i8 [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 false) +; CHECK-NEXT: ret i8 [[TMP1]] ; %cmp = icmp slt i8 %x, 1 %neg = sub i8 0, %x @@ -118,11 +102,9 @@ define i8 @abs_canonical_4(i8 %x) { define i32 @abs_canonical_5(i8 %x) { ; CHECK-LABEL: @abs_canonical_5( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[X]] to i32 -; CHECK-NEXT: [[NEG:%.*]] = sub nsw i32 0, [[CONV]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEG]], i32 [[CONV]] -; CHECK-NEXT: ret i32 [[ABS]] +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[CONV]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i8 %x, 0 %conv = sext i8 %x to i32 @@ -134,10 +116,8 @@ define i32 @abs_canonical_5(i8 %x) { define i32 @abs_canonical_6(i32 %a, i32 %b) { ; CHECK-LABEL: @abs_canonical_6( ; CHECK-NEXT: [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[T1]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[T1]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[T1]] -; CHECK-NEXT: ret i32 [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[T1]], i1 false) +; CHECK-NEXT: ret i32 [[TMP1]] ; %t1 = sub i32 %a, %b %cmp = icmp sgt i32 %t1, -1 @@ -149,10 +129,8 @@ define i32 @abs_canonical_6(i32 %a, i32 %b) { define <2 x i8> @abs_canonical_7(<2 x i8> %a, <2 x i8 > %b) { ; CHECK-LABEL: @abs_canonical_7( ; CHECK-NEXT: [[T1:%.*]] = sub <2 x i8> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[T1]], zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = sub <2 x i8> zeroinitializer, [[T1]] -; CHECK-NEXT: [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i8> [[TMP1]], <2 x i8> [[T1]] -; CHECK-NEXT: ret <2 x i8> [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[T1]], i1 false) +; CHECK-NEXT: ret <2 x i8> [[TMP1]] ; %t1 = sub <2 x i8> %a, %b @@ -164,10 +142,8 @@ define <2 x i8> @abs_canonical_7(<2 x i8> %a, <2 x i8 > %b) { define i32 @abs_canonical_8(i32 %a) { ; CHECK-LABEL: @abs_canonical_8( -; CHECK-NEXT: [[T:%.*]] = sub i32 0, [[A:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], 0 -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i32 [[T]], i32 [[A]] -; CHECK-NEXT: ret i32 [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[A:%.*]], i1 false) +; CHECK-NEXT: ret i32 [[TMP1]] ; %t = sub i32 0, %a %cmp = icmp slt i32 %t, 0 @@ -178,10 +154,9 @@ define i32 @abs_canonical_8(i32 %a) { define i32 @abs_canonical_9(i32 %a, i32 %b) { ; CHECK-LABEL: @abs_canonical_9( ; CHECK-NEXT: [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[T1]], -1 ; CHECK-NEXT: [[T2:%.*]] = sub i32 [[B]], [[A]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i32 [[T1]], i32 [[T2]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[ABS]], [[T2]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[T1]], i1 false) +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[T2]] ; CHECK-NEXT: ret i32 [[ADD]] ; %t1 = sub i32 %a, %b @@ -195,10 +170,8 @@ define i32 @abs_canonical_9(i32 %a, i32 %b) { define i32 @abs_canonical_10(i32 %a, i32 %b) { ; CHECK-LABEL: @abs_canonical_10( ; CHECK-NEXT: [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[T1]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[T1]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[T1]] -; CHECK-NEXT: ret i32 [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[T1]], i1 false) +; CHECK-NEXT: ret i32 [[TMP1]] ; %t2 = sub i32 %b, %a %t1 = sub i32 %a, %b @@ -211,9 +184,8 @@ define i32 @abs_canonical_10(i32 %a, i32 %b) { define i8 @nabs_canonical_1(i8 %x) { ; CHECK-LABEL: @nabs_canonical_1( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[NEG]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 false) +; CHECK-NEXT: [[ABS:%.*]] = sub i8 0, [[TMP1]] ; CHECK-NEXT: ret i8 [[ABS]] ; %cmp = icmp sgt i8 %x, 0 @@ -226,9 +198,8 @@ define i8 @nabs_canonical_1(i8 %x) { define <2 x i8> @nabs_canonical_2(<2 x i8> %x) { ; CHECK-LABEL: @nabs_canonical_2( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[X:%.*]], zeroinitializer -; CHECK-NEXT: [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i8> [[X]], <2 x i8> [[NEG]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[X:%.*]], i1 false) +; CHECK-NEXT: [[ABS:%.*]] = sub <2 x i8> zeroinitializer, [[TMP1]] ; CHECK-NEXT: ret <2 x i8> [[ABS]] ; %cmp = icmp sgt <2 x i8> %x, @@ -241,9 +212,8 @@ define <2 x i8> @nabs_canonical_2(<2 x i8> %x) { define <2 x i8> @nabs_canonical_2_vec_undef_elts(<2 x i8> %x) { ; CHECK-LABEL: @nabs_canonical_2_vec_undef_elts( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[X:%.*]], zeroinitializer -; CHECK-NEXT: [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i8> [[X]], <2 x i8> [[NEG]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[X:%.*]], i1 false) +; CHECK-NEXT: [[ABS:%.*]] = sub <2 x i8> zeroinitializer, [[TMP1]] ; CHECK-NEXT: ret <2 x i8> [[ABS]] ; %cmp = icmp sgt <2 x i8> %x, @@ -256,9 +226,8 @@ define <2 x i8> @nabs_canonical_2_vec_undef_elts(<2 x i8> %x) { define i8 @nabs_canonical_3(i8 %x) { ; CHECK-LABEL: @nabs_canonical_3( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[NEG:%.*]] = sub nsw i8 0, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[NEG]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 true) +; CHECK-NEXT: [[ABS:%.*]] = sub nsw i8 0, [[TMP1]] ; CHECK-NEXT: ret i8 [[ABS]] ; %cmp = icmp slt i8 %x, 0 @@ -269,9 +238,8 @@ define i8 @nabs_canonical_3(i8 %x) { define i8 @nabs_canonical_4(i8 %x) { ; CHECK-LABEL: @nabs_canonical_4( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[NEG]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 false) +; CHECK-NEXT: [[ABS:%.*]] = sub i8 0, [[TMP1]] ; CHECK-NEXT: ret i8 [[ABS]] ; %cmp = icmp slt i8 %x, 1 @@ -282,10 +250,9 @@ define i8 @nabs_canonical_4(i8 %x) { define i32 @nabs_canonical_5(i8 %x) { ; CHECK-LABEL: @nabs_canonical_5( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[X]] to i32 -; CHECK-NEXT: [[NEG:%.*]] = sub nsw i32 0, [[CONV]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i32 [[CONV]], i32 [[NEG]] +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[CONV]], i1 true) +; CHECK-NEXT: [[ABS:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[ABS]] ; %cmp = icmp sgt i8 %x, 0 @@ -298,9 +265,8 @@ define i32 @nabs_canonical_5(i8 %x) { define i32 @nabs_canonical_6(i32 %a, i32 %b) { ; CHECK-LABEL: @nabs_canonical_6( ; CHECK-NEXT: [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[T1]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[T1]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i32 [[T1]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[T1]], i1 false) +; CHECK-NEXT: [[ABS:%.*]] = sub i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[ABS]] ; %t1 = sub i32 %a, %b @@ -313,9 +279,8 @@ define i32 @nabs_canonical_6(i32 %a, i32 %b) { define <2 x i8> @nabs_canonical_7(<2 x i8> %a, <2 x i8 > %b) { ; CHECK-LABEL: @nabs_canonical_7( ; CHECK-NEXT: [[T1:%.*]] = sub <2 x i8> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[T1]], zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = sub <2 x i8> zeroinitializer, [[T1]] -; CHECK-NEXT: [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i8> [[T1]], <2 x i8> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[T1]], i1 false) +; CHECK-NEXT: [[ABS:%.*]] = sub <2 x i8> zeroinitializer, [[TMP1]] ; CHECK-NEXT: ret <2 x i8> [[ABS]] ; %t1 = sub <2 x i8> %a, %b @@ -327,9 +292,8 @@ define <2 x i8> @nabs_canonical_7(<2 x i8> %a, <2 x i8 > %b) { define i32 @nabs_canonical_8(i32 %a) { ; CHECK-LABEL: @nabs_canonical_8( -; CHECK-NEXT: [[T:%.*]] = sub i32 0, [[A:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], 0 -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i32 [[A]], i32 [[T]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[A:%.*]], i1 false) +; CHECK-NEXT: [[ABS:%.*]] = sub i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[ABS]] ; %t = sub i32 0, %a @@ -341,10 +305,9 @@ define i32 @nabs_canonical_8(i32 %a) { define i32 @nabs_canonical_9(i32 %a, i32 %b) { ; CHECK-LABEL: @nabs_canonical_9( ; CHECK-NEXT: [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[T1]], -1 ; CHECK-NEXT: [[T2:%.*]] = sub i32 [[B]], [[A]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i32 [[T2]], i32 [[T1]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[T2]], [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[T1]], i1 false) +; CHECK-NEXT: [[ADD:%.*]] = sub i32 [[T2]], [[TMP1]] ; CHECK-NEXT: ret i32 [[ADD]] ; %t1 = sub i32 %a, %b @@ -358,9 +321,8 @@ define i32 @nabs_canonical_9(i32 %a, i32 %b) { define i32 @nabs_canonical_10(i32 %a, i32 %b) { ; CHECK-LABEL: @nabs_canonical_10( ; CHECK-NEXT: [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[T1]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[T1]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i32 [[T1]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[T1]], i1 false) +; CHECK-NEXT: [[ABS:%.*]] = sub i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[ABS]] ; %t2 = sub i32 %b, %a @@ -376,10 +338,8 @@ define i32 @nabs_canonical_10(i32 %a, i32 %b) { define i8 @shifty_abs_commute0(i8 %x) { ; CHECK-LABEL: @shifty_abs_commute0( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = sub i8 0, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[TMP1]], i8 [[TMP2]], i8 [[X]] -; CHECK-NEXT: ret i8 [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 false) +; CHECK-NEXT: ret i8 [[TMP1]] ; %signbit = ashr i8 %x, 7 %add = add i8 %signbit, %x @@ -389,10 +349,8 @@ define i8 @shifty_abs_commute0(i8 %x) { define i8 @shifty_abs_commute0_nsw(i8 %x) { ; CHECK-LABEL: @shifty_abs_commute0_nsw( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = sub nsw i8 0, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[TMP1]], i8 [[TMP2]], i8 [[X]] -; CHECK-NEXT: ret i8 [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i8 [[TMP1]] ; %signbit = ashr i8 %x, 7 %add = add nsw i8 %signbit, %x @@ -417,10 +375,8 @@ define i8 @shifty_abs_commute0_nuw(i8 %x) { define <2 x i8> @shifty_abs_commute1(<2 x i8> %x) { ; CHECK-LABEL: @shifty_abs_commute1( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <2 x i8> [[X:%.*]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i8> zeroinitializer, [[X]] -; CHECK-NEXT: [[ABS:%.*]] = select <2 x i1> [[TMP1]], <2 x i8> [[TMP2]], <2 x i8> [[X]] -; CHECK-NEXT: ret <2 x i8> [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[X:%.*]], i1 false) +; CHECK-NEXT: ret <2 x i8> [[TMP1]] ; %signbit = ashr <2 x i8> %x, %add = add <2 x i8> %signbit, %x @@ -431,10 +387,8 @@ define <2 x i8> @shifty_abs_commute1(<2 x i8> %x) { define <2 x i8> @shifty_abs_commute2(<2 x i8> %x) { ; CHECK-LABEL: @shifty_abs_commute2( ; CHECK-NEXT: [[Y:%.*]] = mul <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <2 x i8> [[Y]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i8> zeroinitializer, [[Y]] -; CHECK-NEXT: [[ABS:%.*]] = select <2 x i1> [[TMP1]], <2 x i8> [[TMP2]], <2 x i8> [[Y]] -; CHECK-NEXT: ret <2 x i8> [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[Y]], i1 false) +; CHECK-NEXT: ret <2 x i8> [[TMP1]] ; %y = mul <2 x i8> %x, ; extra op to thwart complexity-based canonicalization %signbit = ashr <2 x i8> %y, @@ -446,10 +400,8 @@ define <2 x i8> @shifty_abs_commute2(<2 x i8> %x) { define i8 @shifty_abs_commute3(i8 %x) { ; CHECK-LABEL: @shifty_abs_commute3( ; CHECK-NEXT: [[Y:%.*]] = mul i8 [[X:%.*]], 3 -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i8 [[Y]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = sub i8 0, [[Y]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[TMP1]], i8 [[TMP2]], i8 [[Y]] -; CHECK-NEXT: ret i8 [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[Y]], i1 false) +; CHECK-NEXT: ret i8 [[TMP1]] ; %y = mul i8 %x, 3 ; extra op to thwart complexity-based canonicalization %signbit = ashr i8 %y, 7 @@ -461,6 +413,7 @@ define i8 @shifty_abs_commute3(i8 %x) { ; Negative test - don't transform if it would increase instruction count. declare void @extra_use(i8) +declare void @extra_use_i1(i1) define i8 @shifty_abs_too_many_uses(i8 %x) { ; CHECK-LABEL: @shifty_abs_too_many_uses( @@ -482,10 +435,8 @@ define i8 @shifty_abs_too_many_uses(i8 %x) { define i8 @shifty_sub(i8 %x) { ; CHECK-LABEL: @shifty_sub( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = sub i8 0, [[X]] -; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP1]], i8 [[TMP2]], i8 [[X]] -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 false) +; CHECK-NEXT: ret i8 [[TMP1]] ; %sh = ashr i8 %x, 7 %xor = xor i8 %x, %sh @@ -495,10 +446,8 @@ define i8 @shifty_sub(i8 %x) { define i8 @shifty_sub_nsw_commute(i8 %x) { ; CHECK-LABEL: @shifty_sub_nsw_commute( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = sub nsw i8 0, [[X]] -; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP1]], i8 [[TMP2]], i8 [[X]] -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i8 [[TMP1]] ; %sh = ashr i8 %x, 7 %xor = xor i8 %sh, %x @@ -532,10 +481,9 @@ define i12 @shifty_sub_nsw_nuw(i12 %x) { define i8 @negate_abs(i8 %x) { ; CHECK-LABEL: @negate_abs( -; CHECK-NEXT: [[N:%.*]] = sub i8 0, [[X:%.*]] -; CHECK-NEXT: [[C:%.*]] = icmp slt i8 [[X]], 0 -; CHECK-NEXT: [[S:%.*]] = select i1 [[C]], i8 [[X]], i8 [[N]] -; CHECK-NEXT: ret i8 [[S]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 false) +; CHECK-NEXT: [[R:%.*]] = sub i8 0, [[TMP1]] +; CHECK-NEXT: ret i8 [[R]] ; %n = sub i8 0, %x %c = icmp slt i8 %x, 0 @@ -546,10 +494,8 @@ define i8 @negate_abs(i8 %x) { define <2 x i8> @negate_nabs(<2 x i8> %x) { ; CHECK-LABEL: @negate_nabs( -; CHECK-NEXT: [[N:%.*]] = sub <2 x i8> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[C:%.*]] = icmp slt <2 x i8> [[X]], zeroinitializer -; CHECK-NEXT: [[S:%.*]] = select <2 x i1> [[C]], <2 x i8> [[N]], <2 x i8> [[X]] -; CHECK-NEXT: ret <2 x i8> [[S]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[X:%.*]], i1 false) +; CHECK-NEXT: ret <2 x i8> [[TMP1]] ; %n = sub <2 x i8> zeroinitializer, %x %c = icmp slt <2 x i8> %x, zeroinitializer @@ -573,9 +519,8 @@ define i8 @abs_swapped(i8 %a) { ; CHECK-LABEL: @abs_swapped( ; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[A:%.*]] ; CHECK-NEXT: call void @extra_use(i8 [[NEG]]) -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i8 [[A]], 0 -; CHECK-NEXT: [[M1:%.*]] = select i1 [[CMP1]], i8 [[NEG]], i8 [[A]] -; CHECK-NEXT: ret i8 [[M1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[A]], i1 false) +; CHECK-NEXT: ret i8 [[TMP1]] ; %neg = sub i8 0, %a call void @extra_use(i8 %neg) @@ -588,8 +533,8 @@ define i8 @nabs_swapped(i8 %a) { ; CHECK-LABEL: @nabs_swapped( ; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[A:%.*]] ; CHECK-NEXT: call void @extra_use(i8 [[NEG]]) -; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i8 [[A]], 0 -; CHECK-NEXT: [[M2:%.*]] = select i1 [[CMP2]], i8 [[A]], i8 [[NEG]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[A]], i1 false) +; CHECK-NEXT: [[M2:%.*]] = sub i8 0, [[TMP1]] ; CHECK-NEXT: ret i8 [[M2]] ; %neg = sub i8 0, %a @@ -603,9 +548,8 @@ define i8 @abs_different_constants(i8 %a) { ; CHECK-LABEL: @abs_different_constants( ; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[A:%.*]] ; CHECK-NEXT: call void @extra_use(i8 [[NEG]]) -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i8 [[A]], 0 -; CHECK-NEXT: [[M1:%.*]] = select i1 [[CMP1]], i8 [[NEG]], i8 [[A]] -; CHECK-NEXT: ret i8 [[M1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[A]], i1 false) +; CHECK-NEXT: ret i8 [[TMP1]] ; %neg = sub i8 0, %a call void @extra_use(i8 %neg) @@ -618,8 +562,8 @@ define i8 @nabs_different_constants(i8 %a) { ; CHECK-LABEL: @nabs_different_constants( ; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[A:%.*]] ; CHECK-NEXT: call void @extra_use(i8 [[NEG]]) -; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i8 [[A]], 0 -; CHECK-NEXT: [[M2:%.*]] = select i1 [[CMP2]], i8 [[A]], i8 [[NEG]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[A]], i1 false) +; CHECK-NEXT: [[M2:%.*]] = sub i8 0, [[TMP1]] ; CHECK-NEXT: ret i8 [[M2]] ; %neg = sub i8 0, %a @@ -636,10 +580,8 @@ define i8 @nabs_different_constants(i8 %a) { define i64 @infinite_loop_constant_expression_abs(i64 %arg) { ; CHECK-LABEL: @infinite_loop_constant_expression_abs( ; CHECK-NEXT: [[T:%.*]] = sub i64 ptrtoint (i64* @g to i64), [[ARG:%.*]] -; CHECK-NEXT: [[T1:%.*]] = icmp slt i64 [[T]], 0 -; CHECK-NEXT: [[T2:%.*]] = sub nsw i64 0, [[T]] -; CHECK-NEXT: [[T3:%.*]] = select i1 [[T1]], i64 [[T2]], i64 [[T]] -; CHECK-NEXT: ret i64 [[T3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.abs.i64(i64 [[T]], i1 true) +; CHECK-NEXT: ret i64 [[TMP1]] ; %t = sub i64 ptrtoint (i64* @g to i64), %arg %t1 = icmp slt i64 %t, 0 @@ -647,3 +589,96 @@ define i64 @infinite_loop_constant_expression_abs(i64 %arg) { %t3 = select i1 %t1, i64 %t2, i64 %t ret i64 %t3 } + +define i8 @abs_extra_use_icmp(i8 %x) { +; CHECK-LABEL: @abs_extra_use_icmp( +; CHECK-NEXT: [[C:%.*]] = icmp slt i8 [[X:%.*]], 0 +; CHECK-NEXT: call void @extra_use_i1(i1 [[C]]) +; CHECK-NEXT: [[N:%.*]] = sub i8 0, [[X]] +; CHECK-NEXT: [[S:%.*]] = select i1 [[C]], i8 [[N]], i8 [[X]] +; CHECK-NEXT: ret i8 [[S]] +; + %c = icmp slt i8 %x, 0 + call void @extra_use_i1(i1 %c) + %n = sub i8 0, %x + %s = select i1 %c, i8 %n, i8 %x + ret i8 %s +} + +define i8 @abs_extra_use_sub(i8 %x) { +; CHECK-LABEL: @abs_extra_use_sub( +; CHECK-NEXT: [[N:%.*]] = sub i8 0, [[X:%.*]] +; CHECK-NEXT: call void @extra_use(i8 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X]], i1 false) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %c = icmp slt i8 %x, 0 + %n = sub i8 0, %x + call void @extra_use(i8 %n) + %s = select i1 %c, i8 %n, i8 %x + ret i8 %s +} + +define i8 @abs_extra_use_icmp_sub(i8 %x) { +; CHECK-LABEL: @abs_extra_use_icmp_sub( +; CHECK-NEXT: [[C:%.*]] = icmp slt i8 [[X:%.*]], 0 +; CHECK-NEXT: call void @extra_use_i1(i1 [[C]]) +; CHECK-NEXT: [[N:%.*]] = sub i8 0, [[X]] +; CHECK-NEXT: call void @extra_use(i8 [[N]]) +; CHECK-NEXT: [[S:%.*]] = select i1 [[C]], i8 [[N]], i8 [[X]] +; CHECK-NEXT: ret i8 [[S]] +; + %c = icmp slt i8 %x, 0 + call void @extra_use_i1(i1 %c) + %n = sub i8 0, %x + call void @extra_use(i8 %n) + %s = select i1 %c, i8 %n, i8 %x + ret i8 %s +} + +define i8 @nabs_extra_use_icmp(i8 %x) { +; CHECK-LABEL: @nabs_extra_use_icmp( +; CHECK-NEXT: [[C:%.*]] = icmp slt i8 [[X:%.*]], 0 +; CHECK-NEXT: call void @extra_use_i1(i1 [[C]]) +; CHECK-NEXT: [[N:%.*]] = sub i8 0, [[X]] +; CHECK-NEXT: [[S:%.*]] = select i1 [[C]], i8 [[X]], i8 [[N]] +; CHECK-NEXT: ret i8 [[S]] +; + %c = icmp slt i8 %x, 0 + call void @extra_use_i1(i1 %c) + %n = sub i8 0, %x + %s = select i1 %c, i8 %x, i8 %n + ret i8 %s +} + +define i8 @nabs_extra_use_sub(i8 %x) { +; CHECK-LABEL: @nabs_extra_use_sub( +; CHECK-NEXT: [[N:%.*]] = sub i8 0, [[X:%.*]] +; CHECK-NEXT: call void @extra_use(i8 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X]], i1 false) +; CHECK-NEXT: [[S:%.*]] = sub i8 0, [[TMP1]] +; CHECK-NEXT: ret i8 [[S]] +; + %c = icmp slt i8 %x, 0 + %n = sub i8 0, %x + call void @extra_use(i8 %n) + %s = select i1 %c, i8 %x, i8 %n + ret i8 %s +} + +define i8 @nabs_extra_use_icmp_sub(i8 %x) { +; CHECK-LABEL: @nabs_extra_use_icmp_sub( +; CHECK-NEXT: [[C:%.*]] = icmp slt i8 [[X:%.*]], 0 +; CHECK-NEXT: call void @extra_use_i1(i1 [[C]]) +; CHECK-NEXT: [[N:%.*]] = sub i8 0, [[X]] +; CHECK-NEXT: call void @extra_use(i8 [[N]]) +; CHECK-NEXT: [[S:%.*]] = select i1 [[C]], i8 [[X]], i8 [[N]] +; CHECK-NEXT: ret i8 [[S]] +; + %c = icmp slt i8 %x, 0 + call void @extra_use_i1(i1 %c) + %n = sub i8 0, %x + call void @extra_use(i8 %n) + %s = select i1 %c, i8 %x, i8 %n + ret i8 %s +} diff --git a/llvm/test/Transforms/InstCombine/abs-intrinsic.ll b/llvm/test/Transforms/InstCombine/abs-intrinsic.ll index b00681d44d26c..30e5a9ddab3c6 100644 --- a/llvm/test/Transforms/InstCombine/abs-intrinsic.ll +++ b/llvm/test/Transforms/InstCombine/abs-intrinsic.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s +declare i8 @llvm.abs.i8(i8, i1) declare i32 @llvm.abs.i32(i32, i1) declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) declare <3 x i82> @llvm.abs.v3i82(<3 x i82>, i1) @@ -233,7 +234,7 @@ define i32 @abs_assume_neg(i32 %x) { ; CHECK-LABEL: @abs_assume_neg( ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) -; CHECK-NEXT: [[ABS:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) +; CHECK-NEXT: [[ABS:%.*]] = sub i32 0, [[X]] ; CHECK-NEXT: ret i32 [[ABS]] ; %cmp = icmp slt i32 %x, 0 @@ -245,12 +246,49 @@ define i32 @abs_assume_neg(i32 %x) { define i32 @abs_known_neg(i16 %x) { ; CHECK-LABEL: @abs_known_neg( ; CHECK-NEXT: [[EXT:%.*]] = zext i16 [[X:%.*]] to i32 -; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[EXT]], -1 -; CHECK-NEXT: [[ABS:%.*]] = call i32 @llvm.abs.i32(i32 [[NEG]], i1 false) -; CHECK-NEXT: ret i32 [[ABS]] +; CHECK-NEXT: [[NEG_NEG:%.*]] = add nuw nsw i32 [[EXT]], 1 +; CHECK-NEXT: ret i32 [[NEG_NEG]] ; %ext = zext i16 %x to i32 %neg = sub nsw i32 -1, %ext %abs = call i32 @llvm.abs.i32(i32 %neg, i1 false) ret i32 %abs } + +define i1 @abs_eq_int_min_poison(i8 %x) { +; CHECK-LABEL: @abs_eq_int_min_poison( +; CHECK-NEXT: ret i1 false +; + %abs = call i8 @llvm.abs.i8(i8 %x, i1 true) + %cmp = icmp eq i8 %abs, -128 + ret i1 %cmp +} + +define i1 @abs_ne_int_min_poison(i8 %x) { +; CHECK-LABEL: @abs_ne_int_min_poison( +; CHECK-NEXT: ret i1 true +; + %abs = call i8 @llvm.abs.i8(i8 %x, i1 true) + %cmp = icmp ne i8 %abs, -128 + ret i1 %cmp +} + +define i1 @abs_eq_int_min_nopoison(i8 %x) { +; CHECK-LABEL: @abs_eq_int_min_nopoison( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X:%.*]], -128 +; CHECK-NEXT: ret i1 [[CMP]] +; + %abs = call i8 @llvm.abs.i8(i8 %x, i1 false) + %cmp = icmp eq i8 %abs, -128 + ret i1 %cmp +} + +define i1 @abs_ne_int_min_nopoison(i8 %x) { +; CHECK-LABEL: @abs_ne_int_min_nopoison( +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i8 [[X:%.*]], -128 +; CHECK-NEXT: ret i1 [[CMP]] +; + %abs = call i8 @llvm.abs.i8(i8 %x, i1 false) + %cmp = icmp ne i8 %abs, -128 + ret i1 %cmp +} diff --git a/llvm/test/Transforms/InstCombine/abs_abs.ll b/llvm/test/Transforms/InstCombine/abs_abs.ll index 207ceb5215a7e..f2faf35a25155 100644 --- a/llvm/test/Transforms/InstCombine/abs_abs.ll +++ b/llvm/test/Transforms/InstCombine/abs_abs.ll @@ -3,10 +3,8 @@ define i32 @abs_abs_x01(i32 %x) { ; CHECK-LABEL: @abs_abs_x01( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -19,10 +17,8 @@ define i32 @abs_abs_x01(i32 %x) { define <2 x i32> @abs_abs_x01_vec(<2 x i32> %x) { ; CHECK-LABEL: @abs_abs_x01_vec( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[X:%.*]], zeroinitializer -; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[SUB]], <2 x i32> [[X]] -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[X:%.*]], i1 true) +; CHECK-NEXT: ret <2 x i32> [[TMP1]] ; %cmp = icmp sgt <2 x i32> %x, %sub = sub nsw <2 x i32> zeroinitializer, %x @@ -35,10 +31,8 @@ define <2 x i32> @abs_abs_x01_vec(<2 x i32> %x) { define i32 @abs_abs_x02(i32 %x) { ; CHECK-LABEL: @abs_abs_x02( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -51,10 +45,8 @@ define i32 @abs_abs_x02(i32 %x) { define i32 @abs_abs_x03(i32 %x) { ; CHECK-LABEL: @abs_abs_x03( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -67,10 +59,8 @@ define i32 @abs_abs_x03(i32 %x) { define i32 @abs_abs_x04(i32 %x) { ; CHECK-LABEL: @abs_abs_x04( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 1 %sub = sub nsw i32 0, %x @@ -83,10 +73,8 @@ define i32 @abs_abs_x04(i32 %x) { define <2 x i32> @abs_abs_x04_vec(<2 x i32> %x) { ; CHECK-LABEL: @abs_abs_x04_vec( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[X:%.*]], zeroinitializer -; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[SUB]], <2 x i32> [[X]] -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[X:%.*]], i1 true) +; CHECK-NEXT: ret <2 x i32> [[TMP1]] ; %cmp = icmp slt <2 x i32> %x, %sub = sub nsw <2 x i32> zeroinitializer, %x @@ -99,10 +87,8 @@ define <2 x i32> @abs_abs_x04_vec(<2 x i32> %x) { define i32 @abs_abs_x05(i32 %x) { ; CHECK-LABEL: @abs_abs_x05( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -115,10 +101,8 @@ define i32 @abs_abs_x05(i32 %x) { define i32 @abs_abs_x06(i32 %x) { ; CHECK-LABEL: @abs_abs_x06( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -131,10 +115,8 @@ define i32 @abs_abs_x06(i32 %x) { define i32 @abs_abs_x07(i32 %x) { ; CHECK-LABEL: @abs_abs_x07( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -147,10 +129,8 @@ define i32 @abs_abs_x07(i32 %x) { define i32 @abs_abs_x08(i32 %x) { ; CHECK-LABEL: @abs_abs_x08( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 1 %sub = sub nsw i32 0, %x @@ -163,10 +143,8 @@ define i32 @abs_abs_x08(i32 %x) { define i32 @abs_abs_x09(i32 %x) { ; CHECK-LABEL: @abs_abs_x09( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -179,10 +157,8 @@ define i32 @abs_abs_x09(i32 %x) { define i32 @abs_abs_x10(i32 %x) { ; CHECK-LABEL: @abs_abs_x10( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -195,10 +171,8 @@ define i32 @abs_abs_x10(i32 %x) { define i32 @abs_abs_x11(i32 %x) { ; CHECK-LABEL: @abs_abs_x11( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -211,10 +185,8 @@ define i32 @abs_abs_x11(i32 %x) { define i32 @abs_abs_x12(i32 %x) { ; CHECK-LABEL: @abs_abs_x12( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 1 %sub = sub nsw i32 0, %x @@ -227,10 +199,8 @@ define i32 @abs_abs_x12(i32 %x) { define i32 @abs_abs_x13(i32 %x) { ; CHECK-LABEL: @abs_abs_x13( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -243,10 +213,8 @@ define i32 @abs_abs_x13(i32 %x) { define i32 @abs_abs_x14(i32 %x) { ; CHECK-LABEL: @abs_abs_x14( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -259,10 +227,8 @@ define i32 @abs_abs_x14(i32 %x) { define i32 @abs_abs_x15(i32 %x) { ; CHECK-LABEL: @abs_abs_x15( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -275,10 +241,8 @@ define i32 @abs_abs_x15(i32 %x) { define i32 @abs_abs_x16(i32 %x) { ; CHECK-LABEL: @abs_abs_x16( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 1 %sub = sub nsw i32 0, %x @@ -292,10 +256,8 @@ define i32 @abs_abs_x16(i32 %x) { ; abs(abs(-x)) -> abs(-x) -> abs(x) define i32 @abs_abs_x17(i32 %x) { ; CHECK-LABEL: @abs_abs_x17( -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X]], 0 -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %sub = sub nsw i32 0, %x %cmp = icmp sgt i32 %sub, -1 @@ -310,10 +272,8 @@ define i32 @abs_abs_x17(i32 %x) { define i32 @abs_abs_x18(i32 %x, i32 %y) { ; CHECK-LABEL: @abs_abs_x18( ; CHECK-NEXT: [[A:%.*]] = sub nsw i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], 0 -; CHECK-NEXT: [[NEGA:%.*]] = sub i32 0, [[A]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[NEGA]], i32 [[A]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[A]], i1 false) +; CHECK-NEXT: ret i32 [[TMP1]] ; %a = sub nsw i32 %x, %y %b = sub nsw i32 %y, %x @@ -328,10 +288,8 @@ define i32 @abs_abs_x18(i32 %x, i32 %y) { ; abs(abs(-x)) -> abs(-x) -> abs(x) define <2 x i32> @abs_abs_x02_vec(<2 x i32> %x) { ; CHECK-LABEL: @abs_abs_x02_vec( -; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[X]], zeroinitializer -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[SUB]], <2 x i32> [[X]] -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[X:%.*]], i1 true) +; CHECK-NEXT: ret <2 x i32> [[TMP1]] ; %sub = sub nsw <2 x i32> zeroinitializer, %x %cmp = icmp sgt <2 x i32> %sub, @@ -346,10 +304,8 @@ define <2 x i32> @abs_abs_x02_vec(<2 x i32> %x) { define <2 x i32> @abs_abs_x03_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @abs_abs_x03_vec( ; CHECK-NEXT: [[A:%.*]] = sub nsw <2 x i32> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[A]], zeroinitializer -; CHECK-NEXT: [[NEGA:%.*]] = sub <2 x i32> zeroinitializer, [[A]] -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[NEGA]], <2 x i32> [[A]] -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[A]], i1 false) +; CHECK-NEXT: ret <2 x i32> [[TMP1]] ; %a = sub nsw <2 x i32> %x, %y %b = sub nsw <2 x i32> %y, %x @@ -363,9 +319,8 @@ define <2 x i32> @abs_abs_x03_vec(<2 x i32> %x, <2 x i32> %y) { define i32 @nabs_nabs_x01(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x01( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp sgt i32 %x, -1 @@ -379,9 +334,8 @@ define i32 @nabs_nabs_x01(i32 %x) { define i32 @nabs_nabs_x02(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x02( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp sgt i32 %x, 0 @@ -395,9 +349,8 @@ define i32 @nabs_nabs_x02(i32 %x) { define i32 @nabs_nabs_x03(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x03( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp slt i32 %x, 0 @@ -411,9 +364,8 @@ define i32 @nabs_nabs_x03(i32 %x) { define i32 @nabs_nabs_x04(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x04( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp slt i32 %x, 1 @@ -427,9 +379,8 @@ define i32 @nabs_nabs_x04(i32 %x) { define i32 @nabs_nabs_x05(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x05( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp sgt i32 %x, -1 @@ -443,9 +394,8 @@ define i32 @nabs_nabs_x05(i32 %x) { define i32 @nabs_nabs_x06(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x06( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp sgt i32 %x, 0 @@ -459,9 +409,8 @@ define i32 @nabs_nabs_x06(i32 %x) { define i32 @nabs_nabs_x07(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x07( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp slt i32 %x, 0 @@ -475,9 +424,8 @@ define i32 @nabs_nabs_x07(i32 %x) { define i32 @nabs_nabs_x08(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x08( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp slt i32 %x, 1 @@ -491,9 +439,8 @@ define i32 @nabs_nabs_x08(i32 %x) { define i32 @nabs_nabs_x09(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x09( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp sgt i32 %x, -1 @@ -507,9 +454,8 @@ define i32 @nabs_nabs_x09(i32 %x) { define i32 @nabs_nabs_x10(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x10( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp sgt i32 %x, 0 @@ -523,9 +469,8 @@ define i32 @nabs_nabs_x10(i32 %x) { define i32 @nabs_nabs_x11(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x11( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp slt i32 %x, 0 @@ -539,9 +484,8 @@ define i32 @nabs_nabs_x11(i32 %x) { define i32 @nabs_nabs_x12(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x12( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp slt i32 %x, 1 @@ -555,9 +499,8 @@ define i32 @nabs_nabs_x12(i32 %x) { define i32 @nabs_nabs_x13(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x13( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp sgt i32 %x, -1 @@ -571,9 +514,8 @@ define i32 @nabs_nabs_x13(i32 %x) { define i32 @nabs_nabs_x14(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x14( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp sgt i32 %x, 0 @@ -587,9 +529,8 @@ define i32 @nabs_nabs_x14(i32 %x) { define i32 @nabs_nabs_x15(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x15( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp slt i32 %x, 0 @@ -603,9 +544,8 @@ define i32 @nabs_nabs_x15(i32 %x) { define i32 @nabs_nabs_x16(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x16( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp slt i32 %x, 1 @@ -620,9 +560,8 @@ define i32 @nabs_nabs_x16(i32 %x) { ; nabs(nabs(-x)) -> nabs(-x) -> nabs(x) define i32 @nabs_nabs_x17(i32 %x) { ; CHECK-LABEL: @nabs_nabs_x17( -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X]], 0 -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw i32 0, [[TMP1]] ; CHECK-NEXT: ret i32 [[COND]] ; %sub = sub nsw i32 0, %x @@ -638,10 +577,9 @@ define i32 @nabs_nabs_x17(i32 %x) { define i32 @nabs_nabs_x18(i32 %x, i32 %y) { ; CHECK-LABEL: @nabs_nabs_x18( ; CHECK-NEXT: [[A:%.*]] = sub nsw i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], 0 -; CHECK-NEXT: [[NEGA:%.*]] = sub i32 0, [[A]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[A]], i32 [[NEGA]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[A]], i1 false) +; CHECK-NEXT: [[COND18:%.*]] = sub i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[COND18]] ; %a = sub nsw i32 %x, %y %b = sub nsw i32 %y, %x @@ -656,9 +594,8 @@ define i32 @nabs_nabs_x18(i32 %x, i32 %y) { ; nabs(nabs(-x)) -> nabs(-x) -> nabs(x) define <2 x i32> @nabs_nabs_x01_vec(<2 x i32> %x) { ; CHECK-LABEL: @nabs_nabs_x01_vec( -; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[X]], zeroinitializer -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[X]], <2 x i32> [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[X:%.*]], i1 true) +; CHECK-NEXT: [[COND:%.*]] = sub nsw <2 x i32> zeroinitializer, [[TMP1]] ; CHECK-NEXT: ret <2 x i32> [[COND]] ; %sub = sub nsw <2 x i32> zeroinitializer, %x @@ -674,10 +611,9 @@ define <2 x i32> @nabs_nabs_x01_vec(<2 x i32> %x) { define <2 x i32> @nabs_nabs_x02_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @nabs_nabs_x02_vec( ; CHECK-NEXT: [[A:%.*]] = sub nsw <2 x i32> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[A]], zeroinitializer -; CHECK-NEXT: [[NEGA:%.*]] = sub <2 x i32> zeroinitializer, [[A]] -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[A]], <2 x i32> [[NEGA]] -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[A]], i1 false) +; CHECK-NEXT: [[COND18:%.*]] = sub <2 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: ret <2 x i32> [[COND18]] ; %a = sub nsw <2 x i32> %x, %y %b = sub nsw <2 x i32> %y, %x @@ -691,10 +627,8 @@ define <2 x i32> @nabs_nabs_x02_vec(<2 x i32> %x, <2 x i32> %y) { define i32 @abs_nabs_x01(i32 %x) { ; CHECK-LABEL: @abs_nabs_x01( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -707,10 +641,8 @@ define i32 @abs_nabs_x01(i32 %x) { define i32 @abs_nabs_x02(i32 %x) { ; CHECK-LABEL: @abs_nabs_x02( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -723,10 +655,8 @@ define i32 @abs_nabs_x02(i32 %x) { define i32 @abs_nabs_x03(i32 %x) { ; CHECK-LABEL: @abs_nabs_x03( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -739,10 +669,8 @@ define i32 @abs_nabs_x03(i32 %x) { define i32 @abs_nabs_x04(i32 %x) { ; CHECK-LABEL: @abs_nabs_x04( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 1 %sub = sub nsw i32 0, %x @@ -755,10 +683,8 @@ define i32 @abs_nabs_x04(i32 %x) { define i32 @abs_nabs_x05(i32 %x) { ; CHECK-LABEL: @abs_nabs_x05( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -771,10 +697,8 @@ define i32 @abs_nabs_x05(i32 %x) { define i32 @abs_nabs_x06(i32 %x) { ; CHECK-LABEL: @abs_nabs_x06( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -787,10 +711,8 @@ define i32 @abs_nabs_x06(i32 %x) { define i32 @abs_nabs_x07(i32 %x) { ; CHECK-LABEL: @abs_nabs_x07( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -803,10 +725,8 @@ define i32 @abs_nabs_x07(i32 %x) { define i32 @abs_nabs_x08(i32 %x) { ; CHECK-LABEL: @abs_nabs_x08( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 1 %sub = sub nsw i32 0, %x @@ -819,10 +739,8 @@ define i32 @abs_nabs_x08(i32 %x) { define i32 @abs_nabs_x09(i32 %x) { ; CHECK-LABEL: @abs_nabs_x09( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -835,10 +753,8 @@ define i32 @abs_nabs_x09(i32 %x) { define i32 @abs_nabs_x10(i32 %x) { ; CHECK-LABEL: @abs_nabs_x10( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -851,10 +767,8 @@ define i32 @abs_nabs_x10(i32 %x) { define i32 @abs_nabs_x11(i32 %x) { ; CHECK-LABEL: @abs_nabs_x11( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -867,10 +781,8 @@ define i32 @abs_nabs_x11(i32 %x) { define i32 @abs_nabs_x12(i32 %x) { ; CHECK-LABEL: @abs_nabs_x12( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 1 %sub = sub nsw i32 0, %x @@ -883,10 +795,8 @@ define i32 @abs_nabs_x12(i32 %x) { define i32 @abs_nabs_x13(i32 %x) { ; CHECK-LABEL: @abs_nabs_x13( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -899,10 +809,8 @@ define i32 @abs_nabs_x13(i32 %x) { define i32 @abs_nabs_x14(i32 %x) { ; CHECK-LABEL: @abs_nabs_x14( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -915,10 +823,8 @@ define i32 @abs_nabs_x14(i32 %x) { define i32 @abs_nabs_x15(i32 %x) { ; CHECK-LABEL: @abs_nabs_x15( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -931,10 +837,8 @@ define i32 @abs_nabs_x15(i32 %x) { define i32 @abs_nabs_x16(i32 %x) { ; CHECK-LABEL: @abs_nabs_x16( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp slt i32 %x, 1 %sub = sub nsw i32 0, %x @@ -948,10 +852,8 @@ define i32 @abs_nabs_x16(i32 %x) { ; abs(nabs(-x)) -> abs(-x) -> abs(x) define i32 @abs_nabs_x17(i32 %x) { ; CHECK-LABEL: @abs_nabs_x17( -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X]], 0 -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[X]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %sub = sub nsw i32 0, %x %cmp = icmp sgt i32 %sub, -1 @@ -966,10 +868,8 @@ define i32 @abs_nabs_x17(i32 %x) { define i32 @abs_nabs_x18(i32 %x, i32 %y) { ; CHECK-LABEL: @abs_nabs_x18( ; CHECK-NEXT: [[A:%.*]] = sub nsw i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], 0 -; CHECK-NEXT: [[NEGA:%.*]] = sub i32 0, [[A]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[NEGA]], i32 [[A]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[A]], i1 false) +; CHECK-NEXT: ret i32 [[TMP1]] ; %a = sub nsw i32 %x, %y %b = sub nsw i32 %y, %x @@ -984,10 +884,8 @@ define i32 @abs_nabs_x18(i32 %x, i32 %y) { ; abs(nabs(-x)) -> abs(-x) -> abs(x) define <2 x i32> @abs_nabs_x01_vec(<2 x i32> %x) { ; CHECK-LABEL: @abs_nabs_x01_vec( -; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[X]], zeroinitializer -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[SUB]], <2 x i32> [[X]] -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[X:%.*]], i1 true) +; CHECK-NEXT: ret <2 x i32> [[TMP1]] ; %sub = sub nsw <2 x i32> zeroinitializer, %x %cmp = icmp sgt <2 x i32> %sub, @@ -1002,10 +900,8 @@ define <2 x i32> @abs_nabs_x01_vec(<2 x i32> %x) { define <2 x i32> @abs_nabs_x02_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @abs_nabs_x02_vec( ; CHECK-NEXT: [[A:%.*]] = sub nsw <2 x i32> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[A]], zeroinitializer -; CHECK-NEXT: [[NEGA:%.*]] = sub <2 x i32> zeroinitializer, [[A]] -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[NEGA]], <2 x i32> [[A]] -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[A]], i1 false) +; CHECK-NEXT: ret <2 x i32> [[TMP1]] ; %a = sub nsw <2 x i32> %x, %y %b = sub nsw <2 x i32> %y, %x @@ -1019,10 +915,9 @@ define <2 x i32> @abs_nabs_x02_vec(<2 x i32> %x, <2 x i32> %y) { define i32 @nabs_abs_x01(i32 %x) { ; CHECK-LABEL: @nabs_abs_x01( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB9:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB9]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -1035,10 +930,9 @@ define i32 @nabs_abs_x01(i32 %x) { define i32 @nabs_abs_x02(i32 %x) { ; CHECK-LABEL: @nabs_abs_x02( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB9:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB9]] ; %cmp = icmp sgt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -1051,10 +945,9 @@ define i32 @nabs_abs_x02(i32 %x) { define i32 @nabs_abs_x03(i32 %x) { ; CHECK-LABEL: @nabs_abs_x03( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB9:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB9]] ; %cmp = icmp slt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -1067,10 +960,9 @@ define i32 @nabs_abs_x03(i32 %x) { define i32 @nabs_abs_x04(i32 %x) { ; CHECK-LABEL: @nabs_abs_x04( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB9:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB9]] ; %cmp = icmp slt i32 %x, 1 %sub = sub nsw i32 0, %x @@ -1083,10 +975,9 @@ define i32 @nabs_abs_x04(i32 %x) { define i32 @nabs_abs_x05(i32 %x) { ; CHECK-LABEL: @nabs_abs_x05( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB9:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB9]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -1099,10 +990,9 @@ define i32 @nabs_abs_x05(i32 %x) { define i32 @nabs_abs_x06(i32 %x) { ; CHECK-LABEL: @nabs_abs_x06( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB9:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB9]] ; %cmp = icmp sgt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -1115,10 +1005,9 @@ define i32 @nabs_abs_x06(i32 %x) { define i32 @nabs_abs_x07(i32 %x) { ; CHECK-LABEL: @nabs_abs_x07( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB9:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB9]] ; %cmp = icmp slt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -1131,10 +1020,9 @@ define i32 @nabs_abs_x07(i32 %x) { define i32 @nabs_abs_x08(i32 %x) { ; CHECK-LABEL: @nabs_abs_x08( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB9:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB9]] ; %cmp = icmp slt i32 %x, 1 %sub = sub nsw i32 0, %x @@ -1147,10 +1035,9 @@ define i32 @nabs_abs_x08(i32 %x) { define i32 @nabs_abs_x09(i32 %x) { ; CHECK-LABEL: @nabs_abs_x09( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB16:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB16]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -1163,10 +1050,9 @@ define i32 @nabs_abs_x09(i32 %x) { define i32 @nabs_abs_x10(i32 %x) { ; CHECK-LABEL: @nabs_abs_x10( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB16:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB16]] ; %cmp = icmp sgt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -1179,10 +1065,9 @@ define i32 @nabs_abs_x10(i32 %x) { define i32 @nabs_abs_x11(i32 %x) { ; CHECK-LABEL: @nabs_abs_x11( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB16:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB16]] ; %cmp = icmp slt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -1195,10 +1080,9 @@ define i32 @nabs_abs_x11(i32 %x) { define i32 @nabs_abs_x12(i32 %x) { ; CHECK-LABEL: @nabs_abs_x12( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB16:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB16]] ; %cmp = icmp slt i32 %x, 1 %sub = sub nsw i32 0, %x @@ -1211,10 +1095,9 @@ define i32 @nabs_abs_x12(i32 %x) { define i32 @nabs_abs_x13(i32 %x) { ; CHECK-LABEL: @nabs_abs_x13( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB16:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB16]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -1227,10 +1110,9 @@ define i32 @nabs_abs_x13(i32 %x) { define i32 @nabs_abs_x14(i32 %x) { ; CHECK-LABEL: @nabs_abs_x14( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB16:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB16]] ; %cmp = icmp sgt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -1243,10 +1125,9 @@ define i32 @nabs_abs_x14(i32 %x) { define i32 @nabs_abs_x15(i32 %x) { ; CHECK-LABEL: @nabs_abs_x15( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB16:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB16]] ; %cmp = icmp slt i32 %x, 0 %sub = sub nsw i32 0, %x @@ -1259,10 +1140,9 @@ define i32 @nabs_abs_x15(i32 %x) { define i32 @nabs_abs_x16(i32 %x) { ; CHECK-LABEL: @nabs_abs_x16( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X]] -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB16:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB16]] ; %cmp = icmp slt i32 %x, 1 %sub = sub nsw i32 0, %x @@ -1276,10 +1156,9 @@ define i32 @nabs_abs_x16(i32 %x) { ; nabs(abs(-x)) -> nabs(-x) -> nabs(x) define i32 @nabs_abs_x17(i32 %x) { ; CHECK-LABEL: @nabs_abs_x17( -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X]], 0 -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB16:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[SUB16]] ; %sub = sub nsw i32 0, %x %cmp = icmp sgt i32 %sub, -1 @@ -1294,10 +1173,9 @@ define i32 @nabs_abs_x17(i32 %x) { define i32 @nabs_abs_x18(i32 %x, i32 %y) { ; CHECK-LABEL: @nabs_abs_x18( ; CHECK-NEXT: [[A:%.*]] = sub nsw i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], 0 -; CHECK-NEXT: [[NEGA:%.*]] = sub i32 0, [[A]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[A]], i32 [[NEGA]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[A]], i1 false) +; CHECK-NEXT: [[COND18:%.*]] = sub nsw i32 0, [[TMP1]] +; CHECK-NEXT: ret i32 [[COND18]] ; %a = sub nsw i32 %x, %y %b = sub nsw i32 %y, %x @@ -1312,10 +1190,9 @@ define i32 @nabs_abs_x18(i32 %x, i32 %y) { ; nabs(abs(-x)) -> nabs(-x) -> nabs(x) define <2 x i32> @nabs_abs_x01_vec(<2 x i32> %x) { ; CHECK-LABEL: @nabs_abs_x01_vec( -; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[X]], zeroinitializer -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[X]], <2 x i32> [[SUB]] -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[X:%.*]], i1 true) +; CHECK-NEXT: [[SUB16:%.*]] = sub nsw <2 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: ret <2 x i32> [[SUB16]] ; %sub = sub nsw <2 x i32> zeroinitializer, %x %cmp = icmp sgt <2 x i32> %sub, @@ -1330,10 +1207,9 @@ define <2 x i32> @nabs_abs_x01_vec(<2 x i32> %x) { define <2 x i32> @nabs_abs_x02_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @nabs_abs_x02_vec( ; CHECK-NEXT: [[A:%.*]] = sub nsw <2 x i32> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[A]], zeroinitializer -; CHECK-NEXT: [[NEGA:%.*]] = sub <2 x i32> zeroinitializer, [[A]] -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[A]], <2 x i32> [[NEGA]] -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[A]], i1 false) +; CHECK-NEXT: [[COND18:%.*]] = sub nsw <2 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: ret <2 x i32> [[COND18]] ; %a = sub nsw <2 x i32> %x, %y %b = sub nsw <2 x i32> %y, %x diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll index 8ca24caa2aa1b..a988eea894450 100644 --- a/llvm/test/Transforms/InstCombine/assume.ll +++ b/llvm/test/Transforms/InstCombine/assume.ll @@ -346,6 +346,7 @@ define i32 @assumption_conflicts_with_known_bits(i32 %a, i32 %b) { define void @debug_interference(i8 %x) { ; CHECK-LABEL: @debug_interference( ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i8 [[X:%.*]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 false) ; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i32 5, [[META7:metadata !.*]], metadata !DIExpression()), [[DBG9:!dbg !.*]] ; CHECK-NEXT: tail call void @llvm.assume(i1 false) ; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i32 5, [[META7]], metadata !DIExpression()), [[DBG9]] diff --git a/llvm/test/Transforms/InstCombine/call-callconv.ll b/llvm/test/Transforms/InstCombine/call-callconv.ll index 0cb2c55f9fda5..58a0cf21b24ee 100644 --- a/llvm/test/Transforms/InstCombine/call-callconv.ll +++ b/llvm/test/Transforms/InstCombine/call-callconv.ll @@ -6,10 +6,8 @@ define arm_aapcscc i32 @_abs(i32 %i) nounwind readnone { ; CHECK-LABEL: @_abs( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[I:%.*]], 0 -; CHECK-NEXT: [[NEG:%.*]] = sub nsw i32 0, [[I]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[NEG]], i32 [[I]] -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[I:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %call = tail call arm_aapcscc i32 @abs(i32 %i) nounwind readnone ret i32 %call @@ -19,10 +17,8 @@ declare arm_aapcscc i32 @abs(i32) nounwind readnone define arm_aapcscc i32 @_labs(i32 %i) nounwind readnone { ; CHECK-LABEL: @_labs( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[I:%.*]], 0 -; CHECK-NEXT: [[NEG:%.*]] = sub nsw i32 0, [[I]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[NEG]], i32 [[I]] -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[I:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %call = tail call arm_aapcscc i32 @labs(i32 %i) nounwind readnone ret i32 %call diff --git a/llvm/test/Transforms/InstCombine/cttz-abs.ll b/llvm/test/Transforms/InstCombine/cttz-abs.ll index b89a55c8f5b87..ea536f22f14b7 100644 --- a/llvm/test/Transforms/InstCombine/cttz-abs.ll +++ b/llvm/test/Transforms/InstCombine/cttz-abs.ll @@ -105,10 +105,8 @@ define i64 @cttz_abs_64(i64 %x) { define i32 @cttz_abs_multiuse(i32 %x) { ; CHECK-LABEL: @cttz_abs_multiuse( -; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[S:%.*]] = sub i32 0, [[X]] -; CHECK-NEXT: [[D:%.*]] = select i1 [[C]], i32 [[S]], i32 [[X]] -; CHECK-NEXT: call void @use_abs(i32 [[D]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 false) +; CHECK-NEXT: call void @use_abs(i32 [[TMP1]]) ; CHECK-NEXT: [[R:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X]], i1 true), [[RNG0]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -122,9 +120,8 @@ define i32 @cttz_abs_multiuse(i32 %x) { define i32 @cttz_nabs_multiuse(i32 %x) { ; CHECK-LABEL: @cttz_nabs_multiuse( -; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[S:%.*]] = sub i32 0, [[X]] -; CHECK-NEXT: [[D:%.*]] = select i1 [[C]], i32 [[X]], i32 [[S]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 false) +; CHECK-NEXT: [[D:%.*]] = sub i32 0, [[TMP1]] ; CHECK-NEXT: call void @use_abs(i32 [[D]]) ; CHECK-NEXT: [[R:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X]], i1 true), [[RNG0]] ; CHECK-NEXT: ret i32 [[R]] diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll index 683518121789c..da2161a0bc9f7 100644 --- a/llvm/test/Transforms/InstCombine/icmp.ll +++ b/llvm/test/Transforms/InstCombine/icmp.ll @@ -2996,10 +2996,8 @@ define i32 @f5(i8 %a, i8 %b) { ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[A:%.*]] to i32 ; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[B:%.*]] to i32 ; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV3]] -; CHECK-NEXT: [[CMP4:%.*]] = icmp slt i32 [[SUB]], 0 -; CHECK-NEXT: [[SUB7:%.*]] = sub nsw i32 0, [[SUB]] -; CHECK-NEXT: [[SUB7_SUB:%.*]] = select i1 [[CMP4]], i32 [[SUB7]], i32 [[SUB]] -; CHECK-NEXT: ret i32 [[SUB7_SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %conv = zext i8 %a to i32 %conv3 = zext i8 %b to i32 @@ -3593,10 +3591,8 @@ define i1 @knownbits8(i8 %a, i8 %b) { define i32 @abs_preserve(i32 %x) { ; CHECK-LABEL: @abs_preserve( ; CHECK-NEXT: [[A:%.*]] = shl nsw i32 [[X:%.*]], 1 -; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[A]], 0 -; CHECK-NEXT: [[NEGA:%.*]] = sub i32 0, [[A]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[C]], i32 [[NEGA]], i32 [[A]] -; CHECK-NEXT: ret i32 [[ABS]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[A]], i1 false) +; CHECK-NEXT: ret i32 [[TMP1]] ; %a = mul nsw i32 %x, 2 %c = icmp sge i32 %a, 0 @@ -3634,10 +3630,8 @@ define <2 x i1> @PR36583(<2 x i8*>) { ; fold (icmp pred (sub (0, X)) C1) for vec type define <2 x i32> @Op1Negated_Vec(<2 x i32> %x) { ; CHECK-LABEL: @Op1Negated_Vec( -; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[X]], zeroinitializer -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[SUB]], <2 x i32> [[X]] -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[X:%.*]], i1 true) +; CHECK-NEXT: ret <2 x i32> [[TMP1]] ; %sub = sub nsw <2 x i32> zeroinitializer, %x %cmp = icmp sgt <2 x i32> %sub, diff --git a/llvm/test/Transforms/InstCombine/max-of-nots.ll b/llvm/test/Transforms/InstCombine/max-of-nots.ll index e6649d70946b7..1b551f9f9b510 100644 --- a/llvm/test/Transforms/InstCombine/max-of-nots.ll +++ b/llvm/test/Transforms/InstCombine/max-of-nots.ll @@ -240,12 +240,10 @@ define i32 @abs_of_min_of_not(i32 %x, i32 %y) { ; CHECK-LABEL: @abs_of_min_of_not( ; CHECK-NEXT: [[XORD:%.*]] = xor i32 [[X:%.*]], -1 ; CHECK-NEXT: [[YADD:%.*]] = add i32 [[Y:%.*]], 2 -; CHECK-NEXT: [[COND_I:%.*]] = icmp slt i32 [[YADD]], [[XORD]] -; CHECK-NEXT: [[MIN:%.*]] = select i1 [[COND_I]], i32 [[YADD]], i32 [[XORD]] -; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[MIN]], 0 -; CHECK-NEXT: [[SUB:%.*]] = sub i32 0, [[MIN]] -; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP2]], i32 [[SUB]], i32 [[MIN]] -; CHECK-NEXT: ret i32 [[ABS]] +; CHECK-NEXT: [[COND_I_NOT:%.*]] = icmp slt i32 [[YADD]], [[XORD]] +; CHECK-NEXT: [[MIN:%.*]] = select i1 [[COND_I_NOT]], i32 [[YADD]], i32 [[XORD]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[MIN]], i1 false) +; CHECK-NEXT: ret i32 [[TMP1]] ; %xord = xor i32 %x, -1 diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll new file mode 100644 index 0000000000000..797f85d944474 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -instcombine < %s | FileCheck %s + +declare i8 @llvm.umin.i8(i8, i8) +declare i8 @llvm.umax.i8(i8, i8) +declare i8 @llvm.smin.i8(i8, i8) +declare i8 @llvm.smax.i8(i8, i8) + +define i8 @umin_known_bits(i8 %x, i8 %y) { +; CHECK-LABEL: @umin_known_bits( +; CHECK-NEXT: ret i8 0 +; + %x2 = and i8 %x, 127 + %m = call i8 @llvm.umin.i8(i8 %x2, i8 %y) + %r = and i8 %m, -128 + ret i8 %r +} + +define i8 @umax_known_bits(i8 %x, i8 %y) { +; CHECK-LABEL: @umax_known_bits( +; CHECK-NEXT: ret i8 -128 +; + %x2 = or i8 %x, -128 + %m = call i8 @llvm.umax.i8(i8 %x2, i8 %y) + %r = and i8 %m, -128 + ret i8 %r +} + +define i8 @smin_known_bits(i8 %x, i8 %y) { +; CHECK-LABEL: @smin_known_bits( +; CHECK-NEXT: ret i8 -128 +; + %x2 = or i8 %x, -128 + %m = call i8 @llvm.smin.i8(i8 %x2, i8 %y) + %r = and i8 %m, -128 + ret i8 %r +} + +define i8 @smax_known_bits(i8 %x, i8 %y) { +; CHECK-LABEL: @smax_known_bits( +; CHECK-NEXT: ret i8 0 +; + %x2 = and i8 %x, 127 + %m = call i8 @llvm.smax.i8(i8 %x2, i8 %y) + %r = and i8 %m, -128 + ret i8 %r +} diff --git a/llvm/test/Transforms/InstCombine/pow-1.ll b/llvm/test/Transforms/InstCombine/pow-1.ll index 724f004e6ca99..dfb62f6d0af0e 100644 --- a/llvm/test/Transforms/InstCombine/pow-1.ll +++ b/llvm/test/Transforms/InstCombine/pow-1.ll @@ -247,8 +247,8 @@ define <2 x double> @test_simplify6v(<2 x double> %x) { ; Check pow(x, 0.5) -> fabs(sqrt(x)), where x != -infinity. -define float @test_simplify7(float %x) { -; CHECK-LABEL: @test_simplify7( +define float @powf_libcall_to_select_sqrt(float %x) { +; CHECK-LABEL: @powf_libcall_to_select_sqrt( ; ANY-NEXT: [[SQRTF:%.*]] = call float @sqrtf(float [[X:%.*]]) ; ANY-NEXT: [[ABS:%.*]] = call float @llvm.fabs.f32(float [[SQRTF]]) ; ANY-NEXT: [[ISINF:%.*]] = fcmp oeq float [[X]], 0xFFF0000000000000 @@ -275,8 +275,8 @@ define float @test_simplify7(float %x) { ret float %retval } -define double @test_simplify8(double %x) { -; CHECK-LABEL: @test_simplify8( +define double @pow_libcall_to_select_sqrt(double %x) { +; CHECK-LABEL: @pow_libcall_to_select_sqrt( ; LIB-NEXT: [[SQRT:%.*]] = call double @sqrt(double [[X:%.*]]) ; LIB-NEXT: [[ABS:%.*]] = call double @llvm.fabs.f64(double [[SQRT]]) ; LIB-NEXT: [[ISINF:%.*]] = fcmp oeq double [[X]], 0xFFF0000000000000 diff --git a/llvm/test/Transforms/InstCombine/pow-4.ll b/llvm/test/Transforms/InstCombine/pow-4.ll index 4aac27fe72f0c..23cc2d801a160 100644 --- a/llvm/test/Transforms/InstCombine/pow-4.ll +++ b/llvm/test/Transforms/InstCombine/pow-4.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -instcombine -S < %s | FileCheck %s +; RUN: opt -instcombine -S < %s | FileCheck %s --check-prefixes=CHECK,SQRT +; RUN: opt -instcombine -S < %s -disable-builtin sqrt | FileCheck %s --check-prefixes=CHECK,NOSQRT declare double @llvm.pow.f64(double, double) declare float @llvm.pow.f32(float, float) @@ -151,31 +152,41 @@ define double @test_simplify_neg_16_5(double %x) { } ; pow(x, 16.5) with double + define double @test_simplify_16_5_libcall(double %x) { -; CHECK-LABEL: @test_simplify_16_5_libcall( -; CHECK-NEXT: [[SQRT:%.*]] = call fast double @sqrt(double [[X:%.*]]) -; CHECK-NEXT: [[SQUARE:%.*]] = fmul fast double [[X]], [[X]] -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast double [[SQUARE]], [[SQUARE]] -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast double [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP2]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast double [[TMP3]], [[SQRT]] -; CHECK-NEXT: ret double [[TMP4]] +; SQRT-LABEL: @test_simplify_16_5_libcall( +; SQRT-NEXT: [[SQRT:%.*]] = call fast double @sqrt(double [[X:%.*]]) +; SQRT-NEXT: [[SQUARE:%.*]] = fmul fast double [[X]], [[X]] +; SQRT-NEXT: [[TMP1:%.*]] = fmul fast double [[SQUARE]], [[SQUARE]] +; SQRT-NEXT: [[TMP2:%.*]] = fmul fast double [[TMP1]], [[TMP1]] +; SQRT-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP2]], [[TMP2]] +; SQRT-NEXT: [[TMP4:%.*]] = fmul fast double [[TMP3]], [[SQRT]] +; SQRT-NEXT: ret double [[TMP4]] +; +; NOSQRT-LABEL: @test_simplify_16_5_libcall( +; NOSQRT-NEXT: [[TMP1:%.*]] = call fast double @pow(double [[X:%.*]], double 1.650000e+01) +; NOSQRT-NEXT: ret double [[TMP1]] ; %1 = call fast double @pow(double %x, double 1.650000e+01) ret double %1 } ; pow(x, -16.5) with double + define double @test_simplify_neg_16_5_libcall(double %x) { -; CHECK-LABEL: @test_simplify_neg_16_5_libcall( -; CHECK-NEXT: [[SQRT:%.*]] = call fast double @sqrt(double [[X:%.*]]) -; CHECK-NEXT: [[SQUARE:%.*]] = fmul fast double [[X]], [[X]] -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast double [[SQUARE]], [[SQUARE]] -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast double [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP2]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast double [[TMP3]], [[SQRT]] -; CHECK-NEXT: [[RECIPROCAL:%.*]] = fdiv fast double 1.000000e+00, [[TMP4]] -; CHECK-NEXT: ret double [[RECIPROCAL]] +; SQRT-LABEL: @test_simplify_neg_16_5_libcall( +; SQRT-NEXT: [[SQRT:%.*]] = call fast double @sqrt(double [[X:%.*]]) +; SQRT-NEXT: [[SQUARE:%.*]] = fmul fast double [[X]], [[X]] +; SQRT-NEXT: [[TMP1:%.*]] = fmul fast double [[SQUARE]], [[SQUARE]] +; SQRT-NEXT: [[TMP2:%.*]] = fmul fast double [[TMP1]], [[TMP1]] +; SQRT-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP2]], [[TMP2]] +; SQRT-NEXT: [[TMP4:%.*]] = fmul fast double [[TMP3]], [[SQRT]] +; SQRT-NEXT: [[RECIPROCAL:%.*]] = fdiv fast double 1.000000e+00, [[TMP4]] +; SQRT-NEXT: ret double [[RECIPROCAL]] +; +; NOSQRT-LABEL: @test_simplify_neg_16_5_libcall( +; NOSQRT-NEXT: [[TMP1:%.*]] = call fast double @pow(double [[X:%.*]], double -1.650000e+01) +; NOSQRT-NEXT: ret double [[TMP1]] ; %1 = call fast double @pow(double %x, double -1.650000e+01) ret double %1 diff --git a/llvm/test/Transforms/InstCombine/rem.ll b/llvm/test/Transforms/InstCombine/rem.ll index c833acc16853f..37d81f2ebf6a0 100644 --- a/llvm/test/Transforms/InstCombine/rem.ll +++ b/llvm/test/Transforms/InstCombine/rem.ll @@ -49,9 +49,8 @@ define i8 @big_divisor(i8 %x) { define i5 @biggest_divisor(i5 %x) { ; CHECK-LABEL: @biggest_divisor( -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i5 [[X:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[TMP1]] to i5 -; CHECK-NEXT: [[REM:%.*]] = add i5 [[TMP2]], [[X]] +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i5 [[X:%.*]], -1 +; CHECK-NEXT: [[REM:%.*]] = select i1 [[DOTNOT]], i5 0, i5 [[X]] ; CHECK-NEXT: ret i5 [[REM]] ; %rem = urem i5 %x, -1 @@ -128,8 +127,8 @@ define i8 @urem2(i8 %x, i8 %y) { define i8 @urem3(i8 %x) { ; CHECK-LABEL: @urem3( ; CHECK-NEXT: [[TMP1:%.*]] = urem i8 [[X:%.*]], 3 -; CHECK-NEXT: [[B1:%.*]] = sub i8 [[X]], [[TMP1]] -; CHECK-NEXT: [[C:%.*]] = add i8 [[B1]], [[X]] +; CHECK-NEXT: [[B_NEG:%.*]] = sub i8 [[X]], [[TMP1]] +; CHECK-NEXT: [[C:%.*]] = add i8 [[B_NEG]], [[X]] ; CHECK-NEXT: ret i8 [[C]] ; %A = udiv i8 %x, 3 @@ -377,10 +376,10 @@ define i32 @test17(i32 %X) { define i32 @test18(i16 %x, i32 %y) { ; CHECK-LABEL: @test18( ; CHECK-NEXT: [[TMP1:%.*]] = and i16 [[X:%.*]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 63, i32 31 -; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], [[Y:%.*]] -; CHECK-NEXT: ret i32 [[TMP4]] +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i16 [[TMP1]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[DOTNOT]], i32 63, i32 31 +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: ret i32 [[TMP3]] ; %1 = and i16 %x, 4 %2 = icmp ne i16 %1, 0 @@ -477,10 +476,10 @@ define i32 @test21(i1 %c0, i32* %p) { ; CHECK-NEXT: br i1 [[C0:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[V:%.*]] = load volatile i32, i32* [[P:%.*]], align 4 -; CHECK-NEXT: [[PHITMP:%.*]] = srem i32 [[V]], 5 +; CHECK-NEXT: [[PHI_BO:%.*]] = srem i32 [[V]], 5 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[LHS:%.*]] = phi i32 [ [[PHITMP]], [[IF_THEN]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[LHS:%.*]] = phi i32 [ [[PHI_BO]], [[IF_THEN]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: ret i32 [[LHS]] ; entry: @@ -606,10 +605,10 @@ define i32 @pr27968_3(i1 %c0, i1 %always_false, i32* %p) { ; CHECK-NEXT: br i1 [[C0:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[V:%.*]] = load volatile i32, i32* [[P:%.*]], align 4 -; CHECK-NEXT: [[PHITMP:%.*]] = and i32 [[V]], 2147483647 +; CHECK-NEXT: [[PHI_BO:%.*]] = and i32 [[V]], 2147483647 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[LHS:%.*]] = phi i32 [ [[PHITMP]], [[IF_THEN]] ], [ 5, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[LHS:%.*]] = phi i32 [ [[PHI_BO]], [[IF_THEN]] ], [ 5, [[ENTRY:%.*]] ] ; CHECK-NEXT: br i1 [[ALWAYS_FALSE:%.*]], label [[REM_IS_SAFE:%.*]], label [[REM_IS_UNSAFE:%.*]] ; CHECK: rem.is.safe: ; CHECK-NEXT: ret i32 [[LHS]] diff --git a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll index a473acd730493..aa450f8af8b7e 100644 --- a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll +++ b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll @@ -18,8 +18,8 @@ define i32 @select_xor_icmp(i32 %x, i32 %y, i32 %z) { define i32 @select_xor_icmp2(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @select_xor_icmp2( -; CHECK-NEXT: [[A:%.*]] = icmp eq i32 [[X:%.*]], 0 -; CHECK-NEXT: [[C:%.*]] = select i1 [[A]], i32 [[Z:%.*]], i32 [[Y:%.*]] +; CHECK-NEXT: [[A_NOT:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[C:%.*]] = select i1 [[A_NOT]], i32 [[Z:%.*]], i32 [[Y:%.*]] ; CHECK-NEXT: ret i32 [[C]] ; %A = icmp ne i32 %x, 0 @@ -527,9 +527,9 @@ define i32 @select_xor_fcmp_bad_4(i32 %x, i32 %y, i32 %z, float %k) { define i32 @select_xor_icmp_bad_5(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @select_xor_icmp_bad_5( -; CHECK-NEXT: [[A:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[A_NOT:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[B:%.*]] = xor i32 [[X]], [[Z:%.*]] -; CHECK-NEXT: [[C:%.*]] = select i1 [[A]], i32 [[Y:%.*]], i32 [[B]] +; CHECK-NEXT: [[C:%.*]] = select i1 [[A_NOT]], i32 [[Y:%.*]], i32 [[B]] ; CHECK-NEXT: ret i32 [[C]] ; %A = icmp ne i32 %x, 0 @@ -540,9 +540,9 @@ define i32 @select_xor_icmp_bad_5(i32 %x, i32 %y, i32 %z) { define i32 @select_xor_icmp_bad_6(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @select_xor_icmp_bad_6( -; CHECK-NEXT: [[A:%.*]] = icmp eq i32 [[X:%.*]], 1 +; CHECK-NEXT: [[A_NOT:%.*]] = icmp eq i32 [[X:%.*]], 1 ; CHECK-NEXT: [[B:%.*]] = xor i32 [[X]], [[Z:%.*]] -; CHECK-NEXT: [[C:%.*]] = select i1 [[A]], i32 [[B]], i32 [[Y:%.*]] +; CHECK-NEXT: [[C:%.*]] = select i1 [[A_NOT]], i32 [[B]], i32 [[Y:%.*]] ; CHECK-NEXT: ret i32 [[C]] ; %A = icmp ne i32 %x, 1 @@ -564,12 +564,10 @@ define <2 x i8> @select_xor_icmp_vec_bad(<2 x i8> %x, <2 x i8> %y, <2 x i8> %z) ret <2 x i8> %C } -; TODO: support for undefs, check for an identity constant does not handle them yet -define <2 x i8> @select_xor_icmp_vec_bad_2(<2 x i8> %x, <2 x i8> %y, <2 x i8> %z) { -; CHECK-LABEL: @select_xor_icmp_vec_bad_2( +define <2 x i8> @select_xor_icmp_vec_undef(<2 x i8> %x, <2 x i8> %y, <2 x i8> %z) { +; CHECK-LABEL: @select_xor_icmp_vec_undef( ; CHECK-NEXT: [[A:%.*]] = icmp eq <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[B:%.*]] = xor <2 x i8> [[X]], [[Z:%.*]] -; CHECK-NEXT: [[C:%.*]] = select <2 x i1> [[A]], <2 x i8> [[B]], <2 x i8> [[Y:%.*]] +; CHECK-NEXT: [[C:%.*]] = select <2 x i1> [[A]], <2 x i8> [[Z:%.*]], <2 x i8> [[Y:%.*]] ; CHECK-NEXT: ret <2 x i8> [[C]] ; %A = icmp eq <2 x i8> %x, @@ -604,11 +602,10 @@ define i32 @select_add_icmp_bad(i32 %x, i32 %y, i32 %z) { ret i32 %C } -define i32 @select_and_icmp_bad(i32 %x, i32 %y, i32 %z) { -; CHECK-LABEL: @select_and_icmp_bad( +define i32 @select_and_icmp_zero(i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: @select_and_icmp_zero( ; CHECK-NEXT: [[A:%.*]] = icmp eq i32 [[X:%.*]], 0 -; CHECK-NEXT: [[B:%.*]] = and i32 [[X]], [[Z:%.*]] -; CHECK-NEXT: [[C:%.*]] = select i1 [[A]], i32 [[B]], i32 [[Y:%.*]] +; CHECK-NEXT: [[C:%.*]] = select i1 [[A]], i32 0, i32 [[Y:%.*]] ; CHECK-NEXT: ret i32 [[C]] ; %A = icmp eq i32 %x, 0 diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index 0ac9c699b1ddb..b7c4cb5c6420b 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -1924,8 +1924,8 @@ define i32 @select_dominance_chain(i1 %cond, i32 %x, i32 %y) { ; CHECK: if.false.3: ; CHECK-NEXT: br label [[MERGE_3]] ; CHECK: merge.3: -; CHECK-NEXT: [[S_3:%.*]] = phi i32 [ [[Y:%.*]], [[IF_FALSE_3]] ], [ [[X:%.*]], [[IF_TRUE_3]] ] -; CHECK-NEXT: [[SUM_2:%.*]] = mul i32 [[S_3]], 3 +; CHECK-NEXT: [[S_1:%.*]] = phi i32 [ [[Y:%.*]], [[IF_FALSE_3]] ], [ [[X:%.*]], [[IF_TRUE_3]] ] +; CHECK-NEXT: [[SUM_2:%.*]] = mul i32 [[S_1]], 3 ; CHECK-NEXT: ret i32 [[SUM_2]] ; entry: @@ -2587,3 +2587,111 @@ define void @select_freeze_icmp_multuses(i32 %x, i32 %y) { call void @use_i1_i32(i1 %c.fr, i32 %v) ret void } + +define i32 @pr47322_more_poisonous_replacement(i32 %arg) { +; CHECK-LABEL: @pr47322_more_poisonous_replacement( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[ARG:%.*]], 0 +; CHECK-NEXT: [[TRAILING:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG]], i1 immarg true), [[RNG0:!range !.*]] +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[ARG]], [[TRAILING]] +; CHECK-NEXT: [[R1_SROA_0_1:%.*]] = select i1 [[CMP]], i32 0, i32 [[SHIFTED]] +; CHECK-NEXT: ret i32 [[R1_SROA_0_1]] +; + %cmp = icmp eq i32 %arg, 0 + %trailing = call i32 @llvm.cttz.i32(i32 %arg, i1 immarg true) + %shifted = lshr i32 %arg, %trailing + %r1.sroa.0.1 = select i1 %cmp, i32 0, i32 %shifted + ret i32 %r1.sroa.0.1 +} + +define i8 @select_replacement_add_eq(i8 %x, i8 %y) { +; CHECK-LABEL: @select_replacement_add_eq( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X:%.*]], 1 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 2, i8 [[Y:%.*]] +; CHECK-NEXT: ret i8 [[SEL]] +; + %cmp = icmp eq i8 %x, 1 + %add = add i8 %x, 1 + %sel = select i1 %cmp, i8 %add, i8 %y + ret i8 %sel +} + +define i8 @select_replacement_add_ne(i8 %x, i8 %y) { +; CHECK-LABEL: @select_replacement_add_ne( +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i8 [[X:%.*]], 1 +; CHECK-NEXT: call void @use(i1 [[CMP]]) +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 [[Y:%.*]], i8 2 +; CHECK-NEXT: ret i8 [[SEL]] +; + %cmp = icmp ne i8 %x, 1 + call void @use(i1 %cmp) + %add = add i8 %x, 1 + %sel = select i1 %cmp, i8 %y, i8 %add + ret i8 %sel +} + +define i8 @select_replacement_add_nuw(i8 %x, i8 %y) { +; CHECK-LABEL: @select_replacement_add_nuw( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X:%.*]], 1 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 2, i8 [[Y:%.*]] +; CHECK-NEXT: ret i8 [[SEL]] +; + %cmp = icmp eq i8 %x, 1 + %add = add nuw i8 %x, 1 + %sel = select i1 %cmp, i8 %add, i8 %y + ret i8 %sel +} + +define i8 @select_replacement_sub(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @select_replacement_sub( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 0, i8 [[Z:%.*]] +; CHECK-NEXT: ret i8 [[SEL]] +; + %cmp = icmp eq i8 %x, %y + %sub = sub i8 %x, %y + %sel = select i1 %cmp, i8 %sub, i8 %z + ret i8 %sel +} + +define i8 @select_replacement_shift(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @select_replacement_shift( +; CHECK-NEXT: [[SHR:%.*]] = lshr exact i8 [[X:%.*]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[SHR]], [[Y:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[Z:%.*]] +; CHECK-NEXT: ret i8 [[SEL]] +; + %shr = lshr exact i8 %x, 1 + %cmp = icmp eq i8 %shr, %y + %shl = shl i8 %y, 1 + %sel = select i1 %cmp, i8 %shl, i8 %z + ret i8 %sel +} + +define i8 @select_replacement_loop(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @select_replacement_loop( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[Z:%.*]] +; CHECK-NEXT: ret i8 [[SEL]] +; + %cmp = icmp eq i8 %x, %y + %sel = select i1 %cmp, i8 %x, i8 %z + ret i8 %sel +} + +define i32 @select_replacement_loop2(i32 %arg, i32 %arg2) { +; CHECK-LABEL: @select_replacement_loop2( +; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[ARG:%.*]], [[ARG2:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[ARG2]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[MUL]], [[ARG]] +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[DIV]], i32 undef +; CHECK-NEXT: ret i32 [[SEL]] +; + %div = udiv i32 %arg, %arg2 + %mul = mul nsw i32 %div, %arg2 + %cmp = icmp eq i32 %mul, %arg + %sel = select i1 %cmp, i32 %div, i32 undef + ret i32 %sel +} + +declare void @use(i1) +declare i32 @llvm.cttz.i32(i32, i1 immarg) diff --git a/llvm/test/Transforms/InstCombine/select_meta.ll b/llvm/test/Transforms/InstCombine/select_meta.ll index 67dd246c04082..8d44774cbe49e 100644 --- a/llvm/test/Transforms/InstCombine/select_meta.ll +++ b/llvm/test/Transforms/InstCombine/select_meta.ll @@ -104,10 +104,8 @@ define i16 @t7(i32 %a) { define i32 @abs_nabs_x01(i32 %x) { ; CHECK-LABEL: @abs_nabs_x01( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 %x, 0 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, %x -; CHECK-NEXT: [[COND1:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 %x, !prof ![[$MD3:[0-9]+]] -; CHECK-NEXT: ret i32 [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: ret i32 [[TMP1]] ; %cmp = icmp sgt i32 %x, -1 %sub = sub nsw i32 0, %x @@ -122,10 +120,8 @@ define i32 @abs_nabs_x01(i32 %x) { define <2 x i32> @abs_nabs_x01_vec(<2 x i32> %x) { ; CHECK-LABEL: @abs_nabs_x01_vec( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> %x, zeroinitializer -; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i32> zeroinitializer, %x -; CHECK-NEXT: [[COND1:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[SUB]], <2 x i32> %x, !prof ![[$MD3]] -; CHECK-NEXT: ret <2 x i32> [[COND1]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[X:%.*]], i1 true) +; CHECK-NEXT: ret <2 x i32> [[TMP1]] ; %cmp = icmp sgt <2 x i32> %x, %sub = sub nsw <2 x i32> zeroinitializer, %x @@ -154,7 +150,7 @@ define i32 @test30(i32 %x, i32 %y) { define i32 @test70(i32 %x) { ; CHECK-LABEL: @test70( ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 %x, 75 -; CHECK-NEXT: [[COND:%.*]] = select i1 [[TMP1]], i32 %x, i32 75, !prof ![[$MD3]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[TMP1]], i32 %x, i32 75, !prof ![[$MD3:[0-9]+]] ; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp slt i32 %x, 75 diff --git a/llvm/test/Transforms/InstCombine/shl-factor.ll b/llvm/test/Transforms/InstCombine/shl-factor.ll new file mode 100644 index 0000000000000..274d6e3a5e6b2 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/shl-factor.ll @@ -0,0 +1,281 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare void @use8(i8) + +define i6 @add_shl_same_amount(i6 %x, i6 %y, i6 %z) { +; CHECK-LABEL: @add_shl_same_amount( +; CHECK-NEXT: [[XS:%.*]] = shl i6 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl i6 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = add i6 [[XS]], [[YS]] +; CHECK-NEXT: ret i6 [[DIFF]] +; + %xs = shl i6 %x, %z + %ys = shl i6 %y, %z + %diff = add i6 %xs, %ys + ret i6 %diff +} + +define <2 x i4> @add_shl_same_amount_nsw(<2 x i4> %x, <2 x i4> %y, <2 x i4> %z) { +; CHECK-LABEL: @add_shl_same_amount_nsw( +; CHECK-NEXT: [[XS:%.*]] = shl nsw <2 x i4> [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl nsw <2 x i4> [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = add nsw <2 x i4> [[XS]], [[YS]] +; CHECK-NEXT: ret <2 x i4> [[DIFF]] +; + %xs = shl nsw <2 x i4> %x, %z + %ys = shl nsw <2 x i4> %y, %z + %diff = add nsw <2 x i4> %xs, %ys + ret <2 x i4> %diff +} + +define i64 @add_shl_same_amount_nuw(i64 %x, i64 %y, i64 %z) { +; CHECK-LABEL: @add_shl_same_amount_nuw( +; CHECK-NEXT: [[XS:%.*]] = shl nuw i64 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl nuw i64 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = add nuw i64 [[XS]], [[YS]] +; CHECK-NEXT: ret i64 [[DIFF]] +; + %xs = shl nuw i64 %x, %z + %ys = shl nuw i64 %y, %z + %diff = add nuw i64 %xs, %ys + ret i64 %diff +} + +define i8 @add_shl_same_amount_nsw_extra_use1(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @add_shl_same_amount_nsw_extra_use1( +; CHECK-NEXT: [[XS:%.*]] = shl nuw nsw i8 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[XS]]) +; CHECK-NEXT: [[YS:%.*]] = shl nuw nsw i8 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = add nsw i8 [[XS]], [[YS]] +; CHECK-NEXT: ret i8 [[DIFF]] +; + %xs = shl nsw nuw i8 %x, %z + call void @use8(i8 %xs) + %ys = shl nsw nuw i8 %y, %z + %diff = add nsw i8 %xs, %ys + ret i8 %diff +} + +define i8 @add_shl_same_amount_nuw_extra_use2(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @add_shl_same_amount_nuw_extra_use2( +; CHECK-NEXT: [[XS:%.*]] = shl nuw i8 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl nuw nsw i8 [[Y:%.*]], [[Z]] +; CHECK-NEXT: call void @use8(i8 [[YS]]) +; CHECK-NEXT: [[DIFF:%.*]] = add nuw nsw i8 [[XS]], [[YS]] +; CHECK-NEXT: ret i8 [[DIFF]] +; + %xs = shl nuw i8 %x, %z + %ys = shl nsw nuw i8 %y, %z + call void @use8(i8 %ys) + %diff = add nsw nuw i8 %xs, %ys + ret i8 %diff +} + +define i8 @add_shl_same_amount_nsw_nuw_extra_use3(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @add_shl_same_amount_nsw_nuw_extra_use3( +; CHECK-NEXT: [[XS:%.*]] = shl nuw nsw i8 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[XS]]) +; CHECK-NEXT: [[YS:%.*]] = shl nuw nsw i8 [[Y:%.*]], [[Z]] +; CHECK-NEXT: call void @use8(i8 [[YS]]) +; CHECK-NEXT: [[DIFF:%.*]] = add nuw nsw i8 [[XS]], [[YS]] +; CHECK-NEXT: ret i8 [[DIFF]] +; + %xs = shl nsw nuw i8 %x, %z + call void @use8(i8 %xs) + %ys = shl nsw nuw i8 %y, %z + call void @use8(i8 %ys) + %diff = add nsw nuw i8 %xs, %ys + ret i8 %diff +} + +define i6 @add_shl_same_amount_partial_nsw1(i6 %x, i6 %y, i6 %z) { +; CHECK-LABEL: @add_shl_same_amount_partial_nsw1( +; CHECK-NEXT: [[XS:%.*]] = shl nsw i6 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl nsw i6 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = add i6 [[XS]], [[YS]] +; CHECK-NEXT: ret i6 [[DIFF]] +; + %xs = shl nsw i6 %x, %z + %ys = shl nsw i6 %y, %z + %diff = add i6 %xs, %ys + ret i6 %diff +} + +define i6 @add_shl_same_amount_partial_nsw2(i6 %x, i6 %y, i6 %z) { +; CHECK-LABEL: @add_shl_same_amount_partial_nsw2( +; CHECK-NEXT: [[XS:%.*]] = shl i6 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl nsw i6 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = add nsw i6 [[XS]], [[YS]] +; CHECK-NEXT: ret i6 [[DIFF]] +; + %xs = shl i6 %x, %z + %ys = shl nsw i6 %y, %z + %diff = add nsw i6 %xs, %ys + ret i6 %diff +} + +define i6 @add_shl_same_amount_partial_nuw1(i6 %x, i6 %y, i6 %z) { +; CHECK-LABEL: @add_shl_same_amount_partial_nuw1( +; CHECK-NEXT: [[XS:%.*]] = shl nuw i6 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl nuw i6 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = add i6 [[XS]], [[YS]] +; CHECK-NEXT: ret i6 [[DIFF]] +; + %xs = shl nuw i6 %x, %z + %ys = shl nuw i6 %y, %z + %diff = add i6 %xs, %ys + ret i6 %diff +} + +define i6 @add_shl_same_amount_partial_nuw2(i6 %x, i6 %y, i6 %z) { +; CHECK-LABEL: @add_shl_same_amount_partial_nuw2( +; CHECK-NEXT: [[XS:%.*]] = shl nuw i6 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl i6 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = add nuw i6 [[XS]], [[YS]] +; CHECK-NEXT: ret i6 [[DIFF]] +; + %xs = shl nuw i6 %x, %z + %ys = shl i6 %y, %z + %diff = add nuw i6 %xs, %ys + ret i6 %diff +} + +define i6 @sub_shl_same_amount(i6 %x, i6 %y, i6 %z) { +; CHECK-LABEL: @sub_shl_same_amount( +; CHECK-NEXT: [[XS:%.*]] = shl i6 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl i6 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = sub i6 [[XS]], [[YS]] +; CHECK-NEXT: ret i6 [[DIFF]] +; + %xs = shl i6 %x, %z + %ys = shl i6 %y, %z + %diff = sub i6 %xs, %ys + ret i6 %diff +} + +define <2 x i4> @sub_shl_same_amount_nsw(<2 x i4> %x, <2 x i4> %y, <2 x i4> %z) { +; CHECK-LABEL: @sub_shl_same_amount_nsw( +; CHECK-NEXT: [[XS:%.*]] = shl nsw <2 x i4> [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl nsw <2 x i4> [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = sub nsw <2 x i4> [[XS]], [[YS]] +; CHECK-NEXT: ret <2 x i4> [[DIFF]] +; + %xs = shl nsw <2 x i4> %x, %z + %ys = shl nsw <2 x i4> %y, %z + %diff = sub nsw <2 x i4> %xs, %ys + ret <2 x i4> %diff +} + +define i64 @sub_shl_same_amount_nuw(i64 %x, i64 %y, i64 %z) { +; CHECK-LABEL: @sub_shl_same_amount_nuw( +; CHECK-NEXT: [[XS:%.*]] = shl nuw i64 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl nuw i64 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = sub nuw i64 [[XS]], [[YS]] +; CHECK-NEXT: ret i64 [[DIFF]] +; + %xs = shl nuw i64 %x, %z + %ys = shl nuw i64 %y, %z + %diff = sub nuw i64 %xs, %ys + ret i64 %diff +} + +define i8 @sub_shl_same_amount_nsw_extra_use1(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @sub_shl_same_amount_nsw_extra_use1( +; CHECK-NEXT: [[XS:%.*]] = shl nuw nsw i8 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[XS]]) +; CHECK-NEXT: [[YS:%.*]] = shl nuw nsw i8 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = sub nsw i8 [[XS]], [[YS]] +; CHECK-NEXT: ret i8 [[DIFF]] +; + %xs = shl nsw nuw i8 %x, %z + call void @use8(i8 %xs) + %ys = shl nsw nuw i8 %y, %z + %diff = sub nsw i8 %xs, %ys + ret i8 %diff +} + +define i8 @sub_shl_same_amount_nuw_extra_use2(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @sub_shl_same_amount_nuw_extra_use2( +; CHECK-NEXT: [[XS:%.*]] = shl nuw i8 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl nuw nsw i8 [[Y:%.*]], [[Z]] +; CHECK-NEXT: call void @use8(i8 [[YS]]) +; CHECK-NEXT: [[DIFF:%.*]] = sub nuw nsw i8 [[XS]], [[YS]] +; CHECK-NEXT: ret i8 [[DIFF]] +; + %xs = shl nuw i8 %x, %z + %ys = shl nsw nuw i8 %y, %z + call void @use8(i8 %ys) + %diff = sub nsw nuw i8 %xs, %ys + ret i8 %diff +} + +define i8 @sub_shl_same_amount_nsw_nuw_extra_use3(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @sub_shl_same_amount_nsw_nuw_extra_use3( +; CHECK-NEXT: [[XS:%.*]] = shl nuw nsw i8 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[XS]]) +; CHECK-NEXT: [[YS:%.*]] = shl nuw nsw i8 [[Y:%.*]], [[Z]] +; CHECK-NEXT: call void @use8(i8 [[YS]]) +; CHECK-NEXT: [[DIFF:%.*]] = sub nuw nsw i8 [[XS]], [[YS]] +; CHECK-NEXT: ret i8 [[DIFF]] +; + %xs = shl nsw nuw i8 %x, %z + call void @use8(i8 %xs) + %ys = shl nsw nuw i8 %y, %z + call void @use8(i8 %ys) + %diff = sub nsw nuw i8 %xs, %ys + ret i8 %diff +} + +define i6 @sub_shl_same_amount_partial_nsw1(i6 %x, i6 %y, i6 %z) { +; CHECK-LABEL: @sub_shl_same_amount_partial_nsw1( +; CHECK-NEXT: [[XS:%.*]] = shl nsw i6 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl nsw i6 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = sub i6 [[XS]], [[YS]] +; CHECK-NEXT: ret i6 [[DIFF]] +; + %xs = shl nsw i6 %x, %z + %ys = shl nsw i6 %y, %z + %diff = sub i6 %xs, %ys + ret i6 %diff +} + +define i6 @sub_shl_same_amount_partial_nsw2(i6 %x, i6 %y, i6 %z) { +; CHECK-LABEL: @sub_shl_same_amount_partial_nsw2( +; CHECK-NEXT: [[XS:%.*]] = shl i6 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl nsw i6 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = sub nsw i6 [[XS]], [[YS]] +; CHECK-NEXT: ret i6 [[DIFF]] +; + %xs = shl i6 %x, %z + %ys = shl nsw i6 %y, %z + %diff = sub nsw i6 %xs, %ys + ret i6 %diff +} + +define i6 @sub_shl_same_amount_partial_nuw1(i6 %x, i6 %y, i6 %z) { +; CHECK-LABEL: @sub_shl_same_amount_partial_nuw1( +; CHECK-NEXT: [[XS:%.*]] = shl nuw i6 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl nuw i6 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = sub i6 [[XS]], [[YS]] +; CHECK-NEXT: ret i6 [[DIFF]] +; + %xs = shl nuw i6 %x, %z + %ys = shl nuw i6 %y, %z + %diff = sub i6 %xs, %ys + ret i6 %diff +} + +define i6 @sub_shl_same_amount_partial_nuw2(i6 %x, i6 %y, i6 %z) { +; CHECK-LABEL: @sub_shl_same_amount_partial_nuw2( +; CHECK-NEXT: [[XS:%.*]] = shl nuw i6 [[X:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[YS:%.*]] = shl i6 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[DIFF:%.*]] = sub nuw i6 [[XS]], [[YS]] +; CHECK-NEXT: ret i6 [[DIFF]] +; + %xs = shl nuw i6 %x, %z + %ys = shl i6 %y, %z + %diff = sub nuw i6 %xs, %ys + ret i6 %diff +} + diff --git a/llvm/test/Transforms/InstCombine/statepoint-cleanup.ll b/llvm/test/Transforms/InstCombine/statepoint-cleanup.ll new file mode 100644 index 0000000000000..003f25b4ff7a9 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/statepoint-cleanup.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -instcombine-max-iterations=1 -S | FileCheck %s +; These tests check the optimizations specific to +; pointers being relocated at a statepoint. + + +declare void @func() + +define void @test(i32 addrspace(1)* %b) gc "statepoint-example" { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[D:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 16 +; CHECK-NEXT: [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* nonnull @func, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(i32 addrspace(1)* [[B]], i32 addrspace(1)* [[D]]) ] +; CHECK-NEXT: [[B_NEW_1:%.*]] = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token [[SAFEPOINT_TOKEN]], i32 0, i32 0) +; CHECK-NEXT: [[B_NEW_2:%.*]] = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token [[SAFEPOINT_TOKEN]], i32 0, i32 0) +; CHECK-NEXT: [[D_NEW_1:%.*]] = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token [[SAFEPOINT_TOKEN]], i32 0, i32 1) +; CHECK-NEXT: [[D_NEW_2:%.*]] = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token [[SAFEPOINT_TOKEN]], i32 0, i32 1) +; CHECK-NEXT: [[D_NEW_3:%.*]] = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token [[SAFEPOINT_TOKEN]], i32 0, i32 1) +; CHECK-NEXT: [[D_NEW_4:%.*]] = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token [[SAFEPOINT_TOKEN]], i32 0, i32 1) +; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[B_NEW_1]], align 4 +; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[B_NEW_2]], align 4 +; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[D_NEW_1]], align 4 +; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[D_NEW_2]], align 4 +; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[D_NEW_3]], align 4 +; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[D_NEW_4]], align 4 +; CHECK-NEXT: ret void +; +entry: + %d = getelementptr i32, i32 addrspace(1)* %b, i64 16 + %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %b, i32 addrspace(1)* %b, i32 addrspace(1)* %d, i32 addrspace(1)* %d)] + %b.new.1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 0) + %b.new.2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 1) + %d.new.1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 2) + %d.new.2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 3) + %d.new.3 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 1, i32 2) + %d.new.4 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 1, i32 3) + store i32 1, i32 addrspace(1)* %b.new.1 + store i32 1, i32 addrspace(1)* %b.new.2 + store i32 1, i32 addrspace(1)* %d.new.1 + store i32 1, i32 addrspace(1)* %d.new.2 + store i32 1, i32 addrspace(1)* %d.new.3 + store i32 1, i32 addrspace(1)* %d.new.4 + ret void +} + +define void @test_no_derived_use(i32 addrspace(1)* %b) gc "statepoint-example" { +; CHECK-LABEL: @test_no_derived_use( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* nonnull @func, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(i32 addrspace(1)* [[B:%.*]]) ] +; CHECK-NEXT: [[B_NEW_1:%.*]] = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token [[SAFEPOINT_TOKEN]], i32 0, i32 0) +; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[B_NEW_1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %d = getelementptr i32, i32 addrspace(1)* %b, i64 16 + %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %b, i32 addrspace(1)* %b, i32 addrspace(1)* %d, i32 addrspace(1)* %d)] + %b.new.1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 0) + %b.new.2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 1) + %d.new.1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 2) + %d.new.2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 3) + %d.new.3 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 1, i32 2) + %d.new.4 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 1, i32 3) + store i32 1, i32 addrspace(1)* %b.new.1 + ret void +} + +define void @test_no_base_use(i32 addrspace(1)* %b) gc "statepoint-example" { +; CHECK-LABEL: @test_no_base_use( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[D:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 16 +; CHECK-NEXT: [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* nonnull @func, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(i32 addrspace(1)* [[B]], i32 addrspace(1)* [[D]]) ] +; CHECK-NEXT: [[D_NEW_1:%.*]] = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token [[SAFEPOINT_TOKEN]], i32 0, i32 1) +; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[D_NEW_1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %d = getelementptr i32, i32 addrspace(1)* %b, i64 16 + %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %b, i32 addrspace(1)* %b, i32 addrspace(1)* %d, i32 addrspace(1)* %d)] + %b.new.1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 0) + %b.new.2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 1) + %d.new.1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 2) + %d.new.2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 3) + %d.new.3 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 1, i32 2) + %d.new.4 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 1, i32 3) + store i32 1, i32 addrspace(1)* %d.new.1 + ret void +} + +declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) +declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32) diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll index f31eeb46d8823..ce9657433bb78 100644 --- a/llvm/test/Transforms/InstCombine/sub-gep.ll +++ b/llvm/test/Transforms/InstCombine/sub-gep.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -instcombine < %s | FileCheck %s +target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" + define i64 @test_inbounds([0 x i32]* %base, i64 %idx) { ; CHECK-LABEL: @test_inbounds( ; CHECK-NEXT: [[P2_IDX:%.*]] = shl nsw i64 [[IDX:%.*]], 2 @@ -151,3 +153,187 @@ define i64 @test_inbounds_nuw_multi_index([0 x [2 x i32]]* %base, i64 %idx, i64 %d = sub nuw i64 %i2, %i1 ret i64 %d } + +; rdar://7362831 +define i32 @test23(i8* %P, i64 %A){ +; CHECK-LABEL: @test23( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[A:%.*]] to i32 +; CHECK-NEXT: ret i32 [[TMP1]] +; + %B = getelementptr inbounds i8, i8* %P, i64 %A + %C = ptrtoint i8* %B to i64 + %D = trunc i64 %C to i32 + %E = ptrtoint i8* %P to i64 + %F = trunc i64 %E to i32 + %G = sub i32 %D, %F + ret i32 %G +} + +define i8 @test23_as1(i8 addrspace(1)* %P, i16 %A) { +; CHECK-LABEL: @test23_as1( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i16 [[A:%.*]] to i8 +; CHECK-NEXT: ret i8 [[TMP1]] +; + %B = getelementptr inbounds i8, i8 addrspace(1)* %P, i16 %A + %C = ptrtoint i8 addrspace(1)* %B to i16 + %D = trunc i16 %C to i8 + %E = ptrtoint i8 addrspace(1)* %P to i16 + %F = trunc i16 %E to i8 + %G = sub i8 %D, %F + ret i8 %G +} + +define i64 @test24(i8* %P, i64 %A){ +; CHECK-LABEL: @test24( +; CHECK-NEXT: ret i64 [[A:%.*]] +; + %B = getelementptr inbounds i8, i8* %P, i64 %A + %C = ptrtoint i8* %B to i64 + %E = ptrtoint i8* %P to i64 + %G = sub i64 %C, %E + ret i64 %G +} + +define i16 @test24_as1(i8 addrspace(1)* %P, i16 %A) { +; CHECK-LABEL: @test24_as1( +; CHECK-NEXT: ret i16 [[A:%.*]] +; + %B = getelementptr inbounds i8, i8 addrspace(1)* %P, i16 %A + %C = ptrtoint i8 addrspace(1)* %B to i16 + %E = ptrtoint i8 addrspace(1)* %P to i16 + %G = sub i16 %C, %E + ret i16 %G +} + +define i64 @test24a(i8* %P, i64 %A){ +; CHECK-LABEL: @test24a( +; CHECK-NEXT: [[DIFF_NEG:%.*]] = sub i64 0, [[A:%.*]] +; CHECK-NEXT: ret i64 [[DIFF_NEG]] +; + %B = getelementptr inbounds i8, i8* %P, i64 %A + %C = ptrtoint i8* %B to i64 + %E = ptrtoint i8* %P to i64 + %G = sub i64 %E, %C + ret i64 %G +} + +define i16 @test24a_as1(i8 addrspace(1)* %P, i16 %A) { +; CHECK-LABEL: @test24a_as1( +; CHECK-NEXT: [[DIFF_NEG:%.*]] = sub i16 0, [[A:%.*]] +; CHECK-NEXT: ret i16 [[DIFF_NEG]] +; + %B = getelementptr inbounds i8, i8 addrspace(1)* %P, i16 %A + %C = ptrtoint i8 addrspace(1)* %B to i16 + %E = ptrtoint i8 addrspace(1)* %P to i16 + %G = sub i16 %E, %C + ret i16 %G +} + +@Arr = external global [42 x i16] + +define i64 @test24b(i8* %P, i64 %A){ +; CHECK-LABEL: @test24b( +; CHECK-NEXT: [[B_IDX:%.*]] = shl nsw i64 [[A:%.*]], 1 +; CHECK-NEXT: ret i64 [[B_IDX]] +; + %B = getelementptr inbounds [42 x i16], [42 x i16]* @Arr, i64 0, i64 %A + %C = ptrtoint i16* %B to i64 + %G = sub i64 %C, ptrtoint ([42 x i16]* @Arr to i64) + ret i64 %G +} + +define i64 @test25(i8* %P, i64 %A){ +; CHECK-LABEL: @test25( +; CHECK-NEXT: [[B_IDX:%.*]] = shl nsw i64 [[A:%.*]], 1 +; CHECK-NEXT: [[GEPDIFF:%.*]] = add i64 [[B_IDX]], -84 +; CHECK-NEXT: ret i64 [[GEPDIFF]] +; + %B = getelementptr inbounds [42 x i16], [42 x i16]* @Arr, i64 0, i64 %A + %C = ptrtoint i16* %B to i64 + %G = sub i64 %C, ptrtoint (i16* getelementptr ([42 x i16], [42 x i16]* @Arr, i64 1, i64 0) to i64) + ret i64 %G +} + +@Arr_as1 = external addrspace(1) global [42 x i16] + +define i16 @test25_as1(i8 addrspace(1)* %P, i64 %A) { +; CHECK-LABEL: @test25_as1( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[A:%.*]] to i16 +; CHECK-NEXT: [[B_IDX:%.*]] = shl nsw i16 [[TMP1]], 1 +; CHECK-NEXT: [[GEPDIFF:%.*]] = add i16 [[B_IDX]], -84 +; CHECK-NEXT: ret i16 [[GEPDIFF]] +; + %B = getelementptr inbounds [42 x i16], [42 x i16] addrspace(1)* @Arr_as1, i64 0, i64 %A + %C = ptrtoint i16 addrspace(1)* %B to i16 + %G = sub i16 %C, ptrtoint (i16 addrspace(1)* getelementptr ([42 x i16], [42 x i16] addrspace(1)* @Arr_as1, i64 1, i64 0) to i16) + ret i16 %G +} + +define i64 @test30(i8* %foo, i64 %i, i64 %j) { +; CHECK-LABEL: @test30( +; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i64 [[I:%.*]], 2 +; CHECK-NEXT: [[GEPDIFF:%.*]] = sub i64 [[GEP1_IDX]], [[J:%.*]] +; CHECK-NEXT: ret i64 [[GEPDIFF]] +; + %bit = bitcast i8* %foo to i32* + %gep1 = getelementptr inbounds i32, i32* %bit, i64 %i + %gep2 = getelementptr inbounds i8, i8* %foo, i64 %j + %cast1 = ptrtoint i32* %gep1 to i64 + %cast2 = ptrtoint i8* %gep2 to i64 + %sub = sub i64 %cast1, %cast2 + ret i64 %sub +} + +define i16 @test30_as1(i8 addrspace(1)* %foo, i16 %i, i16 %j) { +; CHECK-LABEL: @test30_as1( +; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i16 [[I:%.*]], 2 +; CHECK-NEXT: [[GEPDIFF:%.*]] = sub i16 [[GEP1_IDX]], [[J:%.*]] +; CHECK-NEXT: ret i16 [[GEPDIFF]] +; + %bit = bitcast i8 addrspace(1)* %foo to i32 addrspace(1)* + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %bit, i16 %i + %gep2 = getelementptr inbounds i8, i8 addrspace(1)* %foo, i16 %j + %cast1 = ptrtoint i32 addrspace(1)* %gep1 to i16 + %cast2 = ptrtoint i8 addrspace(1)* %gep2 to i16 + %sub = sub i16 %cast1, %cast2 + ret i16 %sub +} + +define i64 @gep_diff_both_inbounds(i8* %foo, i64 %i, i64 %j) { +; CHECK-LABEL: @gep_diff_both_inbounds( +; CHECK-NEXT: [[GEPDIFF:%.*]] = sub i64 [[I:%.*]], [[J:%.*]] +; CHECK-NEXT: ret i64 [[GEPDIFF]] +; + %gep1 = getelementptr inbounds i8, i8* %foo, i64 %i + %gep2 = getelementptr inbounds i8, i8* %foo, i64 %j + %cast1 = ptrtoint i8* %gep1 to i64 + %cast2 = ptrtoint i8* %gep2 to i64 + %sub = sub i64 %cast1, %cast2 + ret i64 %sub +} + +define i64 @gep_diff_first_inbounds(i8* %foo, i64 %i, i64 %j) { +; CHECK-LABEL: @gep_diff_first_inbounds( +; CHECK-NEXT: [[GEPDIFF:%.*]] = sub i64 [[I:%.*]], [[J:%.*]] +; CHECK-NEXT: ret i64 [[GEPDIFF]] +; + %gep1 = getelementptr inbounds i8, i8* %foo, i64 %i + %gep2 = getelementptr i8, i8* %foo, i64 %j + %cast1 = ptrtoint i8* %gep1 to i64 + %cast2 = ptrtoint i8* %gep2 to i64 + %sub = sub i64 %cast1, %cast2 + ret i64 %sub +} + +define i64 @gep_diff_second_inbounds(i8* %foo, i64 %i, i64 %j) { +; CHECK-LABEL: @gep_diff_second_inbounds( +; CHECK-NEXT: [[GEPDIFF:%.*]] = sub i64 [[I:%.*]], [[J:%.*]] +; CHECK-NEXT: ret i64 [[GEPDIFF]] +; + %gep1 = getelementptr i8, i8* %foo, i64 %i + %gep2 = getelementptr inbounds i8, i8* %foo, i64 %j + %cast1 = ptrtoint i8* %gep1 to i64 + %cast2 = ptrtoint i8* %gep2 to i64 + %sub = sub i64 %cast1, %cast2 + ret i64 %sub +} diff --git a/llvm/test/Transforms/InstCombine/sub-of-negatible.ll b/llvm/test/Transforms/InstCombine/sub-of-negatible.ll index 0755ebfff1621..f14ae09e93bf7 100644 --- a/llvm/test/Transforms/InstCombine/sub-of-negatible.ll +++ b/llvm/test/Transforms/InstCombine/sub-of-negatible.ll @@ -1155,9 +1155,8 @@ define i8 @negate_abs(i8 %x, i8 %y) { ; CHECK-LABEL: @negate_abs( ; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[T1:%.*]] = icmp slt i8 [[X]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[T1]], i8 [[X]], i8 [[T0]], !prof !0 -; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X]], i1 false) +; CHECK-NEXT: [[T3:%.*]] = sub i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[T3]] ; %t0 = sub i8 0, %x @@ -1171,8 +1170,7 @@ define i8 @negate_nabs(i8 %x, i8 %y) { ; CHECK-LABEL: @negate_nabs( ; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[T1:%.*]] = icmp slt i8 [[X]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[T1]], i8 [[T0]], i8 [[X]], !prof !0 +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X]], i1 false) ; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: ret i8 [[T3]] ; diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll index 437d8f8c5c023..98d8a9e6b5ca6 100644 --- a/llvm/test/Transforms/InstCombine/sub.ll +++ b/llvm/test/Transforms/InstCombine/sub.ll @@ -414,122 +414,6 @@ define zeroext i1 @test22(i32 %a, i32 %b) nounwind { ret i1 %i5 } -; rdar://7362831 -define i32 @test23(i8* %P, i64 %A){ -; CHECK-LABEL: @test23( -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[A:%.*]] to i32 -; CHECK-NEXT: ret i32 [[TMP1]] -; - %B = getelementptr inbounds i8, i8* %P, i64 %A - %C = ptrtoint i8* %B to i64 - %D = trunc i64 %C to i32 - %E = ptrtoint i8* %P to i64 - %F = trunc i64 %E to i32 - %G = sub i32 %D, %F - ret i32 %G -} - -define i8 @test23_as1(i8 addrspace(1)* %P, i16 %A) { -; CHECK-LABEL: @test23_as1( -; CHECK-NEXT: [[TMP1:%.*]] = trunc i16 [[A:%.*]] to i8 -; CHECK-NEXT: ret i8 [[TMP1]] -; - %B = getelementptr inbounds i8, i8 addrspace(1)* %P, i16 %A - %C = ptrtoint i8 addrspace(1)* %B to i16 - %D = trunc i16 %C to i8 - %E = ptrtoint i8 addrspace(1)* %P to i16 - %F = trunc i16 %E to i8 - %G = sub i8 %D, %F - ret i8 %G -} - -define i64 @test24(i8* %P, i64 %A){ -; CHECK-LABEL: @test24( -; CHECK-NEXT: ret i64 [[A:%.*]] -; - %B = getelementptr inbounds i8, i8* %P, i64 %A - %C = ptrtoint i8* %B to i64 - %E = ptrtoint i8* %P to i64 - %G = sub i64 %C, %E - ret i64 %G -} - -define i16 @test24_as1(i8 addrspace(1)* %P, i16 %A) { -; CHECK-LABEL: @test24_as1( -; CHECK-NEXT: ret i16 [[A:%.*]] -; - %B = getelementptr inbounds i8, i8 addrspace(1)* %P, i16 %A - %C = ptrtoint i8 addrspace(1)* %B to i16 - %E = ptrtoint i8 addrspace(1)* %P to i16 - %G = sub i16 %C, %E - ret i16 %G -} - -define i64 @test24a(i8* %P, i64 %A){ -; CHECK-LABEL: @test24a( -; CHECK-NEXT: [[DIFF_NEG:%.*]] = sub i64 0, [[A:%.*]] -; CHECK-NEXT: ret i64 [[DIFF_NEG]] -; - %B = getelementptr inbounds i8, i8* %P, i64 %A - %C = ptrtoint i8* %B to i64 - %E = ptrtoint i8* %P to i64 - %G = sub i64 %E, %C - ret i64 %G -} - -define i16 @test24a_as1(i8 addrspace(1)* %P, i16 %A) { -; CHECK-LABEL: @test24a_as1( -; CHECK-NEXT: [[DIFF_NEG:%.*]] = sub i16 0, [[A:%.*]] -; CHECK-NEXT: ret i16 [[DIFF_NEG]] -; - %B = getelementptr inbounds i8, i8 addrspace(1)* %P, i16 %A - %C = ptrtoint i8 addrspace(1)* %B to i16 - %E = ptrtoint i8 addrspace(1)* %P to i16 - %G = sub i16 %E, %C - ret i16 %G -} - - -@Arr = external global [42 x i16] - -define i64 @test24b(i8* %P, i64 %A){ -; CHECK-LABEL: @test24b( -; CHECK-NEXT: [[B_IDX:%.*]] = shl nsw i64 [[A:%.*]], 1 -; CHECK-NEXT: ret i64 [[B_IDX]] -; - %B = getelementptr inbounds [42 x i16], [42 x i16]* @Arr, i64 0, i64 %A - %C = ptrtoint i16* %B to i64 - %G = sub i64 %C, ptrtoint ([42 x i16]* @Arr to i64) - ret i64 %G -} - -define i64 @test25(i8* %P, i64 %A){ -; CHECK-LABEL: @test25( -; CHECK-NEXT: [[B_IDX:%.*]] = shl nsw i64 [[A:%.*]], 1 -; CHECK-NEXT: [[GEPDIFF:%.*]] = add i64 [[B_IDX]], -84 -; CHECK-NEXT: ret i64 [[GEPDIFF]] -; - %B = getelementptr inbounds [42 x i16], [42 x i16]* @Arr, i64 0, i64 %A - %C = ptrtoint i16* %B to i64 - %G = sub i64 %C, ptrtoint (i16* getelementptr ([42 x i16], [42 x i16]* @Arr, i64 1, i64 0) to i64) - ret i64 %G -} - -@Arr_as1 = external addrspace(1) global [42 x i16] - -define i16 @test25_as1(i8 addrspace(1)* %P, i64 %A) { -; CHECK-LABEL: @test25_as1( -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[A:%.*]] to i16 -; CHECK-NEXT: [[B_IDX:%.*]] = shl nsw i16 [[TMP1]], 1 -; CHECK-NEXT: [[GEPDIFF:%.*]] = add i16 [[B_IDX]], -84 -; CHECK-NEXT: ret i16 [[GEPDIFF]] -; - %B = getelementptr inbounds [42 x i16], [42 x i16] addrspace(1)* @Arr_as1, i64 0, i64 %A - %C = ptrtoint i16 addrspace(1)* %B to i16 - %G = sub i16 %C, ptrtoint (i16 addrspace(1)* getelementptr ([42 x i16], [42 x i16] addrspace(1)* @Arr_as1, i64 1, i64 0) to i16) - ret i16 %G -} - define i32 @test26(i32 %x) { ; CHECK-LABEL: @test26( ; CHECK-NEXT: [[SHL_NEG:%.*]] = shl i32 -3, [[X:%.*]] @@ -823,49 +707,6 @@ define i32 @test28commuted(i32 %x, i32 %y, i32 %z) { ret i32 %sub } -define i64 @test29(i8* %foo, i64 %i, i64 %j) { -; CHECK-LABEL: @test29( -; CHECK-NEXT: [[GEPDIFF:%.*]] = sub i64 [[I:%.*]], [[J:%.*]] -; CHECK-NEXT: ret i64 [[GEPDIFF]] -; - %gep1 = getelementptr inbounds i8, i8* %foo, i64 %i - %gep2 = getelementptr inbounds i8, i8* %foo, i64 %j - %cast1 = ptrtoint i8* %gep1 to i64 - %cast2 = ptrtoint i8* %gep2 to i64 - %sub = sub i64 %cast1, %cast2 - ret i64 %sub -} - -define i64 @test30(i8* %foo, i64 %i, i64 %j) { -; CHECK-LABEL: @test30( -; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i64 [[I:%.*]], 2 -; CHECK-NEXT: [[GEPDIFF:%.*]] = sub i64 [[GEP1_IDX]], [[J:%.*]] -; CHECK-NEXT: ret i64 [[GEPDIFF]] -; - %bit = bitcast i8* %foo to i32* - %gep1 = getelementptr inbounds i32, i32* %bit, i64 %i - %gep2 = getelementptr inbounds i8, i8* %foo, i64 %j - %cast1 = ptrtoint i32* %gep1 to i64 - %cast2 = ptrtoint i8* %gep2 to i64 - %sub = sub i64 %cast1, %cast2 - ret i64 %sub -} - -define i16 @test30_as1(i8 addrspace(1)* %foo, i16 %i, i16 %j) { -; CHECK-LABEL: @test30_as1( -; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i16 [[I:%.*]], 2 -; CHECK-NEXT: [[GEPDIFF:%.*]] = sub i16 [[GEP1_IDX]], [[J:%.*]] -; CHECK-NEXT: ret i16 [[GEPDIFF]] -; - %bit = bitcast i8 addrspace(1)* %foo to i32 addrspace(1)* - %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %bit, i16 %i - %gep2 = getelementptr inbounds i8, i8 addrspace(1)* %foo, i16 %j - %cast1 = ptrtoint i32 addrspace(1)* %gep1 to i16 - %cast2 = ptrtoint i8 addrspace(1)* %gep2 to i16 - %sub = sub i16 %cast1, %cast2 - ret i16 %sub -} - define <2 x i64> @test31(<2 x i64> %A) { ; CHECK-LABEL: @test31( ; CHECK-NEXT: [[SUB:%.*]] = add <2 x i64> [[A:%.*]], diff --git a/llvm/test/Transforms/InstCombine/volatile_store.ll b/llvm/test/Transforms/InstCombine/volatile_store.ll index c2f63d6659f07..105ec83056d61 100644 --- a/llvm/test/Transforms/InstCombine/volatile_store.ll +++ b/llvm/test/Transforms/InstCombine/volatile_store.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s @x = weak global i32 0 @@ -8,7 +8,7 @@ define void @self_assign_1() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP:%.*]] = load volatile i32, i32* @x, align 4 ; CHECK-NEXT: store volatile i32 [[TMP]], i32* @x, align 4 -; CHECK-NEXT: br label %return +; CHECK-NEXT: br label [[RETURN:%.*]] ; CHECK: return: ; CHECK-NEXT: ret void ; @@ -20,3 +20,22 @@ entry: return: ret void } + +define void @volatile_store_before_unreachable(i1 %c, i8* %p) { +; CHECK-LABEL: @volatile_store_before_unreachable( +; CHECK-NEXT: br i1 [[C:%.*]], label [[TRUE:%.*]], label [[FALSE:%.*]] +; CHECK: true: +; CHECK-NEXT: store volatile i8 0, i8* [[P:%.*]], align 1 +; CHECK-NEXT: unreachable +; CHECK: false: +; CHECK-NEXT: ret void +; + br i1 %c, label %true, label %false + +true: + store volatile i8 0, i8* %p + unreachable + +false: + ret void +} diff --git a/llvm/test/Transforms/InstCombine/xor.ll b/llvm/test/Transforms/InstCombine/xor.ll index 312b0125f626f..ba275a6066419 100644 --- a/llvm/test/Transforms/InstCombine/xor.ll +++ b/llvm/test/Transforms/InstCombine/xor.ll @@ -1171,3 +1171,77 @@ define i8 @not_ashr_wrong_const(i8 %x) { %r = xor i8 %a, -2 ret i8 %r } + +; (~A & B) ^ A --> (A | B) +; The division ops are here to thwart complexity-based canonicalization: all ops are binops. + +define i32 @test52(i32 %p1, i32 %p2) { +; CHECK-LABEL: @test52( +; CHECK-NEXT: [[A:%.*]] = udiv i32 42, [[P1:%.*]] +; CHECK-NEXT: [[B:%.*]] = udiv i32 42, [[P2:%.*]] +; CHECK-NEXT: [[O:%.*]] = xor i32 [[A]], -1 +; CHECK-NEXT: [[R:%.*]] = and i32 [[B]], [[O]] +; CHECK-NEXT: [[Z:%.*]] = xor i32 [[R]], [[A]] +; CHECK-NEXT: ret i32 [[Z]] +; + %a = udiv i32 42, %p1 + %b = udiv i32 42, %p2 + %o = xor i32 %a, -1 + %r = and i32 %o, %b + %z = xor i32 %r, %a + ret i32 %z +} + +; (~B & A) ^ B --> (A | B) +; The division ops are here to thwart complexity-based canonicalization: all ops are binops. + +define i32 @test53(i32 %p1, i32 %p2) { +; CHECK-LABEL: @test53( +; CHECK-NEXT: [[A:%.*]] = udiv i32 42, [[P1:%.*]] +; CHECK-NEXT: [[B:%.*]] = udiv i32 42, [[P2:%.*]] +; CHECK-NEXT: [[O:%.*]] = xor i32 [[B]], -1 +; CHECK-NEXT: [[R:%.*]] = and i32 [[A]], [[O]] +; CHECK-NEXT: [[Z:%.*]] = xor i32 [[R]], [[B]] +; CHECK-NEXT: ret i32 [[Z]] +; + %a = udiv i32 42, %p1 + %b = udiv i32 42, %p2 + %o = xor i32 %b, -1 + %r = and i32 %o, %a + %z = xor i32 %r, %b + ret i32 %z +} + +define i32 @test54(i32 %p1, i32 %p2) { +; CHECK-LABEL: @test54( +; CHECK-NEXT: [[A:%.*]] = udiv i32 42, [[P1:%.*]] +; CHECK-NEXT: [[B:%.*]] = udiv i32 42, [[P2:%.*]] +; CHECK-NEXT: [[O:%.*]] = xor i32 [[A]], -1 +; CHECK-NEXT: [[R:%.*]] = and i32 [[B]], [[O]] +; CHECK-NEXT: [[Z:%.*]] = xor i32 [[R]], [[A]] +; CHECK-NEXT: ret i32 [[Z]] +; + %a = udiv i32 42, %p1 + %b = udiv i32 42, %p2 + %o = xor i32 %a, -1 + %r = and i32 %b, %o + %z = xor i32 %r, %a + ret i32 %z +} + +define i32 @test55(i32 %p1, i32 %p2) { +; CHECK-LABEL: @test55( +; CHECK-NEXT: [[A:%.*]] = udiv i32 42, [[P1:%.*]] +; CHECK-NEXT: [[B:%.*]] = udiv i32 42, [[P2:%.*]] +; CHECK-NEXT: [[O:%.*]] = xor i32 [[A]], -1 +; CHECK-NEXT: [[R:%.*]] = and i32 [[B]], [[O]] +; CHECK-NEXT: [[Z:%.*]] = xor i32 [[A]], [[R]] +; CHECK-NEXT: ret i32 [[Z]] +; + %a = udiv i32 42, %p1 + %b = udiv i32 42, %p2 + %o = xor i32 %a, -1 + %r = and i32 %o, %b + %z = xor i32 %a, %r + ret i32 %z +} diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll b/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll index 170e2d55421c8..b1b879da1fbd3 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instsimplify -S | FileCheck %s ; Overflow on a float to int or int to float conversion is undefined (PR21130). @@ -38,3 +39,20 @@ define float @overflow_sitofp() { ret float %i } +; https://llvm.org/PR43907 + +define float @nan_f64_trunc() { +; CHECK-LABEL: @nan_f64_trunc( +; CHECK-NEXT: ret float 0x7FF0000000000000 +; + %f = fptrunc double 0x7FF0000000000001 to float + ret float %f +} + +define <2 x half> @nan_v2f32_trunc() { +; CHECK-LABEL: @nan_v2f32_trunc( +; CHECK-NEXT: ret <2 x half> +; + %f = fptrunc <2 x float> to <2 x half> + ret <2 x half> %f +} diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll index d590c565316e7..1da77358ede7e 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll @@ -41,6 +41,14 @@ define @sub() { ret %r } +define @sub_splat() { +; CHECK-LABEL: @sub_splat( +; CHECK-NEXT: ret shufflevector ( insertelement ( undef, i32 -16, i32 0), undef, zeroinitializer) +; + %r = sub zeroinitializer, shufflevector ( insertelement ( undef, i32 16, i32 0), undef, zeroinitializer) + ret %r +} + define @fsub() { ; CHECK-LABEL: @fsub( ; CHECK-NEXT: ret undef @@ -73,6 +81,14 @@ define @udiv() { ret %r } +define @udiv_splat_zero() { +; CHECK-LABEL: @udiv_splat_zero( +; CHECK-NEXT: ret undef +; + %r = udiv zeroinitializer, zeroinitializer + ret %r +} + define @sdiv() { ; CHECK-LABEL: @sdiv( ; CHECK-NEXT: ret undef diff --git a/llvm/test/Transforms/InstSimplify/abs_intrinsic.ll b/llvm/test/Transforms/InstSimplify/abs_intrinsic.ll index 70b50da9f0415..4598c5732e121 100644 --- a/llvm/test/Transforms/InstSimplify/abs_intrinsic.ll +++ b/llvm/test/Transforms/InstSimplify/abs_intrinsic.ll @@ -47,11 +47,14 @@ define i32 @test_abs_abs_3(i32 %x) { } ; If the sign bit is known zero, the abs is not needed. +; These cases are only folded by InstCombine, to avoid computing known bits +; twice, for the non-negative and the negative case. define i32 @zext_abs(i31 %x) { ; CHECK-LABEL: @zext_abs( ; CHECK-NEXT: [[ZEXT:%.*]] = zext i31 [[X:%.*]] to i32 -; CHECK-NEXT: ret i32 [[ZEXT]] +; CHECK-NEXT: [[ABS:%.*]] = call i32 @llvm.abs.i32(i32 [[ZEXT]], i1 false) +; CHECK-NEXT: ret i32 [[ABS]] ; %zext = zext i31 %x to i32 %abs = call i32 @llvm.abs.i32(i32 %zext, i1 false) @@ -61,7 +64,8 @@ define i32 @zext_abs(i31 %x) { define <3 x i82> @lshr_abs(<3 x i82> %x) { ; CHECK-LABEL: @lshr_abs( ; CHECK-NEXT: [[LSHR:%.*]] = lshr <3 x i82> [[X:%.*]], -; CHECK-NEXT: ret <3 x i82> [[LSHR]] +; CHECK-NEXT: [[ABS:%.*]] = call <3 x i82> @llvm.abs.v3i82(<3 x i82> [[LSHR]], i1 true) +; CHECK-NEXT: ret <3 x i82> [[ABS]] ; %lshr = lshr <3 x i82> %x, %abs = call <3 x i82> @llvm.abs.v3i82(<3 x i82> %lshr, i1 true) @@ -71,7 +75,8 @@ define <3 x i82> @lshr_abs(<3 x i82> %x) { define i32 @and_abs(i32 %x) { ; CHECK-LABEL: @and_abs( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 2147483644 -; CHECK-NEXT: ret i32 [[AND]] +; CHECK-NEXT: [[ABS:%.*]] = call i32 @llvm.abs.i32(i32 [[AND]], i1 true) +; CHECK-NEXT: ret i32 [[ABS]] ; %and = and i32 %x, 2147483644 %abs = call i32 @llvm.abs.i32(i32 %and, i1 true) @@ -81,7 +86,8 @@ define i32 @and_abs(i32 %x) { define <3 x i82> @select_abs(<3 x i1> %cond) { ; CHECK-LABEL: @select_abs( ; CHECK-NEXT: [[SEL:%.*]] = select <3 x i1> [[COND:%.*]], <3 x i82> zeroinitializer, <3 x i82> -; CHECK-NEXT: ret <3 x i82> [[SEL]] +; CHECK-NEXT: [[ABS:%.*]] = call <3 x i82> @llvm.abs.v3i82(<3 x i82> [[SEL]], i1 false) +; CHECK-NEXT: ret <3 x i82> [[ABS]] ; %sel = select <3 x i1> %cond, <3 x i82> zeroinitializer, <3 x i82> %abs = call <3 x i82> @llvm.abs.v3i82(<3 x i82> %sel, i1 false) @@ -94,7 +100,8 @@ define i32 @assume_abs(i32 %x) { ; CHECK-LABEL: @assume_abs( ; CHECK-NEXT: [[ASSUME:%.*]] = icmp sge i32 [[X:%.*]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[ASSUME]]) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: [[ABS:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 true) +; CHECK-NEXT: ret i32 [[ABS]] ; %assume = icmp sge i32 %x, 0 call void @llvm.assume(i1 %assume) diff --git a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll index 8b606dca2e21f..b1dd69c19f813 100644 --- a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll +++ b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll @@ -223,6 +223,7 @@ define float @PR22688(float %x) { declare float @llvm.fabs.f32(float) declare <2 x float> @llvm.fabs.v2f32(<2 x float>) declare float @llvm.sqrt.f32(float) +declare float @llvm.maxnum.f32(float, float) define float @fabs_select_positive_constants(i32 %c) { ; CHECK-LABEL: @fabs_select_positive_constants( @@ -529,649 +530,6 @@ define float @fabs_select_positive_constants_vector_extract(i32 %c) { ret float %fabs } -declare float @llvm.minnum.f32(float, float) -declare float @llvm.maxnum.f32(float, float) -declare double @llvm.minnum.f64(double, double) -declare double @llvm.maxnum.f64(double, double) -declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) -declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) - -; From the LangRef for minnum/maxnum: -; "If either operand is a NaN, returns the other non-NaN operand." - -define double @maxnum_nan_op0(double %x) { -; CHECK-LABEL: @maxnum_nan_op0( -; CHECK-NEXT: ret double [[X:%.*]] -; - %r = call double @llvm.maxnum.f64(double 0x7ff8000000000000, double %x) - ret double %r -} - -define double @maxnum_nan_op1(double %x) { -; CHECK-LABEL: @maxnum_nan_op1( -; CHECK-NEXT: ret double [[X:%.*]] -; - %r = call double @llvm.maxnum.f64(double %x, double 0x7ff800000000dead) - ret double %r -} - -define double @minnum_nan_op0(double %x) { -; CHECK-LABEL: @minnum_nan_op0( -; CHECK-NEXT: ret double [[X:%.*]] -; - %r = call double @llvm.minnum.f64(double 0x7ff8000dead00000, double %x) - ret double %r -} - -define double @minnum_nan_op1(double %x) { -; CHECK-LABEL: @minnum_nan_op1( -; CHECK-NEXT: ret double [[X:%.*]] -; - %r = call double @llvm.minnum.f64(double %x, double 0x7ff800dead00dead) - ret double %r -} - -define <2 x double> @maxnum_nan_op0_vec(<2 x double> %x) { -; CHECK-LABEL: @maxnum_nan_op0_vec( -; CHECK-NEXT: ret <2 x double> [[X:%.*]] -; - %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> , <2 x double> %x) - ret <2 x double> %r -} - -define <2 x double> @maxnum_nan_op1_vec(<2 x double> %x) { -; CHECK-LABEL: @maxnum_nan_op1_vec( -; CHECK-NEXT: ret <2 x double> [[X:%.*]] -; - %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> ) - ret <2 x double> %r -} - -define <2 x double> @minnum_nan_op0_vec(<2 x double> %x) { -; CHECK-LABEL: @minnum_nan_op0_vec( -; CHECK-NEXT: ret <2 x double> [[X:%.*]] -; - %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> , <2 x double> %x) - ret <2 x double> %r -} - -define <2 x double> @minnum_nan_op1_vec(<2 x double> %x) { -; CHECK-LABEL: @minnum_nan_op1_vec( -; CHECK-NEXT: ret <2 x double> [[X:%.*]] -; - %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> ) - ret <2 x double> %r -} - -define float @maxnum_undef_op1(float %x) { -; CHECK-LABEL: @maxnum_undef_op1( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.maxnum.f32(float %x, float undef) - ret float %val -} - -define float @maxnum_undef_op0(float %x) { -; CHECK-LABEL: @maxnum_undef_op0( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.maxnum.f32(float undef, float %x) - ret float %val -} - -define float @minnum_undef_op1(float %x) { -; CHECK-LABEL: @minnum_undef_op1( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minnum.f32(float %x, float undef) - ret float %val -} - -define float @minnum_undef_op0(float %x) { -; CHECK-LABEL: @minnum_undef_op0( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minnum.f32(float undef, float %x) - ret float %val -} - -define float @minnum_undef_undef(float %x) { -; CHECK-LABEL: @minnum_undef_undef( -; CHECK-NEXT: ret float undef -; - %val = call float @llvm.minnum.f32(float undef, float undef) - ret float %val -} - -define float @maxnum_undef_undef(float %x) { -; CHECK-LABEL: @maxnum_undef_undef( -; CHECK-NEXT: ret float undef -; - %val = call float @llvm.maxnum.f32(float undef, float undef) - ret float %val -} - -define float @minnum_same_args(float %x) { -; CHECK-LABEL: @minnum_same_args( -; CHECK-NEXT: ret float [[X:%.*]] -; - %y = call float @llvm.minnum.f32(float %x, float %x) - ret float %y -} - -define float @maxnum_same_args(float %x) { -; CHECK-LABEL: @maxnum_same_args( -; CHECK-NEXT: ret float [[X:%.*]] -; - %y = call float @llvm.maxnum.f32(float %x, float %x) - ret float %y -} - -define float @minnum_x_minnum_x_y(float %x, float %y) { -; CHECK-LABEL: @minnum_x_minnum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minnum.f32(float %x, float %y) - %b = call float @llvm.minnum.f32(float %x, float %a) - ret float %b -} - -define float @minnum_y_minnum_x_y(float %x, float %y) { -; CHECK-LABEL: @minnum_y_minnum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minnum.f32(float %x, float %y) - %b = call float @llvm.minnum.f32(float %y, float %a) - ret float %b -} - -define float @minnum_x_y_minnum_x(float %x, float %y) { -; CHECK-LABEL: @minnum_x_y_minnum_x( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minnum.f32(float %x, float %y) - %b = call float @llvm.minnum.f32(float %a, float %x) - ret float %b -} - -define float @minnum_x_y_minnum_y(float %x, float %y) { -; CHECK-LABEL: @minnum_x_y_minnum_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minnum.f32(float %x, float %y) - %b = call float @llvm.minnum.f32(float %a, float %y) - ret float %b -} - -; negative test - -define float @minnum_z_minnum_x_y(float %x, float %y, float %z) { -; CHECK-LABEL: @minnum_z_minnum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.minnum.f32(float [[Z:%.*]], float [[A]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.minnum.f32(float %x, float %y) - %b = call float @llvm.minnum.f32(float %z, float %a) - ret float %b -} - -; negative test - -define float @minnum_x_y_minnum_z(float %x, float %y, float %z) { -; CHECK-LABEL: @minnum_x_y_minnum_z( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.minnum.f32(float [[A]], float [[Z:%.*]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.minnum.f32(float %x, float %y) - %b = call float @llvm.minnum.f32(float %a, float %z) - ret float %b -} - -; minnum(X, -INF) --> -INF - -define float @minnum_neginf(float %x) { -; CHECK-LABEL: @minnum_neginf( -; CHECK-NEXT: ret float 0xFFF0000000000000 -; - %val = call float @llvm.minnum.f32(float %x, float 0xFFF0000000000000) - ret float %val -} - -define <2 x double> @minnum_neginf_commute_vec(<2 x double> %x) { -; CHECK-LABEL: @minnum_neginf_commute_vec( -; CHECK-NEXT: ret <2 x double> -; - %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> , <2 x double> %x) - ret <2 x double> %r -} - -; negative test - -define float @minnum_inf(float %x) { -; CHECK-LABEL: @minnum_inf( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.minnum.f32(float 0x7FF0000000000000, float [[X:%.*]]) -; CHECK-NEXT: ret float [[VAL]] -; - %val = call float @llvm.minnum.f32(float 0x7FF0000000000000, float %x) - ret float %val -} -define float @maxnum_x_maxnum_x_y(float %x, float %y) { -; CHECK-LABEL: @maxnum_x_maxnum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maxnum.f32(float %x, float %y) - %b = call float @llvm.maxnum.f32(float %x, float %a) - ret float %b -} - -define float @maxnum_y_maxnum_x_y(float %x, float %y) { -; CHECK-LABEL: @maxnum_y_maxnum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maxnum.f32(float %x, float %y) - %b = call float @llvm.maxnum.f32(float %y, float %a) - ret float %b -} - -define float @maxnum_x_y_maxnum_x(float %x, float %y) { -; CHECK-LABEL: @maxnum_x_y_maxnum_x( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maxnum.f32(float %x, float %y) - %b = call float @llvm.maxnum.f32(float %a, float %x) - ret float %b -} - -define float @maxnum_x_y_maxnum_y(float %x, float %y) { -; CHECK-LABEL: @maxnum_x_y_maxnum_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maxnum.f32(float %x, float %y) - %b = call float @llvm.maxnum.f32(float %a, float %y) - ret float %b -} - -; negative test - -define float @maxnum_z_maxnum_x_y(float %x, float %y, float %z) { -; CHECK-LABEL: @maxnum_z_maxnum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.maxnum.f32(float [[Z:%.*]], float [[A]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.maxnum.f32(float %x, float %y) - %b = call float @llvm.maxnum.f32(float %z, float %a) - ret float %b -} - -; negative test - -define float @maxnum_x_y_maxnum_z(float %x, float %y, float %z) { -; CHECK-LABEL: @maxnum_x_y_maxnum_z( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.maxnum.f32(float [[A]], float [[Z:%.*]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.maxnum.f32(float %x, float %y) - %b = call float @llvm.maxnum.f32(float %a, float %z) - ret float %b -} - -; maxnum(X, INF) --> INF - -define <2 x double> @maxnum_inf(<2 x double> %x) { -; CHECK-LABEL: @maxnum_inf( -; CHECK-NEXT: ret <2 x double> -; - %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double>) - ret <2 x double> %val -} - -define float @maxnum_inf_commute(float %x) { -; CHECK-LABEL: @maxnum_inf_commute( -; CHECK-NEXT: ret float 0x7FF0000000000000 -; - %val = call float @llvm.maxnum.f32(float 0x7FF0000000000000, float %x) - ret float %val -} - -; negative test - -define float @maxnum_neginf(float %x) { -; CHECK-LABEL: @maxnum_neginf( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.maxnum.f32(float 0xFFF0000000000000, float [[X:%.*]]) -; CHECK-NEXT: ret float [[VAL]] -; - %val = call float @llvm.maxnum.f32(float 0xFFF0000000000000, float %x) - ret float %val -} - -declare float @llvm.minimum.f32(float, float) -declare float @llvm.maximum.f32(float, float) -declare double @llvm.minimum.f64(double, double) -declare double @llvm.maximum.f64(double, double) -declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>) -declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>) - -; From the LangRef for minimum/maximum: -; "If either operand is a NaN, returns NaN." - -define double @maximum_nan_op0(double %x) { -; CHECK-LABEL: @maximum_nan_op0( -; CHECK-NEXT: ret double 0x7FF8000000000000 -; - %r = call double @llvm.maximum.f64(double 0x7ff8000000000000, double %x) - ret double %r -} - -define double @maximum_nan_op1(double %x) { -; CHECK-LABEL: @maximum_nan_op1( -; CHECK-NEXT: ret double 0x7FF800000000DEAD -; - %r = call double @llvm.maximum.f64(double %x, double 0x7ff800000000dead) - ret double %r -} - -define double @minimum_nan_op0(double %x) { -; CHECK-LABEL: @minimum_nan_op0( -; CHECK-NEXT: ret double 0x7FF8000DEAD00000 -; - %r = call double @llvm.minimum.f64(double 0x7ff8000dead00000, double %x) - ret double %r -} - -define double @minimum_nan_op1(double %x) { -; CHECK-LABEL: @minimum_nan_op1( -; CHECK-NEXT: ret double 0x7FF800DEAD00DEAD -; - %r = call double @llvm.minimum.f64(double %x, double 0x7ff800dead00dead) - ret double %r -} - -define <2 x double> @maximum_nan_op0_vec(<2 x double> %x) { -; CHECK-LABEL: @maximum_nan_op0_vec( -; CHECK-NEXT: ret <2 x double> -; - %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> , <2 x double> %x) - ret <2 x double> %r -} - -define <2 x double> @maximum_nan_op1_vec(<2 x double> %x) { -; CHECK-LABEL: @maximum_nan_op1_vec( -; CHECK-NEXT: ret <2 x double> -; - %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> ) - ret <2 x double> %r -} - -define <2 x double> @minimum_nan_op0_vec(<2 x double> %x) { -; CHECK-LABEL: @minimum_nan_op0_vec( -; CHECK-NEXT: ret <2 x double> -; - %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> , <2 x double> %x) - ret <2 x double> %r -} - -define <2 x double> @minimum_nan_op1_vec(<2 x double> %x) { -; CHECK-LABEL: @minimum_nan_op1_vec( -; CHECK-NEXT: ret <2 x double> -; - %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> ) - ret <2 x double> %r -} - -define float @maximum_undef_op1(float %x) { -; CHECK-LABEL: @maximum_undef_op1( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.maximum.f32(float %x, float undef) - ret float %val -} - -define float @maximum_undef_op0(float %x) { -; CHECK-LABEL: @maximum_undef_op0( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.maximum.f32(float undef, float %x) - ret float %val -} - -define float @minimum_undef_op1(float %x) { -; CHECK-LABEL: @minimum_undef_op1( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minimum.f32(float %x, float undef) - ret float %val -} - -define float @minimum_undef_op0(float %x) { -; CHECK-LABEL: @minimum_undef_op0( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minimum.f32(float undef, float %x) - ret float %val -} - -define float @minimum_undef_undef(float %x) { -; CHECK-LABEL: @minimum_undef_undef( -; CHECK-NEXT: ret float undef -; - %val = call float @llvm.minimum.f32(float undef, float undef) - ret float %val -} - -define float @maximum_undef_undef(float %x) { -; CHECK-LABEL: @maximum_undef_undef( -; CHECK-NEXT: ret float undef -; - %val = call float @llvm.maximum.f32(float undef, float undef) - ret float %val -} - -define float @minimum_same_args(float %x) { -; CHECK-LABEL: @minimum_same_args( -; CHECK-NEXT: ret float [[X:%.*]] -; - %y = call float @llvm.minimum.f32(float %x, float %x) - ret float %y -} - -define float @maximum_same_args(float %x) { -; CHECK-LABEL: @maximum_same_args( -; CHECK-NEXT: ret float [[X:%.*]] -; - %y = call float @llvm.maximum.f32(float %x, float %x) - ret float %y -} - -define float @minimum_x_minimum_x_y(float %x, float %y) { -; CHECK-LABEL: @minimum_x_minimum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minimum.f32(float %x, float %y) - %b = call float @llvm.minimum.f32(float %x, float %a) - ret float %b -} - -define float @minimum_y_minimum_x_y(float %x, float %y) { -; CHECK-LABEL: @minimum_y_minimum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minimum.f32(float %x, float %y) - %b = call float @llvm.minimum.f32(float %y, float %a) - ret float %b -} - -define float @minimum_x_y_minimum_x(float %x, float %y) { -; CHECK-LABEL: @minimum_x_y_minimum_x( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minimum.f32(float %x, float %y) - %b = call float @llvm.minimum.f32(float %a, float %x) - ret float %b -} - -define float @minimum_x_y_minimum_y(float %x, float %y) { -; CHECK-LABEL: @minimum_x_y_minimum_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minimum.f32(float %x, float %y) - %b = call float @llvm.minimum.f32(float %a, float %y) - ret float %b -} - -; negative test - -define float @minimum_z_minimum_x_y(float %x, float %y, float %z) { -; CHECK-LABEL: @minimum_z_minimum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.minimum.f32(float [[Z:%.*]], float [[A]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.minimum.f32(float %x, float %y) - %b = call float @llvm.minimum.f32(float %z, float %a) - ret float %b -} - -; negative test - -define float @minimum_x_y_minimum_z(float %x, float %y, float %z) { -; CHECK-LABEL: @minimum_x_y_minimum_z( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.minimum.f32(float [[A]], float [[Z:%.*]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.minimum.f32(float %x, float %y) - %b = call float @llvm.minimum.f32(float %a, float %z) - ret float %b -} - -; minimum(X, -INF) --> -INF - -define float @minimum_neginf(float %x) { -; CHECK-LABEL: @minimum_neginf( -; CHECK-NEXT: ret float 0xFFF0000000000000 -; - %val = call float @llvm.minimum.f32(float %x, float 0xFFF0000000000000) - ret float %val -} - -define <2 x double> @minimum_neginf_commute_vec(<2 x double> %x) { -; CHECK-LABEL: @minimum_neginf_commute_vec( -; CHECK-NEXT: ret <2 x double> -; - %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> , <2 x double> %x) - ret <2 x double> %r -} - -; negative test - -define float @minimum_inf(float %x) { -; CHECK-LABEL: @minimum_inf( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.minimum.f32(float 0x7FF0000000000000, float [[X:%.*]]) -; CHECK-NEXT: ret float [[VAL]] -; - %val = call float @llvm.minimum.f32(float 0x7FF0000000000000, float %x) - ret float %val -} -define float @maximum_x_maximum_x_y(float %x, float %y) { -; CHECK-LABEL: @maximum_x_maximum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maximum.f32(float %x, float %y) - %b = call float @llvm.maximum.f32(float %x, float %a) - ret float %b -} - -define float @maximum_y_maximum_x_y(float %x, float %y) { -; CHECK-LABEL: @maximum_y_maximum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maximum.f32(float %x, float %y) - %b = call float @llvm.maximum.f32(float %y, float %a) - ret float %b -} - -define float @maximum_x_y_maximum_x(float %x, float %y) { -; CHECK-LABEL: @maximum_x_y_maximum_x( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maximum.f32(float %x, float %y) - %b = call float @llvm.maximum.f32(float %a, float %x) - ret float %b -} - -define float @maximum_x_y_maximum_y(float %x, float %y) { -; CHECK-LABEL: @maximum_x_y_maximum_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maximum.f32(float %x, float %y) - %b = call float @llvm.maximum.f32(float %a, float %y) - ret float %b -} - -; negative test - -define float @maximum_z_maximum_x_y(float %x, float %y, float %z) { -; CHECK-LABEL: @maximum_z_maximum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.maximum.f32(float [[Z:%.*]], float [[A]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.maximum.f32(float %x, float %y) - %b = call float @llvm.maximum.f32(float %z, float %a) - ret float %b -} - -; negative test - -define float @maximum_x_y_maximum_z(float %x, float %y, float %z) { -; CHECK-LABEL: @maximum_x_y_maximum_z( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.maximum.f32(float [[A]], float [[Z:%.*]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.maximum.f32(float %x, float %y) - %b = call float @llvm.maximum.f32(float %a, float %z) - ret float %b -} - -; maximum(X, INF) --> INF - -define <2 x double> @maximum_inf(<2 x double> %x) { -; CHECK-LABEL: @maximum_inf( -; CHECK-NEXT: ret <2 x double> -; - %val = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double>) - ret <2 x double> %val -} - -define float @maximum_inf_commute(float %x) { -; CHECK-LABEL: @maximum_inf_commute( -; CHECK-NEXT: ret float 0x7FF0000000000000 -; - %val = call float @llvm.maximum.f32(float 0x7FF0000000000000, float %x) - ret float %val -} - ; Y - (Y - X) --> X define float @fsub_fsub_common_op(float %x, float %y) { diff --git a/llvm/test/Transforms/InstSimplify/fminmax-folds.ll b/llvm/test/Transforms/InstSimplify/fminmax-folds.ll new file mode 100644 index 0000000000000..c62f76c87faef --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/fminmax-folds.ll @@ -0,0 +1,1089 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instsimplify -S | FileCheck %s + +declare float @llvm.minnum.f32(float, float) +declare float @llvm.maxnum.f32(float, float) +declare float @llvm.minimum.f32(float, float) +declare float @llvm.maximum.f32(float, float) +declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) +declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>) +declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>) + +declare double @llvm.minnum.f64(double, double) +declare double @llvm.maxnum.f64(double, double) +declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) +declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) +declare double @llvm.minimum.f64(double, double) +declare double @llvm.maximum.f64(double, double) +declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>) +declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>) + +define float @test_minnum_const_nan(float %x) { +; CHECK-LABEL: @test_minnum_const_nan( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call float @llvm.minnum.f32(float %x, float 0x7fff000000000000) + ret float %r +} + +define float @test_maxnum_const_nan(float %x) { +; CHECK-LABEL: @test_maxnum_const_nan( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call float @llvm.maxnum.f32(float %x, float 0x7fff000000000000) + ret float %r +} + +define float @test_maximum_const_nan(float %x) { +; CHECK-LABEL: @test_maximum_const_nan( +; CHECK-NEXT: ret float 0x7FFF000000000000 +; + %r = call float @llvm.maximum.f32(float %x, float 0x7fff000000000000) + ret float %r +} + +define float @test_minimum_const_nan(float %x) { +; CHECK-LABEL: @test_minimum_const_nan( +; CHECK-NEXT: ret float 0x7FFF000000000000 +; + %r = call float @llvm.minimum.f32(float %x, float 0x7fff000000000000) + ret float %r +} + +define float @test_minnum_const_inf(float %x) { +; CHECK-LABEL: @test_minnum_const_inf( +; CHECK-NEXT: [[R:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float 0x7FF0000000000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call float @llvm.minnum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_maxnum_const_inf(float %x) { +; CHECK-LABEL: @test_maxnum_const_inf( +; CHECK-NEXT: ret float 0x7FF0000000000000 +; + %r = call float @llvm.maxnum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_maximum_const_inf(float %x) { +; CHECK-LABEL: @test_maximum_const_inf( +; CHECK-NEXT: [[R:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float 0x7FF0000000000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call float @llvm.maximum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_minimum_const_inf(float %x) { +; CHECK-LABEL: @test_minimum_const_inf( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call float @llvm.minimum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_minnum_const_neg_inf(float %x) { +; CHECK-LABEL: @test_minnum_const_neg_inf( +; CHECK-NEXT: ret float 0xFFF0000000000000 +; + %r = call float @llvm.minnum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_maxnum_const_neg_inf(float %x) { +; CHECK-LABEL: @test_maxnum_const_neg_inf( +; CHECK-NEXT: [[R:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float 0xFFF0000000000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call float @llvm.maxnum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_maximum_const_neg_inf(float %x) { +; CHECK-LABEL: @test_maximum_const_neg_inf( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call float @llvm.maximum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_minimum_const_neg_inf(float %x) { +; CHECK-LABEL: @test_minimum_const_neg_inf( +; CHECK-NEXT: [[R:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float 0xFFF0000000000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call float @llvm.minimum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_minnum_const_inf_nnan(float %x) { +; CHECK-LABEL: @test_minnum_const_inf_nnan( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call nnan float @llvm.minnum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_maxnum_const_inf_nnan(float %x) { +; CHECK-LABEL: @test_maxnum_const_inf_nnan( +; CHECK-NEXT: ret float 0x7FF0000000000000 +; + %r = call nnan float @llvm.maxnum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_maximum_const_inf_nnan(float %x) { +; CHECK-LABEL: @test_maximum_const_inf_nnan( +; CHECK-NEXT: ret float 0x7FF0000000000000 +; + %r = call nnan float @llvm.maximum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_minimum_const_inf_nnan(float %x) { +; CHECK-LABEL: @test_minimum_const_inf_nnan( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call nnan float @llvm.minimum.f32(float %x, float 0x7ff0000000000000) + ret float %r +} + +define float @test_minnum_const_inf_nnan_comm(float %x) { +; CHECK-LABEL: @test_minnum_const_inf_nnan_comm( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call nnan float @llvm.minnum.f32(float 0x7ff0000000000000, float %x) + ret float %r +} + +define float @test_maxnum_const_inf_nnan_comm(float %x) { +; CHECK-LABEL: @test_maxnum_const_inf_nnan_comm( +; CHECK-NEXT: ret float 0x7FF0000000000000 +; + %r = call nnan float @llvm.maxnum.f32(float 0x7ff0000000000000, float %x) + ret float %r +} + +define float @test_maximum_const_inf_nnan_comm(float %x) { +; CHECK-LABEL: @test_maximum_const_inf_nnan_comm( +; CHECK-NEXT: ret float 0x7FF0000000000000 +; + %r = call nnan float @llvm.maximum.f32(float 0x7ff0000000000000, float %x) + ret float %r +} + +define float @test_minimum_const_inf_nnan_comm(float %x) { +; CHECK-LABEL: @test_minimum_const_inf_nnan_comm( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call nnan float @llvm.minimum.f32(float 0x7ff0000000000000, float %x) + ret float %r +} + +define <2 x float> @test_minnum_const_inf_nnan_comm_vec(<2 x float> %x) { +; CHECK-LABEL: @test_minnum_const_inf_nnan_comm_vec( +; CHECK-NEXT: ret <2 x float> [[X:%.*]] +; + %r = call nnan <2 x float> @llvm.minnum.v2f32(<2 x float> , <2 x float> %x) + ret <2 x float> %r +} + +define <2 x float> @test_maxnum_const_inf_nnan_comm_vec(<2 x float> %x) { +; CHECK-LABEL: @test_maxnum_const_inf_nnan_comm_vec( +; CHECK-NEXT: ret <2 x float> +; + %r = call nnan <2 x float> @llvm.maxnum.v2f32(<2 x float> , <2 x float> %x) + ret <2 x float> %r +} + +define <2 x float> @test_maximum_const_inf_nnan_comm_vec(<2 x float> %x) { +; CHECK-LABEL: @test_maximum_const_inf_nnan_comm_vec( +; CHECK-NEXT: ret <2 x float> +; + %r = call nnan <2 x float> @llvm.maximum.v2f32(<2 x float> , <2 x float> %x) + ret <2 x float> %r +} + +define <2 x float> @test_minimum_const_inf_nnan_comm_vec(<2 x float> %x) { +; CHECK-LABEL: @test_minimum_const_inf_nnan_comm_vec( +; CHECK-NEXT: ret <2 x float> [[X:%.*]] +; + %r = call nnan <2 x float> @llvm.minimum.v2f32(<2 x float> , <2 x float> %x) + ret <2 x float> %r +} + +define float @test_minnum_const_neg_inf_nnan(float %x) { +; CHECK-LABEL: @test_minnum_const_neg_inf_nnan( +; CHECK-NEXT: ret float 0xFFF0000000000000 +; + %r = call nnan float @llvm.minnum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_maxnum_const_neg_inf_nnan(float %x) { +; CHECK-LABEL: @test_maxnum_const_neg_inf_nnan( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call nnan float @llvm.maxnum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_maximum_const_neg_inf_nnan(float %x) { +; CHECK-LABEL: @test_maximum_const_neg_inf_nnan( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call nnan float @llvm.maximum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_minimum_const_neg_inf_nnan(float %x) { +; CHECK-LABEL: @test_minimum_const_neg_inf_nnan( +; CHECK-NEXT: ret float 0xFFF0000000000000 +; + %r = call nnan float @llvm.minimum.f32(float %x, float 0xfff0000000000000) + ret float %r +} + +define float @test_minnum_const_max(float %x) { +; CHECK-LABEL: @test_minnum_const_max( +; CHECK-NEXT: [[R:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call float @llvm.minnum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_maxnum_const_max(float %x) { +; CHECK-LABEL: @test_maxnum_const_max( +; CHECK-NEXT: [[R:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call float @llvm.maxnum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_maximum_const_max(float %x) { +; CHECK-LABEL: @test_maximum_const_max( +; CHECK-NEXT: [[R:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call float @llvm.maximum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_minimum_const_max(float %x) { +; CHECK-LABEL: @test_minimum_const_max( +; CHECK-NEXT: [[R:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call float @llvm.minimum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_minnum_const_neg_max(float %x) { +; CHECK-LABEL: @test_minnum_const_neg_max( +; CHECK-NEXT: [[R:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call float @llvm.minnum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_maxnum_const_neg_max(float %x) { +; CHECK-LABEL: @test_maxnum_const_neg_max( +; CHECK-NEXT: [[R:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call float @llvm.maxnum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_maximum_const_neg_max(float %x) { +; CHECK-LABEL: @test_maximum_const_neg_max( +; CHECK-NEXT: [[R:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call float @llvm.maximum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_minimum_const_neg_max(float %x) { +; CHECK-LABEL: @test_minimum_const_neg_max( +; CHECK-NEXT: [[R:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call float @llvm.minimum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_minnum_const_max_ninf(float %x) { +; CHECK-LABEL: @test_minnum_const_max_ninf( +; CHECK-NEXT: [[R:%.*]] = call ninf float @llvm.minnum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call ninf float @llvm.minnum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_maxnum_const_max_ninf(float %x) { +; CHECK-LABEL: @test_maxnum_const_max_ninf( +; CHECK-NEXT: ret float 0x47EFFFFFE0000000 +; + %r = call ninf float @llvm.maxnum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_maximum_const_max_ninf(float %x) { +; CHECK-LABEL: @test_maximum_const_max_ninf( +; CHECK-NEXT: [[R:%.*]] = call ninf float @llvm.maximum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call ninf float @llvm.maximum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_minimum_const_max_ninf(float %x) { +; CHECK-LABEL: @test_minimum_const_max_ninf( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call ninf float @llvm.minimum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_minnum_const_neg_max_ninf(float %x) { +; CHECK-LABEL: @test_minnum_const_neg_max_ninf( +; CHECK-NEXT: ret float 0xC7EFFFFFE0000000 +; + %r = call ninf float @llvm.minnum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_maxnum_const_neg_max_ninf(float %x) { +; CHECK-LABEL: @test_maxnum_const_neg_max_ninf( +; CHECK-NEXT: [[R:%.*]] = call ninf float @llvm.maxnum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call ninf float @llvm.maxnum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_maximum_const_neg_max_ninf(float %x) { +; CHECK-LABEL: @test_maximum_const_neg_max_ninf( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call ninf float @llvm.maximum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_minimum_const_neg_max_ninf(float %x) { +; CHECK-LABEL: @test_minimum_const_neg_max_ninf( +; CHECK-NEXT: [[R:%.*]] = call ninf float @llvm.minimum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: ret float [[R]] +; + %r = call ninf float @llvm.minimum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_minnum_const_max_nnan_ninf(float %x) { +; CHECK-LABEL: @test_minnum_const_max_nnan_ninf( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call nnan ninf float @llvm.minnum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_maxnum_const_max_nnan_ninf(float %x) { +; CHECK-LABEL: @test_maxnum_const_max_nnan_ninf( +; CHECK-NEXT: ret float 0x47EFFFFFE0000000 +; + %r = call nnan ninf float @llvm.maxnum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_maximum_const_max_nnan_ninf(float %x) { +; CHECK-LABEL: @test_maximum_const_max_nnan_ninf( +; CHECK-NEXT: ret float 0x47EFFFFFE0000000 +; + %r = call nnan ninf float @llvm.maximum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_minimum_const_max_nnan_ninf(float %x) { +; CHECK-LABEL: @test_minimum_const_max_nnan_ninf( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call nnan ninf float @llvm.minimum.f32(float %x, float 0x47efffffe0000000) + ret float %r +} + +define float @test_minnum_const_neg_max_nnan_ninf(float %x) { +; CHECK-LABEL: @test_minnum_const_neg_max_nnan_ninf( +; CHECK-NEXT: ret float 0xC7EFFFFFE0000000 +; + %r = call nnan ninf float @llvm.minnum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_maxnum_const_neg_max_nnan_ninf(float %x) { +; CHECK-LABEL: @test_maxnum_const_neg_max_nnan_ninf( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call nnan ninf float @llvm.maxnum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_maximum_const_neg_max_nnan_ninf(float %x) { +; CHECK-LABEL: @test_maximum_const_neg_max_nnan_ninf( +; CHECK-NEXT: ret float [[X:%.*]] +; + %r = call nnan ninf float @llvm.maximum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +define float @test_minimum_const_neg_max_nnan_ninf(float %x) { +; CHECK-LABEL: @test_minimum_const_neg_max_nnan_ninf( +; CHECK-NEXT: ret float 0xC7EFFFFFE0000000 +; + %r = call nnan ninf float @llvm.minimum.f32(float %x, float 0xc7efffffe0000000) + ret float %r +} + +; From the LangRef for minnum/maxnum: +; "If either operand is a NaN, returns the other non-NaN operand." + +define double @maxnum_nan_op0(double %x) { +; CHECK-LABEL: @maxnum_nan_op0( +; CHECK-NEXT: ret double [[X:%.*]] +; + %r = call double @llvm.maxnum.f64(double 0x7ff8000000000000, double %x) + ret double %r +} + +define double @maxnum_nan_op1(double %x) { +; CHECK-LABEL: @maxnum_nan_op1( +; CHECK-NEXT: ret double [[X:%.*]] +; + %r = call double @llvm.maxnum.f64(double %x, double 0x7ff800000000dead) + ret double %r +} + +define double @minnum_nan_op0(double %x) { +; CHECK-LABEL: @minnum_nan_op0( +; CHECK-NEXT: ret double [[X:%.*]] +; + %r = call double @llvm.minnum.f64(double 0x7ff8000dead00000, double %x) + ret double %r +} + +define double @minnum_nan_op1(double %x) { +; CHECK-LABEL: @minnum_nan_op1( +; CHECK-NEXT: ret double [[X:%.*]] +; + %r = call double @llvm.minnum.f64(double %x, double 0x7ff800dead00dead) + ret double %r +} + +define <2 x double> @maxnum_nan_op0_vec(<2 x double> %x) { +; CHECK-LABEL: @maxnum_nan_op0_vec( +; CHECK-NEXT: ret <2 x double> [[X:%.*]] +; + %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> , <2 x double> %x) + ret <2 x double> %r +} + +define <2 x double> @maxnum_nan_op1_vec(<2 x double> %x) { +; CHECK-LABEL: @maxnum_nan_op1_vec( +; CHECK-NEXT: ret <2 x double> [[X:%.*]] +; + %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> ) + ret <2 x double> %r +} + +define <2 x double> @minnum_nan_op0_vec(<2 x double> %x) { +; CHECK-LABEL: @minnum_nan_op0_vec( +; CHECK-NEXT: ret <2 x double> [[X:%.*]] +; + %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> , <2 x double> %x) + ret <2 x double> %r +} + +define <2 x double> @minnum_nan_op1_vec(<2 x double> %x) { +; CHECK-LABEL: @minnum_nan_op1_vec( +; CHECK-NEXT: ret <2 x double> [[X:%.*]] +; + %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> ) + ret <2 x double> %r +} + +define float @maxnum_undef_op1(float %x) { +; CHECK-LABEL: @maxnum_undef_op1( +; CHECK-NEXT: ret float [[X:%.*]] +; + %val = call float @llvm.maxnum.f32(float %x, float undef) + ret float %val +} + +define float @maxnum_undef_op0(float %x) { +; CHECK-LABEL: @maxnum_undef_op0( +; CHECK-NEXT: ret float [[X:%.*]] +; + %val = call float @llvm.maxnum.f32(float undef, float %x) + ret float %val +} + +define float @minnum_undef_op1(float %x) { +; CHECK-LABEL: @minnum_undef_op1( +; CHECK-NEXT: ret float [[X:%.*]] +; + %val = call float @llvm.minnum.f32(float %x, float undef) + ret float %val +} + +define float @minnum_undef_op0(float %x) { +; CHECK-LABEL: @minnum_undef_op0( +; CHECK-NEXT: ret float [[X:%.*]] +; + %val = call float @llvm.minnum.f32(float undef, float %x) + ret float %val +} + +define float @minnum_undef_undef(float %x) { +; CHECK-LABEL: @minnum_undef_undef( +; CHECK-NEXT: ret float undef +; + %val = call float @llvm.minnum.f32(float undef, float undef) + ret float %val +} + +define float @maxnum_undef_undef(float %x) { +; CHECK-LABEL: @maxnum_undef_undef( +; CHECK-NEXT: ret float undef +; + %val = call float @llvm.maxnum.f32(float undef, float undef) + ret float %val +} + +define float @minnum_same_args(float %x) { +; CHECK-LABEL: @minnum_same_args( +; CHECK-NEXT: ret float [[X:%.*]] +; + %y = call float @llvm.minnum.f32(float %x, float %x) + ret float %y +} + +define float @maxnum_same_args(float %x) { +; CHECK-LABEL: @maxnum_same_args( +; CHECK-NEXT: ret float [[X:%.*]] +; + %y = call float @llvm.maxnum.f32(float %x, float %x) + ret float %y +} + +define float @minnum_x_minnum_x_y(float %x, float %y) { +; CHECK-LABEL: @minnum_x_minnum_x_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.minnum.f32(float %x, float %y) + %b = call float @llvm.minnum.f32(float %x, float %a) + ret float %b +} + +define float @minnum_y_minnum_x_y(float %x, float %y) { +; CHECK-LABEL: @minnum_y_minnum_x_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.minnum.f32(float %x, float %y) + %b = call float @llvm.minnum.f32(float %y, float %a) + ret float %b +} + +define float @minnum_x_y_minnum_x(float %x, float %y) { +; CHECK-LABEL: @minnum_x_y_minnum_x( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.minnum.f32(float %x, float %y) + %b = call float @llvm.minnum.f32(float %a, float %x) + ret float %b +} + +define float @minnum_x_y_minnum_y(float %x, float %y) { +; CHECK-LABEL: @minnum_x_y_minnum_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.minnum.f32(float %x, float %y) + %b = call float @llvm.minnum.f32(float %a, float %y) + ret float %b +} + +; negative test + +define float @minnum_z_minnum_x_y(float %x, float %y, float %z) { +; CHECK-LABEL: @minnum_z_minnum_x_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: [[B:%.*]] = call float @llvm.minnum.f32(float [[Z:%.*]], float [[A]]) +; CHECK-NEXT: ret float [[B]] +; + %a = call float @llvm.minnum.f32(float %x, float %y) + %b = call float @llvm.minnum.f32(float %z, float %a) + ret float %b +} + +; negative test + +define float @minnum_x_y_minnum_z(float %x, float %y, float %z) { +; CHECK-LABEL: @minnum_x_y_minnum_z( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: [[B:%.*]] = call float @llvm.minnum.f32(float [[A]], float [[Z:%.*]]) +; CHECK-NEXT: ret float [[B]] +; + %a = call float @llvm.minnum.f32(float %x, float %y) + %b = call float @llvm.minnum.f32(float %a, float %z) + ret float %b +} + +; minnum(X, -INF) --> -INF + +define float @minnum_neginf(float %x) { +; CHECK-LABEL: @minnum_neginf( +; CHECK-NEXT: ret float 0xFFF0000000000000 +; + %val = call float @llvm.minnum.f32(float %x, float 0xFFF0000000000000) + ret float %val +} + +define <2 x double> @minnum_neginf_commute_vec(<2 x double> %x) { +; CHECK-LABEL: @minnum_neginf_commute_vec( +; CHECK-NEXT: ret <2 x double> +; + %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> , <2 x double> %x) + ret <2 x double> %r +} + +; negative test + +define float @minnum_inf(float %x) { +; CHECK-LABEL: @minnum_inf( +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.minnum.f32(float 0x7FF0000000000000, float [[X:%.*]]) +; CHECK-NEXT: ret float [[VAL]] +; + %val = call float @llvm.minnum.f32(float 0x7FF0000000000000, float %x) + ret float %val +} +define float @maxnum_x_maxnum_x_y(float %x, float %y) { +; CHECK-LABEL: @maxnum_x_maxnum_x_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.maxnum.f32(float %x, float %y) + %b = call float @llvm.maxnum.f32(float %x, float %a) + ret float %b +} + +define float @maxnum_y_maxnum_x_y(float %x, float %y) { +; CHECK-LABEL: @maxnum_y_maxnum_x_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.maxnum.f32(float %x, float %y) + %b = call float @llvm.maxnum.f32(float %y, float %a) + ret float %b +} + +define float @maxnum_x_y_maxnum_x(float %x, float %y) { +; CHECK-LABEL: @maxnum_x_y_maxnum_x( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.maxnum.f32(float %x, float %y) + %b = call float @llvm.maxnum.f32(float %a, float %x) + ret float %b +} + +define float @maxnum_x_y_maxnum_y(float %x, float %y) { +; CHECK-LABEL: @maxnum_x_y_maxnum_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.maxnum.f32(float %x, float %y) + %b = call float @llvm.maxnum.f32(float %a, float %y) + ret float %b +} + +; negative test + +define float @maxnum_z_maxnum_x_y(float %x, float %y, float %z) { +; CHECK-LABEL: @maxnum_z_maxnum_x_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: [[B:%.*]] = call float @llvm.maxnum.f32(float [[Z:%.*]], float [[A]]) +; CHECK-NEXT: ret float [[B]] +; + %a = call float @llvm.maxnum.f32(float %x, float %y) + %b = call float @llvm.maxnum.f32(float %z, float %a) + ret float %b +} + +; negative test + +define float @maxnum_x_y_maxnum_z(float %x, float %y, float %z) { +; CHECK-LABEL: @maxnum_x_y_maxnum_z( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: [[B:%.*]] = call float @llvm.maxnum.f32(float [[A]], float [[Z:%.*]]) +; CHECK-NEXT: ret float [[B]] +; + %a = call float @llvm.maxnum.f32(float %x, float %y) + %b = call float @llvm.maxnum.f32(float %a, float %z) + ret float %b +} + +; maxnum(X, INF) --> INF + +define <2 x double> @maxnum_inf(<2 x double> %x) { +; CHECK-LABEL: @maxnum_inf( +; CHECK-NEXT: ret <2 x double> +; + %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double>) + ret <2 x double> %val +} + +define float @maxnum_inf_commute(float %x) { +; CHECK-LABEL: @maxnum_inf_commute( +; CHECK-NEXT: ret float 0x7FF0000000000000 +; + %val = call float @llvm.maxnum.f32(float 0x7FF0000000000000, float %x) + ret float %val +} + +; negative test + +define float @maxnum_neginf(float %x) { +; CHECK-LABEL: @maxnum_neginf( +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.maxnum.f32(float 0xFFF0000000000000, float [[X:%.*]]) +; CHECK-NEXT: ret float [[VAL]] +; + %val = call float @llvm.maxnum.f32(float 0xFFF0000000000000, float %x) + ret float %val +} + +; From the LangRef for minimum/maximum: +; "If either operand is a NaN, returns NaN." + +define double @maximum_nan_op0(double %x) { +; CHECK-LABEL: @maximum_nan_op0( +; CHECK-NEXT: ret double 0x7FF8000000000000 +; + %r = call double @llvm.maximum.f64(double 0x7ff8000000000000, double %x) + ret double %r +} + +define double @maximum_nan_op1(double %x) { +; CHECK-LABEL: @maximum_nan_op1( +; CHECK-NEXT: ret double 0x7FF800000000DEAD +; + %r = call double @llvm.maximum.f64(double %x, double 0x7ff800000000dead) + ret double %r +} + +define double @minimum_nan_op0(double %x) { +; CHECK-LABEL: @minimum_nan_op0( +; CHECK-NEXT: ret double 0x7FF8000DEAD00000 +; + %r = call double @llvm.minimum.f64(double 0x7ff8000dead00000, double %x) + ret double %r +} + +define double @minimum_nan_op1(double %x) { +; CHECK-LABEL: @minimum_nan_op1( +; CHECK-NEXT: ret double 0x7FF800DEAD00DEAD +; + %r = call double @llvm.minimum.f64(double %x, double 0x7ff800dead00dead) + ret double %r +} + +define <2 x double> @maximum_nan_op0_vec(<2 x double> %x) { +; CHECK-LABEL: @maximum_nan_op0_vec( +; CHECK-NEXT: ret <2 x double> +; + %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> , <2 x double> %x) + ret <2 x double> %r +} + +define <2 x double> @maximum_nan_op1_vec(<2 x double> %x) { +; CHECK-LABEL: @maximum_nan_op1_vec( +; CHECK-NEXT: ret <2 x double> +; + %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> ) + ret <2 x double> %r +} + +define <2 x double> @minimum_nan_op0_vec(<2 x double> %x) { +; CHECK-LABEL: @minimum_nan_op0_vec( +; CHECK-NEXT: ret <2 x double> +; + %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> , <2 x double> %x) + ret <2 x double> %r +} + +define <2 x double> @minimum_nan_op1_vec(<2 x double> %x) { +; CHECK-LABEL: @minimum_nan_op1_vec( +; CHECK-NEXT: ret <2 x double> +; + %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> ) + ret <2 x double> %r +} + +define float @maximum_undef_op1(float %x) { +; CHECK-LABEL: @maximum_undef_op1( +; CHECK-NEXT: ret float [[X:%.*]] +; + %val = call float @llvm.maximum.f32(float %x, float undef) + ret float %val +} + +define float @maximum_undef_op0(float %x) { +; CHECK-LABEL: @maximum_undef_op0( +; CHECK-NEXT: ret float [[X:%.*]] +; + %val = call float @llvm.maximum.f32(float undef, float %x) + ret float %val +} + +define float @minimum_undef_op1(float %x) { +; CHECK-LABEL: @minimum_undef_op1( +; CHECK-NEXT: ret float [[X:%.*]] +; + %val = call float @llvm.minimum.f32(float %x, float undef) + ret float %val +} + +define float @minimum_undef_op0(float %x) { +; CHECK-LABEL: @minimum_undef_op0( +; CHECK-NEXT: ret float [[X:%.*]] +; + %val = call float @llvm.minimum.f32(float undef, float %x) + ret float %val +} + +define float @minimum_undef_undef(float %x) { +; CHECK-LABEL: @minimum_undef_undef( +; CHECK-NEXT: ret float undef +; + %val = call float @llvm.minimum.f32(float undef, float undef) + ret float %val +} + +define float @maximum_undef_undef(float %x) { +; CHECK-LABEL: @maximum_undef_undef( +; CHECK-NEXT: ret float undef +; + %val = call float @llvm.maximum.f32(float undef, float undef) + ret float %val +} + +define float @minimum_same_args(float %x) { +; CHECK-LABEL: @minimum_same_args( +; CHECK-NEXT: ret float [[X:%.*]] +; + %y = call float @llvm.minimum.f32(float %x, float %x) + ret float %y +} + +define float @maximum_same_args(float %x) { +; CHECK-LABEL: @maximum_same_args( +; CHECK-NEXT: ret float [[X:%.*]] +; + %y = call float @llvm.maximum.f32(float %x, float %x) + ret float %y +} + +define float @minimum_x_minimum_x_y(float %x, float %y) { +; CHECK-LABEL: @minimum_x_minimum_x_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.minimum.f32(float %x, float %y) + %b = call float @llvm.minimum.f32(float %x, float %a) + ret float %b +} + +define float @minimum_y_minimum_x_y(float %x, float %y) { +; CHECK-LABEL: @minimum_y_minimum_x_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.minimum.f32(float %x, float %y) + %b = call float @llvm.minimum.f32(float %y, float %a) + ret float %b +} + +define float @minimum_x_y_minimum_x(float %x, float %y) { +; CHECK-LABEL: @minimum_x_y_minimum_x( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.minimum.f32(float %x, float %y) + %b = call float @llvm.minimum.f32(float %a, float %x) + ret float %b +} + +define float @minimum_x_y_minimum_y(float %x, float %y) { +; CHECK-LABEL: @minimum_x_y_minimum_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.minimum.f32(float %x, float %y) + %b = call float @llvm.minimum.f32(float %a, float %y) + ret float %b +} + +; negative test + +define float @minimum_z_minimum_x_y(float %x, float %y, float %z) { +; CHECK-LABEL: @minimum_z_minimum_x_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: [[B:%.*]] = call float @llvm.minimum.f32(float [[Z:%.*]], float [[A]]) +; CHECK-NEXT: ret float [[B]] +; + %a = call float @llvm.minimum.f32(float %x, float %y) + %b = call float @llvm.minimum.f32(float %z, float %a) + ret float %b +} + +; negative test + +define float @minimum_x_y_minimum_z(float %x, float %y, float %z) { +; CHECK-LABEL: @minimum_x_y_minimum_z( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: [[B:%.*]] = call float @llvm.minimum.f32(float [[A]], float [[Z:%.*]]) +; CHECK-NEXT: ret float [[B]] +; + %a = call float @llvm.minimum.f32(float %x, float %y) + %b = call float @llvm.minimum.f32(float %a, float %z) + ret float %b +} + +define float @maximum_x_maximum_x_y(float %x, float %y) { +; CHECK-LABEL: @maximum_x_maximum_x_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.maximum.f32(float %x, float %y) + %b = call float @llvm.maximum.f32(float %x, float %a) + ret float %b +} + +define float @maximum_y_maximum_x_y(float %x, float %y) { +; CHECK-LABEL: @maximum_y_maximum_x_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.maximum.f32(float %x, float %y) + %b = call float @llvm.maximum.f32(float %y, float %a) + ret float %b +} + +define float @maximum_x_y_maximum_x(float %x, float %y) { +; CHECK-LABEL: @maximum_x_y_maximum_x( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.maximum.f32(float %x, float %y) + %b = call float @llvm.maximum.f32(float %a, float %x) + ret float %b +} + +define float @maximum_x_y_maximum_y(float %x, float %y) { +; CHECK-LABEL: @maximum_x_y_maximum_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: ret float [[A]] +; + %a = call float @llvm.maximum.f32(float %x, float %y) + %b = call float @llvm.maximum.f32(float %a, float %y) + ret float %b +} + +; negative test + +define float @maximum_z_maximum_x_y(float %x, float %y, float %z) { +; CHECK-LABEL: @maximum_z_maximum_x_y( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: [[B:%.*]] = call float @llvm.maximum.f32(float [[Z:%.*]], float [[A]]) +; CHECK-NEXT: ret float [[B]] +; + %a = call float @llvm.maximum.f32(float %x, float %y) + %b = call float @llvm.maximum.f32(float %z, float %a) + ret float %b +} + +; negative test + +define float @maximum_x_y_maximum_z(float %x, float %y, float %z) { +; CHECK-LABEL: @maximum_x_y_maximum_z( +; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: [[B:%.*]] = call float @llvm.maximum.f32(float [[A]], float [[Z:%.*]]) +; CHECK-NEXT: ret float [[B]] +; + %a = call float @llvm.maximum.f32(float %x, float %y) + %b = call float @llvm.maximum.f32(float %a, float %z) + ret float %b +} + +; negative test - minimum(X, -INF) != -INF because X could be NaN + +define float @minimum_neginf(float %x) { +; CHECK-LABEL: @minimum_neginf( +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float 0xFFF0000000000000) +; CHECK-NEXT: ret float [[VAL]] +; + %val = call float @llvm.minimum.f32(float %x, float 0xFFF0000000000000) + ret float %val +} + +; negative test - minimum(-INF, X) != -INF because X could be NaN + +define <2 x double> @minimum_neginf_commute_vec(<2 x double> %x) { +; CHECK-LABEL: @minimum_neginf_commute_vec( +; CHECK-NEXT: [[R:%.*]] = call <2 x double> @llvm.minimum.v2f64(<2 x double> , <2 x double> [[X:%.*]]) +; CHECK-NEXT: ret <2 x double> [[R]] +; + %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> , <2 x double> %x) + ret <2 x double> %r +} + +; TODO: minimum(INF, X) --> X + +define float @minimum_inf(float %x) { +; CHECK-LABEL: @minimum_inf( +; CHECK-NEXT: ret float [[X:%.*]] +; + %val = call float @llvm.minimum.f32(float 0x7FF0000000000000, float %x) + ret float %val +} + +; negative test - maximum(X, INF) != INF because X could be NaN + +define <2 x double> @maximum_inf(<2 x double> %x) { +; CHECK-LABEL: @maximum_inf( +; CHECK-NEXT: [[VAL:%.*]] = call <2 x double> @llvm.maximum.v2f64(<2 x double> [[X:%.*]], <2 x double> ) +; CHECK-NEXT: ret <2 x double> [[VAL]] +; + %val = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double>) + ret <2 x double> %val +} + +; negative test - maximum(INF, X) != INF because X could be NaN + +define float @maximum_inf_commute(float %x) { +; CHECK-LABEL: @maximum_inf_commute( +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.maximum.f32(float 0x7FF0000000000000, float [[X:%.*]]) +; CHECK-NEXT: ret float [[VAL]] +; + %val = call float @llvm.maximum.f32(float 0x7FF0000000000000, float %x) + ret float %val +} diff --git a/llvm/test/Transforms/InstSimplify/known-non-zero.ll b/llvm/test/Transforms/InstSimplify/known-non-zero.ll index 524e51be76f54..2af4f27162061 100644 --- a/llvm/test/Transforms/InstSimplify/known-non-zero.ll +++ b/llvm/test/Transforms/InstSimplify/known-non-zero.ll @@ -145,3 +145,24 @@ for.body: ; preds = %for.cond %inc = add nuw nsw i32 %shift.0, 1 br label %for.cond } + +define i1 @freeze_nonzero(i8 %x, i8 %mask) { +; CHECK-LABEL: @freeze_nonzero( +; CHECK-NEXT: [[Y:%.*]] = or i8 [[X:%.*]], [[MASK:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ne i8 [[Y]], 0 +; CHECK-NEXT: br i1 [[C]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: ret i1 false +; CHECK: B: +; CHECK-NEXT: ret i1 false +; + %y = or i8 %x, %mask + %c = icmp ne i8 %y, 0 + br i1 %c, label %A, label %B +A: + %fr = freeze i8 %y + %c2 = icmp eq i8 %fr, 0 + ret i1 %c2 +B: + ret i1 0 +} diff --git a/llvm/test/Transforms/JumpThreading/constant-fold-status.ll b/llvm/test/Transforms/JumpThreading/constant-fold-status.ll new file mode 100644 index 0000000000000..95cf8bab7a5ed --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/constant-fold-status.ll @@ -0,0 +1,28 @@ +; RUN: opt -jump-threading < %s -S -o - | FileCheck %s + +; Reproducer for PR47297. + +; The pass did previously not report a correct Modified status in the case +; where a terminator's condition was successfully constant folded, but there +; were no other transformations done. This was caught by the pass return +; status check that is hidden under EXPENSIVE_CHECKS. + +; CHECK-LABEL: entry: +; CHECK-NEXT: br i1 icmp eq (i32 ptrtoint (i16* @a to i32), i32 0), label %overflow, label %cont + +@a = internal global i16 0 + +define void @foo(i16 %d) { +entry: + %.not = icmp eq i16 zext (i1 icmp ne (i32 ptrtoint (i16* @a to i32), i32 0) to i16), 0 + br i1 %.not, label %overflow, label %cont + +overflow: ; preds = %entry + call void @bar() + br label %cont + +cont: ; preds = %overflow, %entry + ret void +} + +declare void @bar() diff --git a/llvm/test/Transforms/JumpThreading/select-unfold-freeze.ll b/llvm/test/Transforms/JumpThreading/select-unfold-freeze.ll new file mode 100644 index 0000000000000..12288fc272627 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/select-unfold-freeze.ll @@ -0,0 +1,248 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -jump-threading-freeze-select-cond -jump-threading < %s | FileCheck %s + +declare void @foo() +declare void @bar() +declare void @baz() +declare void @quux() + + +define void @test_switch_cmp(i1 %cond, i32 %val, i8 %value) nounwind { +; CHECK-LABEL: @test_switch_cmp( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[L0:%.*]], label [[L0_THREAD:%.*]] +; CHECK: L0: +; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ [[VAL:%.*]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[VAL_PHI]], 0 +; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[CMP]] +; CHECK-NEXT: br i1 [[COND_FR]], label [[L1:%.*]], label [[TMP0:%.*]] +; CHECK: 0: +; CHECK-NEXT: [[TMP1:%.*]] = phi i8 [ [[VALUE:%.*]], [[L0]] ] +; CHECK-NEXT: switch i8 [[TMP1]], label [[L3:%.*]] [ +; CHECK-NEXT: i8 1, label [[L1]] +; CHECK-NEXT: i8 2, label [[L2:%.*]] +; CHECK-NEXT: ] +; CHECK: L1: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: ret void +; CHECK: L2: +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: ret void +; CHECK: L3: +; CHECK-NEXT: call void @baz() +; CHECK-NEXT: ret void +; CHECK: L0.thread: +; CHECK-NEXT: call void @quux() +; CHECK-NEXT: br label [[L1]] +; +entry: + br i1 %cond, label %L0, label %L4 +L0: + %val.phi = phi i32 [%val, %entry], [-1, %L4] + %cmp = icmp slt i32 %val.phi, 0 + %expr = select i1 %cmp, i8 1, i8 %value + switch i8 %expr, label %L3 [i8 1, label %L1 i8 2, label %L2] + +L1: + call void @foo() + ret void +L2: + call void @bar() + ret void +L3: + call void @baz() + ret void +L4: + call void @quux() + br label %L0 +} + +define i32 @unfold3(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) nounwind { +; CHECK-LABEL: @unfold3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[J:%.*]], 2 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[U:%.*]], [[V:%.*]] +; CHECK-NEXT: br i1 [[CMP_I]], label [[DOTEXIT_THREAD4:%.*]], label [[COND_FALSE_I:%.*]] +; CHECK: cond.false.i: +; CHECK-NEXT: [[CMP4_I:%.*]] = icmp sgt i32 [[U]], [[V]] +; CHECK-NEXT: br i1 [[CMP4_I]], label [[DOTEXIT_THREAD:%.*]], label [[COND_FALSE_6_I:%.*]] +; CHECK: cond.false.6.i: +; CHECK-NEXT: [[CMP8_I:%.*]] = icmp slt i32 [[W:%.*]], [[X:%.*]] +; CHECK-NEXT: br i1 [[CMP8_I]], label [[DOTEXIT_THREAD4]], label [[COND_FALSE_10_I:%.*]] +; CHECK: cond.false.10.i: +; CHECK-NEXT: [[CMP13_I:%.*]] = icmp sgt i32 [[W]], [[X]] +; CHECK-NEXT: br i1 [[CMP13_I]], label [[DOTEXIT_THREAD]], label [[DOTEXIT:%.*]] +; CHECK: .exit: +; CHECK-NEXT: [[PHITMP:%.*]] = icmp sge i32 [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[PHITMP]] +; CHECK-NEXT: br i1 [[COND_FR]], label [[DOTEXIT_THREAD]], label [[DOTEXIT_THREAD4]] +; CHECK: .exit.thread: +; CHECK-NEXT: br label [[DOTEXIT_THREAD4]] +; CHECK: .exit.thread4: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[J]], [[DOTEXIT_THREAD]] ], [ [[ADD3]], [[DOTEXIT]] ], [ [[ADD3]], [[ENTRY:%.*]] ], [ [[ADD3]], [[COND_FALSE_6_I]] ] +; CHECK-NEXT: ret i32 [[TMP0]] +; +entry: + %add3 = add nsw i32 %j, 2 + %cmp.i = icmp slt i32 %u, %v + br i1 %cmp.i, label %.exit, label %cond.false.i + +cond.false.i: ; preds = %entry + %cmp4.i = icmp sgt i32 %u, %v + br i1 %cmp4.i, label %.exit, label %cond.false.6.i + +cond.false.6.i: ; preds = %cond.false.i + %cmp8.i = icmp slt i32 %w, %x + br i1 %cmp8.i, label %.exit, label %cond.false.10.i + +cond.false.10.i: ; preds = %cond.false.6.i + %cmp13.i = icmp sgt i32 %w, %x + br i1 %cmp13.i, label %.exit, label %cond.false.15.i + +cond.false.15.i: ; preds = %cond.false.10.i + %phitmp = icmp sge i32 %y, %z + br label %.exit + +.exit: ; preds = %entry, %cond.false.i, %cond.false.6.i, %cond.false.10.i, %cond.false.15.i + %cond23.i = phi i1 [ false, %entry ], [ true, %cond.false.i ], [ false, %cond.false.6.i ], [ %phitmp, %cond.false.15.i ], [ true, %cond.false.10.i ] + %j.add3 = select i1 %cond23.i, i32 %j, i32 %add3 + ret i32 %j.add3 +} + +define i32 @unfold4(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) nounwind { +; CHECK-LABEL: @unfold4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[J:%.*]], 2 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[U:%.*]], [[V:%.*]] +; CHECK-NEXT: br i1 [[CMP_I]], label [[DOTEXIT_THREAD:%.*]], label [[COND_FALSE_I:%.*]] +; CHECK: cond.false.i: +; CHECK-NEXT: [[CMP4_I:%.*]] = icmp sgt i32 [[U]], [[V]] +; CHECK-NEXT: br i1 [[CMP4_I]], label [[DOTEXIT_THREAD5:%.*]], label [[COND_FALSE_6_I:%.*]] +; CHECK: cond.false.6.i: +; CHECK-NEXT: [[CMP8_I:%.*]] = icmp slt i32 [[W:%.*]], [[X:%.*]] +; CHECK-NEXT: br i1 [[CMP8_I]], label [[DOTEXIT_THREAD]], label [[COND_FALSE_10_I:%.*]] +; CHECK: cond.false.10.i: +; CHECK-NEXT: [[CMP13_I:%.*]] = icmp sgt i32 [[W]], [[X]] +; CHECK-NEXT: br i1 [[CMP13_I]], label [[DOTEXIT_THREAD5]], label [[DOTEXIT:%.*]] +; CHECK: .exit: +; CHECK-NEXT: [[CMP19_I:%.*]] = icmp sge i32 [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CMP19_I]] to i32 +; CHECK-NEXT: [[LNOT_I18:%.*]] = icmp eq i32 [[CONV]], 1 +; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[LNOT_I18]] +; CHECK-NEXT: br i1 [[COND_FR]], label [[DOTEXIT_THREAD]], label [[DOTEXIT_THREAD5]] +; CHECK: .exit.thread: +; CHECK-NEXT: br label [[DOTEXIT_THREAD5]] +; CHECK: .exit.thread5: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[J]], [[DOTEXIT_THREAD]] ], [ [[ADD3]], [[DOTEXIT]] ], [ [[ADD3]], [[COND_FALSE_I]] ], [ [[ADD3]], [[COND_FALSE_10_I]] ] +; CHECK-NEXT: ret i32 [[TMP0]] +; +entry: + %add3 = add nsw i32 %j, 2 + %cmp.i = icmp slt i32 %u, %v + br i1 %cmp.i, label %.exit, label %cond.false.i + +cond.false.i: ; preds = %entry + %cmp4.i = icmp sgt i32 %u, %v + br i1 %cmp4.i, label %.exit, label %cond.false.6.i + +cond.false.6.i: ; preds = %cond.false.i + %cmp8.i = icmp slt i32 %w, %x + br i1 %cmp8.i, label %.exit, label %cond.false.10.i + +cond.false.10.i: ; preds = %cond.false.6.i + %cmp13.i = icmp sgt i32 %w, %x + br i1 %cmp13.i, label %.exit, label %cond.false.15.i + +cond.false.15.i: ; preds = %cond.false.10.i + %cmp19.i = icmp sge i32 %y, %z + %conv = zext i1 %cmp19.i to i32 + br label %.exit + +.exit: ; preds = %entry, %cond.false.i, %cond.false.6.i, %cond.false.10.i, %cond.false.15.i + %cond23.i = phi i32 [ 1, %entry ], [ 0, %cond.false.i ], [ 1, %cond.false.6.i ], [ %conv, %cond.false.15.i ], [ 0, %cond.false.10.i ] + %lnot.i18 = icmp eq i32 %cond23.i, 1 + %j.add3 = select i1 %lnot.i18, i32 %j, i32 %add3 + ret i32 %j.add3 +} + +; TODO: cond23_i should be constant-folded. +define i32 @unfold5(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) nounwind { +; CHECK-LABEL: @unfold5( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[J:%.*]], 2 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[U:%.*]], [[V:%.*]] +; CHECK-NEXT: br i1 [[CMP_I]], label [[DOTEXIT_THREAD:%.*]], label [[COND_FALSE_I:%.*]] +; CHECK: cond.false.i: +; CHECK-NEXT: [[CMP4_I:%.*]] = icmp sgt i32 [[U]], [[V]] +; CHECK-NEXT: br i1 [[CMP4_I]], label [[DOTEXIT_THREAD]], label [[COND_FALSE_6_I:%.*]] +; CHECK: cond.false.6.i: +; CHECK-NEXT: [[CMP8_I:%.*]] = icmp slt i32 [[W:%.*]], [[X:%.*]] +; CHECK-NEXT: br i1 [[CMP8_I]], label [[DOTEXIT_THREAD]], label [[COND_FALSE_10_I:%.*]] +; CHECK: cond.false.10.i: +; CHECK-NEXT: [[CMP13_I:%.*]] = icmp sgt i32 [[W]], [[X]] +; CHECK-NEXT: br i1 [[CMP13_I]], label [[TMP0:%.*]], label [[COND_FALSE_15_I:%.*]] +; CHECK: cond.false.15.i: +; CHECK-NEXT: [[CMP19_I:%.*]] = icmp sge i32 [[Y:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CMP19_I]] to i32 +; CHECK-NEXT: br label [[DOTEXIT_THREAD]] +; CHECK: 0: +; CHECK-NEXT: [[COND23_I:%.*]] = phi i32 [ 7, [[COND_FALSE_10_I]] ] +; CHECK-NEXT: [[LNOT_I18:%.*]] = icmp sgt i32 [[COND23_I]], 5 +; CHECK-NEXT: br label [[DOTEXIT_THREAD]] +; CHECK: .exit.thread: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[J]], [[TMP0]] ], [ [[CONV]], [[COND_FALSE_15_I]] ], [ 1, [[COND_FALSE_6_I]] ], [ 3, [[COND_FALSE_I]] ], [ 2, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i32 [[TMP1]] +; +entry: + %add3 = add nsw i32 %j, 2 + %cmp.i = icmp slt i32 %u, %v + br i1 %cmp.i, label %.exit, label %cond.false.i + +cond.false.i: ; preds = %entry + %cmp4.i = icmp sgt i32 %u, %v + br i1 %cmp4.i, label %.exit, label %cond.false.6.i + +cond.false.6.i: ; preds = %cond.false.i + %cmp8.i = icmp slt i32 %w, %x + br i1 %cmp8.i, label %.exit, label %cond.false.10.i + +cond.false.10.i: ; preds = %cond.false.6.i + %cmp13.i = icmp sgt i32 %w, %x + br i1 %cmp13.i, label %.exit, label %cond.false.15.i + +cond.false.15.i: ; preds = %cond.false.10.i + %cmp19.i = icmp sge i32 %y, %z + %conv = zext i1 %cmp19.i to i32 + br label %.exit + +.exit: ; preds = %entry, %cond.false.i, %cond.false.6.i, %cond.false.10.i, %cond.false.15.i + %cond23.i = phi i32 [ 2, %entry ], [ 3, %cond.false.i ], [ 1, %cond.false.6.i ], [ %conv, %cond.false.15.i ], [ 7, %cond.false.10.i ] + %lnot.i18 = icmp sgt i32 %cond23.i, 5 + %j.add3 = select i1 %lnot.i18, i32 %j, i32 %cond23.i + ret i32 %j.add3 +} + +define i32 @TryToUnfoldSelectInCurrBB(i1 %b, i1 %ui, i32 %s, i1 %x) { +; CHECK-LABEL: @TryToUnfoldSelectInCurrBB( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[B:%.*]], label [[IF_END_THREAD:%.*]], label [[IF_END:%.*]] +; CHECK: if.end: +; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[X:%.*]] +; CHECK-NEXT: br i1 [[COND_FR]], label [[TMP0:%.*]], label [[IF_END_THREAD]] +; CHECK: 0: +; CHECK-NEXT: br label [[IF_END_THREAD]] +; CHECK: if.end.thread: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[S:%.*]], [[TMP0]] ], [ 42, [[IF_END]] ], [ 42, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i32 [[TMP1]] +; +entry: + br i1 %b, label %if.end, label %if.else + +if.else: + br label %if.end + +if.end: + %v = phi i1 [ %x, %if.else ], [ false, %entry ] + %v1 = select i1 %v, i32 %s, i32 42 + ret i32 %v1 +} diff --git a/llvm/test/Transforms/LICM/AArch64/lit.local.cfg b/llvm/test/Transforms/LICM/AArch64/lit.local.cfg new file mode 100644 index 0000000000000..7184443994b69 --- /dev/null +++ b/llvm/test/Transforms/LICM/AArch64/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AArch64' in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/LICM/AArch64/sve-load-hoist.ll b/llvm/test/Transforms/LICM/AArch64/sve-load-hoist.ll new file mode 100644 index 0000000000000..b0fcdb7d8dfcd --- /dev/null +++ b/llvm/test/Transforms/LICM/AArch64/sve-load-hoist.ll @@ -0,0 +1,30 @@ +; RUN: opt -licm -mtriple aarch64-linux-gnu -mattr=+sve -S < %s | FileCheck %s + +define void @no_hoist_load1_nxv2i64(* %out, i8* %in8, i32 %n) { +; CHECK-LABEL: @no_hoist_load1_nxv2i64( +; CHECK: entry: +; CHECK-NOT: load +; CHECK: for.body: +; CHECK: load +entry: + %cmp0 = icmp ugt i32 %n, 0 + %invst = call {}* @llvm.invariant.start.p0i8(i64 16, i8* %in8) + %in = bitcast i8* %in8 to * + br i1 %cmp0, label %for.body, label %for.end + +for.body: + %i = phi i32 [0, %entry], [%inc, %for.body] + %i2 = zext i32 %i to i64 + %ptr = getelementptr , * %out, i64 %i2 + %val = load , * %in, align 16 + store %val, * %ptr, align 16 + %inc = add nuw nsw i32 %i, 1 + %cmp = icmp ult i32 %inc, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: + ret void +} + +declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly + diff --git a/llvm/test/Transforms/LICM/Inputs/no-hoist-prof.prof b/llvm/test/Transforms/LICM/Inputs/no-hoist-prof.prof new file mode 100644 index 0000000000000..c1b2ee0873c00 --- /dev/null +++ b/llvm/test/Transforms/LICM/Inputs/no-hoist-prof.prof @@ -0,0 +1,7 @@ +_Z3fooii:200:1 + 0: 1 + 1: 1 _Z3bari:1 + 2: 200 + 3: 200 + 4: 0 + 5: 1 diff --git a/llvm/test/Transforms/LICM/hoisting.ll b/llvm/test/Transforms/LICM/hoisting.ll index 97609fa397e45..00ac0f5756dea 100644 --- a/llvm/test/Transforms/LICM/hoisting.ll +++ b/llvm/test/Transforms/LICM/hoisting.ll @@ -360,3 +360,36 @@ loop: loopexit: ret i32 %sum } + +; We can't hoist the invariant load out of the loop because +; the marker is given a variable size (-1). +define i32 @test_fence5(i8* %addr, i32 %n, i8* %volatile) { +; CHECK-LABEL: @test_fence5 +; CHECK-LABEL: entry +; CHECK: invariant.start +; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8 +; CHECK: br label %loop +entry: + %gep = getelementptr inbounds i8, i8* %addr, i64 8 + %addr.i = bitcast i8* %gep to i32 * + store atomic i32 5, i32 * %addr.i unordered, align 8 + fence release + %invst = call {}* @llvm.invariant.start.p0i8(i64 -1, i8* %gep) + br label %loop + +loop: + %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ] + %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ] + %volload = load atomic i8, i8* %volatile unordered, align 8 + fence acquire + %volchk = icmp eq i8 %volload, 0 + %addrld = load atomic i32, i32* %addr.i unordered, align 8 + %sel = select i1 %volchk, i32 0, i32 %addrld + %sum.next = add i32 %sel, %sum + %indvar.next = add i32 %indvar, 1 + %cond = icmp slt i32 %indvar.next, %n + br i1 %cond, label %loop, label %loopexit + +loopexit: + ret i32 %sum +} diff --git a/llvm/test/Transforms/LICM/no-hoist-prof.ll b/llvm/test/Transforms/LICM/no-hoist-prof.ll new file mode 100644 index 0000000000000..1b18aa3c288e4 --- /dev/null +++ b/llvm/test/Transforms/LICM/no-hoist-prof.ll @@ -0,0 +1,88 @@ +; RUN: opt -enable-new-pm=1 -sample-profile -licm -S -sample-profile-file='%S/Inputs/no-hoist-prof.prof' < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM +; RUN: opt -passes=licm -S < %s | FileCheck %s --check-prefix=CHECK-LICM + +; Original source code: +; +; int bar(int); +; int foo(int iter, int explode) { +; int base = bar(explode); +; for (int i = 0; i != iter; ++i) +; if (i == explode) +; iter = (base * base) + bar(iter); +; return iter; +; } + +; We need debug information in this .ll in order to leverage the pgo file, so: +; .ll generated by running `clang++ -O3 -g -S -emit-llvm`, then: +; - move hoisted mul back into cold section +; - give labels names +; - reindex variables +; - remove metadata calls, attributes, module header +; - remove unnecessary metadata + +; CHECK-LICM: .l.check.preheader:{{.*}} +; CHECK-LICM-NEXT: {{.*}} = mul {{.*}} +; CHECK-LICM-NEXT: br{{.*}} + +; CHECK-BFI-LICM: .l.cold:{{.*}} +; CHECK-BFI-LICM-NEXT: {{.*}} = mul {{.*}} + +define dso_local i32 @_Z3fooii(i32, i32) local_unnamed_addr #0 !dbg !7 { + %3 = tail call i32 @_Z3bari(i32 %1), !dbg !19 + %4 = icmp eq i32 %0, 0, !dbg !22 + br i1 %4, label %.l.ret, label %.l.check.preheader, !dbg !24 + +.l.check.preheader: + br label %.l.check, !dbg !24 + +.l.ret: + %5 = phi i32 [ 0, %2 ], [ %12, %.l.iterate ] + ret i32 %5, !dbg !25 + +.l.check: + %6 = phi i32 [ 0, %.l.check.preheader ], [ %13, %.l.iterate ] + %7 = phi i32 [ %0, %.l.check.preheader ], [ %12, %.l.iterate ] + %8 = icmp eq i32 %6, %1, !dbg !26 + br i1 %8, label %.l.cold, label %.l.iterate, !dbg !28 + +.l.cold: + %9 = mul nsw i32 %3, %3 + %10 = tail call i32 @_Z3bari(i32 %7), !dbg !29 + %11 = add nsw i32 %10, %9, !dbg !30 + br label %.l.iterate, !dbg !31 + +.l.iterate: + %12 = phi i32 [ %11, %.l.cold ], [ %7, %.l.check ] + %13 = add nuw nsw i32 %6, 1, !dbg !32 + %14 = icmp eq i32 %13, %12, !dbg !22 + br i1 %14, label %.l.ret, label %.l.check, !dbg !24, !llvm.loop !33 +} + +attributes #0 = { "use-sample-profile" } + +declare dso_local i32 @_Z3bari(i32) local_unnamed_addr #1 + +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.20181009 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, nameTableKind: None) +!1 = !DIFile(filename: "foo.cpp", directory: "/tmp/gather_pgo") +!4 = !{i32 2, !"Debug Info Version", i32 3} +!7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0) +!8 = !DISubroutineType(types: !9) +!9 = !{!10, !10, !10} +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!16 = distinct !DILexicalBlock(scope: !7, file: !1, line: 4, column: 3) +!19 = !DILocation(line: 3, column: 14, scope: !7) +!22 = !DILocation(line: 4, column: 21, scope: !23) +!23 = distinct !DILexicalBlock(scope: !16, file: !1, line: 4, column: 3) +!24 = !DILocation(line: 4, column: 3, scope: !16) +!25 = !DILocation(line: 7, column: 3, scope: !7) +!26 = !DILocation(line: 5, column: 11, scope: !27) +!27 = distinct !DILexicalBlock(scope: !23, file: !1, line: 5, column: 9) +!28 = !DILocation(line: 5, column: 9, scope: !23) +!29 = !DILocation(line: 6, column: 30, scope: !27) +!30 = !DILocation(line: 6, column: 28, scope: !27) +!31 = !DILocation(line: 6, column: 7, scope: !27) +!32 = !DILocation(line: 4, column: 30, scope: !23) +!33 = distinct !{!33, !24, !34} +!34 = !DILocation(line: 6, column: 38, scope: !16) diff --git a/llvm/test/Transforms/LICM/sink.ll b/llvm/test/Transforms/LICM/sink.ll index 17170f5af1965..8a5da47847c86 100644 --- a/llvm/test/Transforms/LICM/sink.ll +++ b/llvm/test/Transforms/LICM/sink.ll @@ -1,8 +1,10 @@ -; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm -licm-coldness-threshold=0 < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM ; RUN: opt -S -licm < %s | opt -S -loop-sink | FileCheck %s --check-prefix=CHECK-SINK ; RUN: opt -S < %s -passes='require,loop(licm),loop-sink' \ ; RUN: | FileCheck %s --check-prefix=CHECK-SINK -; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm -licm-coldness-threshold=0 -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM ; Original source code: ; int g; @@ -29,6 +31,10 @@ define i32 @foo(i32, i32) #0 !prof !2 { ; CHECK-LICM: load i32, i32* @g ; CHECK-LICM: br label %.lr.ph +; CHECK-BFI-LICM: .lr.ph.preheader: +; CHECK-BFI-LICM-NOT: load i32, i32* @g +; CHECK-BFI-LICM: br label %.lr.ph + .lr.ph: %.03 = phi i32 [ %8, %.combine ], [ 0, %.lr.ph.preheader ] %.012 = phi i32 [ %.1, %.combine ], [ %1, %.lr.ph.preheader ] diff --git a/llvm/test/Transforms/LoopIdiom/memcpy-vectors.ll b/llvm/test/Transforms/LoopIdiom/memcpy-vectors.ll new file mode 100644 index 0000000000000..b4445c70cb57f --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/memcpy-vectors.ll @@ -0,0 +1,53 @@ +; RUN: opt -loop-idiom -S <%s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define void @memcpy_fixed_vec(i64* noalias %a, i64* noalias %b) local_unnamed_addr #1 { +; CHECK-LABEL: @memcpy_fixed_vec( +; CHECK: entry: +; CHECK: memcpy +; CHECK: vector.body +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i64, i64* %a, i64 %index + %1 = bitcast i64* %0 to <2 x i64>* + %wide.load = load <2 x i64>, <2 x i64>* %1, align 8 + %2 = getelementptr inbounds i64, i64* %b, i64 %index + %3 = bitcast i64* %2 to <2 x i64>* + store <2 x i64> %wide.load, <2 x i64>* %3, align 8 + %index.next = add nuw nsw i64 %index, 2 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @memcpy_scalable_vec(i64* noalias %a, i64* noalias %b) local_unnamed_addr #1 { +; CHECK-LABEL: @memcpy_scalable_vec( +; CHECK: entry: +; CHECK-NOT: memcpy +; CHECK: vector.body +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = bitcast i64* %a to * + %1 = getelementptr inbounds , * %0, i64 %index + %wide.load = load , * %1, align 16 + %2 = bitcast i64* %b to * + %3 = getelementptr inbounds , * %2, i64 %index + store %wide.load, * %3, align 16 + %index.next = add nuw nsw i64 %index, 1 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} diff --git a/llvm/test/Transforms/LoopLoadElim/pr47457.ll b/llvm/test/Transforms/LoopLoadElim/pr47457.ll new file mode 100644 index 0000000000000..a58be5a8cf5e9 --- /dev/null +++ b/llvm/test/Transforms/LoopLoadElim/pr47457.ll @@ -0,0 +1,45 @@ +; RUN: opt -loop-load-elim -S %s | FileCheck %s +; RUN: opt -passes=loop-load-elim -S %s | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2" +target triple = "x86_64-unknown-linux-gnu" + +; Make sure it does not crash with assert. +define void @test() { +; CHECK-LABEL: test + +bb: + br label %bb1 + +bb1: ; preds = %bb6, %bb1, %bb + %tmp = phi i32 [ undef, %bb ], [ 0, %bb1 ], [ %tmp3, %bb6 ] + br i1 undef, label %bb1, label %bb2 + +bb2: ; preds = %bb1 + %tmp3 = add i32 %tmp, 1 + %tmp4 = icmp ult i32 %tmp, undef + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb2 + ret void + +bb6: ; preds = %bb2 + br i1 undef, label %bb7, label %bb1 + +bb7: ; preds = %bb7, %bb6 + %tmp8 = phi i32 [ %tmp15, %bb7 ], [ %tmp3, %bb6 ] + %tmp9 = phi i32 [ %tmp8, %bb7 ], [ %tmp, %bb6 ] + %tmp10 = zext i32 %tmp9 to i64 + %tmp11 = getelementptr inbounds float, float addrspace(1)* null, i64 %tmp10 + %tmp12 = load float, float addrspace(1)* %tmp11, align 4 + %tmp13 = zext i32 %tmp8 to i64 + %tmp14 = getelementptr inbounds float, float addrspace(1)* null, i64 %tmp13 + store float 1.000000e+00, float addrspace(1)* %tmp14, align 4 + %tmp15 = add nuw nsw i32 %tmp8, 1 + %tmp16 = icmp sgt i32 %tmp8, 78 + br i1 %tmp16, label %bb17, label %bb7 + +bb17: ; preds = %bb7 + unreachable +} diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr47329.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr47329.ll new file mode 100644 index 0000000000000..bd2d6b4b0b4ca --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr47329.ll @@ -0,0 +1,299 @@ +; RUN: opt < %s -loop-reduce +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +@d = internal unnamed_addr global i32** null, align 8 + +define dso_local i32 @main() local_unnamed_addr { +entry: + %.pre.pre = load i32**, i32*** @d, align 8 + br label %for.body9 + +for.body9: ; preds = %for.body9, %entry + %i = phi i32** [ %.pre.pre, %entry ], [ %incdec.ptr, %for.body9 ] + %incdec.ptr = getelementptr inbounds i32*, i32** %i, i64 -1 + br i1 undef, label %for.body9, label %for.inc + +for.inc: ; preds = %for.body9 + br label %for.body9.118 + +for.body9.1: ; preds = %for.inc.547, %for.body9.1 + %i1 = phi i32** [ %incdec.ptr.1, %for.body9.1 ], [ %incdec.ptr.542, %for.inc.547 ] + %incdec.ptr.1 = getelementptr inbounds i32*, i32** %i1, i64 -1 + br i1 undef, label %for.body9.1, label %for.inc.1 + +for.inc.1: ; preds = %for.body9.1 + br label %for.body9.1.1 + +for.body9.2: ; preds = %for.inc.1.5, %for.body9.2 + %i2 = phi i32** [ %incdec.ptr.2, %for.body9.2 ], [ %incdec.ptr.1.5, %for.inc.1.5 ] + %incdec.ptr.2 = getelementptr inbounds i32*, i32** %i2, i64 -1 + br i1 undef, label %for.body9.2, label %for.inc.2 + +for.inc.2: ; preds = %for.body9.2 + br label %for.body9.2.1 + +for.body9.3: ; preds = %for.inc.2.5, %for.body9.3 + %i3 = phi i32** [ %incdec.ptr.3, %for.body9.3 ], [ %incdec.ptr.2.5, %for.inc.2.5 ] + %incdec.ptr.3 = getelementptr inbounds i32*, i32** %i3, i64 -1 + br i1 undef, label %for.body9.3, label %for.inc.3 + +for.inc.3: ; preds = %for.body9.3 + br label %for.body9.3.1 + +for.body9.4: ; preds = %for.inc.3.5, %for.body9.4 + %i4 = phi i32** [ %incdec.ptr.4, %for.body9.4 ], [ %incdec.ptr.3.5, %for.inc.3.5 ] + %incdec.ptr.4 = getelementptr inbounds i32*, i32** %i4, i64 -1 + br i1 undef, label %for.body9.4, label %for.inc.4 + +for.inc.4: ; preds = %for.body9.4 + br label %for.body9.4.1 + +for.body9.5: ; preds = %for.inc.4.5, %for.body9.5 + %i5 = phi i32** [ %incdec.ptr.5, %for.body9.5 ], [ %incdec.ptr.4.5, %for.inc.4.5 ] + %incdec.ptr.5 = getelementptr inbounds i32*, i32** %i5, i64 -1 + br i1 undef, label %for.body9.5, label %for.inc.5 + +for.inc.5: ; preds = %for.body9.5 + br label %for.body9.5.1 + +for.body9.5.1: ; preds = %for.body9.5.1, %for.inc.5 + %i6 = phi i32** [ %incdec.ptr.5.1, %for.body9.5.1 ], [ %incdec.ptr.5, %for.inc.5 ] + %incdec.ptr.5.1 = getelementptr inbounds i32*, i32** %i6, i64 -1 + br i1 undef, label %for.body9.5.1, label %for.inc.5.1 + +for.inc.5.1: ; preds = %for.body9.5.1 + br label %for.body9.5.2 + +for.body9.5.2: ; preds = %for.body9.5.2, %for.inc.5.1 + %i7 = phi i32** [ %incdec.ptr.5.2, %for.body9.5.2 ], [ %incdec.ptr.5.1, %for.inc.5.1 ] + %incdec.ptr.5.2 = getelementptr inbounds i32*, i32** %i7, i64 -1 + br i1 undef, label %for.body9.5.2, label %for.inc.5.2 + +for.inc.5.2: ; preds = %for.body9.5.2 + br label %for.body9.5.3 + +for.body9.5.3: ; preds = %for.body9.5.3, %for.inc.5.2 + %i8 = phi i32** [ %incdec.ptr.5.3, %for.body9.5.3 ], [ %incdec.ptr.5.2, %for.inc.5.2 ] + %incdec.ptr.5.3 = getelementptr inbounds i32*, i32** %i8, i64 -1 + br i1 undef, label %for.body9.5.3, label %for.inc.5.3 + +for.inc.5.3: ; preds = %for.body9.5.3 + br label %for.body9.5.4 + +for.body9.5.4: ; preds = %for.body9.5.4, %for.inc.5.3 + %i9 = phi i32** [ %incdec.ptr.5.4, %for.body9.5.4 ], [ %incdec.ptr.5.3, %for.inc.5.3 ] + %incdec.ptr.5.4 = getelementptr inbounds i32*, i32** %i9, i64 -1 + br i1 undef, label %for.body9.5.4, label %for.inc.5.4 + +for.inc.5.4: ; preds = %for.body9.5.4 + br label %for.body9.5.5 + +for.body9.5.5: ; preds = %for.body9.5.5, %for.inc.5.4 + %i10 = phi i32** [ undef, %for.body9.5.5 ], [ %incdec.ptr.5.4, %for.inc.5.4 ] + %i11 = bitcast i32** %i10 to i64* + %i12 = load i64, i64* %i11, align 8 + br label %for.body9.5.5 + +for.body9.4.1: ; preds = %for.body9.4.1, %for.inc.4 + %i13 = phi i32** [ %incdec.ptr.4.1, %for.body9.4.1 ], [ %incdec.ptr.4, %for.inc.4 ] + %incdec.ptr.4.1 = getelementptr inbounds i32*, i32** %i13, i64 -1 + br i1 undef, label %for.body9.4.1, label %for.inc.4.1 + +for.inc.4.1: ; preds = %for.body9.4.1 + br label %for.body9.4.2 + +for.body9.4.2: ; preds = %for.body9.4.2, %for.inc.4.1 + %i14 = phi i32** [ %incdec.ptr.4.2, %for.body9.4.2 ], [ %incdec.ptr.4.1, %for.inc.4.1 ] + %incdec.ptr.4.2 = getelementptr inbounds i32*, i32** %i14, i64 -1 + br i1 undef, label %for.body9.4.2, label %for.inc.4.2 + +for.inc.4.2: ; preds = %for.body9.4.2 + br label %for.body9.4.3 + +for.body9.4.3: ; preds = %for.body9.4.3, %for.inc.4.2 + %i15 = phi i32** [ %incdec.ptr.4.3, %for.body9.4.3 ], [ %incdec.ptr.4.2, %for.inc.4.2 ] + %incdec.ptr.4.3 = getelementptr inbounds i32*, i32** %i15, i64 -1 + br i1 undef, label %for.body9.4.3, label %for.inc.4.3 + +for.inc.4.3: ; preds = %for.body9.4.3 + br label %for.body9.4.4 + +for.body9.4.4: ; preds = %for.body9.4.4, %for.inc.4.3 + %i16 = phi i32** [ %incdec.ptr.4.4, %for.body9.4.4 ], [ %incdec.ptr.4.3, %for.inc.4.3 ] + %incdec.ptr.4.4 = getelementptr inbounds i32*, i32** %i16, i64 -1 + br i1 undef, label %for.body9.4.4, label %for.inc.4.4 + +for.inc.4.4: ; preds = %for.body9.4.4 + br label %for.body9.4.5 + +for.body9.4.5: ; preds = %for.body9.4.5, %for.inc.4.4 + %i17 = phi i32** [ %incdec.ptr.4.5, %for.body9.4.5 ], [ %incdec.ptr.4.4, %for.inc.4.4 ] + %incdec.ptr.4.5 = getelementptr inbounds i32*, i32** %i17, i64 -1 + br i1 undef, label %for.body9.4.5, label %for.inc.4.5 + +for.inc.4.5: ; preds = %for.body9.4.5 + br label %for.body9.5 + +for.body9.3.1: ; preds = %for.body9.3.1, %for.inc.3 + %i18 = phi i32** [ %incdec.ptr.3.1, %for.body9.3.1 ], [ %incdec.ptr.3, %for.inc.3 ] + %incdec.ptr.3.1 = getelementptr inbounds i32*, i32** %i18, i64 -1 + br i1 undef, label %for.body9.3.1, label %for.inc.3.1 + +for.inc.3.1: ; preds = %for.body9.3.1 + br label %for.body9.3.2 + +for.body9.3.2: ; preds = %for.body9.3.2, %for.inc.3.1 + %i19 = phi i32** [ %incdec.ptr.3.2, %for.body9.3.2 ], [ %incdec.ptr.3.1, %for.inc.3.1 ] + %incdec.ptr.3.2 = getelementptr inbounds i32*, i32** %i19, i64 -1 + br i1 undef, label %for.body9.3.2, label %for.inc.3.2 + +for.inc.3.2: ; preds = %for.body9.3.2 + br label %for.body9.3.3 + +for.body9.3.3: ; preds = %for.body9.3.3, %for.inc.3.2 + %i20 = phi i32** [ %incdec.ptr.3.3, %for.body9.3.3 ], [ %incdec.ptr.3.2, %for.inc.3.2 ] + %incdec.ptr.3.3 = getelementptr inbounds i32*, i32** %i20, i64 -1 + br i1 undef, label %for.body9.3.3, label %for.inc.3.3 + +for.inc.3.3: ; preds = %for.body9.3.3 + br label %for.body9.3.4 + +for.body9.3.4: ; preds = %for.body9.3.4, %for.inc.3.3 + %i21 = phi i32** [ %incdec.ptr.3.4, %for.body9.3.4 ], [ %incdec.ptr.3.3, %for.inc.3.3 ] + %incdec.ptr.3.4 = getelementptr inbounds i32*, i32** %i21, i64 -1 + br i1 undef, label %for.body9.3.4, label %for.inc.3.4 + +for.inc.3.4: ; preds = %for.body9.3.4 + br label %for.body9.3.5 + +for.body9.3.5: ; preds = %for.body9.3.5, %for.inc.3.4 + %i22 = phi i32** [ %incdec.ptr.3.5, %for.body9.3.5 ], [ %incdec.ptr.3.4, %for.inc.3.4 ] + %incdec.ptr.3.5 = getelementptr inbounds i32*, i32** %i22, i64 -1 + br i1 undef, label %for.body9.3.5, label %for.inc.3.5 + +for.inc.3.5: ; preds = %for.body9.3.5 + br label %for.body9.4 + +for.body9.2.1: ; preds = %for.body9.2.1, %for.inc.2 + %i23 = phi i32** [ %incdec.ptr.2.1, %for.body9.2.1 ], [ %incdec.ptr.2, %for.inc.2 ] + %incdec.ptr.2.1 = getelementptr inbounds i32*, i32** %i23, i64 -1 + br i1 undef, label %for.body9.2.1, label %for.inc.2.1 + +for.inc.2.1: ; preds = %for.body9.2.1 + br label %for.body9.2.2 + +for.body9.2.2: ; preds = %for.body9.2.2, %for.inc.2.1 + %i24 = phi i32** [ %incdec.ptr.2.2, %for.body9.2.2 ], [ %incdec.ptr.2.1, %for.inc.2.1 ] + %incdec.ptr.2.2 = getelementptr inbounds i32*, i32** %i24, i64 -1 + br i1 undef, label %for.body9.2.2, label %for.inc.2.2 + +for.inc.2.2: ; preds = %for.body9.2.2 + br label %for.body9.2.3 + +for.body9.2.3: ; preds = %for.body9.2.3, %for.inc.2.2 + %i25 = phi i32** [ %incdec.ptr.2.3, %for.body9.2.3 ], [ %incdec.ptr.2.2, %for.inc.2.2 ] + %incdec.ptr.2.3 = getelementptr inbounds i32*, i32** %i25, i64 -1 + br i1 undef, label %for.body9.2.3, label %for.inc.2.3 + +for.inc.2.3: ; preds = %for.body9.2.3 + br label %for.body9.2.4 + +for.body9.2.4: ; preds = %for.body9.2.4, %for.inc.2.3 + %i26 = phi i32** [ %incdec.ptr.2.4, %for.body9.2.4 ], [ %incdec.ptr.2.3, %for.inc.2.3 ] + %incdec.ptr.2.4 = getelementptr inbounds i32*, i32** %i26, i64 -1 + br i1 undef, label %for.body9.2.4, label %for.inc.2.4 + +for.inc.2.4: ; preds = %for.body9.2.4 + br label %for.body9.2.5 + +for.body9.2.5: ; preds = %for.body9.2.5, %for.inc.2.4 + %i27 = phi i32** [ %incdec.ptr.2.5, %for.body9.2.5 ], [ %incdec.ptr.2.4, %for.inc.2.4 ] + %incdec.ptr.2.5 = getelementptr inbounds i32*, i32** %i27, i64 -1 + br i1 undef, label %for.body9.2.5, label %for.inc.2.5 + +for.inc.2.5: ; preds = %for.body9.2.5 + br label %for.body9.3 + +for.body9.1.1: ; preds = %for.body9.1.1, %for.inc.1 + %i28 = phi i32** [ %incdec.ptr.1.1, %for.body9.1.1 ], [ %incdec.ptr.1, %for.inc.1 ] + %incdec.ptr.1.1 = getelementptr inbounds i32*, i32** %i28, i64 -1 + br i1 undef, label %for.body9.1.1, label %for.inc.1.1 + +for.inc.1.1: ; preds = %for.body9.1.1 + br label %for.body9.1.2 + +for.body9.1.2: ; preds = %for.body9.1.2, %for.inc.1.1 + %i29 = phi i32** [ %incdec.ptr.1.2, %for.body9.1.2 ], [ %incdec.ptr.1.1, %for.inc.1.1 ] + %incdec.ptr.1.2 = getelementptr inbounds i32*, i32** %i29, i64 -1 + br i1 undef, label %for.body9.1.2, label %for.inc.1.2 + +for.inc.1.2: ; preds = %for.body9.1.2 + br label %for.body9.1.3 + +for.body9.1.3: ; preds = %for.body9.1.3, %for.inc.1.2 + %i30 = phi i32** [ %incdec.ptr.1.3, %for.body9.1.3 ], [ %incdec.ptr.1.2, %for.inc.1.2 ] + %incdec.ptr.1.3 = getelementptr inbounds i32*, i32** %i30, i64 -1 + br i1 undef, label %for.body9.1.3, label %for.inc.1.3 + +for.inc.1.3: ; preds = %for.body9.1.3 + br label %for.body9.1.4 + +for.body9.1.4: ; preds = %for.body9.1.4, %for.inc.1.3 + %i31 = phi i32** [ %incdec.ptr.1.4, %for.body9.1.4 ], [ %incdec.ptr.1.3, %for.inc.1.3 ] + %incdec.ptr.1.4 = getelementptr inbounds i32*, i32** %i31, i64 -1 + br i1 undef, label %for.body9.1.4, label %for.inc.1.4 + +for.inc.1.4: ; preds = %for.body9.1.4 + br label %for.body9.1.5 + +for.body9.1.5: ; preds = %for.body9.1.5, %for.inc.1.4 + %i32 = phi i32** [ %incdec.ptr.1.5, %for.body9.1.5 ], [ %incdec.ptr.1.4, %for.inc.1.4 ] + %incdec.ptr.1.5 = getelementptr inbounds i32*, i32** %i32, i64 -1 + br i1 undef, label %for.body9.1.5, label %for.inc.1.5 + +for.inc.1.5: ; preds = %for.body9.1.5 + br label %for.body9.2 + +for.body9.118: ; preds = %for.body9.118, %for.inc + %i33 = phi i32** [ %incdec.ptr, %for.inc ], [ %incdec.ptr.114, %for.body9.118 ] + %incdec.ptr.114 = getelementptr inbounds i32*, i32** %i33, i64 -1 + br i1 undef, label %for.body9.118, label %for.inc.119 + +for.inc.119: ; preds = %for.body9.118 + br label %for.body9.225 + +for.body9.225: ; preds = %for.body9.225, %for.inc.119 + %i34 = phi i32** [ %incdec.ptr.114, %for.inc.119 ], [ %incdec.ptr.221, %for.body9.225 ] + %incdec.ptr.221 = getelementptr inbounds i32*, i32** %i34, i64 -1 + %i35 = bitcast i32** %i34 to i64* + %i36 = load i64, i64* %i35, align 8 + br i1 undef, label %for.body9.225, label %for.inc.226 + +for.inc.226: ; preds = %for.body9.225 + br label %for.body9.332 + +for.body9.332: ; preds = %for.body9.332, %for.inc.226 + %i37 = phi i32** [ %incdec.ptr.221, %for.inc.226 ], [ %incdec.ptr.328, %for.body9.332 ] + %incdec.ptr.328 = getelementptr inbounds i32*, i32** %i37, i64 -1 + br i1 undef, label %for.body9.332, label %for.inc.333 + +for.inc.333: ; preds = %for.body9.332 + br label %for.body9.439 + +for.body9.439: ; preds = %for.body9.439, %for.inc.333 + %i38 = phi i32** [ %incdec.ptr.328, %for.inc.333 ], [ %incdec.ptr.435, %for.body9.439 ] + %incdec.ptr.435 = getelementptr inbounds i32*, i32** %i38, i64 -1 + br i1 undef, label %for.body9.439, label %for.inc.440 + +for.inc.440: ; preds = %for.body9.439 + br label %for.body9.546 + +for.body9.546: ; preds = %for.body9.546, %for.inc.440 + %i39 = phi i32** [ %incdec.ptr.435, %for.inc.440 ], [ %incdec.ptr.542, %for.body9.546 ] + %incdec.ptr.542 = getelementptr inbounds i32*, i32** %i39, i64 -1 + br i1 undef, label %for.body9.546, label %for.inc.547 + +for.inc.547: ; preds = %for.body9.546 + br label %for.body9.1 +} diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll index 07ad549ebb9d8..af39bec33013e 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll @@ -18,7 +18,6 @@ define float @test1(float* nocapture readonly %arr, i64 %start, float %threshold) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s2, #-7.00000000 ; CHECK-NEXT: cbz x1, .LBB0_4 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: add x8, x0, #28 // =28 @@ -32,7 +31,7 @@ define float @test1(float* nocapture readonly %arr, i64 %start, float %threshold ; CHECK-NEXT: add x1, x1, #1 // =1 ; CHECK-NEXT: cbnz x1, .LBB0_2 ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: fmov s0, #-7.00000000 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_5: // %cleanup2 ; CHECK-NEXT: mov v0.16b, v1.16b @@ -64,23 +63,22 @@ cleanup2: ; preds = %for.cond, %for.body define float @test2(float* nocapture readonly %arr, i64 %start, float %threshold) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s2, #-7.00000000 ; CHECK-NEXT: cbz x1, .LBB1_4 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: add x8, x0, #28 // =28 ; CHECK-NEXT: .LBB1_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr s1, [x8, x1, lsl #2] -; CHECK-NEXT: scvtf s3, x1 -; CHECK-NEXT: fadd s3, s3, s0 -; CHECK-NEXT: fcmp s1, s3 +; CHECK-NEXT: scvtf s2, x1 +; CHECK-NEXT: fadd s2, s2, s0 +; CHECK-NEXT: fcmp s1, s2 ; CHECK-NEXT: b.gt .LBB1_5 ; CHECK-NEXT: // %bb.3: // %for.cond ; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: add x1, x1, #1 // =1 ; CHECK-NEXT: cbnz x1, .LBB1_2 ; CHECK-NEXT: .LBB1_4: -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: fmov s0, #-7.00000000 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_5: // %cleanup4 ; CHECK-NEXT: mov v0.16b, v1.16b diff --git a/llvm/test/Transforms/LoopUnroll/FullUnroll.ll b/llvm/test/Transforms/LoopUnroll/FullUnroll.ll index 2d78714eae755..cc7950148ee33 100644 --- a/llvm/test/Transforms/LoopUnroll/FullUnroll.ll +++ b/llvm/test/Transforms/LoopUnroll/FullUnroll.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes='loop-unroll-full' -disable-verify --mtriple x86_64-pc-linux-gnu -S -o - %s | FileCheck %s +; RUN: opt -passes='loop-unroll-full' -disable-verify -disable-loop-unrolling=true --mtriple x86_64-pc-linux-gnu -S -o - %s | FileCheck %s ; This checks that the loop full unroller will fire in the new pass manager ; when forced via #pragma in the source (or annotation in the code). @@ -39,7 +39,7 @@ bb24: ; preds = %bb3.loopexit ret void } -attributes #0 = { noinline nounwind optnone uwtable } +attributes #0 = { nounwind uwtable } !llvm.module.flags = !{!0} diff --git a/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll b/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll index b607221a052d3..a83632af7b4cd 100644 --- a/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll +++ b/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll @@ -1,5 +1,5 @@ -; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime -unroll-partial-threshold=60 < %s -S | FileCheck %s -; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='loop-unroll-and-jam' -allow-unroll-and-jam -unroll-runtime -unroll-partial-threshold=60 < %s -S | FileCheck %s +; RUN: opt -loop-unroll-and-jam -unroll-runtime -unroll-partial-threshold=60 < %s -S | FileCheck %s +; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='loop-unroll-and-jam' -unroll-runtime -unroll-partial-threshold=60 < %s -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll index a315c7c7ca692..34a1c83721d4c 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll @@ -18,7 +18,7 @@ define i32 @mla_i32(i8* noalias nocapture readonly %A, i8* noalias nocapture rea ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 @@ -31,17 +31,17 @@ define i32 @mla_i32(i8* noalias nocapture readonly %A, i8* noalias nocapture rea ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP4]] -; CHECK-NEXT: [[TMP10]] = add <4 x i32> [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -58,7 +58,7 @@ define i32 @mla_i32(i8* noalias nocapture readonly %A, i8* noalias nocapture rea ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !2 ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -102,7 +102,7 @@ define i32 @mla_i8(i8* noalias nocapture readonly %A, i8* noalias nocapture read ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 @@ -115,17 +115,17 @@ define i32 @mla_i8(i8* noalias nocapture readonly %A, i8* noalias nocapture read ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP4]] -; CHECK-NEXT: [[TMP10]] = add <16 x i32> [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> [[TMP9]]) +; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> [[TMP10]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -142,7 +142,7 @@ define i32 @mla_i8(i8* noalias nocapture readonly %A, i8* noalias nocapture read ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !5 ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -186,23 +186,23 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -214,7 +214,7 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !7 ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll index 0d4cc31677b80..614d055730d88 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -loop-vectorize < %s -S -o - | FileCheck %s +; RUN: opt -loop-vectorize -instcombine -simplifycfg -tail-predication=enabled < %s -S -o - | FileCheck %s target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main-arm-none-eabi" @@ -8,23 +8,18 @@ define i64 @add_i64_i64(i64* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i64_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[X:%.*]], i32 [[I_08]] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ADD]] = add nsw i64 [[TMP0]], [[R_07]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ] -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -51,24 +46,19 @@ define i64 @add_i32_i64(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i32_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[I_08]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ] -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -96,24 +86,19 @@ define i64 @add_i16_i64(i16* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i16_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[I_08]] ; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i64 ; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ] -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -141,24 +126,19 @@ define i64 @add_i8_i64(i8* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i8_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[I_08]] ; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i64 ; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_07]], [[CONV]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ] -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -185,48 +165,28 @@ define i32 @add_i32_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i32_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[TMP2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP7]], [[R_07]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !2 -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] ; entry: @@ -253,50 +213,29 @@ define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i16_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP1]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> undef) +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[TMP3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP8]] to i32 -; CHECK-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !5 -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] ; entry: @@ -324,50 +263,29 @@ define i32 @add_i8_i32(i8* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i8_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> undef) +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[TMP3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP3:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP8]] to i32 -; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[R_07]], [[CONV]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !7 -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] ; entry: @@ -394,48 +312,28 @@ define signext i16 @add_i16_i16(i16* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i16_i16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 8 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4]] = add <8 x i16> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> undef) +; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[WIDE_MASKED_LOAD]], <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP3]] = add <8 x i16> [[VEC_PHI]], [[TMP2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP4]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[R_09:%.*]] = phi i16 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_010]] -; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[ADD]] = add i16 [[TMP7]], [[R_09]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !9 -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i16 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP3]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] ; entry: @@ -462,50 +360,29 @@ define signext i16 @add_i8_i16(i8* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i8_i16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 8 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i16> -; CHECK-NEXT: [[TMP5]] = add <8 x i16> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> undef) +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP2]], <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP4]] = add <8 x i16> [[VEC_PHI]], [[TMP3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP5]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[R_09:%.*]] = phi i16 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_010]] -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP8]] to i16 -; CHECK-NEXT: [[ADD]] = add i16 [[R_09]], [[CONV]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !11 -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i16 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP4]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] ; entry: @@ -532,48 +409,28 @@ define zeroext i8 @add_i8_i8(i8* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i8_i8( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 16 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1 -; CHECK-NEXT: [[TMP4]] = add <16 x i8> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef) +; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[WIDE_MASKED_LOAD]], <16 x i8> zeroinitializer +; CHECK-NEXT: [[TMP3]] = add <16 x i8> [[VEC_PHI]], [[TMP2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> [[TMP4]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[R_08:%.*]] = phi i8 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_09]] -; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ADD]] = add i8 [[TMP7]], [[R_08]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_09]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !13 -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i8 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> [[TMP3]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i8 [[R_0_LCSSA]] ; entry: @@ -599,12 +456,10 @@ define i64 @mla_i64_i64(i64* nocapture readonly %x, i64* nocapture readonly %y, ; CHECK-LABEL: @mla_i64_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[X:%.*]], i32 [[I_010]] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[Y:%.*]], i32 [[I_010]] @@ -613,12 +468,9 @@ define i64 @mla_i64_i64(i64* nocapture readonly %x, i64* nocapture readonly %y, ; CHECK-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[R_09]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ] -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -647,12 +499,10 @@ define i64 @mla_i32_i64(i32* nocapture readonly %x, i32* nocapture readonly %y, ; CHECK-LABEL: @mla_i32_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[I_010]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[I_010]] @@ -662,12 +512,9 @@ define i64 @mla_i32_i64(i32* nocapture readonly %x, i32* nocapture readonly %y, ; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_09]], [[CONV]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ] -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -697,12 +544,10 @@ define i64 @mla_i16_i64(i16* nocapture readonly %x, i16* nocapture readonly %y, ; CHECK-LABEL: @mla_i16_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[I_012]] ; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 @@ -714,12 +559,9 @@ define i64 @mla_i16_i64(i16* nocapture readonly %x, i16* nocapture readonly %y, ; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_011]], [[CONV3]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ] -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -751,12 +593,10 @@ define i64 @mla_i8_i64(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 ; CHECK-LABEL: @mla_i8_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[I_012]] ; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 @@ -768,12 +608,9 @@ define i64 @mla_i8_i64(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 ; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_011]], [[CONV3]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ] -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -805,56 +642,32 @@ define i32 @mla_i32_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, ; CHECK-LABEL: @mla_i32_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[TMP7]], [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[VEC_PHI]], [[TMP5]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[R_09:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_010]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[Y]], i32 [[I_010]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[R_09]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !15 -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] ; entry: @@ -883,60 +696,34 @@ define i32 @mla_i16_i32(i16* nocapture readonly %x, i16* nocapture readonly %y, ; CHECK-LABEL: @mla_i16_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[WIDE_LOAD1]] to <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP4]] -; CHECK-NEXT: [[TMP10]] = add <4 x i32> [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP1]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> undef) +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP4]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> undef) +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[VEC_PHI]], [[TMP7]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[R_010:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_011]] -; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[Y]], i32 [[I_011]] -; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 -; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP14]] to i32 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]] -; CHECK-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[R_010]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_011]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !17 -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] ; entry: @@ -967,60 +754,34 @@ define i32 @mla_i8_i32(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 ; CHECK-LABEL: @mla_i8_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw <4 x i32> [[TMP8]], [[TMP4]] -; CHECK-NEXT: [[TMP10]] = add <4 x i32> [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> undef) +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> undef) +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[VEC_PHI]], [[TMP7]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !18 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP9:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[R_010:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_011]] -; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP13]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[Y]], i32 [[I_011]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP14]] to i32 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV2]], [[CONV]] -; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[MUL]], [[R_010]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_011]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !19 -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] ; entry: @@ -1051,56 +812,32 @@ define signext i16 @mla_i16_i16(i16* nocapture readonly %x, i16* nocapture reado ; CHECK-LABEL: @mla_i16_i16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 8 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 0 +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> undef) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP6]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = mul <8 x i16> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8]] = add <8 x i16> [[TMP7]], [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP3]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> undef) +; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP4]], <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP6]] = add <8 x i16> [[VEC_PHI]], [[TMP5]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !20 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[R_012:%.*]] = phi i16 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_013]] -; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[Y]], i32 [[I_013]] -; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 -; CHECK-NEXT: [[MUL:%.*]] = mul i16 [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[ADD]] = add i16 [[MUL]], [[R_012]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_013]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !21 -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i16 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP6]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] ; entry: @@ -1129,60 +866,34 @@ define signext i16 @mla_i8_i16(i8* nocapture readonly %x, i8* nocapture readonly ; CHECK-LABEL: @mla_i8_i16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 8 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = mul nuw <8 x i16> [[TMP8]], [[TMP4]] -; CHECK-NEXT: [[TMP10]] = add <8 x i16> [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> undef) +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP4]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> undef) +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD1]] to <8 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw <8 x i16> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP6]], <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP8]] = add <8 x i16> [[VEC_PHI]], [[TMP7]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !22 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP11:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP12:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP10]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[R_012:%.*]] = phi i16 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_013]] -; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP13]] to i16 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[Y]], i32 [[I_013]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP14]] to i16 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i16 [[CONV2]], [[CONV]] -; CHECK-NEXT: [[ADD]] = add i16 [[MUL]], [[R_012]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_013]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !23 -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i16 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP8]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] ; entry: @@ -1213,56 +924,32 @@ define zeroext i8 @mla_i8_i8(i8* nocapture readonly %x, i8* nocapture readonly % ; CHECK-LABEL: @mla_i8_i8( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CMP10]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 16 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = mul <16 x i8> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8]] = add <16 x i8> [[TMP7]], [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef) +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP4]], <16 x i8> zeroinitializer +; CHECK-NEXT: [[TMP6]] = add <16 x i8> [[VEC_PHI]], [[TMP5]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !24 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[R_011:%.*]] = phi i8 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_012]] -; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[Y]], i32 [[I_012]] -; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[ADD]] = add i8 [[MUL]], [[R_011]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !25 -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i8 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP8:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> [[TMP6]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i8 [[R_0_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll new file mode 100644 index 0000000000000..cdcb81ec2dc28 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll @@ -0,0 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -tail-predication=enabled -loop-vectorize -instcombine -simplifycfg %s -S -o - | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + +define arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32* nocapture %minp, i32 %N) { +; CHECK-LABEL: @minmaxval4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP26_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI1]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 2147483647, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ -2147483648, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[MAX_0_LCSSA:%.*]] = phi i32 [ -2147483648, [[ENTRY:%.*]] ], [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MIN_0_LCSSA:%.*]] = phi i32 [ 2147483647, [[ENTRY]] ], [ [[COND9:%.*]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i32 [[MIN_0_LCSSA]], i32* [[MINP:%.*]], align 4 +; CHECK-NEXT: ret i32 [[MAX_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[MIN_028:%.*]] = phi i32 [ [[COND9]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[MAX_027:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_029]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP9]], [[MAX_027]] +; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP9]], i32 [[MAX_027]] +; CHECK-NEXT: [[CMP4:%.*]] = icmp slt i32 [[TMP9]], [[MIN_028]] +; CHECK-NEXT: [[COND9]] = select i1 [[CMP4]], i32 [[TMP9]], i32 [[MIN_028]] +; CHECK-NEXT: [[INC]] = add nuw i32 [[I_029]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; +entry: + %cmp26.not = icmp eq i32 %N, 0 + br i1 %cmp26.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %max.0.lcssa = phi i32 [ -2147483648, %entry ], [ %cond, %for.body ] + %min.0.lcssa = phi i32 [ 2147483647, %entry ], [ %cond9, %for.body ] + store i32 %min.0.lcssa, i32* %minp, align 4 + ret i32 %max.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.029 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %min.028 = phi i32 [ %cond9, %for.body ], [ 2147483647, %entry ] + %max.027 = phi i32 [ %cond, %for.body ], [ -2147483648, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.029 + %0 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %0, %max.027 + %cond = select i1 %cmp1, i32 %0, i32 %max.027 + %cmp4 = icmp slt i32 %0, %min.028 + %cond9 = select i1 %cmp4, i32 %0, i32 %min.028 + %inc = add nuw i32 %i.029, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll index baedc0a23daa2..95b22eb9660ad 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll @@ -1,13 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ ; RUN: -tail-predication=enabled -loop-vectorize -S < %s | \ ; RUN: FileCheck %s define void @trunc_not_allowed_different_vec_elemns(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i16* noalias nocapture %D) #0 { -; CHECK-LABEL: trunc_not_allowed_different_vec_elemns( +; CHECK-LABEL: @trunc_not_allowed_different_vec_elemns( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NOT: llvm.masked.load -; CHECK-NOT: llvm.masked.store -; CHECK: br i1 %{{.*}}, label %{{.*}}, label %vector.body +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = shl <4 x i16> [[TMP11]], +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[D:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16* [[TMP14]] to <4 x i16>* +; CHECK-NEXT: store <4 x i16> [[TMP12]], <4 x i16>* [[TMP15]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 431, 428 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_021:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD9:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_021]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_021]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_021]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD_TR:%.*]] = trunc i32 [[ADD]] to i16 +; CHECK-NEXT: [[CONV7:%.*]] = shl i16 [[ADD_TR]], 1 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, i16* [[D]], i32 [[I_021]] +; CHECK-NEXT: store i16 [[CONV7]], i16* [[ARRAYIDX8]], align 2 +; CHECK-NEXT: [[ADD9]] = add nuw nsw i32 [[I_021]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD9]], 431 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; entry: br label %for.body @@ -33,11 +84,24 @@ for.body: } define void @unsupported_i64_type(i64* noalias nocapture %A, i64* noalias nocapture readonly %B, i64* noalias nocapture readonly %C) #0 { -; CHECK-LABEL: unsupported_i64_type( -; CHECK-NOT: vector.body: -; CHECK-NOT: llvm.masked.load -; CHECK-NOT: llvm.masked.store +; CHECK-LABEL: @unsupported_i64_type( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void ; CHECK: for.body: +; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i32 [[I_09]] +; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[C:%.*]], i32 [[I_09]] +; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i32 [[I_09]] +; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ADD3]] = add nuw nsw i32 [[I_09]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD3]], 431 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; entry: br label %for.body @@ -59,11 +123,53 @@ for.body: } define void @narrowing_load_not_allowed(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i16* noalias nocapture readonly %C) #0 { -; CHECK-LABEL: narrowing_load_not_allowed( +; CHECK-LABEL: @narrowing_load_not_allowed( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NOT: llvm.masked.load -; CHECK-NOT: llvm.masked.store -; CHECK: br i1 %{{.*}}, label %{{.*}}, label %vector.body +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[C:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP6]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = trunc <8 x i16> [[WIDE_LOAD]] to <8 x i8> +; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i8> [[WIDE_LOAD1]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>* +; CHECK-NEXT: store <8 x i8> [[TMP8]], <8 x i8>* [[TMP11]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 424 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 431, 424 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 424, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[C]], i32 [[I_012]] +; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[I_012]] +; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV3:%.*]] = trunc i16 [[TMP13]] to i8 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP14]], [[CONV3]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[I_012]] +; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[ADD6]] = add nuw nsw i32 [[I_012]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD6]], 431 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; entry: br label %for.body @@ -91,11 +197,54 @@ for.body: ; preds = %for.body, %entry ; we could allow this case. ; define void @trunc_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { -; CHECK-LABEL: trunc_not_allowed( -; CHECK: vector.body: -; CHECK-NOT: llvm.masked.load -; CHECK-NOT: llvm.masked.store -; CHECK: br i1 %{{.*}}, label %{{.*}}, label %vector.body +; CHECK-LABEL: @trunc_not_allowed( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw i32 [[TMP0]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i16 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 431, 428 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_09]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD3]] = add nuw nsw i32 [[I_09]], 1 +; CHECK-NEXT: [[ADD_IV:%.*]] = trunc i32 [[ADD3]] to i16 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i16 [[ADD_IV]], 431 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]] +; entry: br label %for.body @@ -123,11 +272,67 @@ for.body: ; force vectorisation with a loop hint. ; define void @strides_different_direction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) #0 { -; CHECK-LABEL: strides_different_direction( +; CHECK-LABEL: @strides_different_direction( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 430) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], [[MUL_RESULT]] +; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[N]], [[MUL_RESULT]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[TMP1]], [[N]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP0]], [[N]] +; CHECK-NEXT: [[TMP4:%.*]] = select i1 true, i1 [[TMP2]], i1 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 false, [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NOT: llvm.masked.load -; CHECK-NOT: llvm.masked.store -; CHECK: br i1 %{{.*}}, label %{{.*}}, label %vector.body +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = sub nsw i32 [[N]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 -3 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = add nsw <4 x i32> [[REVERSE]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 431, 428 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[N]], [[I_09]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[SUB]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD3]] = add nuw nsw i32 [[I_09]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD3]], 431 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]] +; entry: br label %for.body @@ -150,11 +355,53 @@ for.body: } define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { -; CHECK-LABEL: too_many_loop_blocks( +; CHECK-LABEL: @too_many_loop_blocks( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NOT: llvm.masked.load -; CHECK-NOT: llvm.masked.store -; CHECK: br i1 %{{.*}}, label %{{.*}}, label %vector.body +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 431, 428 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[LOOPINCR:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_09]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: br label [[LOOPINCR]] +; CHECK: loopincr: +; CHECK-NEXT: [[ADD3]] = add nuw nsw i32 [[I_09]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD3]], 431 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]] +; entry: br label %for.body @@ -179,9 +426,24 @@ loopincr: } define void @double(double* noalias nocapture %A, double* noalias nocapture readonly %B, double* noalias nocapture readonly %C) #0 { -; CHECK-LABEL: double( +; CHECK-LABEL: @double( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NOT: vector.body: +; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i32 [[I_09]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i32 [[I_09]] +; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i32 [[I_09]] +; CHECK-NEXT: store double [[ADD]], double* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ADD3]] = add nuw nsw i32 [[I_09]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD3]], 431 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; entry: br label %for.body @@ -203,11 +465,28 @@ for.body: } define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C, half* noalias nocapture %D) #0 { -; CHECK-LABEL: fptrunc_not_allowed( -; CHECK-NOT: vector.body: -; CHECK-NOT: llvm.masked.load -; CHECK-NOT: llvm.masked.store -; CHECK: br i1 %{{.*}}, label %{{.*}}, label %for.body +; CHECK-LABEL: @fptrunc_not_allowed( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_017:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[I_017]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 [[I_017]] +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[I_017]] +; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CONV:%.*]] = fptrunc float [[ADD]] to half +; CHECK-NEXT: [[FACTOR:%.*]] = fmul fast half [[CONV]], 0xH4000 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, half* [[D:%.*]], i32 [[I_017]] +; CHECK-NEXT: store half [[FACTOR]], half* [[ARRAYIDX5]], align 2 +; CHECK-NEXT: [[ADD6]] = add nuw nsw i32 [[I_017]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD6]], 431 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; entry: br label %for.body @@ -238,6 +517,30 @@ for.body: ; to be reverted which is expensive and what we would like to avoid. ; define dso_local void @select_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N, i32* noalias nocapture readonly %Cond) { +; CHECK-LABEL: @select_not_allowed( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i32 [[I_011]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: [[C_B:%.*]] = select i1 [[TOBOOL_NOT]], i32* [[C:%.*]], i32* [[B:%.*]] +; CHECK-NEXT: [[COND_IN:%.*]] = getelementptr inbounds i32, i32* [[C_B]], i32 [[I_011]] +; CHECK-NEXT: [[COND:%.*]] = load i32, i32* [[COND_IN]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_011]] +; CHECK-NEXT: store i32 [[COND]], i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_011]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] +; entry: %cmp10 = icmp sgt i32 %N, 0 br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup @@ -267,11 +570,55 @@ for.body: ; preds = %for.body.preheader, } define i32 @i32_smin_reduction(i32* nocapture readonly %x, i32 %n) #0 { -; CHECK-LABEL: i32_smin_reduction( +; CHECK-LABEL: @i32_smin_reduction( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NOT: @llvm.masked.load -; CHECK-NOT: @llvm.masked.store -; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2147483647, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[R_07]], [[TMP8]] +; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP13:!llvm.loop !.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 2147483647, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[R_0_LCSSA]] +; entry: %cmp6 = icmp sgt i32 %n, 0 br i1 %cmp6, label %for.body, label %for.cond.cleanup @@ -293,11 +640,55 @@ for.cond.cleanup: ; preds = %for.body, %entry } define i32 @i32_smax_reduction(i32* nocapture readonly %x, i32 %n) #0 { -; CHECK-LABEL: i32_smax_reduction( +; CHECK-LABEL: @i32_smax_reduction( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NOT: @llvm.masked.load -; CHECK-NOT: @llvm.masked.store -; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -2147483648, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i32 [[R_07]], [[TMP8]] +; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP15:!llvm.loop !.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ -2147483648, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[R_0_LCSSA]] +; entry: %cmp6 = icmp sgt i32 %n, 0 br i1 %cmp6, label %for.body, label %for.cond.cleanup @@ -319,11 +710,55 @@ for.cond.cleanup: ; preds = %for.body, %entry } define i32 @i32_umin_reduction(i32* nocapture readonly %x, i32 %n) #0 { -; CHECK-LABEL: i32_umin_reduction( +; CHECK-LABEL: @i32_umin_reduction( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NOT: @llvm.masked.load -; CHECK-NOT: @llvm.masked.store -; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[R_07]], [[TMP8]] +; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP17:!llvm.loop !.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[R_0_LCSSA]] +; entry: %cmp6 = icmp sgt i32 %n, 0 br i1 %cmp6, label %for.body, label %for.cond.cleanup @@ -345,11 +780,55 @@ for.cond.cleanup: ; preds = %for.body, %entry } define i32 @i32_umax_reduction(i32* nocapture readonly %x, i32 %n) #0 { -; CHECK-LABEL: i32_umax_reduction( +; CHECK-LABEL: @i32_umax_reduction( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NOT: @llvm.masked.load -; CHECK-NOT: @llvm.masked.store -; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[R_07]], [[TMP8]] +; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP19:!llvm.loop !.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[R_0_LCSSA]] +; entry: %cmp6 = icmp sgt i32 %n, 0 br i1 %cmp6, label %for.body, label %for.cond.cleanup diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll index dd6692d75e5f5..a6e191c3d6923 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll @@ -300,3 +300,117 @@ for.end: ; preds = %for.body !91 = distinct !{!31, !32, !33} !92 = !{!"llvm.loop.vectorize.width", i32 4} !93 = !{!"llvm.loop.vectorize.enable", i1 true} + +declare float @__log10f_finite(float) #0 + +; CHECK-LABEL: @log10_f32 +; CHECK: <4 x float> @__svml_log10f4 +; CHECK: ret +define void @log10_f32(float* nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call fast float @__log10f_finite(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv + store float %call, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21 + +for.end: ; preds = %for.body + ret void +} + +!101 = distinct !{!21, !22, !23} +!102 = !{!"llvm.loop.vectorize.width", i32 4} +!103 = !{!"llvm.loop.vectorize.enable", i1 true} + + +declare double @__log10_finite(double) #0 + +; CHECK-LABEL: @log10_f64 +; CHECK: <4 x double> @__svml_log104 +; CHECK: ret +define void @log10_f64(double* nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call fast double @__log10_finite(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv + store double %call, double* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31 + +for.end: ; preds = %for.body + ret void +} + +!111 = distinct !{!31, !32, !33} +!112 = !{!"llvm.loop.vectorize.width", i32 4} +!113 = !{!"llvm.loop.vectorize.enable", i1 true} + +declare float @__sqrtf_finite(float) #0 + +; CHECK-LABEL: @sqrt_f32 +; CHECK: <4 x float> @__svml_sqrtf4 +; CHECK: ret +define void @sqrt_f32(float* nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call fast float @__sqrtf_finite(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv + store float %call, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21 + +for.end: ; preds = %for.body + ret void +} + +!121 = distinct !{!21, !22, !23} +!122 = !{!"llvm.loop.vectorize.width", i32 4} +!123 = !{!"llvm.loop.vectorize.enable", i1 true} + + +declare double @__sqrt_finite(double) #0 + +; CHECK-LABEL: @sqrt_f64 +; CHECK: <4 x double> @__svml_sqrt4 +; CHECK: ret +define void @sqrt_f64(double* nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call fast double @__sqrt_finite(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv + store double %call, double* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31 + +for.end: ; preds = %for.body + ret void +} + +!131 = distinct !{!31, !32, !33} +!132 = !{!"llvm.loop.vectorize.width", i32 4} +!133 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll index c074830075521..da6b4696ba2ba 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll @@ -33,6 +33,16 @@ declare float @log2f(float) #0 declare double @llvm.log2.f64(double) #0 declare float @llvm.log2.f32(float) #0 +declare double @log10(double) #0 +declare float @log10f(float) #0 +declare double @llvm.log10.f64(double) #0 +declare float @llvm.log10.f32(float) #0 + +declare double @sqrt(double) #0 +declare float @sqrtf(float) #0 +declare double @llvm.sqrt.f64(double) #0 +declare float @llvm.sqrt.f32(float) #0 + declare double @exp2(double) #0 declare float @exp2f(float) #0 declare double @llvm.exp2.f64(double) #0 @@ -598,6 +608,190 @@ for.end: ret void } +define void @log10_f64(double* nocapture %varray) { +; CHECK-LABEL: @log10_f64( +; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @log10(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log10_f32(float* nocapture %varray) { +; CHECK-LABEL: @log10_f32( +; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @log10f(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log10_f64_intrinsic(double* nocapture %varray) { +; CHECK-LABEL: @log10_f64_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.log10.f64(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log10_f32_intrinsic(float* nocapture %varray) { +; CHECK-LABEL: @log10_f32_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.log10.f32(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @sqrt_f64(double* nocapture %varray) { +; CHECK-LABEL: @sqrt_f64( +; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sqrt4(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @sqrt(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @sqrt_f32(float* nocapture %varray) { +; CHECK-LABEL: @sqrt_f32( +; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sqrtf4(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @sqrtf(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @sqrt_f64_intrinsic(double* nocapture %varray) { +; CHECK-LABEL: @sqrt_f64_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sqrt4(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.sqrt.f64(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @sqrt_f32_intrinsic(float* nocapture %varray) { +; CHECK-LABEL: @sqrt_f32_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sqrtf4(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.sqrt.f32(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + define void @exp2_f64(double* nocapture %varray) { ; CHECK-LABEL: @exp2_f64( ; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]]) diff --git a/llvm/test/Transforms/LowerSwitch/feature.ll b/llvm/test/Transforms/LowerSwitch/feature.ll index 09d25f0b06d44..55427af498eac 100644 --- a/llvm/test/Transforms/LowerSwitch/feature.ll +++ b/llvm/test/Transforms/LowerSwitch/feature.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -lowerswitch -S | FileCheck %s +; RUN: opt < %s -passes=lowerswitch -S | FileCheck %s ; We have switch on input. ; On output we should got binary comparison tree. Check that all is fine. diff --git a/llvm/test/Transforms/PGOProfile/chr.ll b/llvm/test/Transforms/PGOProfile/chr.ll index c2e1ae4f53a0f..1a22d7f0b8498 100644 --- a/llvm/test/Transforms/PGOProfile/chr.ll +++ b/llvm/test/Transforms/PGOProfile/chr.ll @@ -2006,9 +2006,16 @@ define i64 @test_chr_22(i1 %i, i64* %j, i64 %v0) !prof !14 { ; CHECK-NEXT: bb0: ; CHECK-NEXT: [[REASS_ADD:%.*]] = shl i64 [[V0:%.*]], 1 ; CHECK-NEXT: [[V2:%.*]] = add i64 [[REASS_ADD]], 3 +; CHECK-NEXT: [[C1:%.*]] = icmp slt i64 [[V2]], 100 +; CHECK-NEXT: br i1 [[C1]], label [[BB0_SPLIT:%.*]], label [[BB0_SPLIT_NONCHR:%.*]], !prof !15 +; CHECK: bb0.split: ; CHECK-NEXT: [[V299:%.*]] = mul i64 [[V2]], 7860086430977039991 ; CHECK-NEXT: store i64 [[V299]], i64* [[J:%.*]], align 4 ; CHECK-NEXT: ret i64 99 +; CHECK: bb0.split.nonchr: +; CHECK-NEXT: [[V299_NONCHR:%.*]] = mul i64 [[V2]], 7860086430977039991 +; CHECK-NEXT: store i64 [[V299_NONCHR]], i64* [[J]], align 4 +; CHECK-NEXT: ret i64 99 ; bb0: %v1 = add i64 %v0, 3 diff --git a/llvm/test/Transforms/PGOProfile/split-indirectbr-critical-edges.ll b/llvm/test/Transforms/PGOProfile/split-indirectbr-critical-edges.ll index dc834b7cd47cc..70daa54331a30 100644 --- a/llvm/test/Transforms/PGOProfile/split-indirectbr-critical-edges.ll +++ b/llvm/test/Transforms/PGOProfile/split-indirectbr-critical-edges.ll @@ -37,3 +37,27 @@ if.end: ; preds = %if.end.preheader, % indirectbr i8* %2, [label %for.cond2, label %if.end] ; CHECK: indirectbr i8* %2, [label %for.cond2, label %if.end] } + +;; If an indirectbr critical edge cannot be split, ignore it. +;; The edge will not be profiled. +; CHECK-LABEL: @cannot_split( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.instrprof.increment +; CHECK-NOT: call void @llvm.instrprof.increment +define i32 @cannot_split(i8* nocapture readonly %p) { +entry: + %targets = alloca <2 x i8*>, align 16 + store <2 x i8*> , <2 x i8*>* %targets, align 16 + %arrayidx2 = getelementptr inbounds i8, i8* %p, i64 1 + %0 = load i8, i8* %arrayidx2 + %idxprom = sext i8 %0 to i64 + %arrayidx3 = getelementptr inbounds <2 x i8*>, <2 x i8*>* %targets, i64 0, i64 %idxprom + %1 = load i8*, i8** %arrayidx3, align 8 + br label %indirect + +indirect: ; preds = %entry, %indirect + indirectbr i8* %1, [label %indirect, label %end] + +end: ; preds = %indirect + ret i32 0 +} diff --git a/llvm/test/Transforms/PartialInlining/intrinsic-call-cost.ll b/llvm/test/Transforms/PartialInlining/intrinsic-call-cost.ll new file mode 100644 index 0000000000000..8f5a92df8407c --- /dev/null +++ b/llvm/test/Transforms/PartialInlining/intrinsic-call-cost.ll @@ -0,0 +1,55 @@ +; RUN: opt -partial-inliner -S < %s | FileCheck %s + +; Checks that valid costs are computed for intrinsic calls. +; https://bugs.llvm.org/show_bug.cgi?id=45932 + + +@emit_notes = external global i8, align 2 + +; CHECK: var_reg_delete +; CHECK-NEXT: bb +; CHECK-NEXT: tail call void @delete_variable_part() +; CHECK-NEXT: ret void + +define void @var_reg_delete() { +bb: + tail call void @delete_variable_part() + ret void +} + +; CHECK: delete_variable_part +; CHECK-NEXT: bb +; CHECK-NEXT: %tmp1.i = tail call i32 @find_variable_location_part() +; CHECK-NEXT: %tmp3.i = icmp sgt i32 %tmp1.i, -1 +; CHECK-NEXT: br i1 %tmp3.i, label %bb4.i, label %delete_slot_part.exit + +; CHECK: bb4.i +; CHECK-NEXT: %tmp.i.i = load i8, i8* @emit_notes +; CHECK-NEXT: %tmp1.i.i = icmp ne i8 %tmp.i.i, 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 %tmp1.i.i) +; CHECK-NEXT: unreachable + +; CHECK: delete_slot_part.exit +; CHECK-NEXT: ret void + +define void @delete_variable_part() { +bb: + %tmp1.i = tail call i32 @find_variable_location_part() + %tmp3.i = icmp sgt i32 %tmp1.i, -1 + br i1 %tmp3.i, label %bb4.i, label %delete_slot_part.exit + +bb4.i: + %tmp.i.i = load i8, i8* @emit_notes, align 2 + %tmp1.i.i = icmp ne i8 %tmp.i.i, 0 + tail call void @llvm.assume(i1 %tmp1.i.i) + unreachable + +delete_slot_part.exit: + ret void +} + +; CHECK: declare i32 @find_variable_location_part +declare i32 @find_variable_location_part() + +; CHECK: declare void @llvm.assume(i1 noundef) +declare void @llvm.assume(i1 noundef) diff --git a/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops.ll b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops.ll new file mode 100644 index 0000000000000..96535892953f2 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -O2 -S | FileCheck %s --check-prefixes=CHECK,OLDPM +; RUN: opt < %s -passes='default' -aa-pipeline=default -S | FileCheck %s --check-prefixes=CHECK,NEWPM + +target triple = "x86_64--" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) #0 +declare void @llvm.masked.store.v8f32.p0v8f32(<8 x float>, <8 x float>*, i32, <8 x i1>) + +; PR11210: If we have been able to replace a AVX/AVX2 masked store with a +; generic masked store intrinsic, then we should be able to remove dead +; masked stores. + +define void @PR11210_v8f32_maskstore_maskstore(i8* %ptr, <8 x float> %x, <8 x float> %y, <8 x i32> %src) { +; CHECK-LABEL: @PR11210_v8f32_maskstore_maskstore( +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt <8 x i32> [[SRC:%.*]], zeroinitializer +; CHECK-NEXT: [[CASTVEC:%.*]] = bitcast i8* [[PTR:%.*]] to <8 x float>* +; CHECK-NEXT: tail call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[Y:%.*]], <8 x float>* [[CASTVEC]], i32 1, <8 x i1> [[CMP]]) +; CHECK-NEXT: ret void +; + %cmp = icmp sgt <8 x i32> %src, zeroinitializer + %mask = sext <8 x i1> %cmp to <8 x i32> + call void @llvm.x86.avx.maskstore.ps.256(i8* %ptr, <8 x i32> %mask, <8 x float> %x) + call void @llvm.x86.avx.maskstore.ps.256(i8* %ptr, <8 x i32> %mask, <8 x float> %y) + ret void +} + +; The contents of %mask are unknown so we don't replace this with a generic masked.store. +define void @PR11210_v8f32_maskstore_maskstore_raw_mask(i8* %ptr, <8 x float> %x, <8 x float> %y, <8 x i32> %mask) { +; CHECK-LABEL: @PR11210_v8f32_maskstore_maskstore_raw_mask( +; CHECK-NEXT: tail call void @llvm.x86.avx.maskstore.ps.256(i8* [[PTR:%.*]], <8 x i32> [[MASK:%.*]], <8 x float> [[X:%.*]]) +; CHECK-NEXT: tail call void @llvm.x86.avx.maskstore.ps.256(i8* [[PTR]], <8 x i32> [[MASK]], <8 x float> [[Y:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx.maskstore.ps.256(i8* %ptr, <8 x i32> %mask, <8 x float> %x) + call void @llvm.x86.avx.maskstore.ps.256(i8* %ptr, <8 x i32> %mask, <8 x float> %y) + ret void +} + +; Mix AVX and generic masked stores. +define void @PR11210_v8f32_mstore_maskstore(i8* %ptr, <8 x float> %x, <8 x float> %y, <8 x i32> %src) { +; CHECK-LABEL: @PR11210_v8f32_mstore_maskstore( +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt <8 x i32> [[SRC:%.*]], zeroinitializer +; CHECK-NEXT: [[PTRF:%.*]] = bitcast i8* [[PTR:%.*]] to <8 x float>* +; CHECK-NEXT: tail call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[Y:%.*]], <8 x float>* [[PTRF]], i32 1, <8 x i1> [[CMP]]) +; CHECK-NEXT: ret void +; + %cmp = icmp sgt <8 x i32> %src, zeroinitializer + %mask = sext <8 x i1> %cmp to <8 x i32> + %ptrf = bitcast i8* %ptr to <8 x float>* + tail call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> %x, <8 x float>* %ptrf, i32 1, <8 x i1> %cmp) + call void @llvm.x86.avx.maskstore.ps.256(i8* %ptr, <8 x i32> %mask, <8 x float> %y) + ret void +} + diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll index 0e02a01291d84..c3699ff0d6b4f 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll @@ -12,7 +12,7 @@ define i32 @add_v4i32(i32* %p) #0 { ; CHECK-LABEL: @add_v4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !tbaa !0 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, [[TBAA0:!tbaa !.*]] ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP1]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> @@ -51,7 +51,7 @@ define signext i16 @mul_v8i16(i16* %p) #0 { ; CHECK-LABEL: @mul_v8i16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[P:%.*]] to <8 x i16>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2, !tbaa !4 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2, [[TBAA4:!tbaa !.*]] ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> ; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <8 x i16> [[TMP1]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> @@ -95,7 +95,7 @@ define signext i8 @or_v16i8(i8* %p) #0 { ; CHECK-LABEL: @or_v16i8( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P:%.*]] to <16 x i8>* -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1, !tbaa !6 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1, [[TBAA6:!tbaa !.*]] ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX:%.*]] = or <16 x i8> [[TMP1]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <16 x i8> [[BIN_RDX]], <16 x i8> undef, <16 x i32> @@ -141,7 +141,7 @@ define i32 @smin_v4i32(i32* %p) #0 { ; CHECK-LABEL: @smin_v4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !tbaa !0 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, [[TBAA0]] ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP1]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP1]], <4 x i32> [[RDX_SHUF]] @@ -195,7 +195,7 @@ define i32 @umax_v4i32(i32* %p) #0 { ; CHECK-LABEL: @umax_v4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !tbaa !0 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, [[TBAA0]] ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt <4 x i32> [[TMP1]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP1]], <4 x i32> [[RDX_SHUF]] @@ -249,7 +249,7 @@ define float @fadd_v4i32(float* %p) #0 { ; CHECK-LABEL: @fadd_v4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, !tbaa !7 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7:!tbaa !.*]] ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP1]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> @@ -290,7 +290,7 @@ define float @fmul_v4i32(float* %p) #0 { ; CHECK-LABEL: @fmul_v4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, !tbaa !7 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7]] ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul fast <4 x float> [[TMP1]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> @@ -330,18 +330,17 @@ for.end: define float @fmin_v4i32(float* %p) #0 { ; CHECK-LABEL: @fmin_v4i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[P:%.*]], align 4, !tbaa !7 -; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP0]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[P:%.*]], align 4, [[TBAA7]] ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[P]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_1]], align 4, !tbaa !7 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP2]], float [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4, [[TBAA7]] +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP1]], float [[TMP0]]) ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX_2]], align 4, !tbaa !7 -; CHECK-NEXT: [[TMP5:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP4]], float [[TMP3]]) +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_2]], align 4, [[TBAA7]] +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]]) ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_3]], align 4, !tbaa !7 -; CHECK-NEXT: [[TMP7:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP6]], float [[TMP5]]) -; CHECK-NEXT: ret float [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_3]], align 4, [[TBAA7]] +; CHECK-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP5]], float [[TMP4]]) +; CHECK-NEXT: ret float [[TMP6]] ; entry: br label %for.cond diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll index cf01ead15b0e5..4610febfdd3db 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -71,12 +71,10 @@ define i32 @TestVectorsEqual(i32* noalias %Vec0, i32* noalias %Vec1, i32 %Tolera ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[VEC1:%.*]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) -; CHECK-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP8]], [[TOLERANCE:%.*]] -; CHECK-NEXT: [[COND6:%.*]] = zext i1 [[CMP5]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4]], i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP5_NOT:%.*]] = icmp sle i32 [[TMP6]], [[TOLERANCE:%.*]] +; CHECK-NEXT: [[COND6:%.*]] = zext i1 [[CMP5_NOT]] to i32 ; CHECK-NEXT: ret i32 [[COND6]] ; entry: @@ -134,8 +132,8 @@ define i32 @TestVectorsEqual_alt(i32* noalias %Vec0, i32* noalias %Vec1, i32 %To ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: [[CMP3:%.*]] = icmp ule i32 [[TMP5]], [[TOLERANCE:%.*]] -; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP3]] to i32 +; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp ule i32 [[TMP5]], [[TOLERANCE:%.*]] +; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP3_NOT]] to i32 ; CHECK-NEXT: ret i32 [[COND]] ; entry: diff --git a/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll b/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll index 61287e35005ff..2605701d231d2 100644 --- a/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll +++ b/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll @@ -41,10 +41,7 @@ define void @caller1(i1 %c, i64* align 1 %ptr) { ; ASSUMPTIONS-ON-NEXT: br i1 [[C:%.*]], label [[TRUE2_CRITEDGE:%.*]], label [[FALSE1:%.*]] ; ASSUMPTIONS-ON: false1: ; ASSUMPTIONS-ON-NEXT: store volatile i64 1, i64* [[PTR:%.*]], align 8 -; ASSUMPTIONS-ON-NEXT: [[PTRINT:%.*]] = ptrtoint i64* [[PTR]] to i64 -; ASSUMPTIONS-ON-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 7 -; ASSUMPTIONS-ON-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; ASSUMPTIONS-ON-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; ASSUMPTIONS-ON-NEXT: call void @llvm.assume(i1 true) [ "align"(i64* [[PTR]], i64 8) ] ; ASSUMPTIONS-ON-NEXT: store volatile i64 0, i64* [[PTR]], align 8 ; ASSUMPTIONS-ON-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 ; ASSUMPTIONS-ON-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 @@ -54,10 +51,7 @@ define void @caller1(i1 %c, i64* align 1 %ptr) { ; ASSUMPTIONS-ON-NEXT: store volatile i64 3, i64* [[PTR]], align 8 ; ASSUMPTIONS-ON-NEXT: ret void ; ASSUMPTIONS-ON: true2.critedge: -; ASSUMPTIONS-ON-NEXT: [[PTRINT_C:%.*]] = ptrtoint i64* [[PTR]] to i64 -; ASSUMPTIONS-ON-NEXT: [[MASKEDPTR_C:%.*]] = and i64 [[PTRINT_C]], 7 -; ASSUMPTIONS-ON-NEXT: [[MASKCOND_C:%.*]] = icmp eq i64 [[MASKEDPTR_C]], 0 -; ASSUMPTIONS-ON-NEXT: tail call void @llvm.assume(i1 [[MASKCOND_C]]) +; ASSUMPTIONS-ON-NEXT: call void @llvm.assume(i1 true) [ "align"(i64* [[PTR]], i64 8) ] ; ASSUMPTIONS-ON-NEXT: store volatile i64 0, i64* [[PTR]], align 8 ; ASSUMPTIONS-ON-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 ; ASSUMPTIONS-ON-NEXT: store volatile i64 -1, i64* [[PTR]], align 8 @@ -94,26 +88,17 @@ false2: ; This test checks that alignment assumptions do not prevent SROA. ; See PR45763. -define internal void @callee2(i64* noalias sret align 8 %arg) { +define internal void @callee2(i64* noalias sret align 32 %arg) { store i64 0, i64* %arg, align 8 ret void } define amdgpu_kernel void @caller2() { -; ASSUMPTIONS-OFF-LABEL: @caller2( -; ASSUMPTIONS-OFF-NEXT: ret void -; -; ASSUMPTIONS-ON-LABEL: @caller2( -; ASSUMPTIONS-ON-NEXT: [[ALLOCA:%.*]] = alloca i64, align 8, addrspace(5) -; ASSUMPTIONS-ON-NEXT: [[CAST:%.*]] = addrspacecast i64 addrspace(5)* [[ALLOCA]] to i64* -; ASSUMPTIONS-ON-NEXT: [[PTRINT:%.*]] = ptrtoint i64* [[CAST]] to i64 -; ASSUMPTIONS-ON-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 7 -; ASSUMPTIONS-ON-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 -; ASSUMPTIONS-ON-NEXT: call void @llvm.assume(i1 [[MASKCOND]]) -; ASSUMPTIONS-ON-NEXT: ret void +; CHECK-LABEL: @caller2( +; CHECK-NEXT: ret void ; %alloca = alloca i64, align 8, addrspace(5) %cast = addrspacecast i64 addrspace(5)* %alloca to i64* - call void @callee2(i64* sret align 8 %cast) + call void @callee2(i64* sret align 32 %cast) ret void } diff --git a/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll b/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll index 1d8cce6879e9d..314af1c141454 100644 --- a/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll +++ b/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll @@ -5,14 +5,11 @@ ; RUN: opt -O3 -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefixes=HOIST,THR1,FALLBACK2 ; RUN: opt -passes='default' -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefixes=HOIST,THR1,FALLBACK3 -; RUN: opt -O3 -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=HOIST,THR2,FALLBACK4 -; RUN: opt -passes='default' -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=HOIST,THR2,FALLBACK5 +; RUN: opt -O3 -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_OLDPM,FALLBACK4 +; RUN: opt -passes='default' -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_NEWPM,FALLBACK5 -; RUN: opt -O3 -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_OLDPM,FALLBACK6 -; RUN: opt -passes='default' -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_NEWPM,FALLBACK7 - -; RUN: opt -O3 -rotation-max-header-size=4 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_OLDPM,FALLBACK8 -; RUN: opt -passes='default' -rotation-max-header-size=4 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_NEWPM,FALLBACK9 +; RUN: opt -O3 -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_OLDPM,FALLBACK6 +; RUN: opt -passes='default' -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_NEWPM,FALLBACK7 ; This example is produced from a very basic C code: ; @@ -61,8 +58,8 @@ define void @_Z4loopi(i32 %width) { ; HOIST-NEXT: br label [[FOR_COND:%.*]] ; HOIST: for.cond: ; HOIST-NEXT: [[I_0:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ] -; HOIST-NEXT: tail call void @f0() ; HOIST-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_0]], [[TMP0]] +; HOIST-NEXT: tail call void @f0() ; HOIST-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] ; HOIST: for.cond.cleanup: ; HOIST-NEXT: tail call void @f2() @@ -80,17 +77,17 @@ define void @_Z4loopi(i32 %width) { ; ROTATED_LATER_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; ROTATED_LATER_OLDPM: for.cond.preheader: ; ROTATED_LATER_OLDPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1 -; ROTATED_LATER_OLDPM-NEXT: tail call void @f0() ; ROTATED_LATER_OLDPM-NEXT: [[EXITCOND_NOT3:%.*]] = icmp eq i32 [[TMP0]], 0 ; ROTATED_LATER_OLDPM-NEXT: br i1 [[EXITCOND_NOT3]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]] ; ROTATED_LATER_OLDPM: for.cond.cleanup: +; ROTATED_LATER_OLDPM-NEXT: tail call void @f0() ; ROTATED_LATER_OLDPM-NEXT: tail call void @f2() ; ROTATED_LATER_OLDPM-NEXT: br label [[RETURN]] ; ROTATED_LATER_OLDPM: for.body: ; ROTATED_LATER_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_COND_PREHEADER]] ] +; ROTATED_LATER_OLDPM-NEXT: tail call void @f0() ; ROTATED_LATER_OLDPM-NEXT: tail call void @f1() ; ROTATED_LATER_OLDPM-NEXT: [[INC]] = add nuw i32 [[I_04]], 1 -; ROTATED_LATER_OLDPM-NEXT: tail call void @f0() ; ROTATED_LATER_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]] ; ROTATED_LATER_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; ROTATED_LATER_OLDPM: return: @@ -102,19 +99,19 @@ define void @_Z4loopi(i32 %width) { ; ROTATED_LATER_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; ROTATED_LATER_NEWPM: for.cond.preheader: ; ROTATED_LATER_NEWPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1 -; ROTATED_LATER_NEWPM-NEXT: tail call void @f0() ; ROTATED_LATER_NEWPM-NEXT: [[EXITCOND_NOT3:%.*]] = icmp eq i32 [[TMP0]], 0 ; ROTATED_LATER_NEWPM-NEXT: br i1 [[EXITCOND_NOT3]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND_PREHEADER_FOR_BODY_CRIT_EDGE:%.*]] ; ROTATED_LATER_NEWPM: for.cond.preheader.for.body_crit_edge: ; ROTATED_LATER_NEWPM-NEXT: [[INC_1:%.*]] = add nuw i32 0, 1 ; ROTATED_LATER_NEWPM-NEXT: br label [[FOR_BODY:%.*]] ; ROTATED_LATER_NEWPM: for.cond.cleanup: +; ROTATED_LATER_NEWPM-NEXT: tail call void @f0() ; ROTATED_LATER_NEWPM-NEXT: tail call void @f2() ; ROTATED_LATER_NEWPM-NEXT: br label [[RETURN]] ; ROTATED_LATER_NEWPM: for.body: ; ROTATED_LATER_NEWPM-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[INC_0:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ], [ [[INC_1]], [[FOR_COND_PREHEADER_FOR_BODY_CRIT_EDGE]] ] -; ROTATED_LATER_NEWPM-NEXT: tail call void @f1() ; ROTATED_LATER_NEWPM-NEXT: tail call void @f0() +; ROTATED_LATER_NEWPM-NEXT: tail call void @f1() ; ROTATED_LATER_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_PHI]], [[TMP0]] ; ROTATED_LATER_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ; ROTATED_LATER_NEWPM: for.body.for.body_crit_edge: @@ -129,19 +126,19 @@ define void @_Z4loopi(i32 %width) { ; ROTATE_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; ROTATE_OLDPM: for.cond.preheader: ; ROTATE_OLDPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1 -; ROTATE_OLDPM-NEXT: tail call void @f0() ; ROTATE_OLDPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; ROTATE_OLDPM: for.body.preheader: ; ROTATE_OLDPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1 ; ROTATE_OLDPM-NEXT: br label [[FOR_BODY:%.*]] ; ROTATE_OLDPM: for.cond.cleanup: +; ROTATE_OLDPM-NEXT: tail call void @f0() ; ROTATE_OLDPM-NEXT: tail call void @f2() ; ROTATE_OLDPM-NEXT: br label [[RETURN]] ; ROTATE_OLDPM: for.body: ; ROTATE_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; ROTATE_OLDPM-NEXT: tail call void @f0() ; ROTATE_OLDPM-NEXT: tail call void @f1() ; ROTATE_OLDPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1 -; ROTATE_OLDPM-NEXT: tail call void @f0() ; ROTATE_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]] ; ROTATE_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; ROTATE_OLDPM: return: @@ -153,19 +150,19 @@ define void @_Z4loopi(i32 %width) { ; ROTATE_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; ROTATE_NEWPM: for.cond.preheader: ; ROTATE_NEWPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1 -; ROTATE_NEWPM-NEXT: tail call void @f0() ; ROTATE_NEWPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; ROTATE_NEWPM: for.body.preheader: ; ROTATE_NEWPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1 ; ROTATE_NEWPM-NEXT: [[INC_1:%.*]] = add nuw nsw i32 0, 1 ; ROTATE_NEWPM-NEXT: br label [[FOR_BODY:%.*]] ; ROTATE_NEWPM: for.cond.cleanup: +; ROTATE_NEWPM-NEXT: tail call void @f0() ; ROTATE_NEWPM-NEXT: tail call void @f2() ; ROTATE_NEWPM-NEXT: br label [[RETURN]] ; ROTATE_NEWPM: for.body: ; ROTATE_NEWPM-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[INC_0:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ], [ [[INC_1]], [[FOR_BODY_PREHEADER]] ] -; ROTATE_NEWPM-NEXT: tail call void @f1() ; ROTATE_NEWPM-NEXT: tail call void @f0() +; ROTATE_NEWPM-NEXT: tail call void @f1() ; ROTATE_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_PHI]], [[TMP0]] ; ROTATE_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ; ROTATE_NEWPM: for.body.for.body_crit_edge: diff --git a/llvm/test/Transforms/PhaseOrdering/min-max-abs-cse.ll b/llvm/test/Transforms/PhaseOrdering/min-max-abs-cse.ll index bdf75ca7e82e4..b94cabc780dd7 100644 --- a/llvm/test/Transforms/PhaseOrdering/min-max-abs-cse.ll +++ b/llvm/test/Transforms/PhaseOrdering/min-max-abs-cse.ll @@ -33,10 +33,8 @@ define i8 @smax_nsw(i8 %a, i8 %b) { define i8 @abs_swapped(i8 %a) { ; CHECK-LABEL: @abs_swapped( -; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[A:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i8 [[A]], 0 -; CHECK-NEXT: [[M1:%.*]] = select i1 [[CMP1]], i8 [[NEG]], i8 [[A]] -; CHECK-NEXT: ret i8 [[M1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[A:%.*]], i1 false) +; CHECK-NEXT: ret i8 [[TMP1]] ; %neg = sub i8 0, %a %cmp1 = icmp sgt i8 %a, 0 @@ -81,9 +79,8 @@ define i8 @abs_different_constants(i8 %a) { define i8 @nabs_different_constants(i8 %a) { ; CHECK-LABEL: @nabs_different_constants( -; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[A:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i8 [[A]], 0 -; CHECK-NEXT: [[M1:%.*]] = select i1 [[CMP1]], i8 [[A]], i8 [[NEG]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[A:%.*]], i1 false) +; CHECK-NEXT: [[M1:%.*]] = sub i8 0, [[TMP1]] ; CHECK-NEXT: ret i8 [[M1]] ; %neg = sub i8 0, %a diff --git a/llvm/test/Transforms/SCCP/intrinsics.ll b/llvm/test/Transforms/SCCP/intrinsics.ll index d06b94162b5be..e261a59d3d6bc 100644 --- a/llvm/test/Transforms/SCCP/intrinsics.ll +++ b/llvm/test/Transforms/SCCP/intrinsics.ll @@ -12,10 +12,8 @@ define void @abs1(i8* %p) { ; CHECK-LABEL: @abs1( ; CHECK-NEXT: [[X:%.*]] = load i8, i8* [[P:%.*]], align 1, [[RNG0:!range !.*]] ; CHECK-NEXT: [[ABS:%.*]] = call i8 @llvm.abs.i8(i8 [[X]], i1 false) -; CHECK-NEXT: [[CMP1:%.*]] = icmp sge i8 [[ABS]], 0 -; CHECK-NEXT: call void @use(i1 [[CMP1]]) -; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i8 [[ABS]], 10 -; CHECK-NEXT: call void @use(i1 [[CMP2]]) +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: call void @use(i1 true) ; CHECK-NEXT: [[CMP3:%.*]] = icmp sge i8 [[ABS]], 1 ; CHECK-NEXT: call void @use(i1 [[CMP3]]) ; CHECK-NEXT: [[CMP4:%.*]] = icmp slt i8 [[ABS]], 9 @@ -40,8 +38,7 @@ define void @abs1(i8* %p) { define void @abs2(i8 %x) { ; CHECK-LABEL: @abs2( ; CHECK-NEXT: [[ABS:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 true) -; CHECK-NEXT: [[CMP:%.*]] = icmp sge i8 [[ABS]], 0 -; CHECK-NEXT: call void @use(i1 [[CMP]]) +; CHECK-NEXT: call void @use(i1 true) ; CHECK-NEXT: ret void ; %abs = call i8 @llvm.abs.i8(i8 %x, i1 true) @@ -68,10 +65,8 @@ define void @umax1(i8* %p1, i8* %p2) { ; CHECK-NEXT: [[X1:%.*]] = load i8, i8* [[P1:%.*]], align 1, [[RNG1:!range !.*]] ; CHECK-NEXT: [[X2:%.*]] = load i8, i8* [[P2:%.*]], align 1, [[RNG2:!range !.*]] ; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.umax.i8(i8 [[X1]], i8 [[X2]]) -; CHECK-NEXT: [[CMP1:%.*]] = icmp uge i8 [[M]], 5 -; CHECK-NEXT: call void @use(i1 [[CMP1]]) -; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i8 [[M]], 15 -; CHECK-NEXT: call void @use(i1 [[CMP2]]) +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: call void @use(i1 true) ; CHECK-NEXT: [[CMP3:%.*]] = icmp uge i8 [[M]], 6 ; CHECK-NEXT: call void @use(i1 [[CMP3]]) ; CHECK-NEXT: [[CMP4:%.*]] = icmp ult i8 [[M]], 14 @@ -95,8 +90,7 @@ define void @umax1(i8* %p1, i8* %p2) { define void @umax2(i8 %x) { ; CHECK-LABEL: @umax2( ; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 10) -; CHECK-NEXT: [[CMP:%.*]] = icmp uge i8 [[M]], 10 -; CHECK-NEXT: call void @use(i1 [[CMP]]) +; CHECK-NEXT: call void @use(i1 true) ; CHECK-NEXT: ret void ; %m = call i8 @llvm.umax.i8(i8 %x, i8 10) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll new file mode 100644 index 0000000000000..b5cab5d3186af --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -S 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; WARN-NOT: warning + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define <2 x float> @insertelement-fixed-vector() { +; CHECK-LABEL: @insertelement-fixed-vector( +; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> undef) +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x float> [[I0]], float [[TMP3]], i32 1 +; CHECK-NEXT: ret <2 x float> [[I1]] +; + %f0 = tail call fast float @llvm.fabs.f32(float undef) + %f1 = tail call fast float @llvm.fabs.f32(float undef) + %i0 = insertelement <2 x float> undef, float %f0, i32 0 + %i1 = insertelement <2 x float> %i0, float %f1, i32 1 + ret <2 x float> %i1 +} + +; TODO: llvm.fabs could be optimized in vector form. It's legal to extract +; elements from fixed-length vector and insert into scalable vector. +define @insertelement-scalable-vector() { +; CHECK-LABEL: @insertelement-scalable-vector( +; CHECK-NEXT: [[F0:%.*]] = tail call fast float @llvm.fabs.f32(float undef) +; CHECK-NEXT: [[F1:%.*]] = tail call fast float @llvm.fabs.f32(float undef) +; CHECK-NEXT: [[I0:%.*]] = insertelement undef, float [[F0]], i32 0 +; CHECK-NEXT: [[I1:%.*]] = insertelement [[I0]], float [[F1]], i32 1 +; CHECK-NEXT: ret [[I1]] +; + %f0 = tail call fast float @llvm.fabs.f32(float undef) + %f1 = tail call fast float @llvm.fabs.f32(float undef) + %i0 = insertelement undef, float %f0, i32 0 + %i1 = insertelement %i0, float %f1, i32 1 + ret %i1 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare float @llvm.fabs.f32(float) diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll index 4007a0d30edc5..397e98eb881df 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll @@ -107,6 +107,8 @@ define i64 @sminv6() { ret i64 %select5 } +; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select +; with fastmath on the select. define float @fmaxv6() { ; GFX9-LABEL: @fmaxv6( ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @farr to <2 x float>*), align 16 @@ -114,19 +116,21 @@ define float @fmaxv6() { ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 ; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]] -; GFX9-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2) to <4 x float>*), align 8 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP4]], <4 x float> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: [[TMP6:%.*]] = fcmp fast ogt float [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], float [[TMP5]], float [[SELECT1]] +; GFX9-NEXT: [[LOAD3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8 +; GFX9-NEXT: [[CMP2:%.*]] = fcmp fast ogt float [[SELECT1]], [[LOAD3]] +; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], float [[SELECT1]], float [[LOAD3]] +; GFX9-NEXT: [[LOAD4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 3), align 4 +; GFX9-NEXT: [[CMP3:%.*]] = fcmp fast ogt float [[SELECT2]], [[LOAD4]] +; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], float [[SELECT2]], float [[LOAD4]] +; GFX9-NEXT: [[LOAD5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16 +; GFX9-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[SELECT3]], [[LOAD5]] +; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], float [[SELECT3]], float [[LOAD5]] +; GFX9-NEXT: [[LOAD6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4 +; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast ogt float [[SELECT4]], [[LOAD6]] +; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], float [[SELECT4]], float [[LOAD6]] ; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00 ; GFX9-NEXT: store float [[STORE_SELECT]], float* @fvar, align 8 -; GFX9-NEXT: ret float [[OP_EXTRA]] +; GFX9-NEXT: ret float [[SELECT5]] ; %load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16 %load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4 @@ -154,6 +158,8 @@ define float @fmaxv6() { ret float %select5 } +; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select +; with fastmath on the select. define double @dminv6() { ; GFX9-LABEL: @dminv6( ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([32 x double]* @darr to <2 x double>*), align 16 @@ -161,19 +167,21 @@ define double @dminv6() { ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 ; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]] ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]] -; GFX9-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2) to <4 x double>*), align 8 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x double> [[TMP4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x double> [[TMP4]], <4 x double> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x double> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: [[TMP6:%.*]] = fcmp fast olt double [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], double [[TMP5]], double [[SELECT1]] +; GFX9-NEXT: [[LOAD3:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8 +; GFX9-NEXT: [[CMP2:%.*]] = fcmp fast olt double [[SELECT1]], [[LOAD3]] +; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], double [[SELECT1]], double [[LOAD3]] +; GFX9-NEXT: [[LOAD4:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 3), align 4 +; GFX9-NEXT: [[CMP3:%.*]] = fcmp fast olt double [[SELECT2]], [[LOAD4]] +; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], double [[SELECT2]], double [[LOAD4]] +; GFX9-NEXT: [[LOAD5:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16 +; GFX9-NEXT: [[CMP4:%.*]] = fcmp fast olt double [[SELECT3]], [[LOAD5]] +; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], double [[SELECT3]], double [[LOAD5]] +; GFX9-NEXT: [[LOAD6:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4 +; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast olt double [[SELECT4]], [[LOAD6]] +; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], double [[SELECT4]], double [[LOAD6]] ; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00 ; GFX9-NEXT: store double [[STORE_SELECT]], double* @dvar, align 8 -; GFX9-NEXT: ret double [[OP_EXTRA]] +; GFX9-NEXT: ret double [[SELECT5]] ; %load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16 %load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll index d7434394dcc39..f97b1243f9548 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll @@ -611,31 +611,22 @@ entry: ret i16 %max3 } +; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select +; with fastmath on the select. define half @reduction_fmax_v4half(<4 x half> %vec4) { -; GFX9-LABEL: @reduction_fmax_v4half( -; GFX9-NEXT: entry: -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[VEC4:%.*]], <4 x half> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x half> [[VEC4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: ret half [[TMP0]] -; -; VI-LABEL: @reduction_fmax_v4half( -; VI-NEXT: entry: -; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0 -; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1 -; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2 -; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3 -; VI-NEXT: [[CMP1:%.*]] = fcmp fast ogt half [[ELT1]], [[ELT0]] -; VI-NEXT: [[MAX1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]] -; VI-NEXT: [[CMP2:%.*]] = fcmp fast ogt half [[ELT2]], [[MAX1]] -; VI-NEXT: [[MAX2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MAX1]] -; VI-NEXT: [[CMP3:%.*]] = fcmp fast ogt half [[ELT3]], [[MAX2]] -; VI-NEXT: [[MAX3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MAX2]] -; VI-NEXT: ret half [[MAX3]] +; GCN-LABEL: @reduction_fmax_v4half( +; GCN-NEXT: entry: +; GCN-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0 +; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1 +; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2 +; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3 +; GCN-NEXT: [[CMP1:%.*]] = fcmp fast ogt half [[ELT1]], [[ELT0]] +; GCN-NEXT: [[MAX1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]] +; GCN-NEXT: [[CMP2:%.*]] = fcmp fast ogt half [[ELT2]], [[MAX1]] +; GCN-NEXT: [[MAX2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MAX1]] +; GCN-NEXT: [[CMP3:%.*]] = fcmp fast ogt half [[ELT3]], [[MAX2]] +; GCN-NEXT: [[MAX3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MAX2]] +; GCN-NEXT: ret half [[MAX3]] ; entry: %elt0 = extractelement <4 x half> %vec4, i64 0 @@ -653,31 +644,22 @@ entry: ret half %max3 } +; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select +; with fastmath on the select. define half @reduction_fmin_v4half(<4 x half> %vec4) { -; GFX9-LABEL: @reduction_fmin_v4half( -; GFX9-NEXT: entry: -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[VEC4:%.*]], <4 x half> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x half> [[VEC4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: ret half [[TMP0]] -; -; VI-LABEL: @reduction_fmin_v4half( -; VI-NEXT: entry: -; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0 -; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1 -; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2 -; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3 -; VI-NEXT: [[CMP1:%.*]] = fcmp fast olt half [[ELT1]], [[ELT0]] -; VI-NEXT: [[MIN1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]] -; VI-NEXT: [[CMP2:%.*]] = fcmp fast olt half [[ELT2]], [[MIN1]] -; VI-NEXT: [[MIN2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MIN1]] -; VI-NEXT: [[CMP3:%.*]] = fcmp fast olt half [[ELT3]], [[MIN2]] -; VI-NEXT: [[MIN3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MIN2]] -; VI-NEXT: ret half [[MIN3]] +; GCN-LABEL: @reduction_fmin_v4half( +; GCN-NEXT: entry: +; GCN-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0 +; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1 +; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2 +; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3 +; GCN-NEXT: [[CMP1:%.*]] = fcmp fast olt half [[ELT1]], [[ELT0]] +; GCN-NEXT: [[MIN1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]] +; GCN-NEXT: [[CMP2:%.*]] = fcmp fast olt half [[ELT2]], [[MIN1]] +; GCN-NEXT: [[MIN2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MIN1]] +; GCN-NEXT: [[CMP3:%.*]] = fcmp fast olt half [[ELT3]], [[MIN2]] +; GCN-NEXT: [[MIN3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MIN2]] +; GCN-NEXT: ret half [[MIN3]] ; entry: %elt0 = extractelement <4 x half> %vec4, i64 0 @@ -719,4 +701,4 @@ entry: %add3 = fadd fast float %elt3, %add2 ret float %add3 -} \ No newline at end of file +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll new file mode 100644 index 0000000000000..fb4ec00906adc --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll @@ -0,0 +1,411 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=-prefer-128-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=+prefer-128-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=-prefer-128-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=+prefer-128-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 + +@a64 = common global [8 x i64] zeroinitializer, align 64 +@b64 = common global [8 x i64] zeroinitializer, align 64 +@c64 = common global [8 x i64] zeroinitializer, align 64 +@a32 = common global [16 x i32] zeroinitializer, align 64 +@b32 = common global [16 x i32] zeroinitializer, align 64 +@c32 = common global [16 x i32] zeroinitializer, align 64 +@a16 = common global [32 x i16] zeroinitializer, align 64 +@b16 = common global [32 x i16] zeroinitializer, align 64 +@c16 = common global [32 x i16] zeroinitializer, align 64 +@a8 = common global [64 x i8] zeroinitializer, align 64 +@b8 = common global [64 x i8] zeroinitializer, align 64 +@c8 = common global [64 x i8] zeroinitializer, align 64 + +define void @sdiv_v16i32_uniformconst() { +; SSE-LABEL: @sdiv_v16i32_uniformconst( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP1]], +; SSE-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[TMP2]], +; SSE-NEXT: [[TMP7:%.*]] = sdiv <4 x i32> [[TMP3]], +; SSE-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP4]], +; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: ret void +; +; SLM-LABEL: @sdiv_v16i32_uniformconst( +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP1]], +; SLM-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[TMP2]], +; SLM-NEXT: [[TMP7:%.*]] = sdiv <4 x i32> [[TMP3]], +; SLM-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP4]], +; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: ret void +; +; AVX-LABEL: @sdiv_v16i32_uniformconst( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = sdiv <8 x i32> [[TMP1]], +; AVX-NEXT: [[TMP4:%.*]] = sdiv <8 x i32> [[TMP2]], +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @sdiv_v16i32_uniformconst( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP2:%.*]] = sdiv <16 x i32> [[TMP1]], +; AVX512-NEXT: store <16 x i32> [[TMP2]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 +; AVX512-NEXT: ret void +; + %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4 + %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4 + %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4 + %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4 + %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4 + %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4 + %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4 + %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4 + %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4 + %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4 + %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 + %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 + %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 + %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 + %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 + %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 + %r0 = sdiv i32 %a0 , 5 + %r1 = sdiv i32 %a1 , 5 + %r2 = sdiv i32 %a2 , 5 + %r3 = sdiv i32 %a3 , 5 + %r4 = sdiv i32 %a4 , 5 + %r5 = sdiv i32 %a5 , 5 + %r6 = sdiv i32 %a6 , 5 + %r7 = sdiv i32 %a7 , 5 + %r8 = sdiv i32 %a8 , 5 + %r9 = sdiv i32 %a9 , 5 + %r10 = sdiv i32 %a10, 5 + %r11 = sdiv i32 %a11, 5 + %r12 = sdiv i32 %a12, 5 + %r13 = sdiv i32 %a13, 5 + %r14 = sdiv i32 %a14, 5 + %r15 = sdiv i32 %a15, 5 + store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4 + store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4 + store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4 + store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4 + store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4 + store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4 + store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4 + store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4 + store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4 + store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4 + store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 + store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 + store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 + store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 + store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 + store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 + ret void +} + +define void @srem_v16i32_uniformconst() { +; SSE-LABEL: @srem_v16i32_uniformconst( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = srem <4 x i32> [[TMP1]], +; SSE-NEXT: [[TMP6:%.*]] = srem <4 x i32> [[TMP2]], +; SSE-NEXT: [[TMP7:%.*]] = srem <4 x i32> [[TMP3]], +; SSE-NEXT: [[TMP8:%.*]] = srem <4 x i32> [[TMP4]], +; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: ret void +; +; SLM-LABEL: @srem_v16i32_uniformconst( +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = srem <4 x i32> [[TMP1]], +; SLM-NEXT: [[TMP6:%.*]] = srem <4 x i32> [[TMP2]], +; SLM-NEXT: [[TMP7:%.*]] = srem <4 x i32> [[TMP3]], +; SLM-NEXT: [[TMP8:%.*]] = srem <4 x i32> [[TMP4]], +; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: ret void +; +; AVX-LABEL: @srem_v16i32_uniformconst( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = srem <8 x i32> [[TMP1]], +; AVX-NEXT: [[TMP4:%.*]] = srem <8 x i32> [[TMP2]], +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @srem_v16i32_uniformconst( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP2:%.*]] = srem <16 x i32> [[TMP1]], +; AVX512-NEXT: store <16 x i32> [[TMP2]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 +; AVX512-NEXT: ret void +; + %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4 + %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4 + %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4 + %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4 + %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4 + %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4 + %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4 + %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4 + %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4 + %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4 + %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 + %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 + %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 + %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 + %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 + %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 + %r0 = srem i32 %a0 , 5 + %r1 = srem i32 %a1 , 5 + %r2 = srem i32 %a2 , 5 + %r3 = srem i32 %a3 , 5 + %r4 = srem i32 %a4 , 5 + %r5 = srem i32 %a5 , 5 + %r6 = srem i32 %a6 , 5 + %r7 = srem i32 %a7 , 5 + %r8 = srem i32 %a8 , 5 + %r9 = srem i32 %a9 , 5 + %r10 = srem i32 %a10, 5 + %r11 = srem i32 %a11, 5 + %r12 = srem i32 %a12, 5 + %r13 = srem i32 %a13, 5 + %r14 = srem i32 %a14, 5 + %r15 = srem i32 %a15, 5 + store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4 + store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4 + store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4 + store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4 + store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4 + store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4 + store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4 + store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4 + store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4 + store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4 + store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 + store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 + store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 + store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 + store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 + store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 + ret void +} + +define void @udiv_v16i32_uniformconst() { +; SSE-LABEL: @udiv_v16i32_uniformconst( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = udiv <4 x i32> [[TMP1]], +; SSE-NEXT: [[TMP6:%.*]] = udiv <4 x i32> [[TMP2]], +; SSE-NEXT: [[TMP7:%.*]] = udiv <4 x i32> [[TMP3]], +; SSE-NEXT: [[TMP8:%.*]] = udiv <4 x i32> [[TMP4]], +; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: ret void +; +; SLM-LABEL: @udiv_v16i32_uniformconst( +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = udiv <4 x i32> [[TMP1]], +; SLM-NEXT: [[TMP6:%.*]] = udiv <4 x i32> [[TMP2]], +; SLM-NEXT: [[TMP7:%.*]] = udiv <4 x i32> [[TMP3]], +; SLM-NEXT: [[TMP8:%.*]] = udiv <4 x i32> [[TMP4]], +; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: ret void +; +; AVX-LABEL: @udiv_v16i32_uniformconst( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = udiv <8 x i32> [[TMP1]], +; AVX-NEXT: [[TMP4:%.*]] = udiv <8 x i32> [[TMP2]], +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @udiv_v16i32_uniformconst( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP2:%.*]] = udiv <16 x i32> [[TMP1]], +; AVX512-NEXT: store <16 x i32> [[TMP2]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 +; AVX512-NEXT: ret void +; + %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4 + %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4 + %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4 + %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4 + %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4 + %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4 + %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4 + %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4 + %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4 + %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4 + %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 + %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 + %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 + %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 + %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 + %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 + %r0 = udiv i32 %a0 , 5 + %r1 = udiv i32 %a1 , 5 + %r2 = udiv i32 %a2 , 5 + %r3 = udiv i32 %a3 , 5 + %r4 = udiv i32 %a4 , 5 + %r5 = udiv i32 %a5 , 5 + %r6 = udiv i32 %a6 , 5 + %r7 = udiv i32 %a7 , 5 + %r8 = udiv i32 %a8 , 5 + %r9 = udiv i32 %a9 , 5 + %r10 = udiv i32 %a10, 5 + %r11 = udiv i32 %a11, 5 + %r12 = udiv i32 %a12, 5 + %r13 = udiv i32 %a13, 5 + %r14 = udiv i32 %a14, 5 + %r15 = udiv i32 %a15, 5 + store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4 + store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4 + store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4 + store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4 + store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4 + store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4 + store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4 + store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4 + store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4 + store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4 + store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 + store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 + store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 + store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 + store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 + store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 + ret void +} + +define void @urem_v16i32_uniformconst() { +; SSE-LABEL: @urem_v16i32_uniformconst( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = urem <4 x i32> [[TMP1]], +; SSE-NEXT: [[TMP6:%.*]] = urem <4 x i32> [[TMP2]], +; SSE-NEXT: [[TMP7:%.*]] = urem <4 x i32> [[TMP3]], +; SSE-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP4]], +; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: ret void +; +; SLM-LABEL: @urem_v16i32_uniformconst( +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = urem <4 x i32> [[TMP1]], +; SLM-NEXT: [[TMP6:%.*]] = urem <4 x i32> [[TMP2]], +; SLM-NEXT: [[TMP7:%.*]] = urem <4 x i32> [[TMP3]], +; SLM-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP4]], +; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: ret void +; +; AVX-LABEL: @urem_v16i32_uniformconst( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = urem <8 x i32> [[TMP1]], +; AVX-NEXT: [[TMP4:%.*]] = urem <8 x i32> [[TMP2]], +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @urem_v16i32_uniformconst( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP2:%.*]] = urem <16 x i32> [[TMP1]], +; AVX512-NEXT: store <16 x i32> [[TMP2]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 +; AVX512-NEXT: ret void +; + %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4 + %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4 + %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4 + %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4 + %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4 + %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4 + %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4 + %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4 + %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4 + %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4 + %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 + %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 + %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 + %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 + %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 + %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 + %r0 = urem i32 %a0 , 5 + %r1 = urem i32 %a1 , 5 + %r2 = urem i32 %a2 , 5 + %r3 = urem i32 %a3 , 5 + %r4 = urem i32 %a4 , 5 + %r5 = urem i32 %a5 , 5 + %r6 = urem i32 %a6 , 5 + %r7 = urem i32 %a7 , 5 + %r8 = urem i32 %a8 , 5 + %r9 = urem i32 %a9 , 5 + %r10 = urem i32 %a10, 5 + %r11 = urem i32 %a11, 5 + %r12 = urem i32 %a12, 5 + %r13 = urem i32 %a13, 5 + %r14 = urem i32 %a14, 5 + %r15 = urem i32 %a15, 5 + store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4 + store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4 + store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4 + store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4 + store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4 + store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4 + store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4 + store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4 + store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4 + store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4 + store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 + store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 + store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 + store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 + store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 + store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll index 3094f9bc2549a..e1028cf552762 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll @@ -15,14 +15,14 @@ define i64 @load_bswap(%v8i8* %p) { ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7 -; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]] -; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]] -; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]] -; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]] -; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]] -; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]] -; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]] -; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]] +; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]], align 1 +; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]], align 1 +; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]], align 1 +; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]], align 1 +; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]], align 1 +; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]], align 1 +; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]], align 1 +; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]], align 1 ; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64 ; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64 ; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64 @@ -103,14 +103,14 @@ define i64 @load_bswap_nop_shift(%v8i8* %p) { ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7 -; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]] -; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]] -; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]] -; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]] -; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]] -; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]] -; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]] -; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]] +; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]], align 1 +; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]], align 1 +; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]], align 1 +; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]], align 1 +; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]], align 1 +; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]], align 1 +; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]], align 1 +; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]], align 1 ; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64 ; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64 ; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64 @@ -537,3 +537,27 @@ define void @load_combine_constant_expression(i64* %t1) { store i64 or (i64 shl (i64 zext (i32 ptrtoint ([8 x i8]* @g1 to i32) to i64), i64 32), i64 zext (i32 ptrtoint ([5 x i8]* @g2 to i32) to i64)), i64* %t3, align 4 ret void } + +@output = dso_local local_unnamed_addr global [8 x i32] zeroinitializer, align 16 + +define void @PR47450(i16* nocapture readonly %p) { +; CHECK-LABEL: @PR47450( +; CHECK-NEXT: [[X:%.*]] = load i16, i16* [[P:%.*]], align 2 +; CHECK-NEXT: [[Z:%.*]] = zext i16 [[X]] to i32 +; CHECK-NEXT: [[S:%.*]] = shl nuw nsw i32 [[Z]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[S]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[S]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[S]], i32 3 +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @output to <4 x i32>*), align 16 +; CHECK-NEXT: ret void +; + %x = load i16, i16* %p, align 2 + %z = zext i16 %x to i32 + %s = shl nuw nsw i32 %z, 1 + store i32 %s, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 0), align 16 + store i32 %s, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 1), align 4 + store i32 %s, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 2), align 8 + store i32 %s, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 3), align 4 + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll index 3ac8c04774a4c..b0971dd804501 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll @@ -74,3 +74,123 @@ for.end: ; preds = %for.inc declare i32 @printf(i8* nocapture, ...) +; PR41312 - the order of the reduction ops should not prevent forming a reduction. +; The 'wrong' member of the reduction requires a greater cost if grouped with the +; other candidates in the reduction because it does not have matching predicate +; and/or constant operand. + +define float @merge_anyof_v4f32_wrong_first(<4 x float> %x) { +; CHECK-LABEL: @merge_anyof_v4f32_wrong_first( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 +; CHECK-NEXT: [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP4]], float -1.000000e+00, float 1.000000e+00 +; CHECK-NEXT: ret float [[R]] +; + %x0 = extractelement <4 x float> %x, i32 0 + %x1 = extractelement <4 x float> %x, i32 1 + %x2 = extractelement <4 x float> %x, i32 2 + %x3 = extractelement <4 x float> %x, i32 3 + %cmp3wrong = fcmp olt float %x3, 42.0 + %cmp0 = fcmp ogt float %x0, 1.0 + %cmp1 = fcmp ogt float %x1, 1.0 + %cmp2 = fcmp ogt float %x2, 1.0 + %cmp3 = fcmp ogt float %x3, 1.0 + %or03 = or i1 %cmp0, %cmp3wrong + %or031 = or i1 %or03, %cmp1 + %or0312 = or i1 %or031, %cmp2 + %or03123 = or i1 %or0312, %cmp3 + %r = select i1 %or03123, float -1.0, float 1.0 + ret float %r +} + +define float @merge_anyof_v4f32_wrong_last(<4 x float> %x) { +; CHECK-LABEL: @merge_anyof_v4f32_wrong_last( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 +; CHECK-NEXT: [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP4]], float -1.000000e+00, float 1.000000e+00 +; CHECK-NEXT: ret float [[R]] +; + %x0 = extractelement <4 x float> %x, i32 0 + %x1 = extractelement <4 x float> %x, i32 1 + %x2 = extractelement <4 x float> %x, i32 2 + %x3 = extractelement <4 x float> %x, i32 3 + %cmp3wrong = fcmp olt float %x3, 42.0 + %cmp0 = fcmp ogt float %x0, 1.0 + %cmp1 = fcmp ogt float %x1, 1.0 + %cmp2 = fcmp ogt float %x2, 1.0 + %cmp3 = fcmp ogt float %x3, 1.0 + %or03 = or i1 %cmp0, %cmp3 + %or031 = or i1 %or03, %cmp1 + %or0312 = or i1 %or031, %cmp2 + %or03123 = or i1 %or0312, %cmp3wrong + %r = select i1 %or03123, float -1.0, float 1.0 + ret float %r +} + +define i32 @merge_anyof_v4i32_wrong_middle(<4 x i32> %x) { +; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 +; CHECK-NEXT: [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP1]], 42 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP4]], i32 -1, i32 1 +; CHECK-NEXT: ret i32 [[R]] +; + %x0 = extractelement <4 x i32> %x, i32 0 + %x1 = extractelement <4 x i32> %x, i32 1 + %x2 = extractelement <4 x i32> %x, i32 2 + %x3 = extractelement <4 x i32> %x, i32 3 + %cmp3wrong = icmp slt i32 %x3, 42 + %cmp0 = icmp sgt i32 %x0, 1 + %cmp1 = icmp sgt i32 %x1, 1 + %cmp2 = icmp sgt i32 %x2, 1 + %cmp3 = icmp sgt i32 %x3, 1 + %or03 = or i1 %cmp0, %cmp3 + %or033 = or i1 %or03, %cmp3wrong + %or0332 = or i1 %or033, %cmp2 + %or03321 = or i1 %or0332, %cmp1 + %r = select i1 %or03321, i32 -1, i32 1 + ret i32 %r +} + +; Operand/predicate swapping allows forming a reduction, but the +; ideal reduction groups all of the original 'sgt' ops together. + +define i32 @merge_anyof_v4i32_wrong_middle_better_rdx(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle_better_rdx( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 +; CHECK-NEXT: [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]] +; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[CMP3WRONG]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP5]], i32 -1, i32 1 +; CHECK-NEXT: ret i32 [[R]] +; + %x0 = extractelement <4 x i32> %x, i32 0 + %x1 = extractelement <4 x i32> %x, i32 1 + %x2 = extractelement <4 x i32> %x, i32 2 + %x3 = extractelement <4 x i32> %x, i32 3 + %y0 = extractelement <4 x i32> %y, i32 0 + %y1 = extractelement <4 x i32> %y, i32 1 + %y2 = extractelement <4 x i32> %y, i32 2 + %y3 = extractelement <4 x i32> %y, i32 3 + %cmp3wrong = icmp slt i32 %x3, %y3 + %cmp0 = icmp sgt i32 %x0, %y0 + %cmp1 = icmp sgt i32 %x1, %y1 + %cmp2 = icmp sgt i32 %x2, %y2 + %cmp3 = icmp sgt i32 %x3, %y3 + %or03 = or i1 %cmp0, %cmp3 + %or033 = or i1 %or03, %cmp3wrong + %or0332 = or i1 %or033, %cmp2 + %or03321 = or i1 %or0332, %cmp1 + %r = select i1 %or03321, i32 -1, i32 1 + ret i32 %r +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index 7b3acfb6c0c01..dd5d649c41bb4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -266,24 +266,52 @@ entry: ret i32 %conv4 } +; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select +; with fastmath on the select. define float @bar() { ; CHECK-LABEL: @bar( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> [[TMP2]]) -; CHECK-NEXT: store float [[TMP3]], float* @res, align 4 -; CHECK-NEXT: ret float [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8 +; CHECK-NEXT: [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]] +; CHECK-NEXT: [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4 +; CHECK-NEXT: [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]] +; CHECK-NEXT: [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]] +; CHECK-NEXT: store float [[MAX_0_MUL3_2]], float* @res, align 4 +; CHECK-NEXT: ret float [[MAX_0_MUL3_2]] ; ; THRESHOLD-LABEL: @bar( ; THRESHOLD-NEXT: entry: -; THRESHOLD-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]] -; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> [[TMP2]]) -; THRESHOLD-NEXT: store float [[TMP3]], float* @res, align 4 -; THRESHOLD-NEXT: ret float [[TMP3]] +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; THRESHOLD-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]] +; THRESHOLD-NEXT: [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]] +; THRESHOLD-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8 +; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8 +; THRESHOLD-NEXT: [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]] +; THRESHOLD-NEXT: [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]] +; THRESHOLD-NEXT: [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]] +; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4 +; THRESHOLD-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4 +; THRESHOLD-NEXT: [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]] +; THRESHOLD-NEXT: [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]] +; THRESHOLD-NEXT: [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]] +; THRESHOLD-NEXT: store float [[MAX_0_MUL3_2]], float* @res, align 4 +; THRESHOLD-NEXT: ret float [[MAX_0_MUL3_2]] ; entry: %0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index f06802eff9c7d..9663ede723cc6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -198,11 +198,59 @@ define i32 @maxi32(i32) { ret i32 %95 } +; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select +; with fastmath on the select. define float @maxf8(float) { -; CHECK-LABEL: @maxf8( -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> [[TMP2]]) -; CHECK-NEXT: ret float [[TMP3]] +; DEFAULT-LABEL: @maxf8( +; DEFAULT-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 +; DEFAULT-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 +; DEFAULT-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] +; DEFAULT-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] +; DEFAULT-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 +; DEFAULT-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] +; DEFAULT-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] +; DEFAULT-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 +; DEFAULT-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] +; DEFAULT-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] +; DEFAULT-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 +; DEFAULT-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] +; DEFAULT-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] +; DEFAULT-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 +; DEFAULT-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] +; DEFAULT-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] +; DEFAULT-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 +; DEFAULT-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] +; DEFAULT-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] +; DEFAULT-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 +; DEFAULT-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] +; DEFAULT-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] +; DEFAULT-NEXT: ret float [[TMP23]] +; +; THRESH-LABEL: @maxf8( +; THRESH-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @arr1 to <2 x float>*), align 16 +; THRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; THRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; THRESH-NEXT: [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]] +; THRESH-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]] +; THRESH-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 +; THRESH-NEXT: [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]] +; THRESH-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]] +; THRESH-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 +; THRESH-NEXT: [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]] +; THRESH-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float [[TMP10]] +; THRESH-NEXT: [[TMP13:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 +; THRESH-NEXT: [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]] +; THRESH-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float [[TMP13]] +; THRESH-NEXT: [[TMP16:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 +; THRESH-NEXT: [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]] +; THRESH-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float [[TMP16]] +; THRESH-NEXT: [[TMP19:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 +; THRESH-NEXT: [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]] +; THRESH-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float [[TMP19]] +; THRESH-NEXT: [[TMP22:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 +; THRESH-NEXT: [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]] +; THRESH-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float [[TMP22]] +; THRESH-NEXT: ret float [[TMP24]] ; %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 @@ -229,11 +277,107 @@ define float @maxf8(float) { ret float %23 } +; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select +; with fastmath on the select. define float @maxf16(float) { -; CHECK-LABEL: @maxf16( -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> [[TMP2]]) -; CHECK-NEXT: ret float [[TMP3]] +; DEFAULT-LABEL: @maxf16( +; DEFAULT-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 +; DEFAULT-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 +; DEFAULT-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] +; DEFAULT-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] +; DEFAULT-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 +; DEFAULT-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] +; DEFAULT-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] +; DEFAULT-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 +; DEFAULT-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] +; DEFAULT-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] +; DEFAULT-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 +; DEFAULT-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] +; DEFAULT-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] +; DEFAULT-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 +; DEFAULT-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] +; DEFAULT-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] +; DEFAULT-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 +; DEFAULT-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] +; DEFAULT-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] +; DEFAULT-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 +; DEFAULT-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] +; DEFAULT-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] +; DEFAULT-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 +; DEFAULT-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] +; DEFAULT-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] +; DEFAULT-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 +; DEFAULT-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] +; DEFAULT-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] +; DEFAULT-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 +; DEFAULT-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] +; DEFAULT-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] +; DEFAULT-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 +; DEFAULT-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] +; DEFAULT-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] +; DEFAULT-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 +; DEFAULT-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] +; DEFAULT-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] +; DEFAULT-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 +; DEFAULT-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] +; DEFAULT-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] +; DEFAULT-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 +; DEFAULT-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] +; DEFAULT-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] +; DEFAULT-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 +; DEFAULT-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] +; DEFAULT-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] +; DEFAULT-NEXT: ret float [[TMP47]] +; +; THRESH-LABEL: @maxf16( +; THRESH-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @arr1 to <2 x float>*), align 16 +; THRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; THRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; THRESH-NEXT: [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]] +; THRESH-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]] +; THRESH-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 +; THRESH-NEXT: [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]] +; THRESH-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]] +; THRESH-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 +; THRESH-NEXT: [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]] +; THRESH-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float [[TMP10]] +; THRESH-NEXT: [[TMP13:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 +; THRESH-NEXT: [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]] +; THRESH-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float [[TMP13]] +; THRESH-NEXT: [[TMP16:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 +; THRESH-NEXT: [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]] +; THRESH-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float [[TMP16]] +; THRESH-NEXT: [[TMP19:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 +; THRESH-NEXT: [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]] +; THRESH-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float [[TMP19]] +; THRESH-NEXT: [[TMP22:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 +; THRESH-NEXT: [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]] +; THRESH-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float [[TMP22]] +; THRESH-NEXT: [[TMP25:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 +; THRESH-NEXT: [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]] +; THRESH-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float [[TMP25]] +; THRESH-NEXT: [[TMP28:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 +; THRESH-NEXT: [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]] +; THRESH-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float [[TMP28]] +; THRESH-NEXT: [[TMP31:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 +; THRESH-NEXT: [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]] +; THRESH-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float [[TMP31]] +; THRESH-NEXT: [[TMP34:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 +; THRESH-NEXT: [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]] +; THRESH-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float [[TMP34]] +; THRESH-NEXT: [[TMP37:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 +; THRESH-NEXT: [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]] +; THRESH-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float [[TMP37]] +; THRESH-NEXT: [[TMP40:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 +; THRESH-NEXT: [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]] +; THRESH-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float [[TMP40]] +; THRESH-NEXT: [[TMP43:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 +; THRESH-NEXT: [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]] +; THRESH-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float [[TMP43]] +; THRESH-NEXT: [[TMP46:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 +; THRESH-NEXT: [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]] +; THRESH-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP45]], float [[TMP46]] +; THRESH-NEXT: ret float [[TMP48]] ; %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 @@ -284,11 +428,203 @@ define float @maxf16(float) { ret float %47 } +; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select +; with fastmath on the select. define float @maxf32(float) { -; CHECK-LABEL: @maxf32( -; CHECK-NEXT: [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> [[TMP2]]) -; CHECK-NEXT: ret float [[TMP3]] +; DEFAULT-LABEL: @maxf32( +; DEFAULT-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 +; DEFAULT-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 +; DEFAULT-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] +; DEFAULT-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]] +; DEFAULT-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 +; DEFAULT-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]] +; DEFAULT-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] +; DEFAULT-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 +; DEFAULT-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]] +; DEFAULT-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]] +; DEFAULT-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 +; DEFAULT-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] +; DEFAULT-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]] +; DEFAULT-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 +; DEFAULT-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]] +; DEFAULT-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]] +; DEFAULT-NEXT: [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 +; DEFAULT-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]] +; DEFAULT-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]] +; DEFAULT-NEXT: [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 +; DEFAULT-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]] +; DEFAULT-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]] +; DEFAULT-NEXT: [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 +; DEFAULT-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]] +; DEFAULT-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]] +; DEFAULT-NEXT: [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 +; DEFAULT-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]] +; DEFAULT-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]] +; DEFAULT-NEXT: [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 +; DEFAULT-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]] +; DEFAULT-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]] +; DEFAULT-NEXT: [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 +; DEFAULT-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]] +; DEFAULT-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]] +; DEFAULT-NEXT: [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 +; DEFAULT-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]] +; DEFAULT-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]] +; DEFAULT-NEXT: [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 +; DEFAULT-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]] +; DEFAULT-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]] +; DEFAULT-NEXT: [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 +; DEFAULT-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]] +; DEFAULT-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]] +; DEFAULT-NEXT: [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 +; DEFAULT-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]] +; DEFAULT-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]] +; DEFAULT-NEXT: [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16 +; DEFAULT-NEXT: [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]] +; DEFAULT-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]] +; DEFAULT-NEXT: [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4 +; DEFAULT-NEXT: [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]] +; DEFAULT-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]] +; DEFAULT-NEXT: [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8 +; DEFAULT-NEXT: [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]] +; DEFAULT-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]] +; DEFAULT-NEXT: [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4 +; DEFAULT-NEXT: [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]] +; DEFAULT-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]] +; DEFAULT-NEXT: [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16 +; DEFAULT-NEXT: [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]] +; DEFAULT-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]] +; DEFAULT-NEXT: [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4 +; DEFAULT-NEXT: [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]] +; DEFAULT-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]] +; DEFAULT-NEXT: [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8 +; DEFAULT-NEXT: [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]] +; DEFAULT-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]] +; DEFAULT-NEXT: [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4 +; DEFAULT-NEXT: [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]] +; DEFAULT-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]] +; DEFAULT-NEXT: [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16 +; DEFAULT-NEXT: [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]] +; DEFAULT-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]] +; DEFAULT-NEXT: [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4 +; DEFAULT-NEXT: [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]] +; DEFAULT-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]] +; DEFAULT-NEXT: [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8 +; DEFAULT-NEXT: [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]] +; DEFAULT-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]] +; DEFAULT-NEXT: [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4 +; DEFAULT-NEXT: [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]] +; DEFAULT-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]] +; DEFAULT-NEXT: [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16 +; DEFAULT-NEXT: [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]] +; DEFAULT-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]] +; DEFAULT-NEXT: [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4 +; DEFAULT-NEXT: [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]] +; DEFAULT-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]] +; DEFAULT-NEXT: [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8 +; DEFAULT-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]] +; DEFAULT-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]] +; DEFAULT-NEXT: [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4 +; DEFAULT-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]] +; DEFAULT-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]] +; DEFAULT-NEXT: ret float [[TMP95]] +; +; THRESH-LABEL: @maxf32( +; THRESH-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @arr1 to <2 x float>*), align 16 +; THRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; THRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; THRESH-NEXT: [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]] +; THRESH-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]] +; THRESH-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8 +; THRESH-NEXT: [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]] +; THRESH-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]] +; THRESH-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4 +; THRESH-NEXT: [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]] +; THRESH-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float [[TMP10]] +; THRESH-NEXT: [[TMP13:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16 +; THRESH-NEXT: [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]] +; THRESH-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float [[TMP13]] +; THRESH-NEXT: [[TMP16:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4 +; THRESH-NEXT: [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]] +; THRESH-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float [[TMP16]] +; THRESH-NEXT: [[TMP19:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8 +; THRESH-NEXT: [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]] +; THRESH-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float [[TMP19]] +; THRESH-NEXT: [[TMP22:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4 +; THRESH-NEXT: [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]] +; THRESH-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float [[TMP22]] +; THRESH-NEXT: [[TMP25:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16 +; THRESH-NEXT: [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]] +; THRESH-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float [[TMP25]] +; THRESH-NEXT: [[TMP28:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4 +; THRESH-NEXT: [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]] +; THRESH-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float [[TMP28]] +; THRESH-NEXT: [[TMP31:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8 +; THRESH-NEXT: [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]] +; THRESH-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float [[TMP31]] +; THRESH-NEXT: [[TMP34:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4 +; THRESH-NEXT: [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]] +; THRESH-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float [[TMP34]] +; THRESH-NEXT: [[TMP37:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16 +; THRESH-NEXT: [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]] +; THRESH-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float [[TMP37]] +; THRESH-NEXT: [[TMP40:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4 +; THRESH-NEXT: [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]] +; THRESH-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float [[TMP40]] +; THRESH-NEXT: [[TMP43:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8 +; THRESH-NEXT: [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]] +; THRESH-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float [[TMP43]] +; THRESH-NEXT: [[TMP46:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4 +; THRESH-NEXT: [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]] +; THRESH-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP45]], float [[TMP46]] +; THRESH-NEXT: [[TMP49:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16 +; THRESH-NEXT: [[TMP50:%.*]] = fcmp fast ogt float [[TMP48]], [[TMP49]] +; THRESH-NEXT: [[TMP51:%.*]] = select i1 [[TMP50]], float [[TMP48]], float [[TMP49]] +; THRESH-NEXT: [[TMP52:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4 +; THRESH-NEXT: [[TMP53:%.*]] = fcmp fast ogt float [[TMP51]], [[TMP52]] +; THRESH-NEXT: [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP51]], float [[TMP52]] +; THRESH-NEXT: [[TMP55:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8 +; THRESH-NEXT: [[TMP56:%.*]] = fcmp fast ogt float [[TMP54]], [[TMP55]] +; THRESH-NEXT: [[TMP57:%.*]] = select i1 [[TMP56]], float [[TMP54]], float [[TMP55]] +; THRESH-NEXT: [[TMP58:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4 +; THRESH-NEXT: [[TMP59:%.*]] = fcmp fast ogt float [[TMP57]], [[TMP58]] +; THRESH-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP57]], float [[TMP58]] +; THRESH-NEXT: [[TMP61:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16 +; THRESH-NEXT: [[TMP62:%.*]] = fcmp fast ogt float [[TMP60]], [[TMP61]] +; THRESH-NEXT: [[TMP63:%.*]] = select i1 [[TMP62]], float [[TMP60]], float [[TMP61]] +; THRESH-NEXT: [[TMP64:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4 +; THRESH-NEXT: [[TMP65:%.*]] = fcmp fast ogt float [[TMP63]], [[TMP64]] +; THRESH-NEXT: [[TMP66:%.*]] = select i1 [[TMP65]], float [[TMP63]], float [[TMP64]] +; THRESH-NEXT: [[TMP67:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8 +; THRESH-NEXT: [[TMP68:%.*]] = fcmp fast ogt float [[TMP66]], [[TMP67]] +; THRESH-NEXT: [[TMP69:%.*]] = select i1 [[TMP68]], float [[TMP66]], float [[TMP67]] +; THRESH-NEXT: [[TMP70:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4 +; THRESH-NEXT: [[TMP71:%.*]] = fcmp fast ogt float [[TMP69]], [[TMP70]] +; THRESH-NEXT: [[TMP72:%.*]] = select i1 [[TMP71]], float [[TMP69]], float [[TMP70]] +; THRESH-NEXT: [[TMP73:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16 +; THRESH-NEXT: [[TMP74:%.*]] = fcmp fast ogt float [[TMP72]], [[TMP73]] +; THRESH-NEXT: [[TMP75:%.*]] = select i1 [[TMP74]], float [[TMP72]], float [[TMP73]] +; THRESH-NEXT: [[TMP76:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4 +; THRESH-NEXT: [[TMP77:%.*]] = fcmp fast ogt float [[TMP75]], [[TMP76]] +; THRESH-NEXT: [[TMP78:%.*]] = select i1 [[TMP77]], float [[TMP75]], float [[TMP76]] +; THRESH-NEXT: [[TMP79:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8 +; THRESH-NEXT: [[TMP80:%.*]] = fcmp fast ogt float [[TMP78]], [[TMP79]] +; THRESH-NEXT: [[TMP81:%.*]] = select i1 [[TMP80]], float [[TMP78]], float [[TMP79]] +; THRESH-NEXT: [[TMP82:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4 +; THRESH-NEXT: [[TMP83:%.*]] = fcmp fast ogt float [[TMP81]], [[TMP82]] +; THRESH-NEXT: [[TMP84:%.*]] = select i1 [[TMP83]], float [[TMP81]], float [[TMP82]] +; THRESH-NEXT: [[TMP85:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16 +; THRESH-NEXT: [[TMP86:%.*]] = fcmp fast ogt float [[TMP84]], [[TMP85]] +; THRESH-NEXT: [[TMP87:%.*]] = select i1 [[TMP86]], float [[TMP84]], float [[TMP85]] +; THRESH-NEXT: [[TMP88:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4 +; THRESH-NEXT: [[TMP89:%.*]] = fcmp fast ogt float [[TMP87]], [[TMP88]] +; THRESH-NEXT: [[TMP90:%.*]] = select i1 [[TMP89]], float [[TMP87]], float [[TMP88]] +; THRESH-NEXT: [[TMP91:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8 +; THRESH-NEXT: [[TMP92:%.*]] = fcmp fast ogt float [[TMP90]], [[TMP91]] +; THRESH-NEXT: [[TMP93:%.*]] = select i1 [[TMP92]], float [[TMP90]], float [[TMP91]] +; THRESH-NEXT: [[TMP94:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4 +; THRESH-NEXT: [[TMP95:%.*]] = fcmp fast ogt float [[TMP93]], [[TMP94]] +; THRESH-NEXT: [[TMP96:%.*]] = select i1 [[TMP95]], float [[TMP93]], float [[TMP94]] +; THRESH-NEXT: ret float [[TMP96]] ; %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4 diff --git a/llvm/test/Transforms/Scalarizer/global-bug-2.ll b/llvm/test/Transforms/Scalarizer/global-bug-2.ll new file mode 100644 index 0000000000000..60f61ab08184b --- /dev/null +++ b/llvm/test/Transforms/Scalarizer/global-bug-2.ll @@ -0,0 +1,20 @@ +; RUN: opt < %s -scalarizer -S -o - | FileCheck %s +; RUN: opt < %s -passes='function(scalarizer)' -S | FileCheck %s + +; The scalarizer used to change the name of the global variable +; Check that the we don't do that any longer. +; +; CHECK: @c.a = global i16 0, align 1 + +@c.a = global i16 0, align 1 + +define void @c() { +entry: + br label %for.cond1 + +for.cond1: ; preds = %for.cond1, %entry + %d.sroa.0.0 = phi <4 x i16*> [ , %entry ], [ %d.sroa.0.1.vec.insert, %for.cond1 ] + %d.sroa.0.0.vec.extract = extractelement <4 x i16*> %d.sroa.0.0, i32 0 + %d.sroa.0.1.vec.insert = shufflevector <4 x i16*> , <4 x i16*> %d.sroa.0.0, <4 x i32> + br label %for.cond1 +} diff --git a/llvm/test/Transforms/SimplifyCFG/common-code-hoisting.ll b/llvm/test/Transforms/SimplifyCFG/common-code-hoisting.ll index b58017ba7ef0b..37cbc4640e415 100644 --- a/llvm/test/Transforms/SimplifyCFG/common-code-hoisting.ll +++ b/llvm/test/Transforms/SimplifyCFG/common-code-hoisting.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -simplifycfg -hoist-common-insts=1 -S < %s | FileCheck %s --check-prefixes=HOIST ; RUN: opt -simplifycfg -hoist-common-insts=0 -S < %s | FileCheck %s --check-prefixes=NOHOIST -; RUN: opt -simplifycfg -S < %s | FileCheck %s --check-prefixes=HOIST,DEFAULT +; RUN: opt -simplifycfg -S < %s | FileCheck %s --check-prefixes=NOHOIST,DEFAULT ; This example is produced from a very basic C code: ; diff --git a/llvm/test/Transforms/StripSymbols/2007-01-15-llvm.used.ll b/llvm/test/Transforms/StripSymbols/2007-01-15-llvm.used.ll index 438fa96b41ef3..81ccc422c2bd0 100644 --- a/llvm/test/Transforms/StripSymbols/2007-01-15-llvm.used.ll +++ b/llvm/test/Transforms/StripSymbols/2007-01-15-llvm.used.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -strip -S | FileCheck %s +; RUN: opt < %s -passes=strip -S | FileCheck %s ; CHECK: foo ; CHECK: bar diff --git a/llvm/test/Transforms/StripSymbols/strip-dead-debug-info.ll b/llvm/test/Transforms/StripSymbols/strip-dead-debug-info.ll index e13e02cb4b558..d9b21d4a60fd5 100644 --- a/llvm/test/Transforms/StripSymbols/strip-dead-debug-info.ll +++ b/llvm/test/Transforms/StripSymbols/strip-dead-debug-info.ll @@ -1,4 +1,5 @@ ; RUN: opt -strip-dead-debug-info -verify %s -S | FileCheck %s +; RUN: opt -passes='strip-dead-debug-info,verify' %s -S | FileCheck %s ; CHECK: ModuleID = '{{.*}}' ; CHECK-NOT: "bar" diff --git a/llvm/test/Transforms/UnifyFunctionExitNodes/unreachable-blocks-status.ll b/llvm/test/Transforms/UnifyFunctionExitNodes/unreachable-blocks-status.ll new file mode 100644 index 0000000000000..a9169e9ff15e9 --- /dev/null +++ b/llvm/test/Transforms/UnifyFunctionExitNodes/unreachable-blocks-status.ll @@ -0,0 +1,67 @@ +; RUN: opt -mergereturn -S < %s | FileCheck %s + +; The pass did previously not report the correct Modified status in the case +; where a function had at most one return block, and an unified unreachable +; block was created. This was caught by the pass return status check that is +; hidden under EXPENSIVE_CHECKS. + +; CHECK: for.foo.body2: +; CHECK-NEXT: br label %UnifiedUnreachableBlock + +; CHECK: for.foo.end: +; CHECK-NEXT: br label %UnifiedUnreachableBlock + +; CHECK: UnifiedUnreachableBlock: +; CHECK-NEXT: unreachable + +define i32 @foo() { +entry: + br label %for.foo.cond + +for.foo.cond: ; preds = %entry + br i1 false, label %for.foo.body, label %for.foo.end3 + +for.foo.body: ; preds = %for.foo.cond + br label %for.foo.cond1 + +for.foo.cond1: ; preds = %for.foo.body + br i1 false, label %for.foo.body2, label %for.foo.end + +for.foo.body2: ; preds = %for.foo.cond1 + unreachable + +for.foo.end: ; preds = %for.foo.cond1 + unreachable + +for.foo.end3: ; preds = %for.foo.cond + ret i32 undef +} + +; CHECK: for.bar.body2: +; CHECK-NEXT: br label %UnifiedUnreachableBlock + +; CHECK: for.bar.end: +; CHECK-NEXT: br label %UnifiedUnreachableBlock + +; CHECK: UnifiedUnreachableBlock: +; CHECK-NEXT: unreachable + +define void @bar() { +entry: + br label %for.bar.cond + +for.bar.cond: ; preds = %entry + br i1 false, label %for.bar.body, label %for.bar.end + +for.bar.body: ; preds = %for.bar.cond + br label %for.bar.cond1 + +for.bar.cond1: ; preds = %for.bar.body + br i1 false, label %for.bar.body2, label %for.bar.end + +for.bar.body2: ; preds = %for.bar.cond1 + unreachable + +for.bar.end: ; preds = %for.bar.cond1 + unreachable +} diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll index c68a9c9a71c65..75e32528ac7c5 100644 --- a/llvm/test/Transforms/Util/add-TLI-mappings.ll +++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll @@ -9,10 +9,13 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; COMMON-LABEL: @llvm.compiler.used = appending global -; SVML-SAME: [3 x i8*] [ +; SVML-SAME: [6 x i8*] [ ; SVML-SAME: i8* bitcast (<2 x double> (<2 x double>)* @__svml_sin2 to i8*), ; SVML-SAME: i8* bitcast (<4 x double> (<4 x double>)* @__svml_sin4 to i8*), -; SVML-SAME: i8* bitcast (<8 x double> (<8 x double>)* @__svml_sin8 to i8*) +; SVML-SAME: i8* bitcast (<8 x double> (<8 x double>)* @__svml_sin8 to i8*), +; SVML-SAME: i8* bitcast (<4 x float> (<4 x float>)* @__svml_log10f4 to i8*), +; SVML-SAME: i8* bitcast (<8 x float> (<8 x float>)* @__svml_log10f8 to i8*), +; SVML-SAME: i8* bitcast (<16 x float> (<16 x float>)* @__svml_log10f16 to i8*) ; MASSV-SAME: [2 x i8*] [ ; MASSV-SAME: i8* bitcast (<2 x double> (<2 x double>)* @__sind2_massv to i8*), ; MASSV-SAME: i8* bitcast (<4 x float> (<4 x float>)* @__log10f4_massv to i8*) diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll index f0c5b6ef7ad81..5842f1478040c 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load.ll @@ -292,6 +292,66 @@ define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl ret <8 x i16> %r } +; Negative test - disable under asan because widened load can cause spurious +; use-after-poison issues when __asan_poison_memory_region is used. + +define <8 x i16> @gep10_load_i16_insert_v8i16_asan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_address { +; CHECK-LABEL: @gep10_load_i16_insert_v8i16_asan( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 +; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 16 +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; CHECK-NEXT: ret <8 x i16> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 + %s = load i16, i16* %gep, align 16 + %r = insertelement <8 x i16> undef, i16 %s, i64 0 + ret <8 x i16> %r +} + +; hwasan and memtag should be similarly suppressed. + +define <8 x i16> @gep10_load_i16_insert_v8i16_hwasan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_hwaddress { +; CHECK-LABEL: @gep10_load_i16_insert_v8i16_hwasan( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 +; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 16 +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; CHECK-NEXT: ret <8 x i16> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 + %s = load i16, i16* %gep, align 16 + %r = insertelement <8 x i16> undef, i16 %s, i64 0 + ret <8 x i16> %r +} + +define <8 x i16> @gep10_load_i16_insert_v8i16_memtag(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_memtag { +; CHECK-LABEL: @gep10_load_i16_insert_v8i16_memtag( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 +; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 16 +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; CHECK-NEXT: ret <8 x i16> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 + %s = load i16, i16* %gep, align 16 + %r = insertelement <8 x i16> undef, i16 %s, i64 0 + ret <8 x i16> %r +} + +; Negative test - disable under tsan because widened load may overlap bytes +; being concurrently modified. tsan does not know that some bytes are undef. + +define <8 x i16> @gep10_load_i16_insert_v8i16_tsan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_thread { +; CHECK-LABEL: @gep10_load_i16_insert_v8i16_tsan( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 +; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 16 +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; CHECK-NEXT: ret <8 x i16> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 + %s = load i16, i16* %gep, align 16 + %r = insertelement <8 x i16> undef, i16 %s, i64 0 + ret <8 x i16> %r +} + ; Negative test - can't safely load the offset vector, but could load+shuffle. define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(31) %p) { @@ -393,3 +453,46 @@ define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p %r = insertelement <2 x float> undef, float %s, i32 0 ret <2 x float> %r } + +; Negative test - suppress load widening for asan/hwasan/memtag/tsan. + +define <2 x float> @load_f32_insert_v2f32_asan(float* align 16 dereferenceable(16) %p) sanitize_address { +; CHECK-LABEL: @load_f32_insert_v2f32_asan( +; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0 +; CHECK-NEXT: ret <2 x float> [[R]] +; + %s = load float, float* %p, align 4 + %r = insertelement <2 x float> undef, float %s, i32 0 + ret <2 x float> %r +} + +declare float* @getscaleptr() +define void @PR47558_multiple_use_load(<2 x float>* nocapture nonnull %resultptr, <2 x float>* nocapture nonnull readonly %opptr) { +; CHECK-LABEL: @PR47558_multiple_use_load( +; CHECK-NEXT: [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) float* @getscaleptr() +; CHECK-NEXT: [[OP:%.*]] = load <2 x float>, <2 x float>* [[OPPTR:%.*]], align 4 +; CHECK-NEXT: [[SCALE:%.*]] = load float, float* [[SCALEPTR]], align 16 +; CHECK-NEXT: [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0 +; CHECK-NEXT: [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1 +; CHECK-NEXT: [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]] +; CHECK-NEXT: [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0 +; CHECK-NEXT: [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0 +; CHECK-NEXT: [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1 +; CHECK-NEXT: [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1 +; CHECK-NEXT: store <2 x float> [[RESULT1]], <2 x float>* [[RESULTPTR:%.*]], align 8 +; CHECK-NEXT: ret void +; + %scaleptr = tail call nonnull align 16 dereferenceable(64) float* @getscaleptr() + %op = load <2 x float>, <2 x float>* %opptr, align 4 + %scale = load float, float* %scaleptr, align 16 + %t1 = insertelement <2 x float> undef, float %scale, i32 0 + %t2 = insertelement <2 x float> %t1, float %scale, i32 1 + %t3 = fmul <2 x float> %op, %t2 + %t4 = extractelement <2 x float> %t3, i32 0 + %result0 = insertelement <2 x float> undef, float %t4, i32 0 + %t5 = extractelement <2 x float> %t3, i32 1 + %result1 = insertelement <2 x float> %result0, float %t5, i32 1 + store <2 x float> %result1, <2 x float>* %resultptr, align 8 + ret void +} diff --git a/llvm/test/Verifier/assume-bundles.ll b/llvm/test/Verifier/assume-bundles.ll index 302421715c797..6e260f25129ee 100644 --- a/llvm/test/Verifier/assume-bundles.ll +++ b/llvm/test/Verifier/assume-bundles.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: not opt -verify < %s 2>&1 | FileCheck %s declare void @llvm.assume(i1) @@ -6,14 +7,21 @@ define void @func(i32* %P, i32 %P1, i32* %P2, i32* %P3) { ; CHECK: tags must be valid attribute names call void @llvm.assume(i1 true) ["adazdazd"()] ; CHECK: the second argument should be a constant integral value - call void @llvm.assume(i1 true) ["align"(i32* %P, i32 %P1)] + call void @llvm.assume(i1 true) ["dereferenceable"(i32* %P, i32 %P1)] ; CHECK: to many arguments - call void @llvm.assume(i1 true) ["align"(i32* %P, i32 8, i32 8)] + call void @llvm.assume(i1 true) ["dereferenceable"(i32* %P, i32 8, i32 8)] ; CHECK: this attribute should have 2 arguments - call void @llvm.assume(i1 true) ["align"(i32* %P)] + call void @llvm.assume(i1 true) ["dereferenceable"(i32* %P)] ; CHECK: this attribute has no argument - call void @llvm.assume(i1 true) ["align"(i32* %P, i32 4), "cold"(i32* %P)] + call void @llvm.assume(i1 true) ["dereferenceable"(i32* %P, i32 4), "cold"(i32* %P)] ; CHECK: this attribute should have one argument call void @llvm.assume(i1 true) ["noalias"()] + call void @llvm.assume(i1 true) ["align"(i32* %P, i32 %P1, i32 4)] +; CHECK: alignment assumptions should have 2 or 3 arguments + call void @llvm.assume(i1 true) ["align"(i32* %P, i32 %P1, i32 4, i32 4)] +; CHECK: second argument should be an integer + call void @llvm.assume(i1 true) ["align"(i32* %P, i32* %P2)] +; CHECK: third argument should be an integer if present + call void @llvm.assume(i1 true) ["align"(i32* %P, i32 %P1, i32* %P2)] ret void } diff --git a/llvm/test/Verifier/get-active-lane-mask.ll b/llvm/test/Verifier/get-active-lane-mask.ll index 94d819b5c75b0..c637916faccfc 100644 --- a/llvm/test/Verifier/get-active-lane-mask.ll +++ b/llvm/test/Verifier/get-active-lane-mask.ll @@ -2,20 +2,20 @@ declare <4 x i32> @llvm.get.active.lane.mask.v4i32.i32(i32, i32) -define <4 x i32> @t1(i32 %IV, i32 %BTC) { +define <4 x i32> @t1(i32 %IV, i32 %TC) { ; CHECK: get_active_lane_mask: element type is not i1 -; CHECK-NEXT: %res = call <4 x i32> @llvm.get.active.lane.mask.v4i32.i32(i32 %IV, i32 %BTC) +; CHECK-NEXT: %res = call <4 x i32> @llvm.get.active.lane.mask.v4i32.i32(i32 %IV, i32 %TC) - %res = call <4 x i32> @llvm.get.active.lane.mask.v4i32.i32(i32 %IV, i32 %BTC) + %res = call <4 x i32> @llvm.get.active.lane.mask.v4i32.i32(i32 %IV, i32 %TC) ret <4 x i32> %res } declare i32 @llvm.get.active.lane.mask.i32.i32(i32, i32) -define i32 @t2(i32 %IV, i32 %BTC) { +define i32 @t2(i32 %IV, i32 %TC) { ; CHECK: Intrinsic has incorrect return type! ; CHECK-NEXT: i32 (i32, i32)* @llvm.get.active.lane.mask.i32.i32 - %res = call i32 @llvm.get.active.lane.mask.i32.i32(i32 %IV, i32 %BTC) + %res = call i32 @llvm.get.active.lane.mask.i32.i32(i32 %IV, i32 %TC) ret i32 %res } diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 4d7d3c861aba5..9a1dd4ebc5a4e 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -330,7 +330,8 @@ def have_ld64_plugin_support(): # Ask llvm-config about asserts llvm_config.feature_config( - [('--assertion-mode', {'ON': 'asserts'})]) + [('--assertion-mode', {'ON': 'asserts'}), + ('--build-mode', {'[Dd][Ee][Bb][Uu][Gg]': 'debug'})]) if 'darwin' == sys.platform: cmd = ['sysctl', 'hw.optional.fma'] @@ -356,8 +357,11 @@ def have_ld64_plugin_support(): if config.enable_threads: config.available_features.add('thread_support') -if config.llvm_libxml2_enabled: +if config.have_libxml2: config.available_features.add('libxml2') if config.have_opt_viewer_modules: config.available_features.add('have_opt_viewer_modules') + +if config.expensive_checks: + config.available_features.add('expensive_checks') diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in index 52f709f817ddd..9765d498b50d6 100644 --- a/llvm/test/lit.site.cfg.py.in +++ b/llvm/test/lit.site.cfg.py.in @@ -35,13 +35,13 @@ config.llvm_use_intel_jitevents = @LLVM_USE_INTEL_JITEVENTS@ config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@" config.have_zlib = @LLVM_ENABLE_ZLIB@ config.have_libxar = @HAVE_LIBXAR@ +config.have_libxml2 = @LLVM_ENABLE_LIBXML2@ config.have_dia_sdk = @LLVM_ENABLE_DIA_SDK@ config.enable_ffi = @LLVM_ENABLE_FFI@ config.build_examples = @LLVM_BUILD_EXAMPLES@ config.enable_threads = @LLVM_ENABLE_THREADS@ config.build_shared_libs = @BUILD_SHARED_LIBS@ config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@ -config.llvm_libxml2_enabled = @LLVM_LIBXML2_ENABLED@ config.llvm_host_triple = '@LLVM_HOST_TRIPLE@' config.host_arch = "@HOST_ARCH@" config.have_opt_viewer_modules = @LLVM_HAVE_OPT_VIEWER_MODULES@ @@ -50,6 +50,7 @@ config.has_plugins = @LLVM_ENABLE_PLUGINS@ config.linked_bye_extension = @LLVM_BYE_LINK_INTO_TOOLS@ config.have_tf_aot = @LLVM_HAVE_TF_AOT@ config.have_tf_api = @LLVM_HAVE_TF_API@ +config.expensive_checks = @LLVM_ENABLE_EXPENSIVE_CHECKS@ # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_function_name.ll new file mode 100644 index 0000000000000..1ea9d20146f1e --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_function_name.ll @@ -0,0 +1,9 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=aarch64-unknown-linux < %s | FileCheck --check-prefix=LINUX %s +; RUN: llc -mtriple=aarch64-apple-darwin < %s | FileCheck --check-prefix=DARWIN %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_function_name.ll.expected new file mode 100644 index 0000000000000..fbe1caeea72d0 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_function_name.ll.expected @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=aarch64-unknown-linux < %s | FileCheck --check-prefix=LINUX %s +; RUN: llc -mtriple=aarch64-apple-darwin < %s | FileCheck --check-prefix=DARWIN %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; LINUX-LABEL: _Z54bar$ompvariant$bar: +; LINUX: // %bb.0: // %entry +; LINUX-NEXT: mov w0, #2 +; LINUX-NEXT: ret +; +; DARWIN-LABEL: _Z54bar$ompvariant$bar: +; DARWIN: ; %bb.0: ; %entry +; DARWIN-NEXT: mov w0, #2 +; DARWIN-NEXT: ret +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_function_name.ll new file mode 100644 index 0000000000000..b48607d2955f0 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_function_name.ll @@ -0,0 +1,8 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_function_name.ll.expected new file mode 100644 index 0000000000000..e13058f32450e --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_function_name.ll.expected @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; CHECK-LABEL: _Z54bar$ompvariant$bar: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, 2 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_function_name.ll new file mode 100644 index 0000000000000..6c0f9e971035d --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_function_name.ll @@ -0,0 +1,10 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=arm64-unknown-linux < %s | FileCheck --prefi=LINUX %s +; RUN: llc -mtriple=armv7-apple-darwin < %s | FileCheck --prefix=DARWIN %s +; RUN: llc -mtriple=armv7-apple-ios < %s | FileCheck --prefix=IOS %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_function_name.ll.expected new file mode 100644 index 0000000000000..e191b0497f0a9 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_function_name.ll.expected @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=arm64-unknown-linux < %s | FileCheck --prefi=LINUX %s +; RUN: llc -mtriple=armv7-apple-darwin < %s | FileCheck --prefix=DARWIN %s +; RUN: llc -mtriple=armv7-apple-ios < %s | FileCheck --prefix=IOS %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; CHECK-LABEL: _Z54bar$ompvariant$bar: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, #2 +; CHECK-NEXT: ret +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_function_name.ll new file mode 100644 index 0000000000000..526f6bd5d4615 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_function_name.ll @@ -0,0 +1,8 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=hexagon-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_function_name.ll.expected new file mode 100644 index 0000000000000..9033be4aefee2 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_function_name.ll.expected @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=hexagon-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; CHECK-LABEL: _Z54bar$ompvariant$bar: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #2 +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_function_name.ll new file mode 100644 index 0000000000000..c1c7d4f612e3d --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_function_name.ll @@ -0,0 +1,8 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=lanai-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_function_name.ll.expected new file mode 100644 index 0000000000000..4f30c23976654 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_function_name.ll.expected @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=lanai-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; CHECK-LABEL: _Z54bar$ompvariant$bar: +; CHECK: ! %bb.0: ! %entry +; CHECK-NEXT: st %fp, [--%sp] +; CHECK-NEXT: add %sp, 0x8, %fp +; CHECK-NEXT: sub %sp, 0x8, %sp +; CHECK-NEXT: mov 0x2, %rv +; CHECK-NEXT: ld -4[%fp], %pc ! return +; CHECK-NEXT: add %fp, 0x0, %sp +; CHECK-NEXT: ld -8[%fp], %fp +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips_function_name.ll new file mode 100644 index 0000000000000..1cf2e3cfcc0cc --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips_function_name.ll @@ -0,0 +1,8 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=mips-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips_function_name.ll.expected new file mode 100644 index 0000000000000..c1c4577542e82 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips_function_name.ll.expected @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=mips-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; CHECK-LABEL: _Z54bar$ompvariant$bar: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: jr $ra +; CHECK-NEXT: addiu $2, $zero, 2 +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/msp430_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/msp430_function_name.ll new file mode 100644 index 0000000000000..1bf6ea93fbd1e --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/msp430_function_name.ll @@ -0,0 +1,8 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=msp430-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/msp430_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/msp430_function_name.ll.expected new file mode 100644 index 0000000000000..2cb55cde0b76f --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/msp430_function_name.ll.expected @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=msp430-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; CHECK-LABEL: _Z54bar$ompvariant$bar: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov #2, r12 +; CHECK-NEXT: clr r13 +; CHECK-NEXT: ret +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/ppc_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/ppc_function_name.ll new file mode 100644 index 0000000000000..d4d1c68fd0ac1 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/ppc_function_name.ll @@ -0,0 +1,8 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=ppc32-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/ppc_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/ppc_function_name.ll.expected new file mode 100644 index 0000000000000..72edada3ff06c --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/ppc_function_name.ll.expected @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=ppc32-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; CHECK-LABEL: _Z54bar$ompvariant$bar: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 3, 2 +; CHECK-NEXT: blr +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/riscv_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/riscv_function_name.ll new file mode 100644 index 0000000000000..db4a1988a9b68 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/riscv_function_name.ll @@ -0,0 +1,8 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=riscv32-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/riscv_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/riscv_function_name.ll.expected new file mode 100644 index 0000000000000..d2ec3e0f9fcc0 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/riscv_function_name.ll.expected @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=riscv32-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; CHECK-LABEL: _Z54bar$ompvariant$bar: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi a0, zero, 2 +; CHECK-NEXT: ret +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/sparc_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/sparc_function_name.ll new file mode 100644 index 0000000000000..8b4ae66f764d5 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/sparc_function_name.ll @@ -0,0 +1,8 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=sparc-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/sparc_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/sparc_function_name.ll.expected new file mode 100644 index 0000000000000..72307c73a4298 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/sparc_function_name.ll.expected @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=sparc-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; CHECK-LABEL: _Z54bar$ompvariant$bar: +; CHECK: .cfi_startproc +; CHECK-NEXT: ! %bb.0: ! %entry +; CHECK-NEXT: retl +; CHECK-NEXT: mov 2, %o0 +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/systemz_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/systemz_function_name.ll new file mode 100644 index 0000000000000..101bec2f0456e --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/systemz_function_name.ll @@ -0,0 +1,8 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=s390x-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/systemz_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/systemz_function_name.ll.expected new file mode 100644 index 0000000000000..c5dade171110b --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/systemz_function_name.ll.expected @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=s390x-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; CHECK-LABEL: _Z54bar$ompvariant$bar: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lhi %r2, 2 +; CHECK-NEXT: br %r14 +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/wasm_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/wasm_function_name.ll new file mode 100644 index 0000000000000..a55cd8efd60bd --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/wasm_function_name.ll @@ -0,0 +1,8 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=wasm32-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/wasm_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/wasm_function_name.ll.expected new file mode 100644 index 0000000000000..e5a10a3e07c63 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/wasm_function_name.ll.expected @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=wasm32-unknown-linux < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; CHECK-LABEL: _Z54bar$ompvariant$bar: +; CHECK: .functype _Z54bar$ompvariant$bar () -> (i32) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: i32.const 2 +; CHECK-NEXT: # fallthrough-return +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_function_name.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_function_name.ll new file mode 100644 index 0000000000000..231aa54d6978e --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_function_name.ll @@ -0,0 +1,8 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_function_name.ll.expected new file mode 100644 index 0000000000000..32b05fccf62bf --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_function_name.ll.expected @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +; CHECK-LABEL: _Z54bar$ompvariant$bar: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl $2, %eax +; CHECK-NEXT: retq +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/aarch64-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/aarch64-function-name.test new file mode 100644 index 0000000000000..36c96cc329fdf --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/aarch64-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: aarch64-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/aarch64_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/aarch64_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/amdgpu-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/amdgpu-function-name.test new file mode 100644 index 0000000000000..eb4092d5a460e --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/amdgpu-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: amdgpu-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/amdgpu_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/amdgpu_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/arm-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/arm-function-name.test new file mode 100644 index 0000000000000..07455cbf13c0e --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/arm-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: arm-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/arm_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/arm_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/hexagon-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/hexagon-function-name.test new file mode 100644 index 0000000000000..1e34074255fd5 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/hexagon-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: hexagon-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/hexagon_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/hexagon_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/lanai-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/lanai-function-name.test new file mode 100644 index 0000000000000..cb5aa4e45ffae --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/lanai-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: lanai-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/lanai_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/lanai_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/mips-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/mips-function-name.test new file mode 100644 index 0000000000000..03f9149d5c02b --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/mips-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: mips-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/mips_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/mips_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/msp430-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/msp430-function-name.test new file mode 100644 index 0000000000000..8f676227aa324 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/msp430-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: msp430-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/msp430_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/msp430_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/ppc-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/ppc-function-name.test new file mode 100644 index 0000000000000..824740cde6f58 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/ppc-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: powerpc-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/ppc_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/ppc_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/riscv-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/riscv-function-name.test new file mode 100644 index 0000000000000..2e1e05d88f9a2 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/riscv-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: riscv-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/riscv_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/riscv_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/sparc-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/sparc-function-name.test new file mode 100644 index 0000000000000..a223ee211da36 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/sparc-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: sparc-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/sparc_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/sparc_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/systemz-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/systemz-function-name.test new file mode 100644 index 0000000000000..e6c47252d4541 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/systemz-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: systemz-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/systemz_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/systemz_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/wasm-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/wasm-function-name.test new file mode 100644 index 0000000000000..fc45e28415dd3 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/wasm-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: webassembly-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/wasm_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/wasm_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-function-name.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-function-name.test new file mode 100644 index 0000000000000..d395afb13971f --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-function-name.test @@ -0,0 +1,5 @@ +# REQUIRES: x86-registered-target +## Check that functions names with '$' are processed correctly + +# RUN: cp -f %S/Inputs/x86_function_name.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/x86_function_name.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/function_name.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/function_name.ll new file mode 100644 index 0000000000000..173e7219cb3f9 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/function_name.ll @@ -0,0 +1,8 @@ +; Check that we accept functions with '$' in the name. +; +; RUN: opt < %s -instsimplify -S | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/function_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/function_name.ll.expected new file mode 100644 index 0000000000000..75e4235eb440e --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/function_name.ll.expected @@ -0,0 +1,9 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; Check that we accept functions with '$' in the name. +; +; RUN: opt < %s -instsimplify -S | FileCheck %s +; +define hidden i32 @"_Z54bar$ompvariant$bar"() { +entry: + ret i32 2 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/function-name.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/function-name.test new file mode 100644 index 0000000000000..3d1a158e00bc7 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/function-name.test @@ -0,0 +1,7 @@ +# REQUIRES: x86-registered-target +## Basic test checking that update_test_checks.py works correctly +# RUN: cp -f %S/Inputs/function_name.ll %t.ll && %update_test_checks %t.ll +# RUN: diff -u %t.ll %S/Inputs/function_name.ll.expected +## Check that running the script again does not change the result: +# RUN: %update_test_checks %t.ll +# RUN: diff -u %t.ll %S/Inputs/function_name.ll.expected diff --git a/llvm/test/tools/llvm-cov/gcov/Inputs/abs-path.gcda b/llvm/test/tools/llvm-cov/gcov/Inputs/abs-path.gcda new file mode 100644 index 0000000000000..806dc6a2aa0f5 Binary files /dev/null and b/llvm/test/tools/llvm-cov/gcov/Inputs/abs-path.gcda differ diff --git a/llvm/test/tools/llvm-cov/gcov/Inputs/abs-path.gcno b/llvm/test/tools/llvm-cov/gcov/Inputs/abs-path.gcno new file mode 100644 index 0000000000000..1bd83064d67be Binary files /dev/null and b/llvm/test/tools/llvm-cov/gcov/Inputs/abs-path.gcno differ diff --git a/llvm/test/tools/llvm-cov/Inputs/gcov-4.7.gcda b/llvm/test/tools/llvm-cov/gcov/Inputs/gcov-4.7.gcda similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/gcov-4.7.gcda rename to llvm/test/tools/llvm-cov/gcov/Inputs/gcov-4.7.gcda diff --git a/llvm/test/tools/llvm-cov/Inputs/gcov-4.7.gcno b/llvm/test/tools/llvm-cov/gcov/Inputs/gcov-4.7.gcno similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/gcov-4.7.gcno rename to llvm/test/tools/llvm-cov/gcov/Inputs/gcov-4.7.gcno diff --git a/llvm/test/tools/llvm-cov/Inputs/gcov-8.gcda b/llvm/test/tools/llvm-cov/gcov/Inputs/gcov-8.gcda similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/gcov-8.gcda rename to llvm/test/tools/llvm-cov/gcov/Inputs/gcov-8.gcda diff --git a/llvm/test/tools/llvm-cov/Inputs/gcov-8.gcno b/llvm/test/tools/llvm-cov/gcov/Inputs/gcov-8.gcno similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/gcov-8.gcno rename to llvm/test/tools/llvm-cov/gcov/Inputs/gcov-8.gcno diff --git a/llvm/test/tools/llvm-cov/Inputs/gcov-9.gcda b/llvm/test/tools/llvm-cov/gcov/Inputs/gcov-9.gcda similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/gcov-9.gcda rename to llvm/test/tools/llvm-cov/gcov/Inputs/gcov-9.gcda diff --git a/llvm/test/tools/llvm-cov/Inputs/gcov-9.gcno b/llvm/test/tools/llvm-cov/gcov/Inputs/gcov-9.gcno similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/gcov-9.gcno rename to llvm/test/tools/llvm-cov/gcov/Inputs/gcov-9.gcno diff --git a/llvm/test/tools/llvm-cov/Inputs/gcov-fake-4.2.gcda b/llvm/test/tools/llvm-cov/gcov/Inputs/gcov-fake-4.2.gcda similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/gcov-fake-4.2.gcda rename to llvm/test/tools/llvm-cov/gcov/Inputs/gcov-fake-4.2.gcda diff --git a/llvm/test/tools/llvm-cov/Inputs/gcov-fake-4.2.gcno b/llvm/test/tools/llvm-cov/gcov/Inputs/gcov-fake-4.2.gcno similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/gcov-fake-4.2.gcno rename to llvm/test/tools/llvm-cov/gcov/Inputs/gcov-fake-4.2.gcno diff --git a/llvm/test/tools/llvm-cov/Inputs/test.cpp b/llvm/test/tools/llvm-cov/gcov/Inputs/test.cpp similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test.cpp rename to llvm/test/tools/llvm-cov/gcov/Inputs/test.cpp diff --git a/llvm/test/tools/llvm-cov/Inputs/test.gcda b/llvm/test/tools/llvm-cov/gcov/Inputs/test.gcda similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test.gcda rename to llvm/test/tools/llvm-cov/gcov/Inputs/test.gcda diff --git a/llvm/test/tools/llvm-cov/Inputs/test.gcno b/llvm/test/tools/llvm-cov/gcov/Inputs/test.gcno similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test.gcno rename to llvm/test/tools/llvm-cov/gcov/Inputs/test.gcno diff --git a/llvm/test/tools/llvm-cov/Inputs/test.h b/llvm/test/tools/llvm-cov/gcov/Inputs/test.h similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test.h rename to llvm/test/tools/llvm-cov/gcov/Inputs/test.h diff --git a/llvm/test/tools/llvm-cov/Inputs/test_file_checksum_fail.gcda b/llvm/test/tools/llvm-cov/gcov/Inputs/test_file_checksum_fail.gcda similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test_file_checksum_fail.gcda rename to llvm/test/tools/llvm-cov/gcov/Inputs/test_file_checksum_fail.gcda diff --git a/llvm/test/tools/llvm-cov/Inputs/test_func_checksum_fail.gcda b/llvm/test/tools/llvm-cov/gcov/Inputs/test_func_checksum_fail.gcda similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test_func_checksum_fail.gcda rename to llvm/test/tools/llvm-cov/gcov/Inputs/test_func_checksum_fail.gcda diff --git a/llvm/test/tools/llvm-cov/Inputs/test_no_gcda.cpp.gcov b/llvm/test/tools/llvm-cov/gcov/Inputs/test_no_gcda.cpp.gcov similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test_no_gcda.cpp.gcov rename to llvm/test/tools/llvm-cov/gcov/Inputs/test_no_gcda.cpp.gcov diff --git a/llvm/test/tools/llvm-cov/Inputs/test_no_gcda.h.gcov b/llvm/test/tools/llvm-cov/gcov/Inputs/test_no_gcda.h.gcov similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test_no_gcda.h.gcov rename to llvm/test/tools/llvm-cov/gcov/Inputs/test_no_gcda.h.gcov diff --git a/llvm/test/tools/llvm-cov/Inputs/test_no_options.cpp.gcov b/llvm/test/tools/llvm-cov/gcov/Inputs/test_no_options.cpp.gcov similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test_no_options.cpp.gcov rename to llvm/test/tools/llvm-cov/gcov/Inputs/test_no_options.cpp.gcov diff --git a/llvm/test/tools/llvm-cov/Inputs/test_no_options.h.gcov b/llvm/test/tools/llvm-cov/gcov/Inputs/test_no_options.h.gcov similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test_no_options.h.gcov rename to llvm/test/tools/llvm-cov/gcov/Inputs/test_no_options.h.gcov diff --git a/llvm/test/tools/llvm-cov/Inputs/test_paths.gcda b/llvm/test/tools/llvm-cov/gcov/Inputs/test_paths.gcda similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test_paths.gcda rename to llvm/test/tools/llvm-cov/gcov/Inputs/test_paths.gcda diff --git a/llvm/test/tools/llvm-cov/Inputs/test_paths.gcno b/llvm/test/tools/llvm-cov/gcov/Inputs/test_paths.gcno similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test_paths.gcno rename to llvm/test/tools/llvm-cov/gcov/Inputs/test_paths.gcno diff --git a/llvm/test/tools/llvm-cov/Inputs/test_read_fail.gcno b/llvm/test/tools/llvm-cov/gcov/Inputs/test_read_fail.gcno similarity index 100% rename from llvm/test/tools/llvm-cov/Inputs/test_read_fail.gcno rename to llvm/test/tools/llvm-cov/gcov/Inputs/test_read_fail.gcno diff --git a/llvm/test/tools/llvm-cov/llvm-cov.test b/llvm/test/tools/llvm-cov/gcov/basic.test similarity index 97% rename from llvm/test/tools/llvm-cov/llvm-cov.test rename to llvm/test/tools/llvm-cov/gcov/basic.test index 2256501cd5ea2..4a3b81ce2b7e3 100644 --- a/llvm/test/tools/llvm-cov/llvm-cov.test +++ b/llvm/test/tools/llvm-cov/gcov/basic.test @@ -38,7 +38,7 @@ RUN: llvm-cov gcov -n test.c | FileCheck %s --check-prefix=OUT # Print to stdout. RUN: llvm-cov gcov -t test.c > stdout RUN: llvm-cov gcov --stdout test.c | cmp stdout - -RUN: cat test_no_options.h.gcov test_no_options.cpp.gcov | diff -u - stdout +RUN: cat test_no_options.cpp.gcov test_no_options.h.gcov | diff -u - stdout RUN: llvm-cov gcov -n -t test.c | count 0 RUN: llvm-cov gcov test_paths.cpp 2>/dev/null | FileCheck %s --check-prefix=MISSING @@ -84,12 +84,7 @@ RUN: llvm-cov gcov test.c -a -b -f | FileCheck %s --check-prefixes=OUT,OUTFILE,O RUN: FileCheck %s --check-prefixes=C,C-A,C-B --match-full-lines --strict-whitespace < test.cpp.gcov RUN: FileCheck %s --check-prefixes=H,H-A,H-B --match-full-lines --strict-whitespace < test.h.gcov - OUT-F:Function '_ZN1AC2Ev' - OUT-F-NEXT:Lines executed:100.00% of 1 - OUT-FB-NEXT:No branches - OUT-FB-NEXT:No calls - OUT-F-EMPTY: - OUT-F-NEXT:Function '_ZN1A1BEv' + OUT-F:Function '_ZN1A1BEv' OUT-F-NEXT:Lines executed:100.00% of 1 OUT-FB-NEXT:No branches OUT-FB-NEXT:No calls @@ -121,14 +116,17 @@ RUN: FileCheck %s --check-prefixes=H,H-A,H-B --match-full-lines --strict-whitesp OUT-F-EMPTY: OUT-F-NEXT:Function '_Z15initialize_gridv' OUT-F-NEXT:Lines executed:100.00% of 5 - OUT-FB-NEXT:Branches executed:100.00% of 4 - OUT-FB-NEXT:Taken at least once:100.00% of 4 + OUT-FB-NEXT:No branches OUT-FB-NEXT:No calls OUT-F-EMPTY: OUT-F-NEXT:Function 'main' OUT-F-NEXT:Lines executed:92.00% of 25 - OUT-FB-NEXT:Branches executed:100.00% of 11 - OUT-FB-NEXT:Taken at least once:81.82% of 11 + OUT-FB-NEXT:No branches + OUT-FB-NEXT:No calls + OUT-F-EMPTY: + OUT-F-NEXT:Function '_ZN1AC2Ev' + OUT-F-NEXT:Lines executed:100.00% of 1 + OUT-FB-NEXT:No branches OUT-FB-NEXT:No calls OUT-F-EMPTY: OUT:File 'test.cpp' diff --git a/llvm/test/tools/llvm-cov/gcov/demangled-names.test b/llvm/test/tools/llvm-cov/gcov/demangled-names.test new file mode 100644 index 0000000000000..31cb05fdca574 --- /dev/null +++ b/llvm/test/tools/llvm-cov/gcov/demangled-names.test @@ -0,0 +1,10 @@ +# Test --demangled-names (-m). +RUN: rm -rf %t && mkdir %t && cd %t +RUN: cp %S/Inputs/test.cpp %S/Inputs/test.gcno %S/Inputs/test.gcda . + +RUN: llvm-cov gcov -b -f -m test.gcda | FileCheck %s +RUN: llvm-cov gcov -b -f --demangled-names test.gcda | FileCheck %s +RUN: FileCheck %s --check-prefix=BRANCH < test.cpp.gcov + +CHECK: Function 'A::B()' +BRANCH: function A::B() called diff --git a/llvm/test/tools/llvm-cov/gcov-4.7.c b/llvm/test/tools/llvm-cov/gcov/gcov-4.7.c similarity index 84% rename from llvm/test/tools/llvm-cov/gcov-4.7.c rename to llvm/test/tools/llvm-cov/gcov/gcov-4.7.c index d92953a6b0b65..211c635f51283 100644 --- a/llvm/test/tools/llvm-cov/gcov-4.7.c +++ b/llvm/test/tools/llvm-cov/gcov/gcov-4.7.c @@ -1,27 +1,25 @@ /// Test that llvm-cov supports gcov [4.7,8) compatible format. #include #include -int main() { // GCOV: #####: [[@LINE]]:int main - double a[11], result; // GCOV-NEXT: -: [[@LINE]]: - for (int i = 0; i < 11; i++) // GCOV-NEXT: #####: [[@LINE]]: +int main() { // GCOV: 1: [[@LINE]]:int main + double a[11], result; // GCOV-NEXT: -: [[@LINE]]: + for (int i = 0; i < 11; i++) // GCOV-NEXT: 12: [[@LINE]]: scanf("%lf", &a[i]); // GCOV-NEXT: 11: [[@LINE]]: - for (int i = 10; i >= 0; i--) { // GCOV-NEXT: 4: [[@LINE]]: + for (int i = 10; i >= 0; i--) { // GCOV-NEXT: 12: [[@LINE]]: result = sqrt(fabs(a[i])) + 5 * pow(a[i], 3); // GCOV-NEXT: 11: [[@LINE]]: printf("\nf(%lf) = "); // GCOV-NEXT: 11: [[@LINE]]: - if (result > 400) printf("Overflow!"); // GCOV-NEXT: #####: [[@LINE]]: - else printf("%lf", result); // GCOV-NEXT: 4: [[@LINE]]: - } // GCOV-NEXT: -: [[@LINE]]: - return 0; // GCOV-NEXT: #####: [[@LINE]]: -} // GCOV-NEXT: -: [[@LINE]]: -/// FIXME several lines do not match gcov 7 + if (result > 400) printf("Overflow!"); // GCOV-NEXT: 11: [[@LINE]]: + else printf("%lf", result); // GCOV-NEXT: 4: [[@LINE]]: + } // GCOV-NEXT: -: [[@LINE]]: + return 0; // GCOV-NEXT: 1: [[@LINE]]: +} // GCOV-NEXT: -: [[@LINE]]: // RUN: rm -rf %t && mkdir %t && cd %t // RUN: cp %s %p/Inputs/gcov-4.7.gc* . -/// FIXME Lines executed:100.00% of 12 // RUN: llvm-cov gcov gcov-4.7.c | FileCheck %s // CHECK: File 'gcov-4.7.c' -// CHECK-NEXT: Lines executed:55.56% of 9 +// CHECK-NEXT: Lines executed:100.00% of 9 // CHECK-NEXT: Creating 'gcov-4.7.c.gcov' // RUN: FileCheck --input-file=%t/gcov-4.7.c.gcov --check-prefix=HEADER %s diff --git a/llvm/test/tools/llvm-cov/gcov-8.c b/llvm/test/tools/llvm-cov/gcov/gcov-8.c similarity index 81% rename from llvm/test/tools/llvm-cov/gcov-8.c rename to llvm/test/tools/llvm-cov/gcov/gcov-8.c index eef3511e93a7c..d557d84130183 100644 --- a/llvm/test/tools/llvm-cov/gcov-8.c +++ b/llvm/test/tools/llvm-cov/gcov/gcov-8.c @@ -1,29 +1,27 @@ /// Test that llvm-cov supports gcov 8 compatible format. #include #include -int main() { // GCOV: 1: [[@LINE]]:int main - double a[11], result; // GCOV-NEXT: -: [[@LINE]]: +int main() { // GCOV: 1: [[@LINE]]:int main + double a[11], result; // GCOV-NEXT: -: [[@LINE]]: for (int i = 0; i < 11; i++) // GCOV-NEXT: 12: [[@LINE]]: scanf("%lf", &a[i]); // GCOV-NEXT: 11: [[@LINE]]: - for (int i = 10; i >= 0; i--) { // GCOV-NEXT: 7: [[@LINE]]: + for (int i = 10; i >= 0; i--) { // GCOV-NEXT: 12: [[@LINE]]: result = sqrt(fabs(a[i])) + 5 * pow(a[i], 3); // GCOV-NEXT: 11: [[@LINE]]: printf("\nf(%lf) = "); // GCOV-NEXT: 11: [[@LINE]]: if (result > 400) printf("Overflow!"); // GCOV-NEXT: 11: [[@LINE]]: - else printf("%lf", result); // GCOV-NEXT: #####: [[@LINE]]: - } // GCOV-NEXT: -: [[@LINE]]: - return 0; // GCOV-NEXT: #####: [[@LINE]]: -} // GCOV-NEXT: -: [[@LINE]]: -/// FIXME several lines do not match gcov 8 + else printf("%lf", result); // GCOV-NEXT: 4: [[@LINE]]: + } // GCOV-NEXT: -: [[@LINE]]: + return 0; // GCOV-NEXT: 1: [[@LINE]]: +} // GCOV-NEXT: -: [[@LINE]]: // RUN: rm -rf %t && mkdir %t && cd %t // RUN: cp %s %p/Inputs/gcov-8.gc* . -/// FIXME Lines executed:100.00% of 12 // RUN: llvm-cov gcov gcov-8.c | FileCheck %s --check-prefixes=OUT,OUTFILE // OUT: File 'gcov-8.c' -// OUT-NEXT: Lines executed:77.78% of 9 -// OUT-B-NEXT: Branches executed:85.71% of 14 -// OUT-B-NEXT: Taken at least once:42.86% of 14 +// OUT-NEXT: Lines executed:100.00% of 9 +// OUT-B-NEXT: Branches executed:100.00% of 14 +// OUT-B-NEXT: Taken at least once:71.43% of 14 // OUT-B-NEXT: No calls // OUTFILE-NEXT: Creating 'gcov-8.c.gcov' // OUT-EMPTY: @@ -51,23 +49,23 @@ int main() { // GCOV: 1: [[@LINE]]:int // I-NEXT:lcount:4,1 // I-NEXT:lcount:6,12 // I-B-NEXT:branch:6,taken -// I-B-NEXT:branch:6,nottaken +// I-B-NEXT:branch:6,taken // I-NEXT:lcount:7,11 // I-B-NEXT:branch:7,taken // I-B-NEXT:branch:7,nottaken -// I-NEXT:lcount:8,7 +// I-NEXT:lcount:8,12 +// I-B-NEXT:branch:8,taken // I-B-NEXT:branch:8,taken -// I-B-NEXT:branch:8,nottaken // I-NEXT:lcount:9,11 // I-NEXT:lcount:10,11 // I-B-NEXT:branch:10,taken // I-B-NEXT:branch:10,nottaken // I-NEXT:lcount:11,11 // I-B-NEXT:branch:11,taken -// I-B-NEXT:branch:11,nottaken +// I-B-NEXT:branch:11,taken // I-B-NEXT:branch:11,taken // I-B-NEXT:branch:11,nottaken -// I-NEXT:lcount:12,0 -// I-B-NEXT:branch:12,notexec -// I-B-NEXT:branch:12,notexec -// I-NEXT:lcount:14,0 +// I-NEXT:lcount:12,4 +// I-B-NEXT:branch:12,taken +// I-B-NEXT:branch:12,nottaken +// I-NEXT:lcount:14,1 diff --git a/llvm/test/tools/llvm-cov/gcov-9.c b/llvm/test/tools/llvm-cov/gcov/gcov-9.c similarity index 86% rename from llvm/test/tools/llvm-cov/gcov-9.c rename to llvm/test/tools/llvm-cov/gcov/gcov-9.c index 335e6c0663dbe..a2e9cf4749736 100644 --- a/llvm/test/tools/llvm-cov/gcov-9.c +++ b/llvm/test/tools/llvm-cov/gcov/gcov-9.c @@ -1,27 +1,25 @@ /// Test that llvm-cov supports gcov 9 compatible format. #include #include -int main() { // GCOV: 1: [[@LINE]]:int main - double a[11], result; // GCOV-NEXT: -: [[@LINE]]: +int main() { // GCOV: 1: [[@LINE]]:int main + double a[11], result; // GCOV-NEXT: -: [[@LINE]]: for (int i = 0; i < 11; i++) // GCOV-NEXT: 12: [[@LINE]]: scanf("%lf", &a[i]); // GCOV-NEXT: 11: [[@LINE]]: - for (int i = 10; i >= 0; i--) { // GCOV-NEXT: 7: [[@LINE]]: + for (int i = 10; i >= 0; i--) { // GCOV-NEXT: 12: [[@LINE]]: result = sqrt(fabs(a[i])) + 5 * pow(a[i], 3); // GCOV-NEXT: 11: [[@LINE]]: printf("\nf(%lf) = "); // GCOV-NEXT: 11: [[@LINE]]: if (result > 400) printf("Overflow!"); // GCOV-NEXT: 11: [[@LINE]]: - else printf("%lf", result); // GCOV-NEXT: #####: [[@LINE]]: - } // GCOV-NEXT: -: [[@LINE]]: - return 0; // GCOV-NEXT: #####: [[@LINE]]: -} // GCOV-NEXT: -: [[@LINE]]: -/// FIXME several lines do not match gcov 9 + else printf("%lf", result); // GCOV-NEXT: 4: [[@LINE]]: + } // GCOV-NEXT: -: [[@LINE]]: + return 0; // GCOV-NEXT: 1: [[@LINE]]: +} // GCOV-NEXT: -: [[@LINE]]: // RUN: rm -rf %t && mkdir %t && cd %t // RUN: cp %s %p/Inputs/gcov-9.gc* . -/// FIXME Lines executed:100.00% of 12 // RUN: llvm-cov gcov gcov-9.c | FileCheck %s // CHECK: File 'gcov-9.c' -// CHECK-NEXT: Lines executed:77.78% of 9 +// CHECK-NEXT: Lines executed:100.00% of 9 // CHECK-NEXT: Creating 'gcov-9.c.gcov' // RUN: FileCheck --input-file=%t/gcov-9.c.gcov --check-prefix=HEADER %s diff --git a/llvm/test/tools/llvm-cov/gcov-fake-4.2.c b/llvm/test/tools/llvm-cov/gcov/gcov-fake-4.2.c similarity index 95% rename from llvm/test/tools/llvm-cov/gcov-fake-4.2.c rename to llvm/test/tools/llvm-cov/gcov/gcov-fake-4.2.c index 7e8eb2f2a5ff2..470a14ff7e41c 100644 --- a/llvm/test/tools/llvm-cov/gcov-fake-4.2.c +++ b/llvm/test/tools/llvm-cov/gcov/gcov-fake-4.2.c @@ -1,6 +1,7 @@ /// Test that llvm-cov supports a fake gcov 4.2 format used before clang 11. // RUN: rm -rf %t && mkdir %t && cd %t +// RUN: echo -e '\n\n\n\n\n\n\n\n\n' > test.cpp && echo > test.h // RUN: llvm-cov gcov test. --gcno=%S/Inputs/gcov-fake-4.2.gcno --gcda=%S/Inputs/gcov-fake-4.2.gcda | FileCheck %s // RUN: FileCheck %s --check-prefix=C < test.cpp.gcov // RUN: FileCheck %s --check-prefix=H < test.h.gcov diff --git a/llvm/test/tools/llvm-cov/gcov-intermediate-format.test b/llvm/test/tools/llvm-cov/gcov/intermediate-format.test similarity index 100% rename from llvm/test/tools/llvm-cov/gcov-intermediate-format.test rename to llvm/test/tools/llvm-cov/gcov/intermediate-format.test diff --git a/llvm/test/tools/llvm-cov/gcov/relative-only.test b/llvm/test/tools/llvm-cov/gcov/relative-only.test new file mode 100644 index 0000000000000..20be39683fbeb --- /dev/null +++ b/llvm/test/tools/llvm-cov/gcov/relative-only.test @@ -0,0 +1,38 @@ +# Test -r (--relative-only) and -s (--source-prefix). +# UNSUPPORTED: system-windows +RUN: rm -rf %t && mkdir %t && cd %t +RUN: cp %S/Inputs/abs-path.gcno %S/Inputs/abs-path.gcda . + +RUN: llvm-cov gcov abs-path.gcda | FileCheck %s +RUN: rm abs-path.c.gcov a.h.gcov +CHECK: File '/tmp/c/abs-path.c' +CHECK: File '/tmp/h/a.h' + +# If there is no source file with a relative path, nothing is dumped. +RUN: llvm-cov gcov -r abs-path.gcda 2>&1 | count 0 +RUN: llvm-cov gcov -r -s /t abs-path.gcda 2>&1 | count 0 +RUN: not ls abs-path.c.gcov 2> /dev/null + +# -s strips a prefix from filenames and can change filtering of -r. +RUN: llvm-cov gcov -r -s /tmp abs-path.gcda | FileCheck %s --check-prefix=STRIP1 --match-full-lines --strict-whitespace +RUN: FileCheck %s --check-prefix=STRIP1_C < abs-path.c.gcov +RUN: FileCheck %s --check-prefix=STRIP1_H < a.h.gcov + +# Test full option names. +RUN: llvm-cov gcov --relative-only --source-prefix=/tmp abs-path.gcda | FileCheck %s --check-prefix=STRIP1 --match-full-lines --strict-whitespace + + STRIP1:File 'c/abs-path.c' + STRIP1-NEXT:Lines executed:100.00% of 1 + STRIP1-NEXT:Creating 'abs-path.c.gcov' +STRIP1-EMPTY: + STRIP1-NEXT:File 'h/a.h' + STRIP1-NEXT:Lines executed:0.00% of 1 + STRIP1-NEXT:Creating 'a.h.gcov' + +STRIP1_C: 0:Source:c/abs-path.c +STRIP1_H: 0:Source:h/a.h + +RUN: llvm-cov gcov -r -s /tmp/h abs-path.gcda | FileCheck %s --check-prefix=STRIP2 + +STRIP2-NOT: File +STRIP2: File 'a.h' diff --git a/llvm/test/tools/llvm-dwarfdump/X86/lookup.s b/llvm/test/tools/llvm-dwarfdump/X86/lookup.s index 74f3314a4f4ec..fed2271f70a06 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/lookup.s +++ b/llvm/test/tools/llvm-dwarfdump/X86/lookup.s @@ -37,9 +37,9 @@ # LEX: DW_AT_low_pc (0x0000000000000004) # LEX: DW_AT_high_pc (0x0000000000000014) -# A: Line info: file 'foo.c', line 3, column 9, start line 1 -# B: Line info: file 'foo.c', line 4, column 6, start line 1 -# C: Line info: file 'foo.c', line 6, column 1, start line 1 +# A: Line info: file 'foo.c', line 3, column 9, start file 'foo.c', start line 1 +# B: Line info: file 'foo.c', line 4, column 6, start file 'foo.c', start line 1 +# C: Line info: file 'foo.c', line 6, column 1, start file 'foo.c', start line 1 .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 13 diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/load.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/load.s index 04f30d353ae0d..2e90e5ab6f162 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Exynos/load.s +++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/load.s @@ -20,7 +20,7 @@ ldpsw x0, x1, [sp, #8]! # ALL: Iterations: 100 # ALL-NEXT: Instructions: 1200 -# ALL-NEXT: Total Cycles: 1904 +# ALL-NEXT: Total Cycles: 1304 # M3-NEXT: Total uOps: 1600 # M4-NEXT: Total uOps: 1400 @@ -28,11 +28,11 @@ ldpsw x0, x1, [sp, #8]! # ALL: Dispatch Width: 6 -# M3-NEXT: uOps Per Cycle: 0.84 -# M4-NEXT: uOps Per Cycle: 0.74 -# M5-NEXT: uOps Per Cycle: 0.74 +# M3-NEXT: uOps Per Cycle: 1.23 +# M4-NEXT: uOps Per Cycle: 1.07 +# M5-NEXT: uOps Per Cycle: 1.07 -# ALL-NEXT: IPC: 0.63 +# ALL-NEXT: IPC: 0.92 # ALL-NEXT: Block RThroughput: 6.0 # ALL: Instruction Info: diff --git a/llvm/test/tools/llvm-ml/builtin_types.test b/llvm/test/tools/llvm-ml/builtin_types.test new file mode 100644 index 0000000000000..b99c491cb8dd8 --- /dev/null +++ b/llvm/test/tools/llvm-ml/builtin_types.test @@ -0,0 +1,77 @@ +# RUN: llvm-ml -filetype=asm %s | FileCheck %s + +.data + +t1_long BYTE 1 +t1_short DB 1 +t1_signed SBYTE -1 + +; CHECK-LABEL: t1_long: +; CHECK: .byte 1 +; CHECK-LABEL: t1_short: +; CHECK: .byte 1 +; CHECK-LABEL: t1_signed: +; CHECK: .byte -1 + +t2_long WORD 2 +t2_short DW 2 +t2_signed SWORD -2 + +; CHECK-LABEL: t2_long: +; CHECK: .short 2 +; CHECK-LABEL: t2_short: +; CHECK: .short 2 +; CHECK-LABEL: t2_signed: +; CHECK: .short -2 + +t3_long DWORD 3 +t3_short DD 3 +t3_signed SDWORD -3 + +; CHECK-LABEL: t3_long: +; CHECK: .long 3 +; CHECK-LABEL: t3_short: +; CHECK: .long 3 +; CHECK-LABEL: t3_signed: +; CHECK: .long -3 + +t4_long FWORD 4 +t4_short DF 4 +t4_long_large FWORD 4294967298 +t4_short_large FWORD 4294967298 + +; CHECK-LABEL: t4_long: +; CHECK-NEXT: .long 4 +; CHECK-NEXT: .short 0 +; CHECK-LABEL: t4_short: +; CHECK-NEXT: .long 4 +; CHECK-NEXT: .short 0 +; CHECK-LABEL: t4_long_large: +; CHECK-NEXT: .long 2 +; CHECK-NEXT: .short 1 +; CHECK-LABEL: t4_short_large: +; CHECK-NEXT: .long 2 +; CHECK-NEXT: .short 1 + +t5_long QWORD 4611686018427387904 +t5_short DQ 4611686018427387904 +t5_signed SQWORD -4611686018427387904 + +; CHECK-LABEL: t5_long: +; CHECK-NEXT: .quad 4611686018427387904 +; CHECK-LABEL: t5_short: +; CHECK-NEXT: .quad 4611686018427387904 +; CHECK-LABEL: t5_signed: +; CHECK-NEXT: .quad -4611686018427387904 + +t6_single REAL4 1.3 +t6_double REAL8 1.3 + +; CHECK-LABEL: t6_single: +; CHECK-NEXT: .long 1067869798 +; CHECK-LABEL: t6_double: +; CHECK-NEXT: .quad 4608533498688228557 + +.code + +END diff --git a/llvm/test/tools/llvm-ml/proc.test b/llvm/test/tools/llvm-ml/proc.test new file mode 100644 index 0000000000000..ad117f7fb1dde --- /dev/null +++ b/llvm/test/tools/llvm-ml/proc.test @@ -0,0 +1,18 @@ +# RUN: llvm-ml -m32 -filetype=asm %s | FileCheck %s +# RUN: llvm-ml -m64 -filetype=asm %s | FileCheck %s + +.code + +t1 PROC + ret +t1 ENDP + +; CHECK: .def t1 +; CHECK-NEXT: .scl 2 +; CHECK-NEXT: .type 32 +; CHECK-NEXT: .endef + +; CHECK: t1: +; CHECK: ret + +END diff --git a/llvm/test/tools/llvm-ml/proc_frame.test b/llvm/test/tools/llvm-ml/proc_frame.test new file mode 100644 index 0000000000000..3bf1c3a3ca4ba --- /dev/null +++ b/llvm/test/tools/llvm-ml/proc_frame.test @@ -0,0 +1,34 @@ +# RUN: llvm-ml -m64 -filetype=asm %s | FileCheck %s + +.code + +t1 PROC FRAME + push rbp + .pushreg rbp + mov rbp, rsp + .setframe rbp, 0 + pushfq + .allocstack 8 + .endprolog + ret +t1 ENDP + +; CHECK: .def t1 +; CHECK-NEXT: .scl 2 +; CHECK-NEXT: .type 32 +; CHECK-NEXT: .endef + +; CHECK: .seh_proc t1 + +; CHECK: t1: +; CHECK: push rbp +; CHECK: .seh_pushreg rbp +; CHECK: mov rbp, rsp +; CHECK: .seh_setframe rbp, 0 +; CHECK: pushfq +; CHECK: .seh_stackalloc 8 +; CHECK: .seh_endprologue +; CHECK: ret +; CHECK: .seh_endproc + +END diff --git a/llvm/test/tools/llvm-ml/size_inference.test b/llvm/test/tools/llvm-ml/size_inference.test new file mode 100644 index 0000000000000..c24eb51fad42a --- /dev/null +++ b/llvm/test/tools/llvm-ml/size_inference.test @@ -0,0 +1,27 @@ +; RUN: not llvm-ml -filetype=asm %s 2>&1 | FileCheck %s --dump-input=always + +.data + +FOO STRUCT + dword_field DWORD 3 + byte_field BYTE 4 DUP (1) +FOO ENDS + +var FOO <> + +.code + +t1 PROC + +mov eax, var.byte_field +; CHECK: error: invalid operand for instruction + +mov eax, [var].byte_field +; CHECK: error: invalid operand for instruction + +mov eax, [var.byte_field] +; CHECK: error: invalid operand for instruction + +t1 ENDP + +END diff --git a/llvm/test/tools/llvm-ml/struct.test b/llvm/test/tools/llvm-ml/struct.test index 38fc763fc7e1f..facd7c14e4f4d 100644 --- a/llvm/test/tools/llvm-ml/struct.test +++ b/llvm/test/tools/llvm-ml/struct.test @@ -78,70 +78,70 @@ t2 FOOBAR <"gh",,<10,11>,<12>,"ijk"> .code t3: -mov eax, t2.f.h -mov eax, [t2].f.h -mov eax, [t2.f.h] +mov al, t2.f.h +mov al, [t2].f.h +mov al, [t2.f.h] ; CHECK: t3: -; CHECK-NEXT: mov eax, dword ptr [rip + t2+11] -; CHECK-NEXT: mov eax, dword ptr [rip + t2+11] -; CHECK-NEXT: mov eax, dword ptr [rip + t2+11] +; CHECK-NEXT: mov al, byte ptr [rip + t2+11] +; CHECK-NEXT: mov al, byte ptr [rip + t2+11] +; CHECK-NEXT: mov al, byte ptr [rip + t2+11] t4: -mov eax, j.FOOBAR.f.h -mov eax, j.baz.b +mov al, j.FOOBAR.f.h +mov al, j.baz.b ; CHECK: t4: -; CHECK-NEXT: mov eax, dword ptr [rip + j+11] -; CHECK-NEXT: mov eax, dword ptr [rip + j+1] +; CHECK-NEXT: mov al, byte ptr [rip + j+11] +; CHECK-NEXT: mov al, byte ptr [rip + j+1] t5: -mov eax, [ebx].FOOBAR.f.h -mov eax, [ebx.FOOBAR].f.h -mov eax, [ebx.FOOBAR.f.h] +mov al, [ebx].FOOBAR.f.h +mov al, [ebx.FOOBAR].f.h +mov al, [ebx.FOOBAR.f.h] ; CHECK: t5: -; CHECK-NEXT: mov eax, dword ptr [ebx + 11] -; CHECK-NEXT: mov eax, dword ptr [ebx + 11] -; CHECK-NEXT: mov eax, dword ptr [ebx + 11] +; CHECK-NEXT: mov al, byte ptr [ebx + 11] +; CHECK-NEXT: mov al, byte ptr [ebx + 11] +; CHECK-NEXT: mov al, byte ptr [ebx + 11] t6: -mov eax, t2.FOOBAR.f.h -mov eax, [t2].FOOBAR.f.h -mov eax, [t2.FOOBAR].f.h -mov eax, [t2.FOOBAR.f.h] +mov al, t2.FOOBAR.f.h +mov al, [t2].FOOBAR.f.h +mov al, [t2.FOOBAR].f.h +mov al, [t2.FOOBAR.f.h] ; CHECK: t6: -; CHECK-NEXT: mov eax, dword ptr [rip + t2+11] -; CHECK-NEXT: mov eax, dword ptr [rip + t2+11] -; CHECK-NEXT: mov eax, dword ptr [rip + t2+11] -; CHECK-NEXT: mov eax, dword ptr [rip + t2+11] +; CHECK-NEXT: mov al, byte ptr [rip + t2+11] +; CHECK-NEXT: mov al, byte ptr [rip + t2+11] +; CHECK-NEXT: mov al, byte ptr [rip + t2+11] +; CHECK-NEXT: mov al, byte ptr [rip + t2+11] t7: -mov eax, [ebx].FOOBAR.e.b -mov eax, [ebx.FOOBAR].e.b -mov eax, [ebx.FOOBAR.e].b -mov eax, [ebx.FOOBAR.e.b] +mov al, [ebx].FOOBAR.e.b +mov al, [ebx.FOOBAR].e.b +mov al, [ebx.FOOBAR.e].b +mov al, [ebx.FOOBAR.e.b] ; CHECK: t7: -; CHECK-NEXT: mov eax, dword ptr [ebx + 9] -; CHECK-NEXT: mov eax, dword ptr [ebx + 9] -; CHECK-NEXT: mov eax, dword ptr [ebx + 9] -; CHECK-NEXT: mov eax, dword ptr [ebx + 9] +; CHECK-NEXT: mov al, byte ptr [ebx + 9] +; CHECK-NEXT: mov al, byte ptr [ebx + 9] +; CHECK-NEXT: mov al, byte ptr [ebx + 9] +; CHECK-NEXT: mov al, byte ptr [ebx + 9] t8: -mov eax, t2.FOOBAR.e.b -mov eax, [t2].FOOBAR.e.b -mov eax, [t2.FOOBAR].e.b -mov eax, [t2.FOOBAR.e].b -mov eax, [t2.FOOBAR.e.b] +mov al, t2.FOOBAR.e.b +mov al, [t2].FOOBAR.e.b +mov al, [t2.FOOBAR].e.b +mov al, [t2.FOOBAR.e].b +mov al, [t2.FOOBAR.e.b] ; CHECK: t8: -; CHECK-NEXT: mov eax, dword ptr [rip + t2+9] -; CHECK-NEXT: mov eax, dword ptr [rip + t2+9] -; CHECK-NEXT: mov eax, dword ptr [rip + t2+9] -; CHECK-NEXT: mov eax, dword ptr [rip + (t2+8)+1] -; CHECK-NEXT: mov eax, dword ptr [rip + t2+9] +; CHECK-NEXT: mov al, byte ptr [rip + t2+9] +; CHECK-NEXT: mov al, byte ptr [rip + t2+9] +; CHECK-NEXT: mov al, byte ptr [rip + t2+9] +; CHECK-NEXT: mov al, byte ptr [rip + (t2+8)+1] +; CHECK-NEXT: mov al, byte ptr [rip + t2+9] QUUX STRUCT u DWORD ? @@ -159,20 +159,20 @@ QUUX ENDS t9: mov eax, [ebx].QUUX.u -mov eax, [ebx].QUUX.v +mov ax, [ebx].QUUX.v mov eax, [ebx].QUUX.w -mov eax, [ebx].QUUX.x -mov eax, [ebx].QUUX.y -mov eax, [ebx].QUUX.after_struct +mov al, [ebx].QUUX.x +mov al, [ebx].QUUX.y +mov al, [ebx].QUUX.after_struct mov eax, [ebx].QUUX.z ; CHECK: t9: ; CHECK-NEXT: mov eax, dword ptr [ebx] +; CHECK-NEXT: mov ax, word ptr [ebx + 4] ; CHECK-NEXT: mov eax, dword ptr [ebx + 4] -; CHECK-NEXT: mov eax, dword ptr [ebx + 4] -; CHECK-NEXT: mov eax, dword ptr [ebx + 4] -; CHECK-NEXT: mov eax, dword ptr [ebx + 5] -; CHECK-NEXT: mov eax, dword ptr [ebx + 4] +; CHECK-NEXT: mov al, byte ptr [ebx + 4] +; CHECK-NEXT: mov al, byte ptr [ebx + 5] +; CHECK-NEXT: mov al, byte ptr [ebx + 4] ; CHECK-NEXT: mov eax, dword ptr [ebx + 8] t10: @@ -184,11 +184,11 @@ mov eax, FOOBAR.f.h ; CHECK-NEXT: mov eax, 11 t11: -mov eax, (FOOBAR PTR [ebx]).f -mov eax, (FOOBAR PTR t1).f +mov ax, (FOOBAR PTR [ebx]).f +mov ax, (FOOBAR PTR t1).f ; CHECK: t11: -; CHECK-NEXT: mov eax, dword ptr [ebx + 10] -; CHECK-NEXT: mov eax, dword ptr [rip + t1+10] +; CHECK-NEXT: mov ax, word ptr [ebx + 10] +; CHECK-NEXT: mov ax, word ptr [rip + t1+10] END diff --git a/llvm/test/tools/llvm-ml/struct_alignment.test b/llvm/test/tools/llvm-ml/struct_alignment.test new file mode 100644 index 0000000000000..cfe803872c3ba --- /dev/null +++ b/llvm/test/tools/llvm-ml/struct_alignment.test @@ -0,0 +1,44 @@ +; RUN: llvm-ml -filetype=asm %s | FileCheck %s + +.data + +FOO STRUCT 8 + f FWORD -1 +FOO ENDS + +t1 FOO <> +; CHECK-LABEL: t1: +; CHECK-NEXT: .long 4294967295 +; CHECK-NEXT: .short 65535 +; CHECK-NOT: .zero + +BAZ STRUCT + b BYTE 3 DUP (-1) + f FWORD -1 +BAZ ENDS + +FOOBAR STRUCT 8 + f1 BAZ <> + f2 BAZ <> + h BYTE -1 +FOOBAR ENDS + +t2 FOOBAR <> +; CHECK-LABEL: t2: +; CHECK-NEXT: .byte -1 +; CHECK-NEXT: .byte -1 +; CHECK-NEXT: .byte -1 +; CHECK-NEXT: .long 4294967295 +; CHECK-NEXT: .short 65535 +; CHECK-NEXT: .zero 3 +; CHECK-NEXT: .byte -1 +; CHECK-NEXT: .byte -1 +; CHECK-NEXT: .byte -1 +; CHECK-NEXT: .long 4294967295 +; CHECK-NEXT: .short 65535 +; CHECK-NEXT: .byte -1 +; CHECK-NEXT: .zero 2 + +.code + +END diff --git a/llvm/test/tools/llvm-ml/type_operators.test b/llvm/test/tools/llvm-ml/type_operators.test new file mode 100644 index 0000000000000..b8546927e3efb --- /dev/null +++ b/llvm/test/tools/llvm-ml/type_operators.test @@ -0,0 +1,237 @@ +# RUN: llvm-ml -filetype=asm %s | FileCheck %s + +.data + +FOO STRUCT 2 + x BYTE ? + y WORD 5 DUP (?) +FOO ENDS + +.code + +t1: +; CHECK-LABEL: t1: + +mov eax, sizeof BYTE +mov eax, (sizeof sBYTE) +mov eax, sizeof(Db) +mov eax, type BYTE +mov eax, (type sBYTE) +mov eax, type(Db) +; CHECK: mov eax, 1 +; CHECK: mov eax, 1 +; CHECK: mov eax, 1 +; CHECK: mov eax, 1 +; CHECK: mov eax, 1 +; CHECK: mov eax, 1 + +mov eax, sizeof(word) +mov eax, type(word) +; CHECK: mov eax, 2 +; CHECK: mov eax, 2 +mov eax, sizeof(dword) +mov eax, type(dword) +; CHECK: mov eax, 4 +; CHECK: mov eax, 4 +mov eax, sizeof(fword) +mov eax, type(fword) +; CHECK: mov eax, 6 +; CHECK: mov eax, 6 +mov eax, sizeof(qword) +mov eax, type(qword) +; CHECK: mov eax, 8 +; CHECK: mov eax, 8 + +mov eax, sizeof(real4) +mov eax, type(real4) +; CHECK: mov eax, 4 +; CHECK: mov eax, 4 +mov eax, sizeof(real8) +mov eax, type(real8) +; CHECK: mov eax, 8 +; CHECK: mov eax, 8 + +mov eax, sizeof(FOO) +mov eax, type(FOO) +; CHECK: mov eax, 12 +; CHECK: mov eax, 12 + + +t2_full BYTE "ab" +t2_short DB ? +t2_signed SBYTE 3 DUP (?) + +t2: +; CHECK-LABEL: t2: + +mov eax, sizeof(t2_full) +mov eax, lengthof(t2_full) +mov eax, type(t2_full) +; CHECK: mov eax, 2 +; CHECK: mov eax, 2 +; CHECK: mov eax, 1 + +mov eax, sizeof(t2_short) +mov eax, lengthof(t2_short) +mov eax, type(t2_short) +; CHECK: mov eax, 1 +; CHECK: mov eax, 1 +; CHECK: mov eax, 1 + +mov eax, sizeof(t2_signed) +mov eax, lengthof(t2_signed) +mov eax, type(t2_signed) +; CHECK: mov eax, 3 +; CHECK: mov eax, 3 +; CHECK: mov eax, 1 + + +t3_full WORD 2 DUP (?) +t3_short DW ? +t3_signed SWORD 3 DUP (?) + +t3: +; CHECK-LABEL: t3: + +mov eax, sizeof(t3_full) +mov eax, lengthof(t3_full) +mov eax, type(t3_full) +; CHECK: mov eax, 4 +; CHECK: mov eax, 2 +; CHECK: mov eax, 2 + +mov eax, sizeof(t3_short) +mov eax, lengthof(t3_short) +mov eax, type(t3_short) +; CHECK: mov eax, 2 +; CHECK: mov eax, 1 +; CHECK: mov eax, 2 + +mov eax, sizeof(t3_signed) +mov eax, lengthof(t3_signed) +mov eax, type(t3_signed) +; CHECK: mov eax, 6 +; CHECK: mov eax, 3 +; CHECK: mov eax, 2 + + +t4_full DWORD 2 DUP (?) +t4_short DD ? +t4_signed SDWORD 3 DUP (?) + +t4: +; CHECK-LABEL: t4: + +mov eax, sizeof(t4_full) +mov eax, lengthof(t4_full) +mov eax, type(t4_full) +; CHECK: mov eax, 8 +; CHECK: mov eax, 2 +; CHECK: mov eax, 4 + +mov eax, sizeof(t4_short) +mov eax, lengthof(t4_short) +mov eax, type(t4_short) +; CHECK: mov eax, 4 +; CHECK: mov eax, 1 +; CHECK: mov eax, 4 + +mov eax, sizeof(t4_signed) +mov eax, lengthof(t4_signed) +mov eax, type(t4_signed) +; CHECK: mov eax, 12 +; CHECK: mov eax, 3 +; CHECK: mov eax, 4 + + +t5_full FWORD 2 DUP (?) +t5_short DF ? + +t5: +; CHECK-LABEL: t5: + +mov eax, sizeof(t5_full) +mov eax, lengthof(t5_full) +mov eax, type(t5_full) +; CHECK: mov eax, 12 +; CHECK: mov eax, 2 +; CHECK: mov eax, 6 + +mov eax, sizeof(t5_short) +mov eax, lengthof(t5_short) +mov eax, type(t5_short) +; CHECK: mov eax, 6 +; CHECK: mov eax, 1 +; CHECK: mov eax, 6 + + +t6_full QWORD 2 DUP (?) +t6_short DQ ? +t6_signed SQWORD 3 DUP (?) + +t6: +; CHECK-LABEL: t6: + +mov eax, sizeof(t6_full) +mov eax, lengthof(t6_full) +mov eax, type(t6_full) +; CHECK: mov eax, 16 +; CHECK: mov eax, 2 +; CHECK: mov eax, 8 + +mov eax, sizeof(t6_short) +mov eax, lengthof(t6_short) +mov eax, type(t6_short) +; CHECK: mov eax, 8 +; CHECK: mov eax, 1 +; CHECK: mov eax, 8 + +mov eax, sizeof(t6_signed) +mov eax, lengthof(t6_signed) +mov eax, type(t6_signed) +; CHECK: mov eax, 24 +; CHECK: mov eax, 3 +; CHECK: mov eax, 8 + + +t7_single REAL4 2 DUP (?) +t7_double REAL8 ? + +t7: +; CHECK-LABEL: t7: + +mov eax, sizeof(t7_single) +mov eax, lengthof(t7_single) +mov eax, type(t7_single) +; CHECK: mov eax, 8 +; CHECK: mov eax, 2 +; CHECK: mov eax, 4 + +mov eax, sizeof(t7_double) +mov eax, lengthof(t7_double) +mov eax, type(t7_double) +; CHECK: mov eax, 8 +; CHECK: mov eax, 1 +; CHECK: mov eax, 8 + + +t8_var FOO <>, <> + +t8: +; CHECK-LABEL: t8: + +mov eax, sizeof(t8_var) +mov eax, lengthof(t8_var) +mov eax, type(t8_var) +; CHECK: mov eax, 24 +; CHECK: mov eax, 2 +; CHECK: mov eax, 12 + +mov eax, sizeof(t8_var.y) +mov eax, lengthof(t8_var.y) +mov eax, type(t8_var.y) +; CHECK: mov eax, 10 +; CHECK: mov eax, 5 +; CHECK: mov eax, 2 + +END diff --git a/llvm/test/tools/llvm-objcopy/ELF/objcopy-version.test b/llvm/test/tools/llvm-objcopy/ELF/objcopy-version.test deleted file mode 100644 index 7494ccd2866d3..0000000000000 --- a/llvm/test/tools/llvm-objcopy/ELF/objcopy-version.test +++ /dev/null @@ -1,4 +0,0 @@ -# RUN: llvm-objcopy --version | FileCheck %s -# RUN: llvm-objcopy -V | FileCheck %s - -# CHECK: {{ version }} diff --git a/llvm/test/tools/llvm-objcopy/ELF/strip-version.test b/llvm/test/tools/llvm-objcopy/ELF/strip-version.test deleted file mode 100644 index 4b2f137ce2aad..0000000000000 --- a/llvm/test/tools/llvm-objcopy/ELF/strip-version.test +++ /dev/null @@ -1,5 +0,0 @@ -# RUN: llvm-strip --version | FileCheck %s -# RUN: llvm-strip -V | FileCheck %s - -# CHECK-DAG: {{ version }} -# CHECK-DAG: GNU strip diff --git a/llvm/test/tools/llvm-objcopy/MachO/install-name-tool-version.test b/llvm/test/tools/llvm-objcopy/MachO/install-name-tool-version.test deleted file mode 100644 index 295e573561012..0000000000000 --- a/llvm/test/tools/llvm-objcopy/MachO/install-name-tool-version.test +++ /dev/null @@ -1,2 +0,0 @@ -# RUN: llvm-install-name-tool --version | FileCheck %s -# CHECK: {{ version }} diff --git a/llvm/test/tools/llvm-objcopy/tool-help-message.test b/llvm/test/tools/llvm-objcopy/tool-help-message.test index 1a0712b7a7ce5..3f99d910ee97e 100644 --- a/llvm/test/tools/llvm-objcopy/tool-help-message.test +++ b/llvm/test/tools/llvm-objcopy/tool-help-message.test @@ -18,6 +18,7 @@ # RUN: not llvm-install-name-tool -abcabc 2>&1 | FileCheck --check-prefix=UNKNOWN-ARG %s # RUN: not llvm-install-name-tool --abcabc 2>&1 | FileCheck --check-prefix=UNKNOWN-ARG %s # RUN: not llvm-install-name-tool -add_rpath @executable 2>&1 | FileCheck %s --check-prefix=NO-INPUT-FILES +# RUN: not llvm-install-name-tool -add_rpath @executable f1 f2 2>&1 | FileCheck %s --check-prefix=MULTIPLE-INPUT-FILES # OBJCOPY-USAGE: USAGE: llvm-objcopy [options] input [output] # OBJCOPY-USAGE: Pass @FILE as argument to read options from FILE. @@ -30,3 +31,4 @@ # UNKNOWN-ARG: unknown argument '{{-+}}abcabc' # NO-INPUT-FILES: no input file specified +# MULTIPLE-INPUT-FILES: expects a single input file diff --git a/llvm/test/tools/llvm-objcopy/tool-version.test b/llvm/test/tools/llvm-objcopy/tool-version.test new file mode 100644 index 0000000000000..a6cc8f96221d2 --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/tool-version.test @@ -0,0 +1,16 @@ +# RUN: llvm-objcopy --version | FileCheck --check-prefix=OBJCOPY %s +# RUN: llvm-objcopy -V | FileCheck --check-prefix=OBJCOPY %s + +# RUN: llvm-strip --version | FileCheck --check-prefix=STRIP %s +# RUN: llvm-strip -V | FileCheck --check-prefix=STRIP %s + +# RUN: llvm-install-name-tool --version | FileCheck %s +# RUN: llvm-install-name-tool -V | FileCheck %s + +# OBJCOPY-DAG: {{ version }} +# OBJCOPY-DAG: GNU objcopy + +# STRIP-DAG: {{ version }} +# STRIP-DAG: GNU strip + +# CHECK: {{ version }} diff --git a/llvm/test/tools/llvm-rc/Inputs/tag-versioninfo.rc b/llvm/test/tools/llvm-rc/Inputs/tag-versioninfo.rc index 54dbff55067cb..4b567dabcb2bc 100644 --- a/llvm/test/tools/llvm-rc/Inputs/tag-versioninfo.rc +++ b/llvm/test/tools/llvm-rc/Inputs/tag-versioninfo.rc @@ -1,6 +1,6 @@ 1 VERSIONINFO FILEVERSION 1, 2, 3, 4 -PRODUCTVERSION 5, 6, 7, 8 +PRODUCTVERSION 5, 6, 7 FILEFLAGSMASK 50 FILEFLAGS 555 FILEOS 110 diff --git a/llvm/test/tools/llvm-rc/tag-versioninfo.test b/llvm/test/tools/llvm-rc/tag-versioninfo.test index 92c91972a221f..3ce534b880960 100644 --- a/llvm/test/tools/llvm-rc/tag-versioninfo.test +++ b/llvm/test/tools/llvm-rc/tag-versioninfo.test @@ -14,7 +14,7 @@ ; CHECK-NEXT: 0000: A0023400 00005600 53005F00 56004500 |..4...V.S._.V.E.| ; CHECK-NEXT: 0010: 52005300 49004F00 4E005F00 49004E00 |R.S.I.O.N._.I.N.| ; CHECK-NEXT: 0020: 46004F00 00000000 BD04EFFE 00000100 |F.O.............| -; CHECK-NEXT: 0030: 02000100 04000300 06000500 08000700 |................| +; CHECK-NEXT: 0030: 02000100 04000300 06000500 00000700 |................| ; CHECK-NEXT: 0040: 32000000 2B020000 6E000000 237A0800 |2...+...n...#z..| ; CHECK-NEXT: 0050: 0E000000 00000000 00000000 00020000 |................| ; CHECK-NEXT: 0060: 01005300 74007200 69006E00 67004600 |..S.t.r.i.n.g.F.| diff --git a/llvm/test/tools/llvm-readobj/COFF/arm64-packed-epilog.s b/llvm/test/tools/llvm-readobj/COFF/arm64-packed-epilog.s new file mode 100644 index 0000000000000..c3bfe5a9cf559 --- /dev/null +++ b/llvm/test/tools/llvm-readobj/COFF/arm64-packed-epilog.s @@ -0,0 +1,34 @@ +// REQUIRES: aarch64-registered-target +// RUN: llvm-mc -filetype=obj -triple aarch64-windows %s -o %t.o +// RUN: llvm-readobj --unwind %t.o | FileCheck %s + +// CHECK: ExceptionData { +// CHECK-NEXT: FunctionLength: 4 +// CHECK-NEXT: Version: 0 +// CHECK-NEXT: ExceptionData: Yes +// CHECK-NEXT: EpiloguePacked: Yes +// CHECK-NEXT: EpilogueOffset: 0 +// CHECK-NEXT: ByteCodeLength: 4 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: 0xe4 ; end +// CHECK-NEXT: ] +// CHECK-NEXT: ExceptionHandler [ +// CHECK-NEXT: Routine: 0x11223344 +// CHECK-NEXT: Parameter: 0x55667788 +// CHECK-NEXT: ] + +.section .pdata,"dr" + .long func@IMGREL + .long "$unwind$func"@IMGREL + + .text + .globl func +func: + ret + +.section .xdata,"dr" +"$unwind$func": +.byte 0x01, 0x00, 0x30, 0x08 +.byte 0xe4, 0xe3, 0xe3, 0xe3 +.byte 0x44, 0x33, 0x22, 0x11 +.byte 0x88, 0x77, 0x66, 0x55 diff --git a/llvm/test/tools/llvm-readobj/COFF/arm64-packed-unwind.s b/llvm/test/tools/llvm-readobj/COFF/arm64-packed-unwind.s new file mode 100644 index 0000000000000..f8c4d5e3074f9 --- /dev/null +++ b/llvm/test/tools/llvm-readobj/COFF/arm64-packed-unwind.s @@ -0,0 +1,332 @@ +## Check interpretation of the packed unwind info format. + +// REQUIRES: aarch64-registered-target +// RUN: llvm-mc -filetype=obj -triple aarch64-windows %s -o %t.o +// RUN: llvm-readobj --unwind %t.o | FileCheck %s + +// CHECK: UnwindInformation [ +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func1 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 88 +// CHECK-NEXT: RegF: 7 +// CHECK-NEXT: RegI: 10 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 0 +// CHECK-NEXT: FrameSize: 160 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: sub sp, sp, #16 +// CHECK-NEXT: stp d14, d15, [sp, #128] +// CHECK-NEXT: stp d12, d13, [sp, #112] +// CHECK-NEXT: stp d10, d11, [sp, #96] +// CHECK-NEXT: stp d8, d9, [sp, #80] +// CHECK-NEXT: stp x27, x28, [sp, #64] +// CHECK-NEXT: stp x25, x26, [sp, #48] +// CHECK-NEXT: stp x23, x24, [sp, #32] +// CHECK-NEXT: stp x21, x22, [sp, #16] +// CHECK-NEXT: stp x19, x20, [sp, #-144]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func2 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 48 +// CHECK-NEXT: RegF: 2 +// CHECK-NEXT: RegI: 3 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 0 +// CHECK-NEXT: FrameSize: 48 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: str d10, [sp, #40] +// CHECK-NEXT: stp d8, d9, [sp, #24] +// CHECK-NEXT: str x21, [sp, #16] +// CHECK-NEXT: stp x19, x20, [sp, #-48]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func3 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 40 +// CHECK-NEXT: RegF: 3 +// CHECK-NEXT: RegI: 1 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 0 +// CHECK-NEXT: FrameSize: 48 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: stp d10, d11, [sp, #24] +// CHECK-NEXT: stp d8, d9, [sp, #8] +// CHECK-NEXT: str x19, [sp, #-48]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func4 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 24 +// CHECK-NEXT: RegF: 1 +// CHECK-NEXT: RegI: 0 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 0 +// CHECK-NEXT: FrameSize: 48 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: sub sp, sp, #32 +// CHECK-NEXT: stp d8, d9, [sp, #-16]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func5 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 56 +// CHECK-NEXT: RegF: 0 +// CHECK-NEXT: RegI: 1 +// CHECK-NEXT: HomedParameters: Yes +// CHECK-NEXT: CR: 0 +// CHECK-NEXT: FrameSize: 112 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: sub sp, sp, #32 +// CHECK-NEXT: stp x6, x7, [sp, #56] +// CHECK-NEXT: stp x4, x5, [sp, #40] +// CHECK-NEXT: stp x2, x3, [sp, #24] +// CHECK-NEXT: stp x0, x1, [sp, #8] +// CHECK-NEXT: str x19, [sp, #-80]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func6 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 48 +// CHECK-NEXT: RegF: 0 +// CHECK-NEXT: RegI: 0 +// CHECK-NEXT: HomedParameters: Yes +// CHECK-NEXT: CR: 0 +// CHECK-NEXT: FrameSize: 112 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: sub sp, sp, #48 +// CHECK-NEXT: stp x6, x7, [sp, #48] +// CHECK-NEXT: stp x4, x5, [sp, #32] +// CHECK-NEXT: stp x2, x3, [sp, #16] +// CHECK-NEXT: stp x0, x1, [sp, #-64]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func7 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 24 +// CHECK-NEXT: RegF: 0 +// CHECK-NEXT: RegI: 0 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 1 +// CHECK-NEXT: FrameSize: 32 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: sub sp, sp, #16 +// CHECK-NEXT: str lr, [sp, #-16]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func8 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 24 +// CHECK-NEXT: RegF: 0 +// CHECK-NEXT: RegI: 1 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 1 +// CHECK-NEXT: FrameSize: 32 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: sub sp, sp, #16 +// CHECK-NEXT: stp x19, lr, [sp, #-16]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func9 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 32 +// CHECK-NEXT: RegF: 0 +// CHECK-NEXT: RegI: 2 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 1 +// CHECK-NEXT: FrameSize: 32 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: str lr, [sp, #16] +// CHECK-NEXT: stp x19, x20, [sp, #-32]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func10 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 32 +// CHECK-NEXT: RegF: 0 +// CHECK-NEXT: RegI: 3 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 1 +// CHECK-NEXT: FrameSize: 48 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: sub sp, sp, #16 +// CHECK-NEXT: stp x21, lr, [sp, #16] +// CHECK-NEXT: stp x19, x20, [sp, #-32]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func11 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 32 +// CHECK-NEXT: RegF: 0 +// CHECK-NEXT: RegI: 2 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 3 +// CHECK-NEXT: FrameSize: 48 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: mov x29, sp +// CHECK-NEXT: stp x29, lr, [sp, #-32]! +// CHECK-NEXT: stp x19, x20, [sp, #-16]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func12 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 40 +// CHECK-NEXT: RegF: 0 +// CHECK-NEXT: RegI: 2 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 3 +// CHECK-NEXT: FrameSize: 544 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: mov x29, sp +// CHECK-NEXT: stp x29, lr, [sp, #0] +// CHECK-NEXT: sub sp, sp, #528 +// CHECK-NEXT: stp x19, x20, [sp, #-16]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func13 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 48 +// CHECK-NEXT: RegF: 0 +// CHECK-NEXT: RegI: 2 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 3 +// CHECK-NEXT: FrameSize: 4112 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: mov x29, sp +// CHECK-NEXT: stp x29, lr, [sp, #0] +// CHECK-NEXT: sub sp, sp, #16 +// CHECK-NEXT: sub sp, sp, #4080 +// CHECK-NEXT: stp x19, x20, [sp, #-16]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func14 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 32 +// CHECK-NEXT: RegF: 0 +// CHECK-NEXT: RegI: 2 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 0 +// CHECK-NEXT: FrameSize: 4112 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: sub sp, sp, #16 +// CHECK-NEXT: sub sp, sp, #4080 +// CHECK-NEXT: stp x19, x20, [sp, #-16]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func15 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 24 +// CHECK-NEXT: RegF: 0 +// CHECK-NEXT: RegI: 2 +// CHECK-NEXT: HomedParameters: No +// CHECK-NEXT: CR: 0 +// CHECK-NEXT: FrameSize: 560 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: sub sp, sp, #544 +// CHECK-NEXT: stp x19, x20, [sp, #-16]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: RuntimeFunction { +// CHECK-NEXT: Function: func16 +// CHECK-NEXT: Fragment: No +// CHECK-NEXT: FunctionLength: 56 +// CHECK-NEXT: RegF: 0 +// CHECK-NEXT: RegI: 0 +// CHECK-NEXT: HomedParameters: Yes +// CHECK-NEXT: CR: 1 +// CHECK-NEXT: FrameSize: 112 +// CHECK-NEXT: Prologue [ +// CHECK-NEXT: sub sp, sp, #32 +// CHECK-NEXT: stp x6, x7, [sp, #56] +// CHECK-NEXT: stp x4, x5, [sp, #40] +// CHECK-NEXT: stp x2, x3, [sp, #24] +// CHECK-NEXT: stp x0, x1, [sp, #8] +// CHECK-NEXT: str lr, [sp, #-80]! +// CHECK-NEXT: end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: ] + + .text + .globl func1 +func1: +func2: +func3: +func4: +func5: +func6: +func7: +func8: +func9: +func10: +func11: +func12: +func13: +func14: +func15: +func16: + ret + + .section .pdata,"dr" + .long func1@IMGREL + .long 0x050ae059 // FunctionLength=22 RegF=7 RegI=10 H=0 CR=0 FrameSize=10 + .long func2@IMGREL + .long 0x01834031 // FunctionLength=12 RegF=2 RegI=3 H=0 CR=0 FrameSize=3 + .long func3@IMGREL + .long 0x01816029 // FunctionLength=10 RegF=3 RegI=1 H=0 CR=0 FrameSize=3 + .long func4@IMGREL + .long 0x01802019 // FunctionLength=6 RegF=1 RegI=0 H=0 CR=0 FrameSize=3 + .long func5@IMGREL + .long 0x03910039 // FunctionLength=14 RegF=0 RegI=1 H=1 CR=0 FrameSize=7 + .long func6@IMGREL + .long 0x03900031 // FunctionLength=12 RegF=0 RegI=0 H=1 CR=0 FrameSize=7 + .long func7@IMGREL + .long 0x01200019 // FunctionLength=6 RegF=0 RegI=0 H=0 CR=1 FrameSize=2 + .long func8@IMGREL + .long 0x01210019 // FunctionLength=6 RegF=0 RegI=1 H=0 CR=1 FrameSize=2 + .long func9@IMGREL + .long 0x01220021 // FunctionLength=8 RegF=0 RegI=2 H=0 CR=1 FrameSize=2 + .long func10@IMGREL + .long 0x01a30021 // FunctionLength=8 RegF=0 RegI=3 H=0 CR=1 FrameSize=3 + .long func11@IMGREL + .long 0x01e20021 // FunctionLength=8 RegF=0 RegI=2 H=0 CR=3 FrameSize=3 + .long func12@IMGREL + .long 0x11620029 // FunctionLength=10 RegF=0 RegI=2 H=0 CR=3 FrameSize=34 + .long func13@IMGREL + .long 0x80e20031 // FunctionLength=12 RegF=0 RegI=2 H=0 CR=3 FrameSize=257 + .long func14@IMGREL + .long 0x80820021 // FunctionLength=8 RegF=0 RegI=2 H=0 CR=0 FrameSize=257 + .long func15@IMGREL + .long 0x11820019 // FunctionLength=6 RegF=0 RegI=2 H=0 CR=0 FrameSize=34 + .long func16@IMGREL + .long 0x03b00039 // FunctionLength=14 RegF=0 RegI=0 H=1 CR=1 FrameSize=7 diff --git a/llvm/test/tools/llvm-readobj/COFF/arm64-unwind-opcodes.s b/llvm/test/tools/llvm-readobj/COFF/arm64-unwind-opcodes.s index 98e2da8fb226b..8ac8f6c98e272 100644 --- a/llvm/test/tools/llvm-readobj/COFF/arm64-unwind-opcodes.s +++ b/llvm/test/tools/llvm-readobj/COFF/arm64-unwind-opcodes.s @@ -1,12 +1,25 @@ // REQUIRES: aarch64-registered-target -// RUN: llvm-mc -filetype=obj -triple aarch64-windows %s -o - \ -// RUN: | llvm-readobj --unwind - | FileCheck %s +// RUN: llvm-mc -filetype=obj -triple aarch64-windows %s -o %t.o +// RUN: llvm-readobj --unwind %t.o | FileCheck --strict-whitespace %s // CHECK: Prologue [ +// CHECK-NEXT: 0xe202 ; add fp, sp, #16 +// CHECK-NEXT: 0xe1 ; mov fp, sp // CHECK-NEXT: 0xdc01 ; str d8, [sp, #8] // CHECK-NEXT: 0xd400 ; str x19, [sp, #-8]! // CHECK-NEXT: 0xe4 ; end // CHECK-NEXT: ] +// CHECK-NEXT: EpilogueScopes [ +// CHECK-NEXT: EpilogueScope { +// CHECK-NEXT: StartOffset: +// CHECK-NEXT: EpilogueStartIndex: +// CHECK-NEXT: Opcodes [ +// CHECK-NEXT: 0xe202 ; sub sp, fp, #16 +// CHECK-NEXT: 0xe1 ; mov sp, fp +// CHECK-NEXT: 0xe4 ; end +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: ] .section .pdata,"dr" .long func@IMGREL @@ -16,9 +29,18 @@ .globl func func: str x19, [sp, #-8]! - str d8, [sp, #8] + str d8, [sp, #8] + mov x29, sp + add x29, sp, #16 + nop + sub sp, x29, #16 + mov sp, x29 ret .section .xdata,"dr" "$unwind$func": -.long 0x10000002, 0x00d401dc, 0xe3e3e3e4 +.byte 0x08, 0x00, 0x40, 0x18 +.byte 0x05, 0x00, 0x00, 0x02 +.byte 0xe2, 0x02, 0xe1, 0xdc +.byte 0x01, 0xd4, 0x00, 0xe4 +.byte 0xe2, 0x02, 0xe1, 0xe4 diff --git a/llvm/test/tools/llvm-readobj/ELF/addrsig.test b/llvm/test/tools/llvm-readobj/ELF/addrsig.test index f6e29c7a46819..24621d80f79e6 100644 --- a/llvm/test/tools/llvm-readobj/ELF/addrsig.test +++ b/llvm/test/tools/llvm-readobj/ELF/addrsig.test @@ -31,12 +31,15 @@ Symbols: # RUN: llvm-readobj --all %t1.o | FileCheck %s --check-prefix LLVM # RUN: llvm-readelf --all %t1.o 2>&1 | FileCheck %s --implicit-check-not=warning --implicit-check-not=error -## Check we report a warning when SHT_LLVM_ADDRSIG is broken (e.g. contains a malformed uleb128). +## Check we report a warning when the content of the SHT_LLVM_ADDRSIG section +## is broken (e.g. contains a malformed uleb128). -# RUN: yaml2obj --docnum=2 %s -o %t2.o -# RUN: llvm-readobj --addrsig %t2.o 2>&1 | FileCheck %s -DFILE=%t2.o --check-prefix=MALFORMED +# RUN: yaml2obj --docnum=2 %s -o %t2.1.o +# RUN: llvm-readobj --addrsig %t2.1.o 2>&1 | FileCheck %s -DFILE=%t2.1.o --check-prefix=MALFORMED -# MALFORMED: warning: '[[FILE]]': malformed uleb128, extends past end +# MALFORMED: Addrsig [ +# MALFORMED-NEXT: warning: '[[FILE]]': unable to decode SHT_LLVM_ADDRSIG section with index 1: malformed uleb128, extends past end +# MALFORMED-NEXT: ] --- !ELF FileHeader: @@ -44,9 +47,19 @@ FileHeader: Data: ELFDATA2LSB Type: ET_DYN Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - Content: "FF" + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "FF" + ShOffset: [[OFFSET=]] + +## Check we report a warning when the content of the SHT_LLVM_ADDRSIG section can't be read. + +# RUN: yaml2obj --docnum=2 -DOFFSET=0xffffffff %s -o %t2.2.o +# RUN: llvm-readobj --addrsig %t2.2.o 2>&1 | FileCheck %s -DFILE=%t2.2.o --check-prefix=BROKEN-SEC + +# BROKEN-SEC: Addrsig [ +# BROKEN-SEC-NEXT: warning: '[[FILE]]': section [index 1] has a sh_offset (0xffffffff) + sh_size (0x1) that is greater than the file size (0x168) +# BROKEN-SEC-NEXT: ] ## Check we report a warning when SHT_LLVM_ADDRSIG references a symbol that can't be ## dumped (e.g. the index value is larger than the number of symbols in .symtab). diff --git a/llvm/test/tools/llvm-readobj/ELF/dyn-symbols-size-from-hash-table.test b/llvm/test/tools/llvm-readobj/ELF/dyn-symbols-size-from-hash-table.test index df9ff8d95ecad..dc421c14eae90 100644 --- a/llvm/test/tools/llvm-readobj/ELF/dyn-symbols-size-from-hash-table.test +++ b/llvm/test/tools/llvm-readobj/ELF/dyn-symbols-size-from-hash-table.test @@ -324,3 +324,94 @@ ProgramHeaders: # LLVM3: DynamicSymbols [ # LLVM3: ] + +## Case 4: The size of the dynamic symbol table, inferred from the hash table, is broken. +## It is so large that symbol table goes past the end of the file. We have a dynamic +## relocation which refers to a symbol with an index that is also too large to be +## in the file. Check we report a warning when trying to dump this relocation. + +# RUN: yaml2obj --docnum=3 %s -o %t4.1 + +## Remember the size of the output produced. +# RUN: wc -c %t4.1 > %t4.out.gnu.txt +# RUN: llvm-readelf --sections --dyn-relocations %t4.1 >> %t4.out.gnu.txt 2>&1 +# RUN: FileCheck %s -DFILE=%t4.1 --input-file=%t4.out.gnu.txt --check-prefix=BROKEN-NCHAIN-GNU + +# BROKEN-NCHAIN-GNU: [[#%u, FILESIZE:]] +# BROKEN-NCHAIN-GNU: warning: '[[FILE]]': the size (0x17ffffffe8) of the dynamic symbol table at 0x[[#%x, DYNSYMOFF:]], derived from the hash table, goes past the end of the file (0x[[#%x, FILESIZE]]) and will be ignored + +# BROKEN-NCHAIN-GNU: [Nr] Name Type Address Off +# BROKEN-NCHAIN-GNU: [ 1] .rela.plt RELA 0000000000001000 0000[[#%x, RELAOFF:]] +# BROKEN-NCHAIN-GNU: [ 4] .dynsym DYNSYM 0000000000001078 0000[[#%x, DYNSYMOFF]] + +# BROKEN-NCHAIN-GNU: 'PLT' relocation section at offset 0x[[#%x, RELAOFF]] contains 24 bytes: +# BROKEN-NCHAIN-GNU-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend +# BROKEN-NCHAIN-GNU-NEXT: warning: '[[FILE]]': unable to get name of the dynamic symbol with index 4292739037: index is greater than or equal to the number of dynamic symbols (1) +# BROKEN-NCHAIN-GNU-NEXT: 0000000000000000 ffddffdd00000000 R_X86_64_NONE + 0 + +# RUN: wc -c %t4.1 > %t4.out.llvm.txt +# RUN: llvm-readobj --sections --dyn-relocations %t4.1 2>&1 >> %t4.out.llvm.txt 2>&1 +# RUN: FileCheck %s -DFILE=%t4.1 --input-file=%t4.out.llvm.txt --check-prefix=BROKEN-NCHAIN-LLVM + +# BROKEN-NCHAIN-LLVM: [[#%u, FILESIZE:]] +# BROKEN-NCHAIN-LLVM: warning: '[[FILE]]': the size (0x17ffffffe8) of the dynamic symbol table at 0x[[#%x, DYNSYMOFF:]], derived from the hash table, goes past the end of the file (0x[[#%x, FILESIZE]]) and will be ignored + +# BROKEN-NCHAIN-LLVM: Name: .dynsym +# BROKEN-NCHAIN-LLVM-NEXT: Type: SHT_DYNSYM +# BROKEN-NCHAIN-LLVM-NEXT: Flags [ +# BROKEN-NCHAIN-LLVM-NEXT: SHF_ALLOC +# BROKEN-NCHAIN-LLVM-NEXT: ] +# BROKEN-NCHAIN-LLVM-NEXT: Address: 0x1078 +# BROKEN-NCHAIN-LLVM-NEXT: Offset: 0x[[#%X, DYNSYMOFF]] + +# BROKEN-NCHAIN-LLVM: Dynamic Relocations { +# BROKEN-NCHAIN-LLVM-NEXT: warning: '[[FILE]]': unable to get name of the dynamic symbol with index 4292739037: index is greater than or equal to the number of dynamic symbols (1) +# BROKEN-NCHAIN-LLVM-NEXT: 0x0 R_X86_64_NONE 0x0 +# BROKEN-NCHAIN-LLVM-NEXT: } + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .rela.plt + Type: SHT_RELA + Flags: [ SHF_ALLOC ] + Address: 0x1000 + Relocations: + - Type: R_X86_64_NONE + Symbol: 0xFFDDFFDD + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_ALLOC ] + Entries: + - Tag: DT_PLTRELSZ + Value: 0x18 + - Tag: DT_JMPREL +## 0x1000 - PT_LOAD's p_vaddr (0x1000) == 0x0. +## 0x0 + PT_LOAD's p_offset (0x78) == .rela.plt section offset (0x78). + Value: 0x1000 + - Tag: DT_PLTREL + Value: 0x7 ## 7 == DT_RELA + - Tag: DT_HASH +## 0x1068 - PT_LOAD's p_vaddr (0x1000) == 0x68. +## 0x68 + PT_LOAD's p_offset (0x78) == .hash section offset (0xE0). + Value: 0x1068 + - Tag: DT_NULL + Value: 0x0 + - Name: .hash + Type: SHT_HASH + Flags: [ SHF_ALLOC ] + Bucket: [ 0 ] + Chain: [ 0 ] + NChain: 0xFFFFFFFF +DynamicSymbols: [] +ProgramHeaders: + - Type: PT_LOAD + Sections: + - Section: .rela.plt + - Section: .dynamic + - Section: .hash + VAddr: 0x1000 diff --git a/llvm/test/tools/llvm-readobj/ELF/dyn-symbols.test b/llvm/test/tools/llvm-readobj/ELF/dyn-symbols.test index f57b21cb6e974..a438535cc1c8d 100644 --- a/llvm/test/tools/llvm-readobj/ELF/dyn-symbols.test +++ b/llvm/test/tools/llvm-readobj/ELF/dyn-symbols.test @@ -322,8 +322,32 @@ Sections: - NonDefault DynamicSymbols: - Name: foo - - Name: bar - - Name: zed + - Name: [[NAME=bar]] + Type: [[TYPE=STT_NOTYPE]] + Index: [[INDEX=]] + - Name: [[NAME=zed]] + Type: [[TYPE=STT_NOTYPE]] + +## Check the behavior for unnamed versioned section symbols. +## TODO: we should print proper symbol names instead of descriptions. +# RUN: yaml2obj %s -DTYPE=STT_SECTION -DNAME="''" -DINDEX=SHN_ABS --docnum=6 -o %t6.sec.sym +# RUN: llvm-readobj -V --dyn-symbols %t6.sec.sym | FileCheck %s --check-prefix=VERSIONED-SEC-SYM-LLVM +# RUN: llvm-readelf -V --dyn-symbols %t6.sec.sym | FileCheck %s --check-prefix=VERSIONED-SEC-SYM-GNU + +# VERSIONED-SEC-SYM-LLVM: DynamicSymbols [ +# VERSIONED-SEC-SYM-LLVM: Name: foo (12) +# VERSIONED-SEC-SYM-LLVM: Name: Absolute (0) +# VERSIONED-SEC-SYM-LLVM: Name: Undefined (0) +# VERSIONED-SEC-SYM-LLVM: VersionSymbols [ +# VERSIONED-SEC-SYM-LLVM: Name: foo +# VERSIONED-SEC-SYM-LLVM: Name: Absolute +# VERSIONED-SEC-SYM-LLVM: Name: Undefined + +# VERSIONED-SEC-SYM-GNU: Symbol table '.dynsym' contains 4 entries: +# VERSIONED-SEC-SYM-GNU: Num: {{.*}} Ndx Name +# VERSIONED-SEC-SYM-GNU: 1: {{.*}} UND foo +# VERSIONED-SEC-SYM-GNU-NEXT: 2: {{.*}} ABS Absolute +# VERSIONED-SEC-SYM-GNU-NEXT: 3: {{.*}} UND Undefined ## Case 8: Check what we print when: ## a) The dynamic symbol table does not exist. diff --git a/llvm/test/tools/llvm-readobj/ELF/dynamic-not-in-pt-dynamic.test b/llvm/test/tools/llvm-readobj/ELF/dynamic-not-in-pt-dynamic.test index 8c33931468c6b..20dd7c0ef630b 100644 --- a/llvm/test/tools/llvm-readobj/ELF/dynamic-not-in-pt-dynamic.test +++ b/llvm/test/tools/llvm-readobj/ELF/dynamic-not-in-pt-dynamic.test @@ -11,7 +11,7 @@ # RUN: llvm-readelf --dynamic-table %t1.o 2>&1 \ # RUN: | FileCheck -DFILE=%t1.o --check-prefixes=WARNING1,GNU1 %s -# WARNING1: warning: '[[FILE]]': The SHT_DYNAMIC section '.dynamic' is not contained within the PT_DYNAMIC segment +# WARNING1: warning: '[[FILE]]': SHT_DYNAMIC section with index 1 is not contained within the PT_DYNAMIC segment # WARNING1: warning: '[[FILE]]': invalid PT_DYNAMIC size (0x1){{$}} # WARNING1: warning: '[[FILE]]': SHT_DYNAMIC section header and PT_DYNAMIC program header disagree about the location of the dynamic table # WARNING1: warning: '[[FILE]]': PT_DYNAMIC dynamic table is invalid: SHT_DYNAMIC will be used @@ -69,7 +69,7 @@ ProgramHeaders: # RUN: llvm-readelf --dynamic-table %t2.o 2>&1 \ # RUN: | FileCheck -DFILE=%t2.o --check-prefixes=WARNING2,GNU2 %s -# WARNING2: warning: '[[FILE]]': The SHT_DYNAMIC section '.dynamic' is not contained within the PT_DYNAMIC segment +# WARNING2: warning: '[[FILE]]': SHT_DYNAMIC section with index 1 is not contained within the PT_DYNAMIC segment # WARNING2: warning: '[[FILE]]': SHT_DYNAMIC section header and PT_DYNAMIC program header disagree about the location of the dynamic table # LLVM2: DynamicSection [ (1 entries) diff --git a/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test b/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test index b6df8ff2a82ff..d6158e66acc74 100644 --- a/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test +++ b/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test @@ -167,6 +167,7 @@ ProgramHeaders: # RUN: llvm-readelf --elf-hash-histogram %t4.3.o 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR3 -DFILE=%t4.3.o --implicit-check-not="warning:" # ERR3: warning: '[[FILE]]': hash table nchain (93) differs from symbol count derived from SHT_DYNSYM section header (1){{$}} +# ERR3: warning: '[[FILE]]': the size (0x5d0) of the dynamic symbol table at 0x78, derived from the hash table, goes past the end of the file (0x1d4) and will be ignored ## Case B.2: the hash table ends 1 byte past the EOF. We have a broken nchain ## field that has a value larger than the number of chains. @@ -174,6 +175,7 @@ ProgramHeaders: # RUN: llvm-readelf --elf-hash-histogram %t4.4.o 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR4 -DFILE=%t4.4.o --implicit-check-not="warning:" # ERR4: warning: '[[FILE]]': hash table nchain (94) differs from symbol count derived from SHT_DYNSYM section header (1){{$}} +# ERR4: warning: '[[FILE]]': the size (0x5e0) of the dynamic symbol table at 0x78, derived from the hash table, goes past the end of the file (0x1d4) and will be ignored # ERR4: warning: '[[FILE]]': the hash table at offset 0x54 goes past the end of the file (0x1d4), nbucket = 1, nchain = 94{{$}} --- !ELF diff --git a/llvm/test/tools/llvm-readobj/ELF/hash-symbols.test b/llvm/test/tools/llvm-readobj/ELF/hash-symbols.test index e398ba7af99c6..7488bd5514e5a 100644 --- a/llvm/test/tools/llvm-readobj/ELF/hash-symbols.test +++ b/llvm/test/tools/llvm-readobj/ELF/hash-symbols.test @@ -81,23 +81,28 @@ Sections: - Tag: DT_NULL Value: 0x0000000000000000 DynamicSymbols: - - Name: ccc + - Name: [[NAME=ccc]] Binding: STB_GLOBAL - - Name: aaa + Type: [[TYPE=STT_NOTYPE]] + - Name: [[NAME=aaa]] Section: .hash Binding: STB_GLOBAL Value: 0x0000000000001000 - - Name: ddd + Type: [[TYPE=STT_NOTYPE]] + - Name: [[NAME=ddd]] Index: SHN_ABS Binding: STB_GLOBAL Value: 0x0000000000000001 - - Name: eee + Type: [[TYPE=STT_NOTYPE]] + - Name: [[NAME=eee]] Section: .gnu.hash Binding: STB_GLOBAL - - Name: bbb + Type: [[TYPE=STT_NOTYPE]] + - Name: [[NAME=bbb]] Section: .hash Binding: STB_WEAK Value: 0x0000000000001001 + Type: [[TYPE=STT_NOTYPE]] ProgramHeaders: - Type: PT_LOAD Flags: [ PF_R, PF_X ] @@ -106,6 +111,26 @@ ProgramHeaders: - Section: .gnu.hash - Section: .dynamic +## Check what we print for unnamed section symbols. +## TODO: we should print proper symbol names instead of descriptions. +# RUN: yaml2obj --docnum=1 -DBITS=64 -DTYPE=STT_SECTION -DNAME="''" %s -o %t1-sec-syms.so +# RUN: llvm-readelf --hash-symbols %t1-sec-syms.so | FileCheck %s --check-prefix=UNNAMED-SEC-SYMS + +# UNNAMED-SEC-SYMS: Symbol table of .hash for image: +# UNNAMED-SEC-SYMS-NEXT: Num {{.*}} Ndx Name +# UNNAMED-SEC-SYMS-NEXT: 1 {{.*}} UND Undefined +# UNNAMED-SEC-SYMS-NEXT: 5 {{.*}} 1 .hash +# UNNAMED-SEC-SYMS-NEXT: 3 {{.*}} ABS Absolute +# UNNAMED-SEC-SYMS-NEXT: 2 {{.*}} 1 .hash +# UNNAMED-SEC-SYMS-NEXT: 4 {{.*}} 2 .gnu.hash +# UNNAMED-SEC-SYMS-EMPTY: +# UNNAMED-SEC-SYMS: Symbol table of .gnu.hash for image: +# UNNAMED-SEC-SYMS-NEXT: Num {{.*}} Ndx Name +# UNNAMED-SEC-SYMS-NEXT: 2 {{.*}} 1 .hash +# UNNAMED-SEC-SYMS-NEXT: 3 {{.*}} ABS Absolute +# UNNAMED-SEC-SYMS-NEXT: 4 {{.*}} 2 .gnu.hash +# UNNAMED-SEC-SYMS-NEXT: 5 {{.*}} 1 .hash + ## Check the output when only .hash section is present. # RUN: yaml2obj --docnum=2 %s -o %t2-32.so @@ -402,6 +427,7 @@ ProgramHeaders: # RUN: llvm-readelf --hash-symbols %t7.3.o 2>&1 | \ # RUN: FileCheck %s --implicit-check-not="warning:" --check-prefix=NOERR2 -DFILE=%t7.3.o # NOERR2: warning: '[[FILE]]': hash table nchain (93) differs from symbol count derived from SHT_DYNSYM section header (1) +# NOERR2: warning: '[[FILE]]': the size (0x5d0) of the dynamic symbol table at 0x78, derived from the hash table, goes past the end of the file (0x1d4) and will be ignored # NOERR2: Symbol table of .hash for image: # NOERR2-NEXT: Num Buc: Value Size Type Bind Vis Ndx Name # NOERR2-NOT: {{.}} diff --git a/llvm/test/tools/llvm-readobj/ELF/hash-table.test b/llvm/test/tools/llvm-readobj/ELF/hash-table.test index 823c6c8ece9c3..1102d848f03e4 100644 --- a/llvm/test/tools/llvm-readobj/ELF/hash-table.test +++ b/llvm/test/tools/llvm-readobj/ELF/hash-table.test @@ -169,6 +169,7 @@ ProgramHeaders: # RUN: FileCheck %s --check-prefix=NOERR2 -DFILE=%t5.3.o --implicit-check-not="warning:" # NOERR2: warning: '[[FILE]]': hash table nchain (93) differs from symbol count derived from SHT_DYNSYM section header (1) +# NOERR2: warning: '[[FILE]]': the size (0x5d0) of the dynamic symbol table at 0x78, derived from the hash table, goes past the end of the file (0x1d4) and will be ignored # NOERR2: HashTable { # NOERR2-NEXT: Num Buckets: 1 # NOERR2-NEXT: Num Chains: 93 @@ -187,6 +188,7 @@ ProgramHeaders: # RUN: FileCheck %s --check-prefix=ERR3 -DFILE=%t5.4.o --implicit-check-not="warning:" # ERR3: warning: '[[FILE]]': hash table nchain (94) differs from symbol count derived from SHT_DYNSYM section header (1) +# ERR3: warning: '[[FILE]]': the size (0x5e0) of the dynamic symbol table at 0x78, derived from the hash table, goes past the end of the file (0x1d4) and will be ignored # ERR3: HashTable { # ERR3-NEXT: Num Buckets: 1 # ERR3-NEXT: Num Chains: 94 diff --git a/llvm/test/tools/llvm-readobj/ELF/mips-got.test b/llvm/test/tools/llvm-readobj/ELF/mips-got.test index 24a06dd2b3bbd..f1c3e4d1fc224 100644 --- a/llvm/test/tools/llvm-readobj/ELF/mips-got.test +++ b/llvm/test/tools/llvm-readobj/ELF/mips-got.test @@ -651,3 +651,58 @@ Sections: Value: 0x1122 DynamicSymbols: - Name: foo + +## Check how we print global GOT entries when they are unnamed section symbols. +## TODO: we should print proper symbol names instead of descriptions. +# RUN: yaml2obj --docnum=5 %s -o %t.err8.o +# RUN: llvm-readobj -A %t.err8.o 2>&1 | FileCheck %s -DFILE=%t.err8.o --check-prefix=SEC-SYMS-LLVM +# RUN: llvm-readelf -A %t.err8.o 2>&1 | FileCheck %s -DFILE=%t.err8.o --check-prefix=SEC-SYMS-GNU + +# SEC-SYMS-LLVM: Global entries [ +# SEC-SYMS-LLVM-NEXT: Entry { +# SEC-SYMS-LLVM: Section: Absolute (0xFFF1) +# SEC-SYMS-LLVM-NEXT: Name: Absolute (0) +# SEC-SYMS-LLVM-NEXT: } +# SEC-SYMS-LLVM-NEXT: Entry { +# SEC-SYMS-LLVM: Section: .got (0x1) +# SEC-SYMS-LLVM-NEXT: Name: .got (0) +# SEC-SYMS-LLVM-NEXT: } +# SEC-SYMS-LLVM-NEXT: Entry { +# SEC-SYMS-LLVM: Section: Common (0xFFF2) +# SEC-SYMS-LLVM-NEXT: Name: Common (0) +# SEC-SYMS-LLVM-NEXT: } +# SEC-SYMS-LLVM-NEXT: ] + +# SEC-SYMS-GNU: Global entries: +# SEC-SYMS-GNU-NEXT: {{.*}} Ndx Name +# SEC-SYMS-GNU-NEXT: {{.*}} ABS Absolute +# SEC-SYMS-GNU-NEXT: {{.*}} 1 .got +# SEC-SYMS-GNU-NEXT: {{.*}} COM Common + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_MIPS +Sections: + - Name: .got + Type: SHT_PROGBITS + Address: 0x1122 + Size: 48 + - Name: .dynamic + Type: SHT_DYNAMIC + Entries: + - Tag: DT_MIPS_LOCAL_GOTNO + Value: 1 + - Tag: DT_MIPS_GOTSYM + Value: 1 + - Tag: DT_PLTGOT + Value: 0x1122 +DynamicSymbols: + - Type: STT_SECTION + Index: SHN_ABS + - Type: STT_SECTION + Section: .got + - Type: STT_SECTION + Index: SHN_COMMON diff --git a/llvm/test/tools/llvm-readobj/ELF/mips-plt.test b/llvm/test/tools/llvm-readobj/ELF/mips-plt.test index 95b310ba664c1..7f3fd0897747f 100644 --- a/llvm/test/tools/llvm-readobj/ELF/mips-plt.test +++ b/llvm/test/tools/llvm-readobj/ELF/mips-plt.test @@ -140,3 +140,75 @@ DynamicSymbols: [] # RUN: not llvm-readobj -A %t.err7.o 2>&1 | FileCheck %s -DFILE=%t.err7.o -check-prefix ERR7 # ERR7: error: '[[FILE]]': unable to get a string table for the SHT_DYNAMIC section with index 1: invalid sh_type for symbol table, expected SHT_SYMTAB or SHT_DYNSYM + +## Check how we print PLT entries when they are unnamed section symbols. +## TODO: we should print proper symbol names instead of descriptions. +# RUN: yaml2obj --docnum=3 %s -o %t.3 +# RUN: llvm-readobj -A %t.3 2>&1 | FileCheck %s -DFILE=%t.err8.o --check-prefix=SEC-SYMS-LLVM +# RUN: llvm-readelf -A %t.3 2>&1 | FileCheck %s -DFILE=%t.err8.o --check-prefix=SEC-SYMS-GNU + +# SEC-SYMS-LLVM: PLT GOT { +# SEC-SYMS-LLVM: Entries [ +# SEC-SYMS-LLVM: Entry { +# SEC-SYMS-LLVM: Section: Absolute (0xFFF1) +# SEC-SYMS-LLVM-NEXT: Name: Absolute (0) +# SEC-SYMS-LLVM-NEXT: } +# SEC-SYMS-LLVM-NEXT: Entry { +# SEC-SYMS-LLVM: Section: .got.plt (0x2) +# SEC-SYMS-LLVM-NEXT: Name: .got.plt (0) +# SEC-SYMS-LLVM-NEXT: } +# SEC-SYMS-LLVM-NEXT: Entry { +# SEC-SYMS-LLVM: Section: Common (0xFFF2) +# SEC-SYMS-LLVM-NEXT: Name: Common (0) +# SEC-SYMS-LLVM-NEXT: } +# SEC-SYMS-LLVM-NEXT: ] +# SEC-SYMS-LLVM-NEXT: } + +# SEC-SYMS-GNU: PLT GOT: +# SEC-SYMS-GNU: Entries: +# SEC-SYMS-GNU-NEXT: Address {{.*}} Ndx Name +# SEC-SYMS-GNU-NEXT: 0000000000002010 {{.*}} ABS Absolute +# SEC-SYMS-GNU-NEXT: 0000000000002018 {{.*}} 2 .got.plt +# SEC-SYMS-GNU-NEXT: 0000000000002020 {{.*}} COM Common + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_MIPS +Sections: + - Name: .rel.plt + Type: SHT_REL + Flags: [ SHF_ALLOC ] + Address: 0x1000 + Link: .dynsym + Relocations: + - Offset: 0x1 + Symbol: 1 + Type: R_MIPS_JUMP_SLOT + - Offset: 0x2 + Symbol: 2 + Type: R_MIPS_JUMP_SLOT + - Offset: 0x2 + Symbol: 3 + Type: R_MIPS_JUMP_SLOT + - Name: .got.plt + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x2000 + Size: 40 ## (dynamic symbols number + 2) * 8 + - Name: .dynamic + Type: SHT_DYNAMIC + Entries: + - Tag: DT_JMPREL + Value: 0x1000 + - Tag: DT_MIPS_PLTGOT + Value: 0x2000 +DynamicSymbols: + - Type: STT_SECTION + Index: SHN_ABS + - Type: STT_SECTION + Section: .got.plt + - Type: STT_SECTION + Index: SHN_COMMON diff --git a/llvm/test/tools/llvm-readobj/ELF/non-dynamic-in-pt-dynamic.test b/llvm/test/tools/llvm-readobj/ELF/non-dynamic-in-pt-dynamic.test index 5905ccb2902cc..12bcdf6b7216b 100644 --- a/llvm/test/tools/llvm-readobj/ELF/non-dynamic-in-pt-dynamic.test +++ b/llvm/test/tools/llvm-readobj/ELF/non-dynamic-in-pt-dynamic.test @@ -10,7 +10,7 @@ # RUN: llvm-readelf --dynamic-table %t1.o 2>&1 \ # RUN: | FileCheck %s --DFILE=%t1.o --check-prefixes=WARNING,GNU -# WARNING: warning: '[[FILE]]': The SHT_DYNAMIC section '.dynamic' is not at the start of PT_DYNAMIC segment +# WARNING: warning: '[[FILE]]': SHT_DYNAMIC section with index 2 is not at the start of PT_DYNAMIC segment # WARNING: warning: '[[FILE]]': invalid PT_DYNAMIC size (0x21){{$}} # WARNING: warning: '[[FILE]]': SHT_DYNAMIC section header and PT_DYNAMIC program header disagree about the location of the dynamic table # WARNING: warning: '[[FILE]]': PT_DYNAMIC dynamic table is invalid: SHT_DYNAMIC will be used diff --git a/llvm/test/tools/llvm-readobj/ELF/note-core.test b/llvm/test/tools/llvm-readobj/ELF/note-core.test index c283519aec492..d7ec0c39ca4c2 100644 --- a/llvm/test/tools/llvm-readobj/ELF/note-core.test +++ b/llvm/test/tools/llvm-readobj/ELF/note-core.test @@ -1,8 +1,263 @@ ## Test that note values are interpreted correctly for core files. -# RUN: yaml2obj %s -o %t.o -# RUN: llvm-readelf --notes %t.o | FileCheck %s --check-prefix=GNU -# RUN: llvm-readobj --notes %t.o | FileCheck %s --check-prefix=LLVM +## Check NT_PRSTATUS. +# RUN: yaml2obj %s -DTYPE=0x1 -o %t1.o +# RUN: llvm-readelf --notes %t1.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PRSTATUS (prstatus structure)" +# RUN: llvm-readobj --notes %t1.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PRSTATUS (prstatus structure)" + +## Check NT_FPREGSET. +# RUN: yaml2obj %s -DTYPE=0x2 -o %t2.o +# RUN: llvm-readelf --notes %t2.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_FPREGSET (floating point registers)" +# RUN: llvm-readobj --notes %t2.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_FPREGSET (floating point registers)" + +## Check NT_PRPSINFO. +# RUN: yaml2obj %s -DTYPE=0x3 -o %t3.o +# RUN: llvm-readelf --notes %t3.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PRPSINFO (prpsinfo structure)" +# RUN: llvm-readobj --notes %t3.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PRPSINFO (prpsinfo structure)" + +## Check NT_TASKSTRUCT. +# RUN: yaml2obj %s -DTYPE=0x4 -o %t4.o +# RUN: llvm-readelf --notes %t4.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_TASKSTRUCT (task structure)" +# RUN: llvm-readobj --notes %t4.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_TASKSTRUCT (task structure)" + +## Check NT_AUXV. +# RUN: yaml2obj %s -DTYPE=0x6 -o %t5.o +# RUN: llvm-readelf --notes %t5.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_AUXV (auxiliary vector)" +# RUN: llvm-readobj --notes %t5.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_AUXV (auxiliary vector)" + +## Check NT_PSTATUS. +# RUN: yaml2obj %s -DTYPE=0xA -o %t6.o +# RUN: llvm-readelf --notes %t6.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PSTATUS (pstatus structure)" +# RUN: llvm-readobj --notes %t6.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PSTATUS (pstatus structure)" + +## Check NT_FPREGS. +# RUN: yaml2obj %s -DTYPE=0xC -o %t7.o +# RUN: llvm-readelf --notes %t7.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_FPREGS (floating point registers)" +# RUN: llvm-readobj --notes %t7.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_FPREGS (floating point registers)" + +## Check NT_PSINFO. +# RUN: yaml2obj %s -DTYPE=0xD -o %t8.o +# RUN: llvm-readelf --notes %t8.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PSINFO (psinfo structure)" +# RUN: llvm-readobj --notes %t8.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PSINFO (psinfo structure)" + +## Check NT_LWPSTATUS. +# RUN: yaml2obj %s -DTYPE=0x10 -o %t9.o +# RUN: llvm-readelf --notes %t9.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_LWPSTATUS (lwpstatus_t structure)" +# RUN: llvm-readobj --notes %t9.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_LWPSTATUS (lwpstatus_t structure)" + +## Check NT_LWPSINFO. +# RUN: yaml2obj %s -DTYPE=0x11 -o %t10.o +# RUN: llvm-readelf --notes %t10.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_LWPSINFO (lwpsinfo_t structure)" +# RUN: llvm-readobj --notes %t10.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_LWPSINFO (lwpsinfo_t structure)" + +## Check NT_WIN32PSTATUS. +# RUN: yaml2obj %s -DTYPE=0x12 -o %t11.o +# RUN: llvm-readelf --notes %t11.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_WIN32PSTATUS (win32_pstatus structure)" +# RUN: llvm-readobj --notes %t11.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_WIN32PSTATUS (win32_pstatus structure)" + +## Check ELF::NT_PPC_VMX. +# RUN: yaml2obj %s -DTYPE=0x100 -o %t12.o +# RUN: llvm-readelf --notes %t12.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_VMX (ppc Altivec registers)" +# RUN: llvm-readobj --notes %t12.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_VMX (ppc Altivec registers)" + +## Check ELF::NT_PPC_VSX. +# RUN: yaml2obj %s -DTYPE=0x102 -o %t13.o +# RUN: llvm-readelf --notes %t13.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_VSX (ppc VSX registers)" +# RUN: llvm-readobj --notes %t13.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_VSX (ppc VSX registers)" + +## Check ELF::NT_PPC_TAR. +# RUN: yaml2obj %s -DTYPE=0x103 -o %t14.o +# RUN: llvm-readelf --notes %t14.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_TAR (ppc TAR register)" +# RUN: llvm-readobj --notes %t14.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_TAR (ppc TAR register)" + +## Check ELF::NT_PPC_PPR. +# RUN: yaml2obj %s -DTYPE=0x104 -o %t15.o +# RUN: llvm-readelf --notes %t15.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_PPR (ppc PPR register)" +# RUN: llvm-readobj --notes %t15.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_PPR (ppc PPR register)" + +## Check ELF::NT_PPC_DSCR. +# RUN: yaml2obj %s -DTYPE=0x105 -o %t16.o +# RUN: llvm-readelf --notes %t16.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_DSCR (ppc DSCR register)" +# RUN: llvm-readobj --notes %t16.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_DSCR (ppc DSCR register)" + +## Check ELF::NT_PPC_EBB. +# RUN: yaml2obj %s -DTYPE=0x106 -o %t17.o +# RUN: llvm-readelf --notes %t17.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_EBB (ppc EBB registers)" +# RUN: llvm-readobj --notes %t17.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_EBB (ppc EBB registers)" + +## Check ELF::NT_PPC_PMU. +# RUN: yaml2obj %s -DTYPE=0x107 -o %t18.o +# RUN: llvm-readelf --notes %t18.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_PMU (ppc PMU registers)" +# RUN: llvm-readobj --notes %t18.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_PMU (ppc PMU registers)" + +## Check ELF::NT_PPC_TM_CGPR. +# RUN: yaml2obj %s -DTYPE=0x108 -o %t19.o +# RUN: llvm-readelf --notes %t19.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_TM_CGPR (ppc checkpointed GPR registers)" +# RUN: llvm-readobj --notes %t19.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_TM_CGPR (ppc checkpointed GPR registers)" + +## Check ELF::NT_PPC_TM_CFPR. +# RUN: yaml2obj %s -DTYPE=0x109 -o %t20.o +# RUN: llvm-readelf --notes %t20.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_TM_CFPR (ppc checkpointed floating point registers)" +# RUN: llvm-readobj --notes %t20.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_TM_CFPR (ppc checkpointed floating point registers)" + +## Check ELF::NT_PPC_TM_CVMX. +# RUN: yaml2obj %s -DTYPE=0x10a -o %t21.o +# RUN: llvm-readelf --notes %t21.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_TM_CVMX (ppc checkpointed Altivec registers)" +# RUN: llvm-readobj --notes %t21.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_TM_CVMX (ppc checkpointed Altivec registers)" + +## Check ELF::NT_PPC_TM_CVSX. +# RUN: yaml2obj %s -DTYPE=0x10b -o %t22.o +# RUN: llvm-readelf --notes %t22.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_TM_CVSX (ppc checkpointed VSX registers)" +# RUN: llvm-readobj --notes %t22.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_TM_CVSX (ppc checkpointed VSX registers)" + +## Check ELF::NT_PPC_TM_SPR. +# RUN: yaml2obj %s -DTYPE=0x10c -o %t23.o +# RUN: llvm-readelf --notes %t23.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_TM_SPR (ppc TM special purpose registers)" +# RUN: llvm-readobj --notes %t23.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_TM_SPR (ppc TM special purpose registers)" + +## Check ELF::NT_PPC_TM_CTAR. +# RUN: yaml2obj %s -DTYPE=0x10d -o %t24.o +# RUN: llvm-readelf --notes %t24.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_TM_CTAR (ppc checkpointed TAR register)" +# RUN: llvm-readobj --notes %t24.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_TM_CTAR (ppc checkpointed TAR register)" + +## Check ELF::NT_PPC_TM_CPPR. +# RUN: yaml2obj %s -DTYPE=0x10e -o %t25.o +# RUN: llvm-readelf --notes %t25.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_TM_CPPR (ppc checkpointed PPR register)" +# RUN: llvm-readobj --notes %t25.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_TM_CPPR (ppc checkpointed PPR register)" + +## Check ELF::NT_PPC_TM_CDSCR. +# RUN: yaml2obj %s -DTYPE=0x10f -o %t26.o +# RUN: llvm-readelf --notes %t26.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PPC_TM_CDSCR (ppc checkpointed DSCR register)" +# RUN: llvm-readobj --notes %t26.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PPC_TM_CDSCR (ppc checkpointed DSCR register)" + +## Check ELF::NT_386_TLS. +# RUN: yaml2obj %s -DTYPE=0x200 -o %t27.o +# RUN: llvm-readelf --notes %t27.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_386_TLS (x86 TLS information)" +# RUN: llvm-readobj --notes %t27.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_386_TLS (x86 TLS information)" + +## Check ELF::NT_386_IOPERM. +# RUN: yaml2obj %s -DTYPE=0x201 -o %t28.o +# RUN: llvm-readelf --notes %t28.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_386_IOPERM (x86 I/O permissions)" +# RUN: llvm-readobj --notes %t28.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_386_IOPERM (x86 I/O permissions)" + +## Check ELF::NT_X86_XSTATE. +# RUN: yaml2obj %s -DTYPE=0x202 -o %t29.o +# RUN: llvm-readelf --notes %t29.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_X86_XSTATE (x86 XSAVE extended state)" +# RUN: llvm-readobj --notes %t29.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_X86_XSTATE (x86 XSAVE extended state)" + +## Check ELF::NT_S390_HIGH_GPRS. +# RUN: yaml2obj %s -DTYPE=0x300 -o %t30.o +# RUN: llvm-readelf --notes %t30.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_HIGH_GPRS (s390 upper register halves)" +# RUN: llvm-readobj --notes %t30.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_HIGH_GPRS (s390 upper register halves)" + +## Check ELF::NT_S390_TIMER. +# RUN: yaml2obj %s -DTYPE=0x301 -o %t31.o +# RUN: llvm-readelf --notes %t31.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_TIMER (s390 timer register)" +# RUN: llvm-readobj --notes %t31.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_TIMER (s390 timer register)" + +## Check ELF::NT_S390_TODCMP. +# RUN: yaml2obj %s -DTYPE=0x302 -o %t32.o +# RUN: llvm-readelf --notes %t32.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_TODCMP (s390 TOD comparator register)" +# RUN: llvm-readobj --notes %t32.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_TODCMP (s390 TOD comparator register)" + +## Check ELF::NT_S390_TODPREG. +# RUN: yaml2obj %s -DTYPE=0x303 -o %t33.o +# RUN: llvm-readelf --notes %t33.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_TODPREG (s390 TOD programmable register)" +# RUN: llvm-readobj --notes %t33.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_TODPREG (s390 TOD programmable register)" + +## Check ELF::NT_S390_CTRS. +# RUN: yaml2obj %s -DTYPE=0x304 -o %t34.o +# RUN: llvm-readelf --notes %t34.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_CTRS (s390 control registers)" +# RUN: llvm-readobj --notes %t34.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_CTRS (s390 control registers)" + +## Check ELF::NT_S390_PREFIX. +# RUN: yaml2obj %s -DTYPE=0x305 -o %t35.o +# RUN: llvm-readelf --notes %t35.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_PREFIX (s390 prefix register)" +# RUN: llvm-readobj --notes %t35.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_PREFIX (s390 prefix register)" + +## Check ELF::NT_S390_LAST_BREAK. +# RUN: yaml2obj %s -DTYPE=0x306 -o %t36.o +# RUN: llvm-readelf --notes %t36.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_LAST_BREAK (s390 last breaking event address)" +# RUN: llvm-readobj --notes %t36.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_LAST_BREAK (s390 last breaking event address)" + +## Check ELF::NT_S390_SYSTEM_CALL. +# RUN: yaml2obj %s -DTYPE=0x307 -o %t37.o +# RUN: llvm-readelf --notes %t37.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_SYSTEM_CALL (s390 system call restart data)" +# RUN: llvm-readobj --notes %t37.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_SYSTEM_CALL (s390 system call restart data)" + +## Check ELF::NT_S390_TDB. +# RUN: yaml2obj %s -DTYPE=0x308 -o %t38.o +# RUN: llvm-readelf --notes %t38.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_TDB (s390 transaction diagnostic block)" +# RUN: llvm-readobj --notes %t38.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_TDB (s390 transaction diagnostic block)" + +## Check ELF::NT_S390_VXRS_LOW. +# RUN: yaml2obj %s -DTYPE=0x309 -o %t39.o +# RUN: llvm-readelf --notes %t39.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_VXRS_LOW (s390 vector registers 0-15 upper half)" +# RUN: llvm-readobj --notes %t39.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_VXRS_LOW (s390 vector registers 0-15 upper half)" + +## Check ELF::NT_S390_VXRS_HIGH. +# RUN: yaml2obj %s -DTYPE=0x30a -o %t40.o +# RUN: llvm-readelf --notes %t40.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_VXRS_HIGH (s390 vector registers 16-31)" +# RUN: llvm-readobj --notes %t40.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_VXRS_HIGH (s390 vector registers 16-31)" + +## Check ELF::NT_S390_GS_CB. +# RUN: yaml2obj %s -DTYPE=0x30b -o %t41.o +# RUN: llvm-readelf --notes %t41.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_GS_CB (s390 guarded-storage registers)" +# RUN: llvm-readobj --notes %t41.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_GS_CB (s390 guarded-storage registers)" + +## Check ELF::NT_S390_GS_BC. +# RUN: yaml2obj %s -DTYPE=0x30c -o %t42.o +# RUN: llvm-readelf --notes %t42.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_S390_GS_BC (s390 guarded-storage broadcast control)" +# RUN: llvm-readobj --notes %t42.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_S390_GS_BC (s390 guarded-storage broadcast control)" + +## Check ELF::NT_ARM_VFP. +# RUN: yaml2obj %s -DTYPE=0x400 -o %t43.o +# RUN: llvm-readelf --notes %t43.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_ARM_VFP (arm VFP registers)" +# RUN: llvm-readobj --notes %t43.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_ARM_VFP (arm VFP registers)" + +## Check ELF::NT_ARM_TLS. +# RUN: yaml2obj %s -DTYPE=0x401 -o %t44.o +# RUN: llvm-readelf --notes %t44.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_ARM_TLS (AArch TLS registers)" +# RUN: llvm-readobj --notes %t44.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_ARM_TLS (AArch TLS registers)" + +## Check ELF::NT_ARM_HW_BREAK. +# RUN: yaml2obj %s -DTYPE=0x402 -o %t45.o +# RUN: llvm-readelf --notes %t45.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_ARM_HW_BREAK (AArch hardware breakpoint registers)" +# RUN: llvm-readobj --notes %t45.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_ARM_HW_BREAK (AArch hardware breakpoint registers)" + +## Check ELF::NT_ARM_HW_WATCH. +# RUN: yaml2obj %s -DTYPE=0x403 -o %t46.o +# RUN: llvm-readelf --notes %t46.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_ARM_HW_WATCH (AArch hardware watchpoint registers)" +# RUN: llvm-readobj --notes %t46.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_ARM_HW_WATCH (AArch hardware watchpoint registers)" + +## Check ELF::NT_FILE. +# RUN: yaml2obj %s -DTYPE=0x46494c45 -o %t47.o +# RUN: llvm-readelf --notes %t47.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_FILE (mapped files)" +# RUN: llvm-readobj --notes %t47.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_FILE (mapped files)" + +## Check ELF::NT_PRXFPREG. +# RUN: yaml2obj %s -DTYPE=0x46e62b7f -o %t48.o +# RUN: llvm-readelf --notes %t48.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_PRXFPREG (user_xfpregs structure)" +# RUN: llvm-readobj --notes %t48.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_PRXFPREG (user_xfpregs structure)" + +## Check ELF::NT_SIGINFO. +# RUN: yaml2obj %s -DTYPE=0x53494749 -o %t49.o +# RUN: llvm-readelf --notes %t49.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="NT_SIGINFO (siginfo_t data)" +# RUN: llvm-readobj --notes %t49.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="NT_SIGINFO (siginfo_t data)" + +## Check an arbitrary unknown type. +# RUN: yaml2obj %s -DTYPE=0x12345678 -o %t50.o +# RUN: llvm-readelf --notes %t50.o | FileCheck %s --check-prefix=CHECK-GNU -DDESC="Unknown note type: (0x12345678)" +# RUN: llvm-readobj --notes %t50.o | FileCheck %s --check-prefix=CHECK-LLVM -DDESC="Unknown (0x12345678)" + +# CHECK-GNU: Owner Data size Description +# CHECK-GNU-NEXT: CORE 0x00000000 [[DESC]] + +# CHECK-LLVM: Note { +# CHECK-LLVM-NEXT: Owner: CORE +# CHECK-LLVM-NEXT: Data size: 0x0 +# CHECK-LLVM-NEXT: Type: [[DESC]] +# CHECK-LLVM-NEXT: } --- !ELF FileHeader: @@ -10,52 +265,12 @@ FileHeader: Data: ELFDATA2LSB Type: ET_CORE Sections: - - Name: .note.foo - Type: SHT_NOTE - # Note: format is 0500000000000000434F524500000000 repeated - Content: 050000000000000001000000434F524500000000050000000000000002000000434F524500000000050000000000000003000000434F524500000000050000000000000004000000434F524500000000050000000000000006000000434F524500000000 + - Name: .note.foo + Type: SHT_NOTE + Notes: + - Name: CORE + Type: [[TYPE]] ProgramHeaders: - - Type: PT_NOTE + - Type: PT_NOTE Sections: - Section: .note.foo - -# GNU: Displaying notes found -# GNU-NEXT: Owner Data size Description -# GNU-NEXT: CORE 0x00000000 NT_PRSTATUS (prstatus structure) -# GNU-NEXT: CORE 0x00000000 NT_FPREGSET (floating point registers) -# GNU-NEXT: CORE 0x00000000 NT_PRPSINFO (prpsinfo structure) -# GNU-NEXT: CORE 0x00000000 NT_TASKSTRUCT (task structure) -# GNU-NEXT: CORE 0x00000000 NT_AUXV (auxiliary vector) - -# LLVM: Notes [ -# LLVM-NEXT: NoteSection { -# LLVM-NEXT: Name: -# LLVM-NEXT: Offset: -# LLVM-NEXT: Size: -# LLVM-NEXT: Note { -# LLVM-NEXT: Owner: CORE -# LLVM-NEXT: Data size: 0x0 -# LLVM-NEXT: Type: NT_PRSTATUS (prstatus structure) -# LLVM-NEXT: } -# LLVM-NEXT: Note { -# LLVM-NEXT: Owner: CORE -# LLVM-NEXT: Data size: 0x0 -# LLVM-NEXT: Type: NT_FPREGSET (floating point registers) -# LLVM-NEXT: } -# LLVM-NEXT: Note { -# LLVM-NEXT: Owner: CORE -# LLVM-NEXT: Data size: 0x0 -# LLVM-NEXT: Type: NT_PRPSINFO (prpsinfo structure) -# LLVM-NEXT: } -# LLVM-NEXT: Note { -# LLVM-NEXT: Owner: CORE -# LLVM-NEXT: Data size: 0x0 -# LLVM-NEXT: Type: NT_TASKSTRUCT (task structure) -# LLVM-NEXT: } -# LLVM-NEXT: Note { -# LLVM-NEXT: Owner: CORE -# LLVM-NEXT: Data size: 0x0 -# LLVM-NEXT: Type: NT_AUXV (auxiliary vector) -# LLVM-NEXT: } -# LLVM-NEXT: } -# LLVM-NEXT: ] diff --git a/llvm/test/tools/llvm-readobj/ELF/note-freebsd.s b/llvm/test/tools/llvm-readobj/ELF/note-freebsd.s index 3d4b461f1feb2..3caca6cc0d718 100644 --- a/llvm/test/tools/llvm-readobj/ELF/note-freebsd.s +++ b/llvm/test/tools/llvm-readobj/ELF/note-freebsd.s @@ -13,7 +13,7 @@ // GNU-NEXT: FreeBSD 0x00000000 NT_PROCSTAT_FILES (files data) // GNU-NEXT: Displaying notes found in: .note.baz // GNU-NEXT: Owner Data size Description -// GNU-NEXT: FreeBSD 0x0000001c Unknown note type (0x00000003) +// GNU-NEXT: FreeBSD 0x0000001c Unknown note type: (0x00000003) // GNU-NEXT: description data: 4c 6f 72 65 6d 20 69 70 73 75 6d 20 64 6f 6c 6f 72 20 73 69 74 20 61 6d 65 74 00 00 // LLVM: Notes [ @@ -49,7 +49,7 @@ // LLVM-NEXT: Note { // LLVM-NEXT: Owner: FreeBSD // LLVM-NEXT: Data size: 0x1C -// LLVM-NEXT: Type: Unknown note type (0x00000003) +// LLVM-NEXT: Type: Unknown (0x00000003) // LLVM-NEXT: Description data ( // LLVM-NEXT: 0000: 4C6F7265 6D206970 73756D20 646F6C6F |Lorem ipsum dolo| // LLVM-NEXT: 0010: 72207369 7420616D 65740000 |r sit amet..| diff --git a/llvm/test/tools/llvm-readobj/ELF/section-symbols.test b/llvm/test/tools/llvm-readobj/ELF/section-symbols.test index 3b6a2eca4fc4e..1aac1e6f06e8f 100644 --- a/llvm/test/tools/llvm-readobj/ELF/section-symbols.test +++ b/llvm/test/tools/llvm-readobj/ELF/section-symbols.test @@ -1,35 +1,71 @@ -## ELF section symbols use the section names when printing. This test verifies -## this and also that appropriate things are printed if the section is somehow -## invalid. +## ELF section symbols use the corresponding section names when printing +## unnamed symbols. This test verifies this and also that appropriate things +## are printed if the section is somehow invalid. # RUN: yaml2obj %s -o %t1 -# RUN: llvm-readobj %t1 --symbols 2> %t.llvm.err1 | FileCheck %s --check-prefix=LLVM1 -# RUN: FileCheck %s --input-file %t.llvm.err1 --check-prefix=WARN1 --implicit-check-not=warning -# RUN: llvm-readelf %t1 --symbols 2> %t.gnu.err1 | FileCheck %s --check-prefix=GNU1 -# RUN: FileCheck %s --input-file %t.gnu.err1 --check-prefix=WARN1 --implicit-check-not=warning +## FIXME: 1) Relocations should print section symbol names when they are not empty. +## 2) We should still print a relocation even when we are unable to lookup a symbol name. +# RUN: llvm-readobj %t1 --symbols --relocations 2>&1 | \ +# RUN: FileCheck %s -DFILE=%t1 --check-prefix=LLVM1 --implicit-check-not="warning:" +# RUN: llvm-readelf %t1 --symbols --relocations 2>&1 | \ +# RUN: FileCheck %s -DFILE=%t1 --check-prefix=GNU1 --implicit-check-not="warning:" + +# LLVM1: Relocations [ +# LLVM1-NEXT: Section (4) .rela.foo { +# LLVM1-NEXT: 0x1 R_X86_64_NONE .foo 0x0 +# LLVM1-NEXT: 0x2 R_X86_64_NONE .foo 0x0 +# LLVM1-NEXT: warning: '[[FILE]]': unable to print relocation 3 in SHT_RELA section with index 4: invalid section index: 67 +# LLVM1-NEXT: warning: '[[FILE]]': unable to print relocation 4 in SHT_RELA section with index 4: invalid section index: 67 +# LLVM1-NEXT: 0x5 R_X86_64_NONE .bar 0x0 +# LLVM1-NEXT: 0x6 R_X86_64_NONE .bar 0x0 +# LLVM1-NEXT: warning: '[[FILE]]': unable to print relocation 7 in SHT_RELA section with index 4: invalid section index: 66 +# LLVM1-NEXT: warning: '[[FILE]]': unable to print relocation 8 in SHT_RELA section with index 4: invalid section index: 66 +# LLVM1-NEXT: } +# LLVM1-NEXT: ] # LLVM1: Name: (0) # LLVM1: Name: .foo (0) +# LLVM1: Name: symbol1 (25) +# LLVM1: warning: '[[FILE]]': invalid section index: 67 # LLVM1: Name:
    (0) +# LLVM1: Name: symbol2 (17) # LLVM1: Name: .bar (0) +# LLVM1: Name: symbol3 (9) +# LLVM1: warning: '[[FILE]]': invalid section index: 66 # LLVM1: Name:
    (0) +# LLVM1: Name: symbol4 (1) + +# GNU1: Relocation section '.rela.foo' at offset 0x58 contains 8 entries: +# GNU1-NEXT: Offset Info Type Sym. Value Symbol's Name + Addend +# GNU1-NEXT: 00000001 00000100 R_X86_64_NONE 00000000 .foo + 0 +# GNU1-NEXT: 00000002 00000200 R_X86_64_NONE 00000000 .foo + 0 +# GNU1-NEXT: warning: '[[FILE]]': unable to print relocation 3 in SHT_RELA section with index 4: invalid section index: 67 +# GNU1-NEXT: warning: '[[FILE]]': unable to print relocation 4 in SHT_RELA section with index 4: invalid section index: 67 +# GNU1-NEXT: 00000005 00000500 R_X86_64_NONE 00000000 .bar + 0 +# GNU1-NEXT: 00000006 00000600 R_X86_64_NONE 00000000 .bar + 0 +# GNU1-NEXT: warning: '[[FILE]]': unable to print relocation 7 in SHT_RELA section with index 4: invalid section index: 66 +# GNU1-NEXT: warning: '[[FILE]]': unable to print relocation 8 in SHT_RELA section with index 4: invalid section index: 66 -# GNU1: Symbol table '.symtab' contains 5 entries: +# GNU1: Symbol table '.symtab' contains 9 entries: # GNU1-NEXT: Num: {{.*}} Type {{.*}} Ndx Name # GNU1-NEXT: 0: {{.*}} NOTYPE {{.*}} UND {{$}} -# GNU1-NEXT: 1: {{.*}} SECTION {{.*}} 1 .foo -# GNU1-NEXT: 2: {{.*}} SECTION {{.*}} 67
    -# GNU1-NEXT: 3: {{.*}} SECTION {{.*}} 2 .bar -# GNU1-NEXT: 4: {{.*}} SECTION {{.*}} 66
    - -# WARN1: warning: '{{.*}}.tmp1': invalid section index: 67 -# WARN1: warning: '{{.*}}.tmp1': invalid section index: 66 +# GNU1-NEXT: 1: {{.*}} SECTION {{.*}} 1 .foo +# GNU1-NEXT: 2: {{.*}} SECTION {{.*}} 1 symbol1 +# GNU1-NEXT: warning: '[[FILE]]': invalid section index: 67 +# GNU1-NEXT: 3: {{.*}} SECTION {{.*}} 67
    +# GNU1-NEXT: 4: {{.*}} SECTION {{.*}} 67 symbol2 +# GNU1-NEXT: 5: {{.*}} SECTION {{.*}} 2 .bar +# GNU1-NEXT: 6: {{.*}} SECTION {{.*}} 2 symbol3 +# GNU1-NEXT: warning: '[[FILE]]': invalid section index: 66 +# GNU1-NEXT: 7: {{.*}} SECTION {{.*}} 66
    +# GNU1-NEXT: 8: {{.*}} SECTION {{.*}} 66 symbol4 --- !ELF FileHeader: - Class: ELFCLASS32 - Data: ELFDATA2LSB - Type: ET_REL + Class: ELFCLASS32 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_X86_64 Sections: - Name: .foo Type: SHT_PROGBITS @@ -38,22 +74,69 @@ Sections: - Name: .symtab_shndx Type: SHT_SYMTAB_SHNDX Link: .symtab - Entries: [ 0, 0, 0, 2, 0x42 ] + Entries: [ 0, 0, 0, 0, 0, 2, 2, 0x42, 0x42 ] + - Name: .rela.foo + Type: SHT_RELA + Link: .symtab + Info: .foo + Relocations: + - Offset: 0x1 + Symbol: 1 + Type: R_X86_64_NONE + - Offset: 0x2 + Symbol: 2 + Type: R_X86_64_NONE + - Offset: 0x3 + Symbol: 3 + Type: R_X86_64_NONE + - Offset: 0x4 + Symbol: 4 + Type: R_X86_64_NONE + - Offset: 0x5 + Symbol: 5 + Type: R_X86_64_NONE + - Offset: 0x6 + Symbol: 6 + Type: R_X86_64_NONE + - Offset: 0x7 + Symbol: 7 + Type: R_X86_64_NONE + - Offset: 0x8 + Symbol: 8 + Type: R_X86_64_NONE Symbols: +## Case 1: a valid unnamed section symbol. - Name: "" Section: .foo Type: STT_SECTION +## Case 2: a valid named section symbol. + - Name: "symbol1" + Section: .foo + Type: STT_SECTION +## Case 3: an unnamed section symbol with invalid index. - Name: "" Index: 0x43 Type: STT_SECTION - # Section symbol via SHT_SYMTAB_SHNDX. +## Case 4: a named section symbol with invalid index. + - Name: "symbol2" + Index: 0x43 + Type: STT_SECTION +## Case 5: a valid unnamed section symbol via SHT_SYMTAB_SHNDX. - Name: "" Index: SHN_XINDEX Type: STT_SECTION - # Section symbol via SHT_SYMTAB_SHNDX with invalid index. +## Case 6: a valid named section symbol via SHT_SYMTAB_SHNDX. + - Name: "symbol3" + Index: SHN_XINDEX + Type: STT_SECTION +## Case 7: a unnamed section symbol via SHT_SYMTAB_SHNDX with invalid index. - Name: "" Index: SHN_XINDEX Type: STT_SECTION +## Case 8: a named section symbol via SHT_SYMTAB_SHNDX with invalid index. + - Name: "symbol4" + Index: SHN_XINDEX + Type: STT_SECTION # RUN: yaml2obj %s --docnum=2 -o %t2 # RUN: llvm-readobj %t2 --symbols 2> %t.llvm.err2 | FileCheck %s --check-prefix=LLVM2 diff --git a/llvm/test/tools/llvm-readobj/ELF/symbol-shndx.test b/llvm/test/tools/llvm-readobj/ELF/symbol-shndx.test index 0d9c225c99fd2..b2d1e2f6d2ecd 100644 --- a/llvm/test/tools/llvm-readobj/ELF/symbol-shndx.test +++ b/llvm/test/tools/llvm-readobj/ELF/symbol-shndx.test @@ -57,29 +57,88 @@ Sections: Link: .symtab Entries: [ 0, 0, 0, 0, 0, 0, 0, 0, 1 ] Symbols: - - Name: undef + - Name: [[NAME=undef]] Binding: STB_GLOBAL - - Name: normal + Type: [[TYPE=STT_NOTYPE]] + - Name: [[NAME=normal]] Section: .text Binding: STB_GLOBAL - - Name: common + Type: [[TYPE=STT_NOTYPE]] + - Name: [[NAME=common]] Index: SHN_COMMON Binding: STB_GLOBAL - - Name: absolute + Type: [[TYPE=STT_NOTYPE]] + - Name: [[NAME=absolute]] Index: SHN_ABS Binding: STB_GLOBAL - - Name: proc + Type: [[TYPE=STT_NOTYPE]] + - Name: [[NAME=proc]] Index: 0xff01 Binding: STB_GLOBAL - - Name: os + Type: [[TYPE=STT_NOTYPE]] + - Name: [[NAME=os]] Index: 0xff21 Binding: STB_GLOBAL - - Name: reserved + Type: [[TYPE=STT_NOTYPE]] + - Name: [[NAME=reserved]] Index: 0xfffe Binding: STB_GLOBAL - - Name: xindex + Type: [[TYPE=STT_NOTYPE]] + - Name: [[NAME=xindex]] Index: SHN_XINDEX Binding: STB_GLOBAL + Type: [[TYPE=STT_NOTYPE]] + +## Check the behavior for section symbols. +# RUN: yaml2obj --docnum=1 -DTYPE=STT_SECTION %s -o %t1-sec +# RUN: llvm-readobj --symbols %t1-sec | FileCheck %s --check-prefix=LLVM1 +# RUN: llvm-readelf --symbols %t1-sec | FileCheck %s --check-prefix=GNU1 + +## Check the behavior for unnamed section symbols. +## TODO: we should print proper symbol names instead of descriptions. +# RUN: yaml2obj --docnum=1 -DTYPE=STT_SECTION -DNAME="''" %s -o %t1-sec-unnamed +# RUN: llvm-readobj --symbols %t1-sec-unnamed | FileCheck %s --check-prefix=LLVM1-SEC-SYMS +# RUN: llvm-readelf --symbols %t1-sec-unnamed | FileCheck %s --check-prefix=GNU1-SEC-SYMS + +# LLVM1-SEC-SYMS: Symbols [ +# LLVM1-SEC-SYMS-NEXT: Symbol { +# LLVM1-SEC-SYMS-NEXT: Name: (0) +# LLVM1-SEC-SYMS: Section: Undefined (0x0) +# LLVM1-SEC-SYMS: Symbol { +# LLVM1-SEC-SYMS-NEXT: Name: Undefined (0) +# LLVM1-SEC-SYMS: Section: Undefined (0x0) +# LLVM1-SEC-SYMS: Symbol { +# LLVM1-SEC-SYMS-NEXT: Name: .text (0) +# LLVM1-SEC-SYMS: Section: .text (0x1) +# LLVM1-SEC-SYMS: Symbol { +# LLVM1-SEC-SYMS-NEXT: Name: Common (0) +# LLVM1-SEC-SYMS: Section: Common (0xFFF2) +# LLVM1-SEC-SYMS: Symbol { +# LLVM1-SEC-SYMS-NEXT: Name: Absolute (0) +# LLVM1-SEC-SYMS: Section: Absolute (0xFFF1) +# LLVM1-SEC-SYMS: Symbol { +# LLVM1-SEC-SYMS-NEXT: Name: Processor Specific (0) +# LLVM1-SEC-SYMS: Section: Processor Specific (0xFF01) +# LLVM1-SEC-SYMS: Symbol { +# LLVM1-SEC-SYMS-NEXT: Name: Operating System Specific (0) +# LLVM1-SEC-SYMS: Section: Operating System Specific (0xFF21) +# LLVM1-SEC-SYMS: Symbol { +# LLVM1-SEC-SYMS-NEXT: Name: Reserved (0) +# LLVM1-SEC-SYMS: Section: Reserved (0xFFFE) +# LLVM1-SEC-SYMS: Symbol { +# LLVM1-SEC-SYMS-NEXT: Name: .text (0) +# LLVM1-SEC-SYMS: Section: .text (0x1) + +# GNU1-SEC-SYMS: Num: {{.*}} Ndx Name +# GNU1-SEC-SYMS-NEXT: 0: {{.*}} UND +# GNU1-SEC-SYMS-NEXT: 1: {{.*}} UND Undefined +# GNU1-SEC-SYMS-NEXT: 2: {{.*}} 1 .text +# GNU1-SEC-SYMS-NEXT: 3: {{.*}} COM Common +# GNU1-SEC-SYMS-NEXT: 4: {{.*}} ABS Absolute +# GNU1-SEC-SYMS-NEXT: 5: {{.*}} PRC[0xff01] Processor Specific +# GNU1-SEC-SYMS-NEXT: 6: {{.*}} OS[0xff21] Operating System Specific +# GNU1-SEC-SYMS-NEXT: 7: {{.*}} RSV[0xfffe] Reserved +# GNU1-SEC-SYMS-NEXT: 8: {{.*}} 1 .text ## In this case, the index does not correspond to a real section. Check that GNU ## style just prints the section index as normal and LLVM style prints a warning diff --git a/llvm/test/tools/llvm-symbolizer/sym-verbose.test b/llvm/test/tools/llvm-symbolizer/sym-verbose.test index c12eb3b530e1b..1529290379093 100644 --- a/llvm/test/tools/llvm-symbolizer/sym-verbose.test +++ b/llvm/test/tools/llvm-symbolizer/sym-verbose.test @@ -18,11 +18,13 @@ RUN: llvm-symbolizer -verbose -print-address -obj=%p/Inputs/discrim < %p/Inputs/ #CHECK: 0x400590 #CHECK-NEXT: foo #CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +#CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c #CHECK-NEXT: Function start line: 4 #CHECK-NEXT: Line: 5 #CHECK-NEXT: Column: 7 #CHECK-NEXT: main #CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +#CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c #CHECK-NEXT: Function start line: 9 #CHECK-NEXT: Line: 10 #CHECK-NEXT: Column: 0 @@ -30,12 +32,14 @@ RUN: llvm-symbolizer -verbose -print-address -obj=%p/Inputs/discrim < %p/Inputs/ #CHECK: 0x4005a5 #CHECK-NEXT: foo #CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +#CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c #CHECK-NEXT: Function start line: 4 #CHECK-NEXT: Line: 5 #CHECK-NEXT: Column: 17 #CHECK-NEXT: Discriminator: 2 #CHECK-NEXT: main #CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +#CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c #CHECK-NEXT: Function start line: 9 #CHECK-NEXT: Line: 10 #CHECK-NEXT: Column: 0 @@ -43,12 +47,14 @@ RUN: llvm-symbolizer -verbose -print-address -obj=%p/Inputs/discrim < %p/Inputs/ #CHECK: 0x4005ad #CHECK-NEXT: foo #CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +#CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c #CHECK-NEXT: Function start line: 4 #CHECK-NEXT: Line: 0 #CHECK-NEXT: Column: 30 #CHECK-NEXT: Discriminator: 4 #CHECK-NEXT: main #CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +#CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c #CHECK-NEXT: Function start line: 9 #CHECK-NEXT: Line: 10 #CHECK-NEXT: Column: 0 @@ -56,11 +62,13 @@ RUN: llvm-symbolizer -verbose -print-address -obj=%p/Inputs/discrim < %p/Inputs/ #CHECK: 0x4005b9 #CHECK-NEXT: foo #CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +#CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c #CHECK-NEXT: Function start line: 4 #CHECK-NEXT: Line: 5 #CHECK-NEXT: Column: 7 #CHECK-NEXT: main #CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +#CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c #CHECK-NEXT: Function start line: 9 #CHECK-NEXT: Line: 10 #CHECK-NEXT: Column: 0 @@ -69,12 +77,14 @@ RUN: llvm-symbolizer -verbose -print-address -obj=%p/Inputs/discrim < %p/Inputs/ #CHECK: 0x4005ce #CHECK-NEXT: foo #CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +#CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c #CHECK-NEXT: Function start line: 4 #CHECK-NEXT: Line: 5 #CHECK-NEXT: Column: 17 #CHECK-NEXT: Discriminator: 2 #CHECK-NEXT: main #CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +#CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c #CHECK-NEXT: Function start line: 9 #CHECK-NEXT: Line: 10 #CHECK-NEXT: Column: 0 @@ -83,12 +93,14 @@ RUN: llvm-symbolizer -verbose -print-address -obj=%p/Inputs/discrim < %p/Inputs/ #CHECK: 0x4005d4 #CHECK-NEXT: foo #CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +#CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c #CHECK-NEXT: Function start line: 4 #CHECK-NEXT: Line: 5 #CHECK-NEXT: Column: 30 #CHECK-NEXT: Discriminator: 4 #CHECK-NEXT: main #CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +#CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c #CHECK-NEXT: Function start line: 9 #CHECK-NEXT: Line: 10 #CHECK-NEXT: Column: 0 diff --git a/llvm/test/tools/obj2yaml/ELF/DWARF/debug-addr.yaml b/llvm/test/tools/obj2yaml/ELF/DWARF/debug-addr.yaml new file mode 100644 index 0000000000000..b294adff5cbd7 --- /dev/null +++ b/llvm/test/tools/obj2yaml/ELF/DWARF/debug-addr.yaml @@ -0,0 +1,215 @@ +## Test how we dump the .debug_addr section. + +## a) Dumping address tables from various object files. + +## Dumping address tables from a little endian 64-bit object file. +# RUN: yaml2obj --docnum=1 %s -DADDRESS=0xFFFFFFFFFFFFFFFF \ +# RUN: -DADDRSIZE=4 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=BASIC --implicit-check-not=Sections: \ +# RUN: -DLENGTH1=0x0000000000000014 \ +# RUN: -DADDRSIZE1=0x08 \ +# RUN: -DADDR=0xFFFFFFFFFFFFFFFF \ +# RUN: -DLENGTH2=0x000000000000000C \ +# RUN: -DADDRSIZE2=0x04 + +## Dumping address tables from a big endian 64-bit object file. +# RUN: yaml2obj --docnum=1 %s -DENDIAN=MSB -DADDRESS=0xFFFFFFFFFFFFFFFF \ +# RUN: -DADDRSIZE=4 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=BASIC --implicit-check-not=Sections: \ +# RUN: -DLENGTH1=0x0000000000000014 \ +# RUN: -DADDRSIZE1=0x08 \ +# RUN: -DADDR=0xFFFFFFFFFFFFFFFF \ +# RUN: -DLENGTH2=0x000000000000000C \ +# RUN: -DADDRSIZE2=0x04 + +## Dumping address tables from a little endian 32-bit object file. +# RUN: yaml2obj --docnum=1 %s -DBITS=32 -DADDRESS=0xFFFFFFFF \ +# RUN: -DADDRSIZE=8 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=BASIC --implicit-check-not=Sections: \ +# RUN: -DLENGTH1=0x000000000000000C \ +# RUN: -DADDRSIZE1=0x04 \ +# RUN: -DADDR=0x00000000FFFFFFFF \ +# RUN: -DLENGTH2=0x0000000000000014 \ +# RUN: -DADDRSIZE2=0x08 + +## Dumping address tables from a big endian 32-bit object file. +# RUN: yaml2obj --docnum=1 %s -DBITS=32 -DENDIAN=MSB -DADDRESS=0xFFFFFFFF \ +# RUN: -DADDRSIZE=8 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=BASIC --implicit-check-not=Sections: \ +# RUN: -DLENGTH1=0x000000000000000C \ +# RUN: -DADDRSIZE1=0x04 \ +# RUN: -DADDR=0x00000000FFFFFFFF \ +# RUN: -DLENGTH2=0x0000000000000014 \ +# RUN: -DADDRSIZE2=0x08 + +# BASIC: DWARF: +# BASIC-NEXT: debug_addr: +# BASIC-NEXT: - Length: [[LENGTH1]] +# BASIC-NEXT: Version: 0x0005 +# BASIC-NEXT: AddressSize: [[ADDRSIZE1]] +# BASIC-NEXT: Entries: +# BASIC-NEXT: - Address: 0x0000000000001234 +# BASIC-NEXT: - Address: 0x0000000000005678 +# BASIC-NEXT: - Format: DWARF64 +# BASIC-NEXT: Length: [[LENGTH1]] +# BASIC-NEXT: Version: 0x0005 +# BASIC-NEXT: AddressSize: [[ADDRSIZE1]] +# BASIC-NEXT: Entries: +# BASIC-NEXT: - Address: 0x0000000000001234 +# BASIC-NEXT: - Address: [[ADDR]] +# BASIC-NEXT: - Length: [[LENGTH2]] +# BASIC-NEXT: Version: 0x0005 +# BASIC-NEXT: AddressSize: [[ADDRSIZE2]] +# BASIC-NEXT: Entries: +# BASIC-NEXT: - Address: 0x0000000000001234 +# BASIC-NEXT: - Address: 0x0000000000005678 +# BASIC-NEXT: - Format: DWARF64 +# BASIC-NEXT: Length: [[LENGTH2]] +# BASIC-NEXT: Version: 0x0005 +# BASIC-NEXT: AddressSize: [[ADDRSIZE2]] +# BASIC-NEXT: Entries: +# BASIC-NEXT: - Address: 0x0000000000001234 +# BASIC-NEXT: - Address: 0x0000000000005678 +# BASIC-NEXT: ... + +--- !ELF +FileHeader: + Class: ELFCLASS[[BITS=64]] + Data: ELFDATA2[[ENDIAN=LSB]] + Type: ET_EXEC +DWARF: + debug_addr: + ## A DWARF32 address table. + - Version: 5 + Entries: + - Address: 0x1234 + - Address: 0x5678 + ## A DWARF64 address table. + - Format: DWARF64 + Version: 5 + Entries: + - Address: 0x1234 + - Address: [[ADDRESS]] + ## A DWARF32 address table with a mutable address size. + - Version: 5 + AddressSize: [[ADDRSIZE]] + Entries: + - Address: 0x1234 + - Address: 0x5678 + ## A DWARF64 address table with a mutable address size. + - Format: DWARF64 + Version: 5 + AddressSize: [[ADDRSIZE]] + Entries: + - Address: 0x1234 + - Address: 0x5678 + +## b) Test dumping a .debug_addr section whose section header properties are +## overridden. + +## Override the sh_type field. +# RUN: yaml2obj --docnum=2 %s -DTYPE=SHT_STRTAB | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=SHT_STRTAB --check-prefix=COMMON + +## Override the sh_flags field. +# RUN: yaml2obj --docnum=2 %s -DFLAGS='[ SHF_ALLOC ]' | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=SHT_PROGBITS --check-prefixes=COMMON,FLAGS + +## Override the sh_link field. +# RUN: yaml2obj --docnum=2 %s -DLINK=.sec | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=SHT_PROGBITS --check-prefixes=COMMON,LINK + +## Override the sh_addr field. +# RUN: yaml2obj --docnum=2 %s -DADDRESS=0x2020 | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=SHT_PROGBITS --check-prefixes=COMMON,ADDR + +## Override the sh_addralign field. +# RUN: yaml2obj --docnum=2 %s -DADDRALIGN=3 | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=SHT_PROGBITS --check-prefixes=COMMON,ADDRALIGN + +## Override the sh_entsize field. +# RUN: yaml2obj --docnum=2 %s -DENTSIZE=3 | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=SHT_PROGBITS --check-prefixes=COMMON,ENTSIZE + +## Override the sh_info field. +# RUN: yaml2obj --docnum=2 %s -DINFO=3 | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=SHT_PROGBITS --check-prefixes=COMMON,INFO + +# COMMON: Sections: +# COMMON-NEXT: - Name: .debug_addr +# COMMON-NEXT: Type: [[TYPE]] +# FLAGS-NEXT: Flags: [ SHF_ALLOC ] +# LINK-NEXT: Link: .sec +# ADDR-NEXT: Address: 0x0000000000002020 +# ADDRALIGN-NEXT: AddressAlign: 0x0000000000000003 +# ENTSIZE-NEXT: EntSize: 0x0000000000000003 +# INFO-NEXT: Info: 0x0000000000000003 +# COMMON-NEXT: - Name: .sec +# COMMON-NEXT: Type: SHT_PROGBITS +# COMMON-NEXT: DWARF: +# COMMON-NEXT: debug_addr: +# COMMON-NEXT: - Length: 0x0000000000000014 +# COMMON-NEXT: Version: 0x0005 +# COMMON-NEXT: AddressSize: 0x08 +# COMMON-NEXT: Entries: +# COMMON-NEXT: - Address: 0x0000000000001234 +# COMMON-NEXT: - Address: 0x0000000000005678 +# COMMON-NEXT: ... + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +Sections: + - Name: .debug_addr + Type: [[TYPE=SHT_PROGBITS]] + Flags: [[FLAGS=]] + Link: [[LINK='']] + EntSize: [[ENTSIZE=]] + Info: [[INFO=]] + AddressAlign: [[ADDRALIGN=0]] + Address: [[ADDRESS=]] + - Name: .sec + Type: SHT_PROGBITS +DWARF: + debug_addr: + - Version: 5 + Entries: + - Address: 0x1234 + - Address: 0x5678 + +## c) Test dumping an address table whose version isn't 5. +## This causes the DWARF parser to fail to parse it and we will dump it as a raw +## content section. + +# RUN: yaml2obj --docnum=3 %s -DCONTENT="AABBCC" | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=RAW --implicit-check-not=DWARF: + +# RAW: Sections: +# RAW-NEXT: - Name: .debug_addr +# RAW-NEXT: Type: SHT_PROGBITS +# RAW-NEXT: AddressAlign: 0x0000000000000001 +# RAW-NEXT: Content: AABBCC +# RAW-NEXT: ... + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +Sections: + - Name: .debug_addr + Type: SHT_PROGBITS + AddressAlign: 1 + Size: [[SIZE=]] + Content: [[CONTENT=]] + +## d) Test dumping an empty .debug_addr section. + +# RUN: yaml2obj --docnum=3 %s -DSIZE=0 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=EMPTY --implicit-check-not=Sections: + +# EMPTY: DWARF: +# EMPTY-NEXT: debug_addr: [] +# EMPTY-NEXT: ... diff --git a/llvm/test/tools/obj2yaml/ELF/DWARF/debug-ranges.yaml b/llvm/test/tools/obj2yaml/ELF/DWARF/debug-ranges.yaml new file mode 100644 index 0000000000000..0e3fbae130711 --- /dev/null +++ b/llvm/test/tools/obj2yaml/ELF/DWARF/debug-ranges.yaml @@ -0,0 +1,233 @@ +## Test how we dump the .debug_ranges section. + +## a) Test dumping the .debug_ranges section from various object files with +## different endian and bits. + +## Dump the .debug_ranges section from a 32-bit little endian object file where +## the address_size of debug_info is 4. +# RUN: yaml2obj --docnum=1 %s -DBITS=32 -DLOWOFFSET=0xFFFFFFFF \ +# RUN: -DHIGHOFFSET=0x10 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=BASIC -DADDRSIZE=0x04 \ +# RUN: -DOFFSET=0x0000000000000018 -DLOWOFFSET=0x00000000FFFFFFFF \ +# RUN: -DHIGHOFFSET=0x0000000000000010 + +## Dump the .debug_ranges section from a 32-bit big endian object file where the +## address_size of debug_info is 4. +# RUN: yaml2obj --docnum=1 %s -DBITS=32 -DENDIAN=MSB -DLOWOFFSET=0xFFFFFFFF \ +# RUN: -DHIGHOFFSET=0x10 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=BASIC -DADDRSIZE=0x04 \ +# RUN: -DOFFSET=0x0000000000000018 -DLOWOFFSET=0x00000000FFFFFFFF \ +# RUN: -DHIGHOFFSET=0x0000000000000010 + +## Dump the .debug_ranges section from a 32-bit little endian object file where +## the address_size of debug_info is 8. +# RUN: yaml2obj --docnum=1 %s -DBITS=32 -DADDRSIZE1=8 \ +# RUN: -DADDRSIZE2=8 -DADDRSIZE3=8 -DADDRSIZE4=8 \ +# RUN: -DLOWOFFSET=0xFFFFFFFFFFFFFFFF -DHIGHOFFSET=0x10 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=BASIC -DADDRSIZE=0x08 \ +# RUN: -DOFFSET=0x0000000000000030 -DLOWOFFSET=0xFFFFFFFFFFFFFFFF \ +# RUN: -DHIGHOFFSET=0x0000000000000010 + +## Dump the .debug_ranges section from a 32-bit big endian object file where the +## address_size of debug_info is 8. +# RUN: yaml2obj --docnum=1 %s -DBITS=32 -DENDIAN=MSB -DADDRSIZE1=8 \ +# RUN: -DADDRSIZE2=8 -DADDRSIZE3=8 -DADDRSIZE4=8 \ +# RUN: -DLOWOFFSET=0xFFFFFFFFFFFFFFFF -DHIGHOFFSET=0x10 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=BASIC -DADDRSIZE=0x08 \ +# RUN: -DOFFSET=0x0000000000000030 -DLOWOFFSET=0xFFFFFFFFFFFFFFFF \ +# RUN: -DHIGHOFFSET=0x0000000000000010 + +## Dump the .debug_ranges section from a 64-bit little endian object file where +## the address_size of debug_info is 8. +# RUN: yaml2obj --docnum=1 %s -DLOWOFFSET=0xFFFFFFFFFFFFFFFF \ +# RUN: -DHIGHOFFSET=0x10 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=BASIC -DADDRSIZE=0x08 \ +# RUN: -DOFFSET=0x0000000000000030 -DLOWOFFSET=0xFFFFFFFFFFFFFFFF \ +# RUN: -DHIGHOFFSET=0x0000000000000010 + +## Dump the .debug_ranges section from a 64-bit big endian object file where the +## address_size of debug_info is 8. +# RUN: yaml2obj --docnum=1 %s -DENDIAN=MSB -DLOWOFFSET=0xFFFFFFFFFFFFFFFF \ +# RUN: -DHIGHOFFSET=0x10 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=BASIC -DADDRSIZE=0x08 \ +# RUN: -DOFFSET=0x0000000000000030 -DLOWOFFSET=0xFFFFFFFFFFFFFFFF \ +# RUN: -DHIGHOFFSET=0x0000000000000010 + +## Dump the .debug_ranges section from a 64-bit little endian object file where +## the address_size of debug_info is 4. +# RUN: yaml2obj --docnum=1 %s -DADDRSIZE1=4 -DADDRSIZE2=4 -DADDRSIZE3=4 \ +# RUN: -DADDRSIZE4=4 -DLOWOFFSET=0xFFFFFFFFFFFFFFFF \ +# RUN: -DHIGHOFFSET=0x10 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=BASIC -DADDRSIZE=0x04 \ +# RUN: -DOFFSET=0x0000000000000018 -DLOWOFFSET=0x00000000FFFFFFFF \ +# RUN: -DHIGHOFFSET=0x0000000000000010 + +## Dump the .debug_ranges section from a 64-bit big endian object file where the +## address_size of debug_info is 4. +# RUN: yaml2obj --docnum=1 %s -DADDRSIZE1=4 -DADDRSIZE2=4 -DADDRSIZE3=4 \ +# RUN: -DADDRSIZE4=4 -DLOWOFFSET=0xFFFFFFFFFFFFFFFF \ +# RUN: -DHIGHOFFSET=0x10 | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=BASIC -DADDRSIZE=0x04 \ +# RUN: -DOFFSET=0x0000000000000018 -DLOWOFFSET=0x00000000FFFFFFFF \ +# RUN: -DHIGHOFFSET=0x0000000000000010 + + +# BASIC-NOT: debug_ranges +# BASIC: debug_ranges: +# BASIC-NEXT: - Offset: 0x0000000000000000 +# BASIC-NEXT: AddrSize: [[ADDRSIZE]] +# BASIC-NEXT: Entries: +# BASIC-NEXT: - LowOffset: 0x0000000000000010 +# BASIC-NEXT: HighOffset: 0x0000000000000020 +# BASIC-NEXT: - LowOffset: 0x0000000000000030 +# BASIC-NEXT: HighOffset: 0x0000000000000040 +# BASIC-NEXT: - Offset: [[OFFSET]] +# BASIC-NEXT: AddrSize: [[ADDRSIZE]] +# BASIC-NEXT: Entries: +# BASIC-NEXT: - LowOffset: [[LOWOFFSET]] +# BASIC-NEXT: HighOffset: [[HIGHOFFSET]] + +--- !ELF +FileHeader: + Class: ELFCLASS[[BITS=64]] + Data: ELFDATA2[[ENDIAN=LSB]] + Type: ET_EXEC +DWARF: + ## The debug_ranges parser depends on the address_size field + ## of compilation units. We add the .debug_info section to + ## assist the parser. + debug_info: + - Version: 4 + AddrSize: [[ADDRSIZE1=]] + - Version: 4 + AddrSize: [[ADDRSIZE2=]] + debug_ranges: + - AddrSize: [[ADDRSIZE3=]] + Entries: + - LowOffset: 0x10 + HighOffset: 0x20 + - LowOffset: 0x30 + HighOffset: 0x40 + - AddrSize: [[ADDRSIZE4=]] + Entries: + - LowOffset: [[LOWOFFSET=0x10]] + HighOffset: [[HIGHOFFSET=0x20]] + +## b) Test that obj2yaml dumps the .debug_ranges as a raw content section when +## the parser fails. In this case, the address_size of the two compilation units +## doesn't match. + +# RUN: yaml2obj --docnum=1 -DADDRSIZE1=4 -DADDRSIZE2=8 %s | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=RAW --implicit-check-not=debug_ranges + +# RAW: - Name: .debug_ranges +# RAW-NEXT: Type: SHT_PROGBITS +# RAW-NEXT: AddressAlign: 0x0000000000000001 +# RAW-NEXT: Content: '1000000000000000 +## ^--------------- LowOffset +# RAW-SAME: {{^}}2000000000000000 +## ^--------------- HighOffset +# RAW-SAME: {{^}}3000000000000000 +## ^--------------- LowOffset +# RAW-SAME: {{^}}4000000000000000 +## ^--------------- HighOffset +# RAW-SAME: {{^}}0000000000000000 +## ^--------------- +# RAW-SAME: {{^}}0000000000000000 +## ---------------- terminator +# RAW-SAME: {{^}}1000000000000000 +## ^--------------- LowOffset +# RAW-SAME: {{^}}2000000000000000 +## ^--------------- HighOffset +# RAW-SAME: {{^}}0000000000000000 +## ^--------------- +# RAW-SAME: {{^}}0000000000000000' +## ---------------- terminator + +## c) Test dumping an empty .debug_ranges section. + +# RUN: yaml2obj --docnum=2 %s | obj2yaml | \ +# RUN: FileCheck %s --check-prefix=EMPTY --implicit-check-not=Sections: + +# EMPTY: DWARF: +# EMPTY-NEXT: debug_ranges: [] + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_ranges: [] + +## d) Test dumping a .debug_ranges section whose section header properties are +## overridden. + +## Override the sh_type field. +# RUN: yaml2obj --docnum=3 -DTYPE=SHT_STRTAB %s | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=STRTAB --check-prefixes=COMMON + +## Override the sh_flags field. +# RUN: yaml2obj --docnum=3 -DFLAGS=[SHF_ALLOC] %s | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=PROGBITS --check-prefixes=COMMON,FLAGS + +## Override the sh_link field. +# RUN: yaml2obj --docnum=3 -DLINK='.sec' %s | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=PROGBITS --check-prefixes=COMMON,LINK + +## Override the sh_entsize field. +# RUN: yaml2obj --docnum=3 -DENTSIZE=3 %s | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=PROGBITS --check-prefixes=COMMON,ENTSIZE + +## Override the sh_info field. +# RUN: yaml2obj --docnum=3 -DINFO=3 %s | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=PROGBITS --check-prefixes=COMMON,INFO + +## Override the sh_addralign field. +# RUN: yaml2obj --docnum=3 -DADDRALIGN=3 %s | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=PROGBITS --check-prefixes=COMMON,ADDRALIGN + +## Override the sh_address field. +# RUN: yaml2obj --docnum=3 -DADDRESS=0x2020 %s | obj2yaml | \ +# RUN: FileCheck %s -DTYPE=PROGBITS --check-prefixes=COMMON,ADDRESS + +# COMMON: - Name: .debug_ranges +# COMMON-NEXT: Type: SHT_[[TYPE]] +# FLAGS-NEXT: Flags: [ SHF_ALLOC ] +# LINK-NEXT: Link: .sec +# ENTSIZE-NEXT: EntSize: 0x0000000000000003 +# INFO-NEXT: Info: 0x0000000000000003 +# ADDRALIGN-NEXT: AddressAlign: 0x0000000000000003 +# ADDRESS-NEXT: Address: 0x0000000000002020 + +# COMMON: debug_ranges: +# COMMON-NEXT: - Offset: 0x0000000000000000 +# COMMON-NEXT: AddrSize: 0x08 +# COMMON-NEXT: Entries: +# COMMON-NEXT: - LowOffset: 0x0000000000000010 +# COMMON-NEXT: HighOffset: 0x0000000000000020 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +Sections: + - Name: .debug_ranges + Type: [[TYPE=SHT_PROGBITS]] + Flags: [[FLAGS=]] + Link: [[LINK='']] + EntSize: [[ENTSIZE=]] + Info: [[INFO=]] + AddressAlign: [[ADDRALIGN=0]] + Address: [[ADDRESS=]] + - Name: .sec + Type: SHT_PROGBITS +DWARF: + debug_info: + - Version: 4 + AddrSize: 8 + debug_ranges: + - Entries: + - LowOffset: 0x10 + HighOffset: 0x20 diff --git a/llvm/test/tools/obj2yaml/ELF/DWARF/debug-str.yaml b/llvm/test/tools/obj2yaml/ELF/DWARF/debug-str.yaml index e058642877243..76c1c5c1b3650 100644 --- a/llvm/test/tools/obj2yaml/ELF/DWARF/debug-str.yaml +++ b/llvm/test/tools/obj2yaml/ELF/DWARF/debug-str.yaml @@ -99,3 +99,27 @@ FileHeader: Type: ET_EXEC DWARF: debug_str: [] + +## d) Test that yaml2obj stops parsing the .debug_str section if it encounters a +## string without a null terminator. The output uses a raw content section instead of +## the DWARF tag to represent the broken .debug_str section. + +# RUN: yaml2obj --docnum=3 %s | obj2yaml | FileCheck %s --check-prefix=NO-TERMINATOR + +# NO-TERMINATOR-NOT: DWARF: +# NO-TERMINATOR: Sections: +# NO-TERMINATOR-NEXT: - Name: .debug_str +# NO-TERMINATOR-NEXT: Type: SHT_PROGBITS +# NO-TERMINATOR-NEXT: Flags: [ SHF_MERGE, SHF_STRINGS ] +# NO-TERMINATOR-NEXT: Content: '61626300616263' +# NO-TERMINATOR-NEXT: ... + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +Sections: + - Name: .debug_str + Type: SHT_PROGBITS + Content: "61626300616263" ## "abc\0abc" diff --git a/llvm/test/tools/obj2yaml/ELF/DWARF/unrecognized-debug-section.yaml b/llvm/test/tools/obj2yaml/ELF/DWARF/unrecognized-debug-section.yaml new file mode 100644 index 0000000000000..618ac3592b6df --- /dev/null +++ b/llvm/test/tools/obj2yaml/ELF/DWARF/unrecognized-debug-section.yaml @@ -0,0 +1,19 @@ +## Test dumping a debug section when its name is not recognized by obj2yaml. + +# RUN: yaml2obj %s | obj2yaml | FileCheck %s + +# CHECK: Sections: +# CHECK-NEXT: - Name: .debug_foo +# CHECK-NEXT: Type: SHT_PROGBITS +# CHECK-NEXT: Content: '01020304' +# CHECK-NEXT: ... + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +Sections: + - Name: .debug_foo + Type: SHT_PROGBITS + Content: '01020304' diff --git a/llvm/test/tools/obj2yaml/ELF/call-graph-profile-section.yaml b/llvm/test/tools/obj2yaml/ELF/call-graph-profile-section.yaml index bc8b631beea83..2e3fcd98065be 100644 --- a/llvm/test/tools/obj2yaml/ELF/call-graph-profile-section.yaml +++ b/llvm/test/tools/obj2yaml/ELF/call-graph-profile-section.yaml @@ -51,10 +51,9 @@ Symbols: # INVALID: --- !ELF # INVALID-NEXT: FileHeader: -# INVALID-NEXT: Class: ELFCLASS32 -# INVALID-NEXT: Data: ELFDATA2MSB -# INVALID-NEXT: Type: ET_DYN -# INVALID-NEXT: Machine: EM_NONE +# INVALID-NEXT: Class: ELFCLASS32 +# INVALID-NEXT: Data: ELFDATA2MSB +# INVALID-NEXT: Type: ET_DYN # INVALID-NEXT: Sections: # INVALID-NEXT: - Name: .empty # INVALID-NEXT: Type: SHT_LLVM_CALL_GRAPH_PROFILE diff --git a/llvm/test/tools/obj2yaml/ELF/duplicate-symbol-and-section-names.yaml b/llvm/test/tools/obj2yaml/ELF/duplicate-symbol-and-section-names.yaml index bea942327a5bb..9e6b8fca67ac4 100644 --- a/llvm/test/tools/obj2yaml/ELF/duplicate-symbol-and-section-names.yaml +++ b/llvm/test/tools/obj2yaml/ELF/duplicate-symbol-and-section-names.yaml @@ -24,10 +24,9 @@ # CASE1: --- !ELF # CASE1-NEXT: FileHeader: -# CASE1-NEXT: Class: ELFCLASS64 -# CASE1-NEXT: Data: ELFDATA2LSB -# CASE1-NEXT: Type: ET_REL -# CASE1-NEXT: Machine: EM_NONE +# CASE1-NEXT: Class: ELFCLASS64 +# CASE1-NEXT: Data: ELFDATA2LSB +# CASE1-NEXT: Type: ET_REL # CASE1-NEXT: Sections: # CASE1-NEXT: - Name: .foo # CASE1-NEXT: Type: SHT_PROGBITS diff --git a/llvm/test/tools/obj2yaml/ELF/emachine.yaml b/llvm/test/tools/obj2yaml/ELF/emachine.yaml index d351505aa2845..10d72bed87f4e 100644 --- a/llvm/test/tools/obj2yaml/ELF/emachine.yaml +++ b/llvm/test/tools/obj2yaml/ELF/emachine.yaml @@ -2,38 +2,36 @@ ## Check it dumps an unknown e_machine as a number. -# RUN: yaml2obj --docnum=1 %s -o %t1 -# RUN: obj2yaml %t1 | FileCheck %s --check-prefix=UNKNOWN +# RUN: yaml2obj -DMACHINE=0x1234 %s -o %t1 +# RUN: obj2yaml %t1 | FileCheck %s -DMACHINE=0x1234 -# UNKNOWN: --- !ELF -# UNKNOWN-NEXT: FileHeader: -# UNKNOWN-NEXT: Class: ELFCLASS64 -# UNKNOWN-NEXT: Data: ELFDATA2MSB -# UNKNOWN-NEXT: Type: ET_REL -# UNKNOWN-NEXT: Machine: 0x1234 +# CHECK: --- !ELF +# CHECK-NEXT: FileHeader: +# CHECK-NEXT: Class: ELFCLASS64 +# CHECK-NEXT: Data: ELFDATA2MSB +# CHECK-NEXT: Type: ET_REL +# CHECK-NEXT: Machine: [[MACHINE]] --- !ELF FileHeader: Class: ELFCLASS64 Data: ELFDATA2MSB Type: ET_REL - Machine: 0x1234 + Machine: [[MACHINE]] ## Check it dumps a known e_machine value as an enum string. -# RUN: yaml2obj --docnum=2 %s -o %t2 -# RUN: obj2yaml %t2 | FileCheck %s --check-prefix=KNOWN +# RUN: yaml2obj %s -DMACHINE=0x1 -o %t2 +# RUN: obj2yaml %t2 | FileCheck %s -DMACHINE=EM_M32 -# KNOWN: --- !ELF -# KNOWN-NEXT: FileHeader: -# KNOWN-NEXT: Class: ELFCLASS64 -# KNOWN-NEXT: Data: ELFDATA2MSB -# KNOWN-NEXT: Type: ET_REL -# KNOWN-NEXT: Machine: EM_NONE +## Check it doesn't dump e_machine when it is EM_NONE (0). ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2MSB - Type: ET_REL - Machine: 0 +# RUN: yaml2obj %s -DMACHINE=0x0 -o %t3 +# RUN: obj2yaml %t3 | FileCheck %s --check-prefix=DEFAULT + +# DEFAULT: --- !ELF +# DEFAULT-NEXT: FileHeader: +# DEFAULT-NEXT: Class: ELFCLASS64 +# DEFAULT-NEXT: Data: ELFDATA2MSB +# DEFAULT-NEXT: Type: ET_REL +# DEFAULT-NEXT: ... diff --git a/llvm/test/tools/obj2yaml/ELF/gnu-unique-symbols.yaml b/llvm/test/tools/obj2yaml/ELF/gnu-unique-symbols.yaml index 2668dad25fb4b..c34ab3e3fc0ad 100644 --- a/llvm/test/tools/obj2yaml/ELF/gnu-unique-symbols.yaml +++ b/llvm/test/tools/obj2yaml/ELF/gnu-unique-symbols.yaml @@ -5,11 +5,10 @@ # CHECK: --- !ELF # CHECK-NEXT: FileHeader: -# CHECK-NEXT: Class: ELFCLASS64 -# CHECK-NEXT: Data: ELFDATA2LSB -# CHECK-NEXT: OSABI: ELFOSABI_GNU -# CHECK-NEXT: Type: ET_REL -# CHECK-NEXT: Machine: EM_NONE +# CHECK-NEXT: Class: ELFCLASS64 +# CHECK-NEXT: Data: ELFDATA2LSB +# CHECK-NEXT: OSABI: ELFOSABI_GNU +# CHECK-NEXT: Type: ET_REL # CHECK-NEXT: Symbols: # CHECK-NEXT: - Name: foo # CHECK-NEXT: Type: STT_OBJECT diff --git a/llvm/test/tools/obj2yaml/ELF/implicit-sections-order.yaml b/llvm/test/tools/obj2yaml/ELF/implicit-sections-order.yaml index 502b8e62688b1..e400d00eb5418 100644 --- a/llvm/test/tools/obj2yaml/ELF/implicit-sections-order.yaml +++ b/llvm/test/tools/obj2yaml/ELF/implicit-sections-order.yaml @@ -34,10 +34,9 @@ # OUTPUT: --- !ELF # OUTPUT-NEXT: FileHeader: -# OUTPUT-NEXT: Class: ELFCLASS64 -# OUTPUT-NEXT: Data: ELFDATA2LSB -# OUTPUT-NEXT: Type: ET_DYN -# OUTPUT-NEXT: Machine: EM_NONE +# OUTPUT-NEXT: Class: ELFCLASS64 +# OUTPUT-NEXT: Data: ELFDATA2LSB +# OUTPUT-NEXT: Type: ET_DYN # OUTPUT-NEXT: Sections: # OUTPUT-NEXT: - Name: .foo.1 # OUTPUT-NEXT: Type: SHT_PROGBITS @@ -124,10 +123,9 @@ DynamicSymbols: ## SHT_STRTAB/SHT_SYMTAB/SHT_DYNSYM sections. # OUTPUT2: --- !ELF # OUTPUT2-NEXT: FileHeader: -# OUTPUT2-NEXT: Class: ELFCLASS64 -# OUTPUT2-NEXT: Data: ELFDATA2LSB -# OUTPUT2-NEXT: Type: ET_DYN -# OUTPUT2-NEXT: Machine: EM_NONE +# OUTPUT2-NEXT: Class: ELFCLASS64 +# OUTPUT2-NEXT: Data: ELFDATA2LSB +# OUTPUT2-NEXT: Type: ET_DYN # OUTPUT2-NEXT: Sections: # OUTPUT2-NEXT: - Name: .foo.1 # OUTPUT2-NEXT: Type: SHT_PROGBITS diff --git a/llvm/test/tools/obj2yaml/ELF/invalid-section-name.yaml b/llvm/test/tools/obj2yaml/ELF/invalid-section-name.yaml index 3f46563b980a5..40667b57a9749 100644 --- a/llvm/test/tools/obj2yaml/ELF/invalid-section-name.yaml +++ b/llvm/test/tools/obj2yaml/ELF/invalid-section-name.yaml @@ -8,10 +8,9 @@ # CHECK: --- !ELF # CHECK-NEXT: FileHeader: -# CHECK-NEXT: Class: ELFCLASS64 -# CHECK-NEXT: Data: ELFDATA2LSB -# CHECK-NEXT: Type: ET_REL -# CHECK-NEXT: Machine: EM_NONE +# CHECK-NEXT: Class: ELFCLASS64 +# CHECK-NEXT: Data: ELFDATA2LSB +# CHECK-NEXT: Type: ET_REL # CHECK-NEXT: Sections: # CHECK-NEXT: - Name: "{{.*}}" # CHECK-NEXT: Type: SHT_PROGBITS diff --git a/llvm/test/tools/obj2yaml/ELF/no-symtab.yaml b/llvm/test/tools/obj2yaml/ELF/no-symtab.yaml index 1566693339cda..8f9fb82856452 100644 --- a/llvm/test/tools/obj2yaml/ELF/no-symtab.yaml +++ b/llvm/test/tools/obj2yaml/ELF/no-symtab.yaml @@ -6,10 +6,9 @@ # NOSYMTAB: --- !ELF # NOSYMTAB-NEXT: FileHeader: -# NOSYMTAB-NEXT: Class: ELFCLASS64 -# NOSYMTAB-NEXT: Data: ELFDATA2LSB -# NOSYMTAB-NEXT: Type: ET_DYN -# NOSYMTAB-NEXT: Machine: EM_NONE +# NOSYMTAB-NEXT: Class: ELFCLASS64 +# NOSYMTAB-NEXT: Data: ELFDATA2LSB +# NOSYMTAB-NEXT: Type: ET_DYN # NOSYMTAB-NEXT: ... --- !ELF @@ -26,10 +25,9 @@ FileHeader: # SYMTAB: --- !ELF # SYMTAB-NEXT: FileHeader: -# SYMTAB-NEXT: Class: ELFCLASS64 -# SYMTAB-NEXT: Data: ELFDATA2LSB -# SYMTAB-NEXT: Type: ET_DYN -# SYMTAB-NEXT: Machine: EM_NONE +# SYMTAB-NEXT: Class: ELFCLASS64 +# SYMTAB-NEXT: Data: ELFDATA2LSB +# SYMTAB-NEXT: Type: ET_DYN # SYMTAB-NEXT: Symbols: [] # SYMTAB-NEXT: ... diff --git a/llvm/test/tools/obj2yaml/ELF/null-section.yaml b/llvm/test/tools/obj2yaml/ELF/null-section.yaml index 4d1e6ee1e7dbd..abba576fb4c78 100644 --- a/llvm/test/tools/obj2yaml/ELF/null-section.yaml +++ b/llvm/test/tools/obj2yaml/ELF/null-section.yaml @@ -6,10 +6,9 @@ # FIRST-SEC: --- !ELF # FIRST-SEC-NEXT: FileHeader: -# FIRST-SEC-NEXT: Class: ELFCLASS64 -# FIRST-SEC-NEXT: Data: ELFDATA2LSB -# FIRST-SEC-NEXT: Type: ET_REL -# FIRST-SEC-NEXT: Machine: EM_NONE +# FIRST-SEC-NEXT: Class: ELFCLASS64 +# FIRST-SEC-NEXT: Data: ELFDATA2LSB +# FIRST-SEC-NEXT: Type: ET_REL # FIRST-SEC-NEXT: Sections: # FIRST-SEC-NEXT: - Type: SHT_NULL # FIRST-SEC-NEXT: Flags: [ SHF_ALLOC ] @@ -48,10 +47,9 @@ Sections: # SECOND-SEC: --- !ELF # SECOND-SEC-NEXT: FileHeader: -# SECOND-SEC-NEXT: Class: ELFCLASS64 -# SECOND-SEC-NEXT: Data: ELFDATA2LSB -# SECOND-SEC-NEXT: Type: ET_REL -# SECOND-SEC-NEXT: Machine: EM_NONE +# SECOND-SEC-NEXT: Class: ELFCLASS64 +# SECOND-SEC-NEXT: Data: ELFDATA2LSB +# SECOND-SEC-NEXT: Type: ET_REL # SECOND-SEC-NEXT: Sections: # SECOND-SEC-NEXT: - Name: .foo # SECOND-SEC-NEXT: Type: SHT_PROGBITS @@ -91,10 +89,9 @@ Sections: # NULL-SEC: --- !ELF # NULL-SEC-NEXT: FileHeader: -# NULL-SEC-NEXT: Class: ELFCLASS64 -# NULL-SEC-NEXT: Data: ELFDATA2LSB -# NULL-SEC-NEXT: Type: ET_REL -# NULL-SEC-NEXT: Machine: EM_NONE +# NULL-SEC-NEXT: Class: ELFCLASS64 +# NULL-SEC-NEXT: Data: ELFDATA2LSB +# NULL-SEC-NEXT: Type: ET_REL # NULL-SEC-NEXT: Sections: # NULL-SEC-NEXT: - Name: .foo # NULL-SEC-NEXT: Type: SHT_PROGBITS @@ -118,10 +115,9 @@ Sections: # NULL-SEC-MIDDLE: --- !ELF # NULL-SEC-MIDDLE-NEXT: FileHeader: -# NULL-SEC-MIDDLE-NEXT: Class: ELFCLASS64 -# NULL-SEC-MIDDLE-NEXT: Data: ELFDATA2LSB -# NULL-SEC-MIDDLE-NEXT: Type: ET_REL -# NULL-SEC-MIDDLE-NEXT: Machine: EM_NONE +# NULL-SEC-MIDDLE-NEXT: Class: ELFCLASS64 +# NULL-SEC-MIDDLE-NEXT: Data: ELFDATA2LSB +# NULL-SEC-MIDDLE-NEXT: Type: ET_REL # NULL-SEC-MIDDLE-NEXT: Sections: # NULL-SEC-MIDDLE-NEXT: - Name: .foo # NULL-SEC-MIDDLE-NEXT: Type: SHT_PROGBITS diff --git a/llvm/test/tools/obj2yaml/ELF/sht-symtab-shndx.yaml b/llvm/test/tools/obj2yaml/ELF/sht-symtab-shndx.yaml index cc20a036daaaf..27decbe76d926 100644 --- a/llvm/test/tools/obj2yaml/ELF/sht-symtab-shndx.yaml +++ b/llvm/test/tools/obj2yaml/ELF/sht-symtab-shndx.yaml @@ -7,10 +7,9 @@ # CASE1: --- !ELF # CASE1-NEXT: FileHeader: -# CASE1-NEXT: Class: ELFCLASS64 -# CASE1-NEXT: Data: ELFDATA2LSB -# CASE1-NEXT: Type: ET_REL -# CASE1-NEXT: Machine: EM_NONE +# CASE1-NEXT: Class: ELFCLASS64 +# CASE1-NEXT: Data: ELFDATA2LSB +# CASE1-NEXT: Type: ET_REL # CASE1-NEXT: Sections: # CASE1-NEXT: - Name: bar # CASE1-NEXT: Type: SHT_PROGBITS diff --git a/llvm/test/tools/obj2yaml/ELF/stack-sizes.yaml b/llvm/test/tools/obj2yaml/ELF/stack-sizes.yaml index 8e6c66729c4e0..a2ef5f1f3770f 100644 --- a/llvm/test/tools/obj2yaml/ELF/stack-sizes.yaml +++ b/llvm/test/tools/obj2yaml/ELF/stack-sizes.yaml @@ -8,10 +8,9 @@ # VALID: --- !ELF # VALID-NEXT: FileHeader: -# VALID-NEXT: Class: ELFCLASS64 -# VALID-NEXT: Data: ELFDATA2LSB -# VALID-NEXT: Type: ET_EXEC -# VALID-NEXT: Machine: EM_NONE +# VALID-NEXT: Class: ELFCLASS64 +# VALID-NEXT: Data: ELFDATA2LSB +# VALID-NEXT: Type: ET_EXEC # VALID-NEXT: Sections: # VALID-NEXT: - Name: .stack_sizes # VALID-NEXT: Type: SHT_PROGBITS @@ -39,10 +38,9 @@ Sections: # INVALID: --- !ELF # INVALID-NEXT: FileHeader: -# INVALID-NEXT: Class: ELFCLASS64 -# INVALID-NEXT: Data: ELFDATA2LSB -# INVALID-NEXT: Type: ET_EXEC -# INVALID-NEXT: Machine: EM_NONE +# INVALID-NEXT: Class: ELFCLASS64 +# INVALID-NEXT: Data: ELFDATA2LSB +# INVALID-NEXT: Type: ET_EXEC # INVALID-NEXT: Sections: # INVALID-NEXT: - Name: .stack_sizes # INVALID-NEXT: Type: SHT_PROGBITS @@ -65,10 +63,9 @@ Sections: # EMPTY: --- !ELF # EMPTY-NEXT: FileHeader: -# EMPTY-NEXT: Class: ELFCLASS64 -# EMPTY-NEXT: Data: ELFDATA2LSB -# EMPTY-NEXT: Type: ET_EXEC -# EMPTY-NEXT: Machine: EM_NONE +# EMPTY-NEXT: Class: ELFCLASS64 +# EMPTY-NEXT: Data: ELFDATA2LSB +# EMPTY-NEXT: Type: ET_EXEC # EMPTY-NEXT: Sections: # EMPTY-NEXT: - Name: .stack_sizes # EMPTY-NEXT: Type: SHT_PROGBITS @@ -83,3 +80,50 @@ Sections: - Name: .stack_sizes Type: SHT_PROGBITS Content: "" + +## Check obj2yaml can dump multiple .stack_sizes. + +# RUN: yaml2obj --docnum=4 %s -o %t4 +# RUN: obj2yaml %t4 | FileCheck %s --check-prefix=MULTI + +# MULTI: --- !ELF +# MULTI-NEXT: FileHeader: +# MULTI-NEXT: Class: ELFCLASS64 +# MULTI-NEXT: Data: ELFDATA2LSB +# MULTI-NEXT: Type: ET_EXEC +# MULTI-NEXT: Sections: +# MULTI-NEXT: - Name: .stack_sizes +# MULTI-NEXT: Type: SHT_PROGBITS +# MULTI-NEXT: Entries: +# MULTI-NEXT: - Address: 0x0000000000000010 +# MULTI-NEXT: Size: 0x0000000000000020 +# MULTI-NEXT: - Address: 0x0000000000000030 +# MULTI-NEXT: Size: 0x0000000000000040 +# MULTI-NEXT: - Name: '.stack_sizes (1)' +# MULTI-NEXT: Type: SHT_PROGBITS +# MULTI-NEXT: Entries: +# MULTI-NEXT: - Address: 0x0000000000000050 +# MULTI-NEXT: Size: 0x0000000000000001 +# MULTI-NEXT: - Address: 0x0000000000000060 +# MULTI-NEXT: Size: 0x0000000000000002 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +Sections: + - Name: .stack_sizes + Type: SHT_PROGBITS + Entries: + - Address: 0x0000000000000010 + Size: 0x0000000000000020 + - Address: 0x0000000000000030 + Size: 0x0000000000000040 + - Name: '.stack_sizes (1)' + Type: SHT_PROGBITS + Entries: + - Address: 0x0000000000000050 + Size: 0x0000000000000001 + - Address: 0x0000000000000060 + Size: 0x0000000000000002 diff --git a/llvm/test/tools/obj2yaml/ELF/symbol-visibility.yaml b/llvm/test/tools/obj2yaml/ELF/symbol-visibility.yaml index 7659def7eb9f8..0c6020062fab2 100644 --- a/llvm/test/tools/obj2yaml/ELF/symbol-visibility.yaml +++ b/llvm/test/tools/obj2yaml/ELF/symbol-visibility.yaml @@ -4,10 +4,9 @@ # CHECK: --- !ELF # CHECK-NEXT: FileHeader: -# CHECK-NEXT: Class: ELFCLASS64 -# CHECK-NEXT: Data: ELFDATA2LSB -# CHECK-NEXT: Type: ET_REL -# CHECK-NEXT: Machine: EM_NONE +# CHECK-NEXT: Class: ELFCLASS64 +# CHECK-NEXT: Data: ELFDATA2LSB +# CHECK-NEXT: Type: ET_REL # CHECK-NEXT: Symbols: # CHECK-NEXT: - Name: default # CHECK-NEXT: - Name: internal diff --git a/llvm/test/tools/obj2yaml/ELF/versym-section.yaml b/llvm/test/tools/obj2yaml/ELF/versym-section.yaml index e394c325af0f2..fd63f553dc401 100644 --- a/llvm/test/tools/obj2yaml/ELF/versym-section.yaml +++ b/llvm/test/tools/obj2yaml/ELF/versym-section.yaml @@ -5,11 +5,10 @@ # CHECK: --- !ELF # CHECK-NEXT: FileHeader: -# CHECK-NEXT: Class: ELFCLASS64 -# CHECK-NEXT: Data: ELFDATA2LSB -# CHECK-NEXT: Type: ET_EXEC -# CHECK-NEXT: Machine: EM_NONE -# CHECK-NEXT: Entry: 0x0000000000201000 +# CHECK-NEXT: Class: ELFCLASS64 +# CHECK-NEXT: Data: ELFDATA2LSB +# CHECK-NEXT: Type: ET_EXEC +# CHECK-NEXT: Entry: 0x0000000000201000 # CHECK-NEXT: Sections: # CHECK-NEXT: - Name: .gnu.version # CHECK-NEXT: Type: SHT_GNU_versym diff --git a/llvm/test/tools/yaml2obj/ELF/DWARF/debug-ranges.yaml b/llvm/test/tools/yaml2obj/ELF/DWARF/debug-ranges.yaml index 6a9cd7a6195e7..f80dd6de53689 100644 --- a/llvm/test/tools/yaml2obj/ELF/DWARF/debug-ranges.yaml +++ b/llvm/test/tools/yaml2obj/ELF/DWARF/debug-ranges.yaml @@ -407,3 +407,17 @@ DWARF: Entries: - LowOffset: 0x1234 HighOffset: 0x5678 + +## l) Test that the .debug_ranges section header is emitted if the "debug_ranges" +## entry is empty. + +# RUN: yaml2obj --docnum=12 %s -o %t12.o +# RUN: llvm-readobj -S %t12.o | FileCheck -DSIZE=0 -DADDRALIGN=1 %s --check-prefix=DWARF-HEADER + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_ranges: [] diff --git a/llvm/tools/bugpoint/OptimizerDriver.cpp b/llvm/tools/bugpoint/OptimizerDriver.cpp index 25a970bd68785..ca78735202fcb 100644 --- a/llvm/tools/bugpoint/OptimizerDriver.cpp +++ b/llvm/tools/bugpoint/OptimizerDriver.cpp @@ -205,6 +205,9 @@ bool BugDriver::runPasses(Module &Program, for (unsigned i = 0, e = OptArgs.size(); i != e; ++i) Args.push_back(OptArgs[i]); + // Pin to legacy PM since bugpoint has lots of infra and hacks revolving + // around the legacy PM. + Args.push_back("-enable-new-pm=0"); Args.push_back("-disable-symbolication"); Args.push_back("-o"); Args.push_back(OutputFilename); diff --git a/llvm/tools/llvm-config/llvm-config.cpp b/llvm/tools/llvm-config/llvm-config.cpp index a9d3f64aaa5b3..1a2f04552d137 100644 --- a/llvm/tools/llvm-config/llvm-config.cpp +++ b/llvm/tools/llvm-config/llvm-config.cpp @@ -381,6 +381,7 @@ int main(int argc, char **argv) { SharedExt = "dll"; SharedVersionedExt = LLVM_DYLIB_VERSION ".dll"; if (HostTriple.isOSCygMing()) { + SharedPrefix = "lib"; StaticExt = "a"; StaticPrefix = "lib"; } else { diff --git a/llvm/tools/llvm-cov/gcov.cpp b/llvm/tools/llvm-cov/gcov.cpp index d99e792c68a95..d42e7cd3b551e 100644 --- a/llvm/tools/llvm-cov/gcov.cpp +++ b/llvm/tools/llvm-cov/gcov.cpp @@ -77,9 +77,7 @@ static void reportCoverage(StringRef SourceFile, StringRef ObjectDir, if (DumpGCOV) GF.print(errs()); - FileInfo FI(Options); - GF.collectLineCounts(FI); - FI.print(llvm::outs(), SourceFile, GCNO, GCDA, GF); + gcovOneInput(Options, SourceFile, GCNO, GCDA, GF); } int gcovMain(int argc, const char *argv[]) { @@ -117,6 +115,11 @@ int gcovMain(int argc, const char *argv[]) { cl::Grouping, cl::NotHidden, cl::aliasopt(Intermediate)); + cl::opt Demangle("demangled-names", cl::init(false), + cl::desc("Demangle function names")); + cl::alias DemangleA("m", cl::desc("Alias for --demangled-names"), + cl::Grouping, cl::NotHidden, cl::aliasopt(Demangle)); + cl::opt NoOutput("n", cl::Grouping, cl::init(false), cl::desc("Do not output any .gcov files")); cl::alias NoOutputA("no-output", cl::aliasopt(NoOutput)); @@ -131,6 +134,14 @@ int gcovMain(int argc, const char *argv[]) { cl::desc("Preserve path components")); cl::alias PreservePathsA("preserve-paths", cl::aliasopt(PreservePaths)); + cl::opt RelativeOnly( + "r", cl::Grouping, + cl::desc("Only dump files with relative paths or absolute paths with the " + "prefix specified by -s")); + cl::alias RelativeOnlyA("relative-only", cl::aliasopt(RelativeOnly)); + cl::opt SourcePrefix("s", cl::desc("Source prefix to elide")); + cl::alias SourcePrefixA("source-prefix", cl::aliasopt(SourcePrefix)); + cl::opt UseStdout("t", cl::Grouping, cl::init(false), cl::desc("Print to stdout")); cl::alias UseStdoutA("stdout", cl::aliasopt(UseStdout)); @@ -157,7 +168,8 @@ int gcovMain(int argc, const char *argv[]) { GCOV::Options Options(AllBlocks, BranchProb, BranchCount, FuncSummary, PreservePaths, UncondBranch, Intermediate, LongNames, - NoOutput, UseStdout, HashFilenames); + Demangle, NoOutput, RelativeOnly, UseStdout, + HashFilenames, SourcePrefix); for (const auto &SourceFile : SourceFiles) reportCoverage(SourceFile, ObjectDir, InputGCNO, InputGCDA, DumpGCOV, diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp index ecd1e21e15bfb..a34352d1512c5 100644 --- a/llvm/tools/llvm-nm/llvm-nm.cpp +++ b/llvm/tools/llvm-nm/llvm-nm.cpp @@ -1635,8 +1635,7 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName, } if (!found) { LastSymbolName = Entry.symbolName(); - NMSymbol W; - memset(&W, '\0', sizeof(NMSymbol)); + NMSymbol W = {}; W.Name = Entry.symbolName(); W.Address = 0; W.Size = 0; diff --git a/llvm/tools/llvm-objcopy/ELF/Object.cpp b/llvm/tools/llvm-objcopy/ELF/Object.cpp index e15fb24f4c425..e19285ee97eac 100644 --- a/llvm/tools/llvm-objcopy/ELF/Object.cpp +++ b/llvm/tools/llvm-objcopy/ELF/Object.cpp @@ -1320,7 +1320,7 @@ void ELFBuilder::readProgramHeaders(const ELFFile &HeadersFile) { ElfHdr.Index = Index++; ElfHdr.OriginalOffset = ElfHdr.Offset = EhdrOffset; - const auto &Ehdr = *HeadersFile.getHeader(); + const typename ELFT::Ehdr &Ehdr = HeadersFile.getHeader(); auto &PrHdr = Obj.ProgramHdrSegment; PrHdr.Type = PT_PHDR; PrHdr.Flags = 0; @@ -1398,7 +1398,7 @@ void ELFBuilder::initSymbolTable(SymbolTableSection *SymTab) { const Elf_Shdr &ShndxSec = *unwrapOrError(ElfFile.getSection(SymTab->getShndxTable()->Index)); ShndxData = unwrapOrError( - ElfFile.template getSectionContentsAsArray(&ShndxSec)); + ElfFile.template getSectionContentsAsArray(ShndxSec)); if (ShndxData.size() != Symbols.size()) error("symbol section index table does not have the same number of " "entries as the symbol table"); @@ -1476,7 +1476,7 @@ SectionBase &ELFBuilder::makeSection(const Elf_Shdr &Shdr) { case SHT_REL: case SHT_RELA: if (Shdr.sh_flags & SHF_ALLOC) { - Data = unwrapOrError(ElfFile.getSectionContents(&Shdr)); + Data = unwrapOrError(ElfFile.getSectionContents(Shdr)); return Obj.addSection(Data); } return Obj.addSection(); @@ -1485,7 +1485,7 @@ SectionBase &ELFBuilder::makeSection(const Elf_Shdr &Shdr) { // mean altering the memory image. There are no special link types or // anything so we can just use a Section. if (Shdr.sh_flags & SHF_ALLOC) { - Data = unwrapOrError(ElfFile.getSectionContents(&Shdr)); + Data = unwrapOrError(ElfFile.getSectionContents(Shdr)); return Obj.addSection
    (Data); } return Obj.addSection(); @@ -1493,16 +1493,16 @@ SectionBase &ELFBuilder::makeSection(const Elf_Shdr &Shdr) { case SHT_GNU_HASH: // Hash tables should refer to SHT_DYNSYM which we're not going to change. // Because of this we don't need to mess with the hash tables either. - Data = unwrapOrError(ElfFile.getSectionContents(&Shdr)); + Data = unwrapOrError(ElfFile.getSectionContents(Shdr)); return Obj.addSection
    (Data); case SHT_GROUP: - Data = unwrapOrError(ElfFile.getSectionContents(&Shdr)); + Data = unwrapOrError(ElfFile.getSectionContents(Shdr)); return Obj.addSection(Data); case SHT_DYNSYM: - Data = unwrapOrError(ElfFile.getSectionContents(&Shdr)); + Data = unwrapOrError(ElfFile.getSectionContents(Shdr)); return Obj.addSection(Data); case SHT_DYNAMIC: - Data = unwrapOrError(ElfFile.getSectionContents(&Shdr)); + Data = unwrapOrError(ElfFile.getSectionContents(Shdr)); return Obj.addSection(Data); case SHT_SYMTAB: { auto &SymTab = Obj.addSection(); @@ -1517,9 +1517,9 @@ SectionBase &ELFBuilder::makeSection(const Elf_Shdr &Shdr) { case SHT_NOBITS: return Obj.addSection
    (Data); default: { - Data = unwrapOrError(ElfFile.getSectionContents(&Shdr)); + Data = unwrapOrError(ElfFile.getSectionContents(Shdr)); - StringRef Name = unwrapOrError(ElfFile.getSectionName(&Shdr)); + StringRef Name = unwrapOrError(ElfFile.getSectionName(Shdr)); if (Name.startswith(".zdebug") || (Shdr.sh_flags & ELF::SHF_COMPRESSED)) { uint64_t DecompressedSize, DecompressedAlign; std::tie(DecompressedSize, DecompressedAlign) = @@ -1541,7 +1541,7 @@ template void ELFBuilder::readSectionHeaders() { continue; } auto &Sec = makeSection(Shdr); - Sec.Name = std::string(unwrapOrError(ElfFile.getSectionName(&Shdr))); + Sec.Name = std::string(unwrapOrError(ElfFile.getSectionName(Shdr))); Sec.Type = Sec.OriginalType = Shdr.sh_type; Sec.Flags = Sec.OriginalFlags = Shdr.sh_flags; Sec.Addr = Shdr.sh_addr; @@ -1560,7 +1560,7 @@ template void ELFBuilder::readSectionHeaders() { } template void ELFBuilder::readSections(bool EnsureSymtab) { - uint32_t ShstrIndex = ElfFile.getHeader()->e_shstrndx; + uint32_t ShstrIndex = ElfFile.getHeader().e_shstrndx; if (ShstrIndex == SHN_XINDEX) ShstrIndex = unwrapOrError(ElfFile.getSection(0))->sh_link; @@ -1602,10 +1602,10 @@ template void ELFBuilder::readSections(bool EnsureSymtab) { auto Shdr = unwrapOrError(ElfFile.sections()).begin() + RelSec->Index; if (RelSec->Type == SHT_REL) initRelocations(RelSec, Obj.SymbolTable, - unwrapOrError(ElfFile.rels(Shdr))); + unwrapOrError(ElfFile.rels(*Shdr))); else initRelocations(RelSec, Obj.SymbolTable, - unwrapOrError(ElfFile.relas(Shdr))); + unwrapOrError(ElfFile.relas(*Shdr))); } else if (auto GroupSec = dyn_cast(&Sec)) { initGroupSection(GroupSec); } @@ -1622,7 +1622,7 @@ template void ELFBuilder::build(bool EnsureSymtab) { ELFFile HeadersFile = unwrapOrError(ELFFile::create(toStringRef( {ElfFile.base() + EhdrOffset, ElfFile.getBufSize() - EhdrOffset}))); - auto &Ehdr = *HeadersFile.getHeader(); + auto &Ehdr = HeadersFile.getHeader(); Obj.OSABI = Ehdr.e_ident[EI_OSABI]; Obj.ABIVersion = Ehdr.e_ident[EI_ABIVERSION]; Obj.Type = Ehdr.e_type; diff --git a/llvm/tools/llvm-objcopy/InstallNameToolOpts.td b/llvm/tools/llvm-objcopy/InstallNameToolOpts.td index 04ffe62c42fca..7998041513cb1 100644 --- a/llvm/tools/llvm-objcopy/InstallNameToolOpts.td +++ b/llvm/tools/llvm-objcopy/InstallNameToolOpts.td @@ -32,3 +32,7 @@ def change: MultiArg<["-", "--"], "change", 2>, def version : Flag<["--"], "version">, HelpText<"Print the version and exit.">; + +def V : Flag<["-"], "V">, + Alias, + HelpText<"Alias for --version">; diff --git a/llvm/tools/llvm-objdump/ELFDump.cpp b/llvm/tools/llvm-objdump/ELFDump.cpp index 602bc63882527..c7a84385ffd50 100644 --- a/llvm/tools/llvm-objdump/ELFDump.cpp +++ b/llvm/tools/llvm-objdump/ELFDump.cpp @@ -92,7 +92,7 @@ static Error getRelocationValueString(const ELFObjectFile *Obj, return SymSI.takeError(); const typename ELFT::Shdr *SymSec = Obj->getSection((*SymSI)->getRawDataRefImpl()); - auto SecName = EF.getSectionName(SymSec); + auto SecName = EF.getSectionName(*SymSec); if (!SecName) return SecName.takeError(); Fmt << *SecName; @@ -338,10 +338,10 @@ static void printSymbolVersionInfo(const ELFFile *Elf, continue; ArrayRef Contents = - unwrapOrError(Elf->getSectionContents(&Shdr), FileName); + unwrapOrError(Elf->getSectionContents(Shdr), FileName); const typename ELFT::Shdr *StrTabSec = unwrapOrError(Elf->getSection(Shdr.sh_link), FileName); - StringRef StrTab = unwrapOrError(Elf->getStringTable(StrTabSec), FileName); + StringRef StrTab = unwrapOrError(Elf->getStringTable(*StrTabSec), FileName); if (Shdr.sh_type == ELF::SHT_GNU_verneed) printSymbolVersionDependency(Contents, StrTab); diff --git a/llvm/tools/llvm-rc/Opts.td b/llvm/tools/llvm-rc/Opts.td index 873dd785b12bd..613f0a0db31ed 100644 --- a/llvm/tools/llvm-rc/Opts.td +++ b/llvm/tools/llvm-rc/Opts.td @@ -4,55 +4,55 @@ include "llvm/Option/OptParser.td" // These options seem to be important for the tool // and should be implemented. -def FILEOUT : JoinedOrSeparate<[ "/", "-" ], "FO">, +def fileout : JoinedOrSeparate<[ "/", "-" ], "FO">, HelpText<"Change the output file location.">; -def DEFINE : Separate<[ "/", "-" ], "D">, +def define : Separate<[ "/", "-" ], "D">, HelpText<"Define a symbol for the C preprocessor.">; -def UNDEF : Separate<[ "/", "-" ], "U">, +def undef : Separate<[ "/", "-" ], "U">, HelpText<"Undefine a symbol for the C preprocessor.">; -def LANG_ID : JoinedOrSeparate<[ "/", "-" ], "L">, +def lang_id : JoinedOrSeparate<[ "/", "-" ], "L">, HelpText<"Set the default language identifier.">; -def LANG_NAME : Separate<[ "/", "-" ], "LN">, +def lang_name : Separate<[ "/", "-" ], "LN">, HelpText<"Set the default language name.">; -def INCLUDE : Separate<[ "/", "-" ], "I">, HelpText<"Add an include path.">; -def NOINCLUDE : Flag<[ "/", "-" ], "X">, HelpText<"Ignore 'include' variable.">; +def includepath : Separate<[ "/", "-" ], "I">, HelpText<"Add an include path.">; +def noinclude : Flag<[ "/", "-" ], "X">, HelpText<"Ignore 'include' variable.">; -def ADD_NULL : Flag<[ "/", "-" ], "N">, +def add_null : Flag<[ "/", "-" ], "N">, HelpText<"Null-terminate all strings in the string table.">; -def DUPID_NOWARN : Flag<[ "/", "-" ], "Y">, +def dupid_nowarn : Flag<[ "/", "-" ], "Y">, HelpText<"Suppress warnings on duplicate resource IDs.">; -def VERBOSE : Flag<[ "/", "-" ], "V">, HelpText<"Be verbose.">; -def HELP : Flag<[ "/", "-" ], "?">, HelpText<"Display this help and exit.">; -def H : Flag<[ "/", "-" ], "H">, - Alias, +def verbose : Flag<[ "/", "-" ], "V">, HelpText<"Be verbose.">; +def help : Flag<[ "/", "-" ], "?">, HelpText<"Display this help and exit.">; +def h : Flag<[ "/", "-" ], "H">, + Alias, HelpText<"Display this help and exit.">; -def DRY_RUN : Flag<[ "/", "-" ], "dry-run">, +def dry_run : Flag<[ "/", "-" ], "dry-run">, HelpText<"Don't compile the input; only try to parse it.">; -def CODEPAGE : JoinedOrSeparate<[ "/", "-" ], "C">, +def codepage : JoinedOrSeparate<[ "/", "-" ], "C">, HelpText<"Set the codepage used for input strings.">; // Unused switches (at least for now). These will stay unimplemented // in an early stage of development and can be ignored. However, we need to // parse them in order to preserve the compatibility with the original tool. -def NOLOGO : Flag<[ "/", "-" ], "NOLOGO">; -def R : Flag<[ "/", "-" ], "R">; -def SL : Flag<[ "/", "-" ], "SL">; +def nologo : Flag<[ "/", "-" ], "NOLOGO">; +def r : Flag<[ "/", "-" ], "R">; +def sl : Flag<[ "/", "-" ], "SL">; // (Codepages support.) -def W : Flag<[ "/", "-" ], "W">; +def w : Flag<[ "/", "-" ], "W">; // (Support of MUI and similar.) -def FM : Separate<[ "/", "-" ], "FM">; -def Q : Separate<[ "/", "-" ], "Q">; -def G : Flag<[ "/", "-" ], "G">; -def GN : Flag<[ "/", "-" ], "GN">; -def G1 : Flag<[ "/", "-" ], "G1">; -def G2 : Flag<[ "/", "-" ], "G2">; +def fm : Separate<[ "/", "-" ], "FM">; +def q : Separate<[ "/", "-" ], "Q">; +def g : Flag<[ "/", "-" ], "G">; +def gn : Flag<[ "/", "-" ], "GN">; +def g1 : Flag<[ "/", "-" ], "G1">; +def g2 : Flag<[ "/", "-" ], "G2">; diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.cpp b/llvm/tools/llvm-rc/ResourceFileWriter.cpp index 09b078c94cd29..c80605aed4465 100644 --- a/llvm/tools/llvm-rc/ResourceFileWriter.cpp +++ b/llvm/tools/llvm-rc/ResourceFileWriter.cpp @@ -138,7 +138,8 @@ enum class NullHandlingMethod { }; // Parses an identifier or string and returns a processed version of it: -// * String the string boundary quotes. +// * Strip the string boundary quotes. +// * Convert the input code page characters to UTF16. // * Squash "" to a single ". // * Replace the escape sequences with their processed version. // For identifiers, this is no-op. diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.cpp b/llvm/tools/llvm-rc/ResourceScriptParser.cpp index 2155985c61b8b..5141ac0c3864f 100644 --- a/llvm/tools/llvm-rc/ResourceScriptParser.cpp +++ b/llvm/tools/llvm-rc/ResourceScriptParser.cpp @@ -777,8 +777,10 @@ RCParser::parseVersionInfoFixed() { // VERSION variations take multiple integers. size_t NumInts = RetType::isVersionType(FixedType) ? 4 : 1; - ASSIGN_OR_RETURN(ArgsResult, readIntsWithCommas(NumInts, NumInts)); + ASSIGN_OR_RETURN(ArgsResult, readIntsWithCommas(1, NumInts)); SmallVector ArgInts(ArgsResult->begin(), ArgsResult->end()); + while (ArgInts.size() < NumInts) + ArgInts.push_back(0); Result.setValue(FixedType, ArgInts); } diff --git a/llvm/tools/llvm-rc/llvm-rc.cpp b/llvm/tools/llvm-rc/llvm-rc.cpp index 71954804f2552..e9027a21d46b8 100644 --- a/llvm/tools/llvm-rc/llvm-rc.cpp +++ b/llvm/tools/llvm-rc/llvm-rc.cpp @@ -92,12 +92,12 @@ int main(int Argc, const char **Argv) { opt::InputArgList InputArgs = T.ParseArgs(ArgsArr, MAI, MAC); // The tool prints nothing when invoked with no command-line arguments. - if (InputArgs.hasArg(OPT_HELP)) { + if (InputArgs.hasArg(OPT_help)) { T.PrintHelp(outs(), "rc [options] file...", "Resource Converter", false); return 0; } - const bool BeVerbose = InputArgs.hasArg(OPT_VERBOSE); + const bool BeVerbose = InputArgs.hasArg(OPT_verbose); std::vector InArgsInfo = InputArgs.getAllArgValues(OPT_INPUT); if (DashDash != Argv + Argc) @@ -141,14 +141,14 @@ int main(int Argc, const char **Argv) { SmallString<128> InputFile(InArgsInfo[0]); llvm::sys::fs::make_absolute(InputFile); Params.InputFilePath = InputFile; - Params.Include = InputArgs.getAllArgValues(OPT_INCLUDE); - Params.NoInclude = InputArgs.getAllArgValues(OPT_NOINCLUDE); + Params.Include = InputArgs.getAllArgValues(OPT_includepath); + Params.NoInclude = InputArgs.getAllArgValues(OPT_noinclude); - if (InputArgs.hasArg(OPT_CODEPAGE)) { - if (InputArgs.getLastArgValue(OPT_CODEPAGE) + if (InputArgs.hasArg(OPT_codepage)) { + if (InputArgs.getLastArgValue(OPT_codepage) .getAsInteger(10, Params.CodePage)) fatalError("Invalid code page: " + - InputArgs.getLastArgValue(OPT_CODEPAGE)); + InputArgs.getLastArgValue(OPT_codepage)); switch (Params.CodePage) { case CpAcp: case CpWin1252: @@ -161,10 +161,10 @@ int main(int Argc, const char **Argv) { } std::unique_ptr Visitor; - bool IsDryRun = InputArgs.hasArg(OPT_DRY_RUN); + bool IsDryRun = InputArgs.hasArg(OPT_dry_run); if (!IsDryRun) { - auto OutArgsInfo = InputArgs.getAllArgValues(OPT_FILEOUT); + auto OutArgsInfo = InputArgs.getAllArgValues(OPT_fileout); if (OutArgsInfo.empty()) { SmallString<128> OutputFile = InputFile; llvm::sys::path::replace_extension(OutputFile, "res"); @@ -182,17 +182,17 @@ int main(int Argc, const char **Argv) { fatalError("Error opening output file '" + OutArgsInfo[0] + "': " + EC.message()); Visitor = std::make_unique(Params, std::move(FOut)); - Visitor->AppendNull = InputArgs.hasArg(OPT_ADD_NULL); + Visitor->AppendNull = InputArgs.hasArg(OPT_add_null); ExitOnErr(NullResource().visit(Visitor.get())); // Set the default language; choose en-US arbitrarily. unsigned PrimaryLangId = 0x09, SubLangId = 0x01; - if (InputArgs.hasArg(OPT_LANG_ID)) { + if (InputArgs.hasArg(OPT_lang_id)) { unsigned LangId; - if (InputArgs.getLastArgValue(OPT_LANG_ID).getAsInteger(16, LangId)) + if (InputArgs.getLastArgValue(OPT_lang_id).getAsInteger(16, LangId)) fatalError("Invalid language id: " + - InputArgs.getLastArgValue(OPT_LANG_ID)); + InputArgs.getLastArgValue(OPT_lang_id)); PrimaryLangId = LangId & 0x3ff; SubLangId = LangId >> 10; } diff --git a/llvm/tools/llvm-readobj/ARMEHABIPrinter.h b/llvm/tools/llvm-readobj/ARMEHABIPrinter.h index dfa2a3538d893..613c4b78b1c21 100644 --- a/llvm/tools/llvm-readobj/ARMEHABIPrinter.h +++ b/llvm/tools/llvm-readobj/ARMEHABIPrinter.h @@ -407,7 +407,7 @@ PrinterContext::FindExceptionTable(unsigned IndexSectionIndex, reportError(SymTabOrErr.takeError(), FileName); const Elf_Shdr *SymTab = *SymTabOrErr; - for (const Elf_Rel &R : unwrapOrError(FileName, ELF->rels(&Sec))) { + for (const Elf_Rel &R : unwrapOrError(FileName, ELF->rels(Sec))) { if (R.r_offset != static_cast(IndexTableOffset)) continue; @@ -417,9 +417,9 @@ PrinterContext::FindExceptionTable(unsigned IndexSectionIndex, RelA.r_addend = 0; const Elf_Sym *Symbol = - unwrapOrError(FileName, ELF->getRelocationSymbol(&RelA, SymTab)); + unwrapOrError(FileName, ELF->getRelocationSymbol(RelA, SymTab)); - auto Ret = ELF->getSection(Symbol, SymTab, ShndxTable); + auto Ret = ELF->getSection(*Symbol, SymTab, ShndxTable); if (!Ret) report_fatal_error(errorToErrorCode(Ret.takeError()).message()); return *Ret; @@ -432,7 +432,7 @@ template void PrinterContext::PrintExceptionTable(const Elf_Shdr *IT, const Elf_Shdr *EHT, uint64_t TableEntryOffset) const { - Expected> Contents = ELF->getSectionContents(EHT); + Expected> Contents = ELF->getSectionContents(*EHT); if (!Contents) return; @@ -499,7 +499,7 @@ void PrinterContext::PrintOpcodes(const uint8_t *Entry, template void PrinterContext::PrintIndexTable(unsigned SectionIndex, const Elf_Shdr *IT) const { - Expected> Contents = ELF->getSectionContents(IT); + Expected> Contents = ELF->getSectionContents(*IT); if (!Contents) return; @@ -553,7 +553,7 @@ void PrinterContext::PrintIndexTable(unsigned SectionIndex, FindExceptionTable(SectionIndex, Entry * IndexTableEntrySize + 4); if (EHT) - if (auto Name = ELF->getSectionName(EHT)) + if (auto Name = ELF->getSectionName(*EHT)) SW.printString("ExceptionHandlingTable", *Name); uint64_t TableEntryOffset = PREL31(Word1, IT->sh_addr); @@ -575,7 +575,7 @@ void PrinterContext::PrintUnwindInformation() const { DictScope UIT(SW, "UnwindIndexTable"); SW.printNumber("SectionIndex", SectionIndex); - if (auto SectionName = ELF->getSectionName(&Sec)) + if (auto SectionName = ELF->getSectionName(Sec)) SW.printString("SectionName", *SectionName); SW.printHex("SectionOffset", Sec.sh_offset); diff --git a/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp b/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp index d753185177050..46a949b990459 100644 --- a/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp +++ b/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp @@ -746,7 +746,9 @@ bool Decoder::opcode_alloc_l(const uint8_t *OC, unsigned &Offset, bool Decoder::opcode_setfp(const uint8_t *OC, unsigned &Offset, unsigned Length, bool Prologue) { - SW.startLine() << format("0x%02x ; mov fp, sp\n", OC[Offset]); + SW.startLine() << format("0x%02x ; mov %s, %s\n", OC[Offset], + static_cast(Prologue ? "fp" : "sp"), + static_cast(Prologue ? "sp" : "fp")); ++Offset; return false; } @@ -754,8 +756,11 @@ bool Decoder::opcode_setfp(const uint8_t *OC, unsigned &Offset, unsigned Length, bool Decoder::opcode_addfp(const uint8_t *OC, unsigned &Offset, unsigned Length, bool Prologue) { unsigned NumBytes = OC[Offset + 1] << 3; - SW.startLine() << format("0x%02x%02x ; add fp, sp, #%u\n", - OC[Offset], OC[Offset + 1], NumBytes); + SW.startLine() << format( + "0x%02x%02x ; %s %s, %s, #%u\n", OC[Offset], OC[Offset + 1], + static_cast(Prologue ? "add" : "sub"), + static_cast(Prologue ? "fp" : "sp"), + static_cast(Prologue ? "sp" : "fp"), NumBytes); Offset += 2; return false; } @@ -1106,6 +1111,143 @@ bool Decoder::dumpPackedEntry(const object::COFFObjectFile &COFF, return true; } +bool Decoder::dumpPackedARM64Entry(const object::COFFObjectFile &COFF, + const SectionRef Section, uint64_t Offset, + unsigned Index, + const RuntimeFunctionARM64 &RF) { + assert((RF.Flag() == RuntimeFunctionFlag::RFF_Packed || + RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment) && + "unpacked entry cannot be treated as a packed entry"); + + ErrorOr Function = getRelocatedSymbol(COFF, Section, Offset); + if (!Function) + Function = getSymbol(COFF, RF.BeginAddress, /*FunctionOnly=*/true); + + StringRef FunctionName; + uint64_t FunctionAddress; + if (Function) { + Expected FunctionNameOrErr = Function->getName(); + if (!FunctionNameOrErr) { + std::string Buf; + llvm::raw_string_ostream OS(Buf); + logAllUnhandledErrors(FunctionNameOrErr.takeError(), OS); + OS.flush(); + report_fatal_error(Buf); + } + FunctionName = *FunctionNameOrErr; + Expected FunctionAddressOrErr = Function->getAddress(); + if (!FunctionAddressOrErr) { + std::string Buf; + llvm::raw_string_ostream OS(Buf); + logAllUnhandledErrors(FunctionAddressOrErr.takeError(), OS); + OS.flush(); + report_fatal_error(Buf); + } + FunctionAddress = *FunctionAddressOrErr; + } else { + FunctionAddress = COFF.getPE32PlusHeader()->ImageBase + RF.BeginAddress; + } + + SW.printString("Function", formatSymbol(FunctionName, FunctionAddress)); + SW.printBoolean("Fragment", + RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment); + SW.printNumber("FunctionLength", RF.FunctionLength()); + SW.printNumber("RegF", RF.RegF()); + SW.printNumber("RegI", RF.RegI()); + SW.printBoolean("HomedParameters", RF.H()); + SW.printNumber("CR", RF.CR()); + SW.printNumber("FrameSize", RF.FrameSize() << 4); + ListScope PS(SW, "Prologue"); + + // Synthesize the equivalent prologue according to the documentation + // at https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling, + // printed in reverse order compared to the docs, to match how prologues + // are printed for the non-packed case. + int IntSZ = 8 * RF.RegI(); + if (RF.CR() == 1) + IntSZ += 8; + int FpSZ = 8 * RF.RegF(); + if (RF.RegF()) + FpSZ += 8; + int SavSZ = (IntSZ + FpSZ + 8 * 8 * RF.H() + 0xf) & ~0xf; + int LocSZ = (RF.FrameSize() << 4) - SavSZ; + + if (RF.CR() == 3) { + SW.startLine() << "mov x29, sp\n"; + if (LocSZ <= 512) { + SW.startLine() << format("stp x29, lr, [sp, #-%d]!\n", LocSZ); + } else { + SW.startLine() << "stp x29, lr, [sp, #0]\n"; + } + } + if (LocSZ > 4080) { + SW.startLine() << format("sub sp, sp, #%d\n", LocSZ - 4080); + SW.startLine() << "sub sp, sp, #4080\n"; + } else if ((RF.CR() != 3 && LocSZ > 0) || LocSZ > 512) { + SW.startLine() << format("sub sp, sp, #%d\n", LocSZ); + } + if (RF.H()) { + SW.startLine() << format("stp x6, x7, [sp, #%d]\n", IntSZ + FpSZ + 48); + SW.startLine() << format("stp x4, x5, [sp, #%d]\n", IntSZ + FpSZ + 32); + SW.startLine() << format("stp x2, x3, [sp, #%d]\n", IntSZ + FpSZ + 16); + if (RF.RegI() > 0 || RF.RegF() > 0 || RF.CR() == 1) { + SW.startLine() << format("stp x0, x1, [sp, #%d]\n", IntSZ + FpSZ); + } else { + // This case isn't documented; if neither RegI nor RegF nor CR=1 + // have decremented the stack pointer by SavSZ, we need to do it here + // (as the final stack adjustment of LocSZ excludes SavSZ). + SW.startLine() << format("stp x0, x1, [sp, #-%d]!\n", SavSZ); + } + } + int FloatRegs = RF.RegF() > 0 ? RF.RegF() + 1 : 0; + for (int I = (FloatRegs + 1) / 2 - 1; I >= 0; I--) { + if (I == (FloatRegs + 1) / 2 - 1 && FloatRegs % 2 == 1) { + // The last register, an odd register without a pair + SW.startLine() << format("str d%d, [sp, #%d]\n", 8 + 2 * I, + IntSZ + 16 * I); + } else if (I == 0 && RF.RegI() == 0 && RF.CR() != 1) { + SW.startLine() << format("stp d%d, d%d, [sp, #-%d]!\n", 8 + 2 * I, + 8 + 2 * I + 1, SavSZ); + } else { + SW.startLine() << format("stp d%d, d%d, [sp, #%d]\n", 8 + 2 * I, + 8 + 2 * I + 1, IntSZ + 16 * I); + } + } + if (RF.CR() == 1 && (RF.RegI() % 2) == 0) { + if (RF.RegI() == 0) + SW.startLine() << format("str lr, [sp, #-%d]!\n", SavSZ); + else + SW.startLine() << format("str lr, [sp, #%d]\n", IntSZ - 8); + } + for (int I = (RF.RegI() + 1) / 2 - 1; I >= 0; I--) { + if (I == (RF.RegI() + 1) / 2 - 1 && RF.RegI() % 2 == 1) { + // The last register, an odd register without a pair + if (RF.CR() == 1) { + if (I == 0) // If this is the only register pair + SW.startLine() << format("stp x%d, lr, [sp, #-%d]!\n", 19 + 2 * I, + SavSZ); + else + SW.startLine() << format("stp x%d, lr, [sp, #%d]\n", 19 + 2 * I, + 16 * I); + } else { + if (I == 0) + SW.startLine() << format("str x%d, [sp, #-%d]!\n", 19 + 2 * I, SavSZ); + else + SW.startLine() << format("str x%d, [sp, #%d]\n", 19 + 2 * I, 16 * I); + } + } else if (I == 0) { + // The first register pair + SW.startLine() << format("stp x19, x20, [sp, #-%d]!\n", SavSZ); + } else { + SW.startLine() << format("stp x%d, x%d, [sp, #%d]\n", 19 + 2 * I, + 19 + 2 * I + 1, 16 * I); + } + } + SW.startLine() << "end\n"; + + return true; +} + bool Decoder::dumpProcedureDataEntry(const COFFObjectFile &COFF, const SectionRef Section, unsigned Index, ArrayRef Contents) { @@ -1118,8 +1260,8 @@ bool Decoder::dumpProcedureDataEntry(const COFFObjectFile &COFF, if (Entry.Flag() == RuntimeFunctionFlag::RFF_Unpacked) return dumpUnpackedEntry(COFF, Section, Offset, Index, Entry); if (isAArch64) { - SW.startLine() << "Packed unwind data not yet supported for ARM64\n"; - return true; + const RuntimeFunctionARM64 EntryARM64(Data); + return dumpPackedARM64Entry(COFF, Section, Offset, Index, EntryARM64); } return dumpPackedEntry(COFF, Section, Offset, Index, Entry); } diff --git a/llvm/tools/llvm-readobj/ARMWinEHPrinter.h b/llvm/tools/llvm-readobj/ARMWinEHPrinter.h index 36fe5d6f4b2b4..3263841a267bc 100644 --- a/llvm/tools/llvm-readobj/ARMWinEHPrinter.h +++ b/llvm/tools/llvm-readobj/ARMWinEHPrinter.h @@ -17,6 +17,7 @@ namespace llvm { namespace ARM { namespace WinEH { class RuntimeFunction; +class RuntimeFunctionARM64; class Decoder { static const size_t PDataEntrySize; @@ -154,6 +155,9 @@ class Decoder { bool dumpPackedEntry(const object::COFFObjectFile &COFF, const object::SectionRef Section, uint64_t Offset, unsigned Index, const RuntimeFunction &Entry); + bool dumpPackedARM64Entry(const object::COFFObjectFile &COFF, + const object::SectionRef Section, uint64_t Offset, + unsigned Index, const RuntimeFunctionARM64 &Entry); bool dumpProcedureDataEntry(const object::COFFObjectFile &COFF, const object::SectionRef Section, unsigned Entry, ArrayRef Contents); diff --git a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h index 035037f4eebc1..52db477ba7267 100644 --- a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h +++ b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h @@ -85,7 +85,7 @@ void PrinterContext::printUnwindInformation() const { reportError(SectionsOrErr.takeError(), ObjF->getFileName()); for (const Elf_Shdr &Shdr : *SectionsOrErr) { - Expected NameOrErr = Obj->getSectionName(&Shdr); + Expected NameOrErr = Obj->getSectionName(Shdr); if (!NameOrErr) reportError(NameOrErr.takeError(), ObjF->getFileName()); if (*NameOrErr == ".eh_frame") @@ -104,13 +104,13 @@ void PrinterContext::printEHFrameHdr(const Elf_Phdr *EHFramePHdr) const { const object::ELFFile *Obj = ObjF->getELFFile(); if (const Elf_Shdr *EHFrameHdr = findSectionByAddress(ObjF, EHFramePHdr->p_vaddr)) { - Expected NameOrErr = Obj->getSectionName(EHFrameHdr); + Expected NameOrErr = Obj->getSectionName(*EHFrameHdr); if (!NameOrErr) reportError(NameOrErr.takeError(), ObjF->getFileName()); W.printString("Corresponding Section", *NameOrErr); } - Expected> Content = Obj->getSegmentContents(EHFramePHdr); + Expected> Content = Obj->getSegmentContents(*EHFramePHdr); if (!Content) reportError(Content.takeError(), ObjF->getFileName()); @@ -181,7 +181,7 @@ void PrinterContext::printEHFrame(const Elf_Shdr *EHFrameShdr) const { W.indent(); Expected> DataOrErr = - ObjF->getELFFile()->getSectionContents(EHFrameShdr); + ObjF->getELFFile()->getSectionContents(*EHFrameShdr); if (!DataOrErr) reportError(DataOrErr.takeError(), ObjF->getFileName()); diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index df3799c8fbe67..051308ed7d448 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -203,6 +203,11 @@ struct VerNeed { std::vector AuxV; }; +struct NoteType { + uint32_t ID; + StringRef Name; +}; + } // namespace template class Relocation { @@ -399,7 +404,7 @@ template static std::string describe(const ELFFile &Obj, const typename ELFT::Shdr &Sec) { unsigned SecNdx = &Sec - &cantFail(Obj.sections()).front(); - return (object::getELFSectionTypeName(Obj.getHeader()->e_machine, + return (object::getELFSectionTypeName(Obj.getHeader().e_machine, Sec.sh_type) + " section with index " + Twine(SecNdx)) .str(); @@ -419,7 +424,7 @@ static Expected getLinkAsStrtab(const ELFFile &Obj, return createError("invalid section linked to " + describe(Obj, *Sec) + ": " + toString(StrTabSecOrErr.takeError())); - Expected StrTabOrErr = Obj.getStringTable(*StrTabSecOrErr); + Expected StrTabOrErr = Obj.getStringTable(**StrTabSecOrErr); if (!StrTabOrErr) return createError("invalid string table linked to " + describe(Obj, *Sec) + ": " + toString(StrTabOrErr.takeError())); @@ -438,13 +443,12 @@ getLinkAsSymtab(const ELFFile &Obj, const typename ELFT::Shdr *Sec, ": " + toString(SymtabOrErr.takeError())); if ((*SymtabOrErr)->sh_type != ExpectedType) - return createError("invalid section linked to " + describe(Obj, *Sec) + - ": expected " + - object::getELFSectionTypeName(Obj.getHeader()->e_machine, - ExpectedType) + - ", but got " + - object::getELFSectionTypeName(Obj.getHeader()->e_machine, - (*SymtabOrErr)->sh_type)); + return createError( + "invalid section linked to " + describe(Obj, *Sec) + ": expected " + + object::getELFSectionTypeName(Obj.getHeader().e_machine, ExpectedType) + + ", but got " + + object::getELFSectionTypeName(Obj.getHeader().e_machine, + (*SymtabOrErr)->sh_type)); Expected StrTabOrErr = getLinkAsStrtab(Obj, *SymtabOrErr); if (!StrTabOrErr) @@ -472,7 +476,7 @@ ELFDumper::getVersionTable(const Elf_Shdr *Sec, ArrayRef *SymTab, return createError("the " + describe(*Sec) + " is misaligned"); Expected> VersionsOrErr = - Obj->template getSectionContentsAsArray(Sec); + Obj->template getSectionContentsAsArray(*Sec); if (!VersionsOrErr) return createError("cannot read content of " + describe(*Sec) + ": " + toString(VersionsOrErr.takeError())); @@ -506,7 +510,7 @@ ELFDumper::getVersionDefinitions(const Elf_Shdr *Sec) const { if (!StrTabOrErr) return StrTabOrErr.takeError(); - Expected> ContentsOrErr = Obj->getSectionContents(Sec); + Expected> ContentsOrErr = Obj->getSectionContents(*Sec); if (!ContentsOrErr) return createError("cannot read content of " + describe(*Sec) + ": " + toString(ContentsOrErr.takeError())); @@ -595,7 +599,7 @@ ELFDumper::getVersionDependencies(const Elf_Shdr *Sec) const { else StrTab = *StrTabOrErr; - Expected> ContentsOrErr = Obj->getSectionContents(Sec); + Expected> ContentsOrErr = Obj->getSectionContents(*Sec); if (!ContentsOrErr) return createError("cannot read content of " + describe(*Sec) + ": " + toString(ContentsOrErr.takeError())); @@ -718,8 +722,9 @@ template class DumpStyle { TYPEDEF_ELF_TYPES(ELFT) DumpStyle(ELFDumper *Dumper) - : Obj(*Dumper->getElfObject()->getELFFile()), Dumper(Dumper) { - FileName = this->Dumper->getElfObject()->getFileName(); + : Obj(*Dumper->getElfObject()->getELFFile()), + ElfObj(*Dumper->getElfObject()), Dumper(Dumper) { + FileName = ElfObj.getFileName(); } virtual ~DumpStyle() = default; @@ -748,17 +753,15 @@ template class DumpStyle { virtual void printAddrsig() = 0; virtual void printNotes() = 0; virtual void printELFLinkerOptions() = 0; - virtual void printStackSizes(const ELFObjectFile *Obj) = 0; - void printNonRelocatableStackSizes(const ELFObjectFile *Obj, - std::function PrintHeader); - void printRelocatableStackSizes(const ELFObjectFile *Obj, - std::function PrintHeader); - void printFunctionStackSize(const ELFObjectFile *Obj, uint64_t SymValue, - Optional FunctionSec, + virtual void printStackSizes() = 0; + void printNonRelocatableStackSizes(std::function PrintHeader); + void printRelocatableStackSizes(std::function PrintHeader); + void printFunctionStackSize(uint64_t SymValue, + Optional FunctionSec, const Elf_Shdr &StackSizeSec, DataExtractor Data, uint64_t *Offset); - void printStackSize(const ELFObjectFile *Obj, RelocationRef Rel, - SectionRef FunctionSec, const Elf_Shdr &StackSizeSec, + void printStackSize(RelocationRef Rel, const Elf_Shdr *FunctionSec, + const Elf_Shdr &StackSizeSec, const RelocationResolver &Resolver, DataExtractor Data); virtual void printStackSizeEntry(uint64_t Size, StringRef FuncName) = 0; virtual void printMipsGOT(const MipsGOTParser &Parser) = 0; @@ -786,6 +789,7 @@ template class DumpStyle { StringRef FileName; const ELFFile &Obj; + const ELFObjectFile &ElfObj; private: const ELFDumper *Dumper; @@ -824,7 +828,7 @@ template class GNUStyle : public DumpStyle { void printAddrsig() override; void printNotes() override; void printELFLinkerOptions() override; - void printStackSizes(const ELFObjectFile *Obj) override; + void printStackSizes() override; void printStackSizeEntry(uint64_t Size, StringRef FuncName) override; void printMipsGOT(const MipsGOTParser &Parser) override; void printMipsPLT(const MipsGOTParser &Parser) override; @@ -948,7 +952,7 @@ template class LLVMStyle : public DumpStyle { void printAddrsig() override; void printNotes() override; void printELFLinkerOptions() override; - void printStackSizes(const ELFObjectFile *Obj) override; + void printStackSizes() override; void printStackSizeEntry(uint64_t Size, StringRef FuncName) override; void printMipsGOT(const MipsGOTParser &Parser) override; void printMipsPLT(const MipsGOTParser &Parser) override; @@ -1064,7 +1068,7 @@ Expected ELFDumper::getSymbolVersion(const Elf_Sym *Sym, // Get the corresponding version index entry. if (Expected EntryOrErr = ObjF->getELFFile()->template getEntry( - SymbolVersionSection, EntryIndex)) + *SymbolVersionSection, EntryIndex)) return this->getSymbolVersionByIndex((*EntryOrErr)->vs_index, IsDefault); else return EntryOrErr.takeError(); @@ -1079,7 +1083,7 @@ ELFDumper::getRelocationTarget(const Relocation &R, const ELFFile &Obj = *ObjF->getELFFile(); Expected SymOrErr = - Obj.template getEntry(SymTab, R.Symbol); + Obj.template getEntry(*SymTab, R.Symbol); if (!SymOrErr) return SymOrErr.takeError(); const Elf_Sym *Sym = *SymOrErr; @@ -1090,14 +1094,14 @@ ELFDumper::getRelocationTarget(const Relocation &R, // This code block returns the section name. if (Sym->getType() == ELF::STT_SECTION) { Expected SecOrErr = - Obj.getSection(Sym, SymTab, ShndxTable); + Obj.getSection(*Sym, SymTab, ShndxTable); if (!SecOrErr) return SecOrErr.takeError(); // A section symbol describes the section at index 0. if (*SecOrErr == nullptr) return RelSymbol(Sym, ""); - Expected NameOrErr = Obj.getSectionName(*SecOrErr); + Expected NameOrErr = Obj.getSectionName(**SecOrErr); if (!NameOrErr) return NameOrErr.takeError(); return RelSymbol(Sym, NameOrErr->str()); @@ -1222,7 +1226,7 @@ Expected ELFDumper::getSymbolSectionIndex(const Elf_Sym *Symbol, const Elf_Sym *FirstSym) const { return Symbol->st_shndx == SHN_XINDEX - ? object::getExtendedSymbolTableIndex(Symbol, FirstSym, + ? object::getExtendedSymbolTableIndex(*Symbol, *FirstSym, ShndxTable) : Symbol->st_shndx; } @@ -1254,7 +1258,7 @@ ELFDumper::getSymbolSectionName(const Elf_Sym *Symbol, Obj->getSection(SectionIndex); if (!SecOrErr) return SecOrErr.takeError(); - return Obj->getSectionName(*SecOrErr); + return Obj->getSectionName(**SecOrErr); } template @@ -1881,19 +1885,17 @@ ELFDumper::findDynamic(const ELFFile *Obj) { } if (DynamicPhdr && DynamicSec) { - StringRef Name = - unwrapOrError(ObjF->getFileName(), Obj->getSectionName(DynamicSec)); if (DynamicSec->sh_addr + DynamicSec->sh_size > DynamicPhdr->p_vaddr + DynamicPhdr->p_memsz || DynamicSec->sh_addr < DynamicPhdr->p_vaddr) - reportWarning(createError("The SHT_DYNAMIC section '" + Name + - "' is not contained within the " + reportWarning(createError(describe(*DynamicSec) + + " is not contained within the " "PT_DYNAMIC segment"), ObjF->getFileName()); if (DynamicSec->sh_addr != DynamicPhdr->p_vaddr) - reportWarning(createError("The SHT_DYNAMIC section '" + Name + - "' is not at the start of " + reportWarning(createError(describe(*DynamicSec) + + " is not at the start of " "PT_DYNAMIC segment"), ObjF->getFileName()); } @@ -2248,8 +2250,21 @@ void ELFDumper::parseDynamicTable(const ELFFile *Obj) { // Derive the dynamic symbol table size from the DT_HASH hash table, if // present. - if (HashTable && DynSymRegion) - DynSymRegion->Size = HashTable->nchain * DynSymRegion->EntSize; + if (HashTable && DynSymRegion) { + const uint64_t FileSize = ObjF->getELFFile()->getBufSize(); + const uint64_t DerivedSize = + (uint64_t)HashTable->nchain * DynSymRegion->EntSize; + const uint64_t Offset = + (const uint8_t *)DynSymRegion->Addr - ObjF->getELFFile()->base(); + if (DerivedSize > FileSize - Offset) + reportUniqueWarning(createError( + "the size (0x" + Twine::utohexstr(DerivedSize) + + ") of the dynamic symbol table at 0x" + Twine::utohexstr(Offset) + + ", derived from the hash table, goes past the end of the file (0x" + + Twine::utohexstr(FileSize) + ") and will be ignored")); + else + DynSymRegion->Size = HashTable->nchain * DynSymRegion->EntSize; + } } template @@ -2331,7 +2346,7 @@ template void ELFDumper::printELFLinkerOptions() { } template void ELFDumper::printStackSizes() { - ELFDumperStyle->printStackSizes(ObjF); + ELFDumperStyle->printStackSizes(); } #define LLVM_READOBJ_DT_FLAG_ENT(prefix, enum) \ @@ -2420,7 +2435,7 @@ const typename ELFT::Shdr * ELFDumper::findSectionByName(StringRef Name) const { const ELFFile *Obj = ObjF->getELFFile(); for (const Elf_Shdr &Shdr : cantFail(Obj->sections())) { - if (Expected NameOrErr = Obj->getSectionName(&Shdr)) { + if (Expected NameOrErr = Obj->getSectionName(Shdr)) { if (*NameOrErr == Name) return &Shdr; } else { @@ -2453,7 +2468,7 @@ std::string ELFDumper::getDynamicEntry(uint64_t Type, }; // Handle custom printing of architecture specific tags - switch (ObjF->getELFFile()->getHeader()->e_machine) { + switch (ObjF->getELFFile()->getHeader().e_machine) { case EM_AARCH64: switch (Type) { case DT_AARCH64_BTI_PLT: @@ -2650,7 +2665,7 @@ namespace { template <> void ELFDumper::printUnwindInfo() { const ELFFile *Obj = ObjF->getELFFile(); - const unsigned Machine = Obj->getHeader()->e_machine; + const unsigned Machine = Obj->getHeader().e_machine; if (Machine == EM_ARM) { ARM::EHABI::PrinterContext Ctx(W, Obj, ObjF->getFileName(), DotSymtabSec); @@ -2829,7 +2844,7 @@ template void ELFDumper::printLoadName() { template void ELFDumper::printArchSpecificInfo() { const ELFFile *Obj = ObjF->getELFFile(); - switch (Obj->getHeader()->e_machine) { + switch (Obj->getHeader().e_machine) { case EM_ARM: case EM_RISCV: printAttributes(); @@ -2864,7 +2879,7 @@ template void ELFDumper::printAttributes() { return; } - const unsigned Machine = Obj->getHeader()->e_machine; + const unsigned Machine = Obj->getHeader().e_machine; assert((Machine == EM_ARM || Machine == EM_RISCV) && "Attributes not implemented."); @@ -2875,7 +2890,7 @@ template void ELFDumper::printAttributes() { continue; ArrayRef Contents = - unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(&Sec)); + unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(Sec)); if (Contents[0] != ELFAttrs::Format_Version) { reportWarning(createError(Twine("unrecognised FormatVersion: 0x") + Twine::utohexstr(Contents[0])), @@ -2975,7 +2990,7 @@ Error MipsGOTParser::findGOT(Elf_Dyn_Range DynTable, return Error::success(); ArrayRef Content = - unwrapOrError(FileName, Obj->getSectionContents(GotSec)); + unwrapOrError(FileName, Obj->getSectionContents(*GotSec)); GotEntries = Entries(reinterpret_cast(Content.data()), Content.size() / sizeof(Entry)); LocalNum = GotEntries.size(); @@ -3025,7 +3040,7 @@ Error MipsGOTParser::findGOT(Elf_Dyn_Range DynTable, GlobalNum = DynSymTotal - *DtGotSym; ArrayRef Content = - unwrapOrError(FileName, Obj->getSectionContents(GotSec)); + unwrapOrError(FileName, Obj->getSectionContents(*GotSec)); GotEntries = Entries(reinterpret_cast(Content.data()), Content.size() / sizeof(Entry)); GotDynSyms = DynSyms.drop_front(*DtGotSym); @@ -3069,7 +3084,7 @@ Error MipsGOTParser::findPLT(Elf_Dyn_Range DynTable) { Twine::utohexstr(*DtJmpRel)); if (Expected> PltContentOrErr = - Obj->getSectionContents(PltSec)) + Obj->getSectionContents(*PltSec)) PltEntries = Entries(reinterpret_cast(PltContentOrErr->data()), PltContentOrErr->size() / sizeof(Entry)); @@ -3193,13 +3208,13 @@ const typename MipsGOTParser::Elf_Sym * MipsGOTParser::getPltSym(const Entry *E) const { int64_t Offset = std::distance(getPltEntries().data(), E); if (PltRelSec->sh_type == ELF::SHT_REL) { - Elf_Rel_Range Rels = unwrapOrError(FileName, Obj->rels(PltRelSec)); + Elf_Rel_Range Rels = unwrapOrError(FileName, Obj->rels(*PltRelSec)); return unwrapOrError(FileName, - Obj->getRelocationSymbol(&Rels[Offset], PltSymTable)); + Obj->getRelocationSymbol(Rels[Offset], PltSymTable)); } else { - Elf_Rela_Range Rels = unwrapOrError(FileName, Obj->relas(PltRelSec)); + Elf_Rela_Range Rels = unwrapOrError(FileName, Obj->relas(*PltRelSec)); return unwrapOrError(FileName, - Obj->getRelocationSymbol(&Rels[Offset], PltSymTable)); + Obj->getRelocationSymbol(Rels[Offset], PltSymTable)); } } @@ -3296,7 +3311,7 @@ template void ELFDumper::printMipsReginfo() { const ELFFile *Obj = ObjF->getELFFile(); Expected> ContentsOrErr = - Obj->getSectionContents(RegInfoSec); + Obj->getSectionContents(*RegInfoSec); if (!ContentsOrErr) { this->reportUniqueWarning(createError( "unable to read the content of the .reginfo section (" + @@ -3364,7 +3379,7 @@ template void ELFDumper::printMipsOptions() { DictScope GS(W, "MIPS Options"); ArrayRef Data = - unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(MipsOpts)); + unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(*MipsOpts)); const uint8_t *const SecBegin = Data.begin(); while (!Data.empty()) { bool IsSupported; @@ -3404,7 +3419,7 @@ template void ELFDumper::printStackMap() const { }; Expected> ContentOrErr = - Obj->getSectionContents(StackMapSection); + Obj->getSectionContents(*StackMapSection); if (!ContentOrErr) { Warn(ContentOrErr.takeError()); return; @@ -3439,9 +3454,9 @@ static inline void printFields(formatted_raw_ostream &OS, StringRef Str1, template static std::string getSectionHeadersNumString(const ELFFile &Obj, StringRef FileName) { - const typename ELFT::Ehdr *ElfHeader = Obj.getHeader(); - if (ElfHeader->e_shnum != 0) - return to_string(ElfHeader->e_shnum); + const typename ELFT::Ehdr &ElfHeader = Obj.getHeader(); + if (ElfHeader.e_shnum != 0) + return to_string(ElfHeader.e_shnum); ArrayRef Arr = cantFail(Obj.sections()); if (Arr.empty()) @@ -3452,71 +3467,71 @@ static std::string getSectionHeadersNumString(const ELFFile &Obj, template static std::string getSectionHeaderTableIndexString(const ELFFile &Obj, StringRef FileName) { - const typename ELFT::Ehdr *ElfHeader = Obj.getHeader(); - if (ElfHeader->e_shstrndx != SHN_XINDEX) - return to_string(ElfHeader->e_shstrndx); + const typename ELFT::Ehdr &ElfHeader = Obj.getHeader(); + if (ElfHeader.e_shstrndx != SHN_XINDEX) + return to_string(ElfHeader.e_shstrndx); ArrayRef Arr = cantFail(Obj.sections()); if (Arr.empty()) return "65535 (corrupt: out of range)"; - return to_string(ElfHeader->e_shstrndx) + " (" + to_string(Arr[0].sh_link) + + return to_string(ElfHeader.e_shstrndx) + " (" + to_string(Arr[0].sh_link) + ")"; } template void GNUStyle::printFileHeaders() { - const Elf_Ehdr *e = this->Obj.getHeader(); + const Elf_Ehdr &e = this->Obj.getHeader(); OS << "ELF Header:\n"; OS << " Magic: "; std::string Str; for (int i = 0; i < ELF::EI_NIDENT; i++) - OS << format(" %02x", static_cast(e->e_ident[i])); + OS << format(" %02x", static_cast(e.e_ident[i])); OS << "\n"; - Str = printEnum(e->e_ident[ELF::EI_CLASS], makeArrayRef(ElfClass)); + Str = printEnum(e.e_ident[ELF::EI_CLASS], makeArrayRef(ElfClass)); printFields(OS, "Class:", Str); - Str = printEnum(e->e_ident[ELF::EI_DATA], makeArrayRef(ElfDataEncoding)); + Str = printEnum(e.e_ident[ELF::EI_DATA], makeArrayRef(ElfDataEncoding)); printFields(OS, "Data:", Str); OS.PadToColumn(2u); OS << "Version:"; OS.PadToColumn(37u); - OS << to_hexString(e->e_ident[ELF::EI_VERSION]); - if (e->e_version == ELF::EV_CURRENT) + OS << to_hexString(e.e_ident[ELF::EI_VERSION]); + if (e.e_version == ELF::EV_CURRENT) OS << " (current)"; OS << "\n"; - Str = printEnum(e->e_ident[ELF::EI_OSABI], makeArrayRef(ElfOSABI)); + Str = printEnum(e.e_ident[ELF::EI_OSABI], makeArrayRef(ElfOSABI)); printFields(OS, "OS/ABI:", Str); printFields(OS, - "ABI Version:", std::to_string(e->e_ident[ELF::EI_ABIVERSION])); - Str = printEnum(e->e_type, makeArrayRef(ElfObjectFileType)); + "ABI Version:", std::to_string(e.e_ident[ELF::EI_ABIVERSION])); + Str = printEnum(e.e_type, makeArrayRef(ElfObjectFileType)); printFields(OS, "Type:", Str); - Str = printEnum(e->e_machine, makeArrayRef(ElfMachineType)); + Str = printEnum(e.e_machine, makeArrayRef(ElfMachineType)); printFields(OS, "Machine:", Str); - Str = "0x" + to_hexString(e->e_version); + Str = "0x" + to_hexString(e.e_version); printFields(OS, "Version:", Str); - Str = "0x" + to_hexString(e->e_entry); + Str = "0x" + to_hexString(e.e_entry); printFields(OS, "Entry point address:", Str); - Str = to_string(e->e_phoff) + " (bytes into file)"; + Str = to_string(e.e_phoff) + " (bytes into file)"; printFields(OS, "Start of program headers:", Str); - Str = to_string(e->e_shoff) + " (bytes into file)"; + Str = to_string(e.e_shoff) + " (bytes into file)"; printFields(OS, "Start of section headers:", Str); std::string ElfFlags; - if (e->e_machine == EM_MIPS) + if (e.e_machine == EM_MIPS) ElfFlags = - printFlags(e->e_flags, makeArrayRef(ElfHeaderMipsFlags), + printFlags(e.e_flags, makeArrayRef(ElfHeaderMipsFlags), unsigned(ELF::EF_MIPS_ARCH), unsigned(ELF::EF_MIPS_ABI), unsigned(ELF::EF_MIPS_MACH)); - else if (e->e_machine == EM_RISCV) - ElfFlags = printFlags(e->e_flags, makeArrayRef(ElfHeaderRISCVFlags)); - Str = "0x" + to_hexString(e->e_flags); + else if (e.e_machine == EM_RISCV) + ElfFlags = printFlags(e.e_flags, makeArrayRef(ElfHeaderRISCVFlags)); + Str = "0x" + to_hexString(e.e_flags); if (!ElfFlags.empty()) Str = Str + ", " + ElfFlags; printFields(OS, "Flags:", Str); - Str = to_string(e->e_ehsize) + " (bytes)"; + Str = to_string(e.e_ehsize) + " (bytes)"; printFields(OS, "Size of this header:", Str); - Str = to_string(e->e_phentsize) + " (bytes)"; + Str = to_string(e.e_phentsize) + " (bytes)"; printFields(OS, "Size of program headers:", Str); - Str = to_string(e->e_phnum); + Str = to_string(e.e_phnum); printFields(OS, "Number of program headers:", Str); - Str = to_string(e->e_shentsize) + " (bytes)"; + Str = to_string(e.e_shentsize) + " (bytes)"; printFields(OS, "Size of section headers:", Str); Str = getSectionHeadersNumString(this->Obj, this->FileName); printFields(OS, "Number of section headers:", Str); @@ -3560,11 +3575,11 @@ std::vector getGroups(const ELFFile &Obj, StringRef StrTable = unwrapOrError(FileName, Obj.getStringTableForSymtab(*Symtab)); const Elf_Sym *Sym = unwrapOrError( - FileName, Obj.template getEntry(Symtab, Sec.sh_info)); + FileName, Obj.template getEntry(*Symtab, Sec.sh_info)); auto Data = unwrapOrError( - FileName, Obj.template getSectionContentsAsArray(&Sec)); + FileName, Obj.template getSectionContentsAsArray(Sec)); - StringRef Name = unwrapOrError(FileName, Obj.getSectionName(&Sec)); + StringRef Name = unwrapOrError(FileName, Obj.getSectionName(Sec)); StringRef Signature = StrTable.data() + Sym->st_name; Ret.push_back({Name, maybeDemangle(Signature), @@ -3577,7 +3592,7 @@ std::vector getGroups(const ELFFile &Obj, std::vector &GM = Ret.back().Members; for (uint32_t Ndx : Data.slice(1)) { - auto Sec = unwrapOrError(FileName, Obj.getSection(Ndx)); + const Elf_Shdr &Sec = *unwrapOrError(FileName, Obj.getSection(Ndx)); const StringRef Name = unwrapOrError(FileName, Obj.getSectionName(Sec)); GM.push_back({Name, Ndx}); } @@ -3724,7 +3739,7 @@ template void GNUStyle::printRelocations() { if (Sec.sh_type == ELF::SHT_ANDROID_REL || Sec.sh_type == ELF::SHT_ANDROID_RELA) { Expected> RelasOrErr = - this->Obj.android_relas(&Sec); + this->Obj.android_relas(Sec); if (!RelasOrErr) return RelasOrErr.takeError(); return RelasOrErr->size(); @@ -3732,7 +3747,7 @@ template void GNUStyle::printRelocations() { if (!opts::RawRelr && (Sec.sh_type == ELF::SHT_RELR || Sec.sh_type == ELF::SHT_ANDROID_RELR)) { - Expected RelrsOrErr = this->Obj.relrs(&Sec); + Expected RelrsOrErr = this->Obj.relrs(Sec); if (!RelrsOrErr) return RelrsOrErr.takeError(); return this->Obj.decode_relrs(*RelrsOrErr).size(); @@ -3824,7 +3839,7 @@ template void GNUStyle::printSectionHeaders() { ArrayRef Sections = cantFail(this->Obj.sections()); OS << "There are " << to_string(Sections.size()) << " section headers, starting at offset " - << "0x" << to_hexString(this->Obj.getHeader()->e_shoff, false) << ":\n\n"; + << "0x" << to_hexString(this->Obj.getHeader().e_shoff, false) << ":\n\n"; OS << "Section Headers:\n"; Field Fields[11] = { {"[Nr]", 2}, {"Name", 7}, {"Type", 25}, @@ -3849,15 +3864,15 @@ template void GNUStyle::printSectionHeaders() { Fields[1].Str = ""; else Fields[1].Str = std::string(unwrapOrError( - this->FileName, this->Obj.getSectionName(&Sec, SecStrTable))); + this->FileName, this->Obj.getSectionName(Sec, SecStrTable))); Fields[2].Str = - getSectionTypeString(this->Obj.getHeader()->e_machine, Sec.sh_type); + getSectionTypeString(this->Obj.getHeader().e_machine, Sec.sh_type); Fields[3].Str = to_string(format_hex_no_prefix(Sec.sh_addr, ELFT::Is64Bits ? 16 : 8)); Fields[4].Str = to_string(format_hex_no_prefix(Sec.sh_offset, 6)); Fields[5].Str = to_string(format_hex_no_prefix(Sec.sh_size, 6)); Fields[6].Str = to_string(format_hex_no_prefix(Sec.sh_entsize, 2)); - Fields[7].Str = getGNUFlags(this->Obj.getHeader()->e_machine, Sec.sh_flags); + Fields[7].Str = getGNUFlags(this->Obj.getHeader().e_machine, Sec.sh_flags); Fields[8].Str = to_string(Sec.sh_link); Fields[9].Str = to_string(Sec.sh_info); Fields[10].Str = to_string(Sec.sh_addralign); @@ -3877,7 +3892,7 @@ template void GNUStyle::printSectionHeaders() { OS << "\n"; ++SectionIndex; } - printSectionDescription(OS, this->Obj.getHeader()->e_machine); + printSectionDescription(OS, this->Obj.getHeader().e_machine); } template @@ -3915,7 +3930,7 @@ std::string GNUStyle::getSymbolSectionNdx(const Elf_Sym *Symbol, return "COM"; case ELF::SHN_XINDEX: { Expected IndexOrErr = object::getExtendedSymbolTableIndex( - Symbol, FirstSym, this->dumper()->getShndxTable()); + *Symbol, *FirstSym, this->dumper()->getShndxTable()); if (!IndexOrErr) { assert(Symbol->st_shndx == SHN_XINDEX && "getSymbolSectionIndex should only fail due to an invalid " @@ -3958,7 +3973,7 @@ void GNUStyle::printSymbol(const Elf_Sym *Symbol, const Elf_Sym *FirstSym, Fields[2].Str = to_string(format_decimal(Symbol->st_size, 5)); unsigned char SymbolType = Symbol->getType(); - if (this->Obj.getHeader()->e_machine == ELF::EM_AMDGPU && + if (this->Obj.getHeader().e_machine == ELF::EM_AMDGPU && SymbolType >= ELF::STT_LOOS && SymbolType < ELF::STT_HIOS) Fields[3].Str = printEnum(SymbolType, makeArrayRef(AMDGPUSymbolTypes)); else @@ -3997,7 +4012,7 @@ void GNUStyle::printHashedSymbol(const Elf_Sym *FirstSym, uint32_t Sym, Fields[3].Str = to_string(format_decimal(Symbol->st_size, 5)); unsigned char SymbolType = Symbol->getType(); - if (this->Obj.getHeader()->e_machine == ELF::EM_AMDGPU && + if (this->Obj.getHeader().e_machine == ELF::EM_AMDGPU && SymbolType >= ELF::STT_LOOS && SymbolType < ELF::STT_HIOS) Fields[4].Str = printEnum(SymbolType, makeArrayRef(AMDGPUSymbolTypes)); else @@ -4224,14 +4239,14 @@ void GNUStyle::printProgramHeaders( template void GNUStyle::printProgramHeaders() { unsigned Bias = ELFT::Is64Bits ? 8 : 0; - const Elf_Ehdr *Header = this->Obj.getHeader(); + const Elf_Ehdr &Header = this->Obj.getHeader(); Field Fields[8] = {2, 17, 26, 37 + Bias, 48 + Bias, 56 + Bias, 64 + Bias, 68 + Bias}; OS << "\nElf file type is " - << printEnum(Header->e_type, makeArrayRef(ElfObjectFileType)) << "\n" - << "Entry point " << format_hex(Header->e_entry, 3) << "\n" - << "There are " << Header->e_phnum << " program headers," - << " starting at offset " << Header->e_phoff << "\n\n" + << printEnum(Header.e_type, makeArrayRef(ElfObjectFileType)) << "\n" + << "Entry point " << format_hex(Header.e_entry, 3) << "\n" + << "There are " << Header.e_phnum << " program headers," + << " starting at offset " << Header.e_phoff << "\n\n" << "Program Headers:\n"; if (ELFT::Is64Bits) OS << " Type Offset VirtAddr PhysAddr " @@ -4251,7 +4266,7 @@ template void GNUStyle::printProgramHeaders() { } for (const Elf_Phdr &Phdr : *PhdrsOrErr) { - Fields[0].Str = getGNUPtType(Header->e_machine, Phdr.p_type); + Fields[0].Str = getGNUPtType(Header.e_machine, Phdr.p_type); Fields[1].Str = to_string(format_hex(Phdr.p_offset, 8)); Fields[2].Str = to_string(format_hex(Phdr.p_vaddr, Width)); Fields[3].Str = to_string(format_hex(Phdr.p_paddr, Width)); @@ -4319,8 +4334,7 @@ template void GNUStyle::printSectionMapping() { if (checkTLSSections(Phdr, Sec) && checkOffsets(Phdr, Sec) && checkVMA(Phdr, Sec) && checkPTDynamic(Phdr, Sec)) { Sections += - unwrapOrError(this->FileName, this->Obj.getSectionName(&Sec)) - .str() + + unwrapOrError(this->FileName, this->Obj.getSectionName(Sec)).str() + " "; BelongsToSegment.insert(&Sec); } @@ -4334,7 +4348,7 @@ template void GNUStyle::printSectionMapping() { for (const Elf_Shdr &Sec : cantFail(this->Obj.sections())) { if (BelongsToSegment.find(&Sec) == BelongsToSegment.end()) Sections += - unwrapOrError(this->FileName, this->Obj.getSectionName(&Sec)).str() + + unwrapOrError(this->FileName, this->Obj.getSectionName(Sec)).str() + ' '; } if (!Sections.empty()) { @@ -4475,7 +4489,7 @@ template void GNUStyle::printGNUVersionSectionProlog( const typename ELFT::Shdr *Sec, const Twine &Label, unsigned EntriesNum) { StringRef SecName = - unwrapOrError(this->FileName, this->Obj.getSectionName(Sec)); + unwrapOrError(this->FileName, this->Obj.getSectionName(*Sec)); OS << Label << " section '" << SecName << "' " << "contains " << EntriesNum << " entries:\n"; @@ -4484,7 +4498,7 @@ void GNUStyle::printGNUVersionSectionProlog( this->Obj.getSection(Sec->sh_link); if (SymTabOrErr) SymTabName = - unwrapOrError(this->FileName, this->Obj.getSectionName(*SymTabOrErr)); + unwrapOrError(this->FileName, this->Obj.getSectionName(**SymTabOrErr)); else this->reportUniqueWarning(createError("invalid section linked to " + describe(this->Obj, *Sec) + ": " + @@ -4764,184 +4778,6 @@ template void GNUStyle::printAddrsig() { reportError(createError("--addrsig: not implemented"), this->FileName); } -static StringRef getGenericNoteTypeName(const uint32_t NT) { - static const struct { - uint32_t ID; - const char *Name; - } Notes[] = { - {ELF::NT_VERSION, "NT_VERSION (version)"}, - {ELF::NT_ARCH, "NT_ARCH (architecture)"}, - {ELF::NT_GNU_BUILD_ATTRIBUTE_OPEN, "OPEN"}, - {ELF::NT_GNU_BUILD_ATTRIBUTE_FUNC, "func"}, - }; - - for (const auto &Note : Notes) - if (Note.ID == NT) - return Note.Name; - - return ""; -} - -static StringRef getCoreNoteTypeName(const uint32_t NT) { - static const struct { - uint32_t ID; - const char *Name; - } Notes[] = { - {ELF::NT_PRSTATUS, "NT_PRSTATUS (prstatus structure)"}, - {ELF::NT_FPREGSET, "NT_FPREGSET (floating point registers)"}, - {ELF::NT_PRPSINFO, "NT_PRPSINFO (prpsinfo structure)"}, - {ELF::NT_TASKSTRUCT, "NT_TASKSTRUCT (task structure)"}, - {ELF::NT_AUXV, "NT_AUXV (auxiliary vector)"}, - {ELF::NT_PSTATUS, "NT_PSTATUS (pstatus structure)"}, - {ELF::NT_FPREGS, "NT_FPREGS (floating point registers)"}, - {ELF::NT_PSINFO, "NT_PSINFO (psinfo structure)"}, - {ELF::NT_LWPSTATUS, "NT_LWPSTATUS (lwpstatus_t structure)"}, - {ELF::NT_LWPSINFO, "NT_LWPSINFO (lwpsinfo_t structure)"}, - {ELF::NT_WIN32PSTATUS, "NT_WIN32PSTATUS (win32_pstatus structure)"}, - - {ELF::NT_PPC_VMX, "NT_PPC_VMX (ppc Altivec registers)"}, - {ELF::NT_PPC_VSX, "NT_PPC_VSX (ppc VSX registers)"}, - {ELF::NT_PPC_TAR, "NT_PPC_TAR (ppc TAR register)"}, - {ELF::NT_PPC_PPR, "NT_PPC_PPR (ppc PPR register)"}, - {ELF::NT_PPC_DSCR, "NT_PPC_DSCR (ppc DSCR register)"}, - {ELF::NT_PPC_EBB, "NT_PPC_EBB (ppc EBB registers)"}, - {ELF::NT_PPC_PMU, "NT_PPC_PMU (ppc PMU registers)"}, - {ELF::NT_PPC_TM_CGPR, "NT_PPC_TM_CGPR (ppc checkpointed GPR registers)"}, - {ELF::NT_PPC_TM_CFPR, - "NT_PPC_TM_CFPR (ppc checkpointed floating point registers)"}, - {ELF::NT_PPC_TM_CVMX, - "NT_PPC_TM_CVMX (ppc checkpointed Altivec registers)"}, - {ELF::NT_PPC_TM_CVSX, "NT_PPC_TM_CVSX (ppc checkpointed VSX registers)"}, - {ELF::NT_PPC_TM_SPR, "NT_PPC_TM_SPR (ppc TM special purpose registers)"}, - {ELF::NT_PPC_TM_CTAR, "NT_PPC_TM_CTAR (ppc checkpointed TAR register)"}, - {ELF::NT_PPC_TM_CPPR, "NT_PPC_TM_CPPR (ppc checkpointed PPR register)"}, - {ELF::NT_PPC_TM_CDSCR, - "NT_PPC_TM_CDSCR (ppc checkpointed DSCR register)"}, - - {ELF::NT_386_TLS, "NT_386_TLS (x86 TLS information)"}, - {ELF::NT_386_IOPERM, "NT_386_IOPERM (x86 I/O permissions)"}, - {ELF::NT_X86_XSTATE, "NT_X86_XSTATE (x86 XSAVE extended state)"}, - - {ELF::NT_S390_HIGH_GPRS, - "NT_S390_HIGH_GPRS (s390 upper register halves)"}, - {ELF::NT_S390_TIMER, "NT_S390_TIMER (s390 timer register)"}, - {ELF::NT_S390_TODCMP, "NT_S390_TODCMP (s390 TOD comparator register)"}, - {ELF::NT_S390_TODPREG, - "NT_S390_TODPREG (s390 TOD programmable register)"}, - {ELF::NT_S390_CTRS, "NT_S390_CTRS (s390 control registers)"}, - {ELF::NT_S390_PREFIX, "NT_S390_PREFIX (s390 prefix register)"}, - {ELF::NT_S390_LAST_BREAK, - "NT_S390_LAST_BREAK (s390 last breaking event address)"}, - {ELF::NT_S390_SYSTEM_CALL, - "NT_S390_SYSTEM_CALL (s390 system call restart data)"}, - {ELF::NT_S390_TDB, "NT_S390_TDB (s390 transaction diagnostic block)"}, - {ELF::NT_S390_VXRS_LOW, - "NT_S390_VXRS_LOW (s390 vector registers 0-15 upper half)"}, - {ELF::NT_S390_VXRS_HIGH, - "NT_S390_VXRS_HIGH (s390 vector registers 16-31)"}, - {ELF::NT_S390_GS_CB, "NT_S390_GS_CB (s390 guarded-storage registers)"}, - {ELF::NT_S390_GS_BC, - "NT_S390_GS_BC (s390 guarded-storage broadcast control)"}, - - {ELF::NT_ARM_VFP, "NT_ARM_VFP (arm VFP registers)"}, - {ELF::NT_ARM_TLS, "NT_ARM_TLS (AArch TLS registers)"}, - {ELF::NT_ARM_HW_BREAK, - "NT_ARM_HW_BREAK (AArch hardware breakpoint registers)"}, - {ELF::NT_ARM_HW_WATCH, - "NT_ARM_HW_WATCH (AArch hardware watchpoint registers)"}, - - {ELF::NT_FILE, "NT_FILE (mapped files)"}, - {ELF::NT_PRXFPREG, "NT_PRXFPREG (user_xfpregs structure)"}, - {ELF::NT_SIGINFO, "NT_SIGINFO (siginfo_t data)"}, - }; - - for (const auto &Note : Notes) - if (Note.ID == NT) - return Note.Name; - - return ""; -} - -static std::string getGNUNoteTypeName(const uint32_t NT) { - static const struct { - uint32_t ID; - const char *Name; - } Notes[] = { - {ELF::NT_GNU_ABI_TAG, "NT_GNU_ABI_TAG (ABI version tag)"}, - {ELF::NT_GNU_HWCAP, "NT_GNU_HWCAP (DSO-supplied software HWCAP info)"}, - {ELF::NT_GNU_BUILD_ID, "NT_GNU_BUILD_ID (unique build ID bitstring)"}, - {ELF::NT_GNU_GOLD_VERSION, "NT_GNU_GOLD_VERSION (gold version)"}, - {ELF::NT_GNU_PROPERTY_TYPE_0, "NT_GNU_PROPERTY_TYPE_0 (property note)"}, - }; - - for (const auto &Note : Notes) - if (Note.ID == NT) - return std::string(Note.Name); - - std::string string; - raw_string_ostream OS(string); - OS << format("Unknown note type (0x%08x)", NT); - return OS.str(); -} - -static std::string getFreeBSDNoteTypeName(const uint32_t NT) { - static const struct { - uint32_t ID; - const char *Name; - } Notes[] = { - {ELF::NT_FREEBSD_THRMISC, "NT_THRMISC (thrmisc structure)"}, - {ELF::NT_FREEBSD_PROCSTAT_PROC, "NT_PROCSTAT_PROC (proc data)"}, - {ELF::NT_FREEBSD_PROCSTAT_FILES, "NT_PROCSTAT_FILES (files data)"}, - {ELF::NT_FREEBSD_PROCSTAT_VMMAP, "NT_PROCSTAT_VMMAP (vmmap data)"}, - {ELF::NT_FREEBSD_PROCSTAT_GROUPS, "NT_PROCSTAT_GROUPS (groups data)"}, - {ELF::NT_FREEBSD_PROCSTAT_UMASK, "NT_PROCSTAT_UMASK (umask data)"}, - {ELF::NT_FREEBSD_PROCSTAT_RLIMIT, "NT_PROCSTAT_RLIMIT (rlimit data)"}, - {ELF::NT_FREEBSD_PROCSTAT_OSREL, "NT_PROCSTAT_OSREL (osreldate data)"}, - {ELF::NT_FREEBSD_PROCSTAT_PSSTRINGS, - "NT_PROCSTAT_PSSTRINGS (ps_strings data)"}, - {ELF::NT_FREEBSD_PROCSTAT_AUXV, "NT_PROCSTAT_AUXV (auxv data)"}, - }; - - for (const auto &Note : Notes) - if (Note.ID == NT) - return std::string(Note.Name); - - std::string string; - raw_string_ostream OS(string); - OS << format("Unknown note type (0x%08x)", NT); - return OS.str(); -} - -static std::string getAMDNoteTypeName(const uint32_t NT) { - static const struct { - uint32_t ID; - const char *Name; - } Notes[] = {{ELF::NT_AMD_AMDGPU_HSA_METADATA, - "NT_AMD_AMDGPU_HSA_METADATA (HSA Metadata)"}, - {ELF::NT_AMD_AMDGPU_ISA, "NT_AMD_AMDGPU_ISA (ISA Version)"}, - {ELF::NT_AMD_AMDGPU_PAL_METADATA, - "NT_AMD_AMDGPU_PAL_METADATA (PAL Metadata)"}}; - - for (const auto &Note : Notes) - if (Note.ID == NT) - return std::string(Note.Name); - - std::string string; - raw_string_ostream OS(string); - OS << format("Unknown note type (0x%08x)", NT); - return OS.str(); -} - -static std::string getAMDGPUNoteTypeName(const uint32_t NT) { - if (NT == ELF::NT_AMDGPU_METADATA) - return std::string("NT_AMDGPU_METADATA (AMDGPU Metadata)"); - - std::string string; - raw_string_ostream OS(string); - OS << format("Unknown note type (0x%08x)", NT); - return OS.str(); -} - template static std::string getGNUProperty(uint32_t Type, uint32_t DataSize, ArrayRef Data) { @@ -5291,6 +5127,138 @@ static void printCoreNote(raw_ostream &OS, const CoreNote &Note) { } } +static const NoteType GenericNoteTypes[] = { + {ELF::NT_VERSION, "NT_VERSION (version)"}, + {ELF::NT_ARCH, "NT_ARCH (architecture)"}, + {ELF::NT_GNU_BUILD_ATTRIBUTE_OPEN, "OPEN"}, + {ELF::NT_GNU_BUILD_ATTRIBUTE_FUNC, "func"}, +}; + +static const NoteType GNUNoteTypes[] = { + {ELF::NT_GNU_ABI_TAG, "NT_GNU_ABI_TAG (ABI version tag)"}, + {ELF::NT_GNU_HWCAP, "NT_GNU_HWCAP (DSO-supplied software HWCAP info)"}, + {ELF::NT_GNU_BUILD_ID, "NT_GNU_BUILD_ID (unique build ID bitstring)"}, + {ELF::NT_GNU_GOLD_VERSION, "NT_GNU_GOLD_VERSION (gold version)"}, + {ELF::NT_GNU_PROPERTY_TYPE_0, "NT_GNU_PROPERTY_TYPE_0 (property note)"}, +}; + +static const NoteType FreeBSDNoteTypes[] = { + {ELF::NT_FREEBSD_THRMISC, "NT_THRMISC (thrmisc structure)"}, + {ELF::NT_FREEBSD_PROCSTAT_PROC, "NT_PROCSTAT_PROC (proc data)"}, + {ELF::NT_FREEBSD_PROCSTAT_FILES, "NT_PROCSTAT_FILES (files data)"}, + {ELF::NT_FREEBSD_PROCSTAT_VMMAP, "NT_PROCSTAT_VMMAP (vmmap data)"}, + {ELF::NT_FREEBSD_PROCSTAT_GROUPS, "NT_PROCSTAT_GROUPS (groups data)"}, + {ELF::NT_FREEBSD_PROCSTAT_UMASK, "NT_PROCSTAT_UMASK (umask data)"}, + {ELF::NT_FREEBSD_PROCSTAT_RLIMIT, "NT_PROCSTAT_RLIMIT (rlimit data)"}, + {ELF::NT_FREEBSD_PROCSTAT_OSREL, "NT_PROCSTAT_OSREL (osreldate data)"}, + {ELF::NT_FREEBSD_PROCSTAT_PSSTRINGS, + "NT_PROCSTAT_PSSTRINGS (ps_strings data)"}, + {ELF::NT_FREEBSD_PROCSTAT_AUXV, "NT_PROCSTAT_AUXV (auxv data)"}, +}; + +static const NoteType AMDNoteTypes[] = { + {ELF::NT_AMD_AMDGPU_HSA_METADATA, + "NT_AMD_AMDGPU_HSA_METADATA (HSA Metadata)"}, + {ELF::NT_AMD_AMDGPU_ISA, "NT_AMD_AMDGPU_ISA (ISA Version)"}, + {ELF::NT_AMD_AMDGPU_PAL_METADATA, + "NT_AMD_AMDGPU_PAL_METADATA (PAL Metadata)"}, +}; + +static const NoteType AMDGPUNoteTypes[] = { + {ELF::NT_AMDGPU_METADATA, "NT_AMDGPU_METADATA (AMDGPU Metadata)"}, +}; + +static const NoteType CoreNoteTypes[] = { + {ELF::NT_PRSTATUS, "NT_PRSTATUS (prstatus structure)"}, + {ELF::NT_FPREGSET, "NT_FPREGSET (floating point registers)"}, + {ELF::NT_PRPSINFO, "NT_PRPSINFO (prpsinfo structure)"}, + {ELF::NT_TASKSTRUCT, "NT_TASKSTRUCT (task structure)"}, + {ELF::NT_AUXV, "NT_AUXV (auxiliary vector)"}, + {ELF::NT_PSTATUS, "NT_PSTATUS (pstatus structure)"}, + {ELF::NT_FPREGS, "NT_FPREGS (floating point registers)"}, + {ELF::NT_PSINFO, "NT_PSINFO (psinfo structure)"}, + {ELF::NT_LWPSTATUS, "NT_LWPSTATUS (lwpstatus_t structure)"}, + {ELF::NT_LWPSINFO, "NT_LWPSINFO (lwpsinfo_t structure)"}, + {ELF::NT_WIN32PSTATUS, "NT_WIN32PSTATUS (win32_pstatus structure)"}, + + {ELF::NT_PPC_VMX, "NT_PPC_VMX (ppc Altivec registers)"}, + {ELF::NT_PPC_VSX, "NT_PPC_VSX (ppc VSX registers)"}, + {ELF::NT_PPC_TAR, "NT_PPC_TAR (ppc TAR register)"}, + {ELF::NT_PPC_PPR, "NT_PPC_PPR (ppc PPR register)"}, + {ELF::NT_PPC_DSCR, "NT_PPC_DSCR (ppc DSCR register)"}, + {ELF::NT_PPC_EBB, "NT_PPC_EBB (ppc EBB registers)"}, + {ELF::NT_PPC_PMU, "NT_PPC_PMU (ppc PMU registers)"}, + {ELF::NT_PPC_TM_CGPR, "NT_PPC_TM_CGPR (ppc checkpointed GPR registers)"}, + {ELF::NT_PPC_TM_CFPR, + "NT_PPC_TM_CFPR (ppc checkpointed floating point registers)"}, + {ELF::NT_PPC_TM_CVMX, + "NT_PPC_TM_CVMX (ppc checkpointed Altivec registers)"}, + {ELF::NT_PPC_TM_CVSX, "NT_PPC_TM_CVSX (ppc checkpointed VSX registers)"}, + {ELF::NT_PPC_TM_SPR, "NT_PPC_TM_SPR (ppc TM special purpose registers)"}, + {ELF::NT_PPC_TM_CTAR, "NT_PPC_TM_CTAR (ppc checkpointed TAR register)"}, + {ELF::NT_PPC_TM_CPPR, "NT_PPC_TM_CPPR (ppc checkpointed PPR register)"}, + {ELF::NT_PPC_TM_CDSCR, "NT_PPC_TM_CDSCR (ppc checkpointed DSCR register)"}, + + {ELF::NT_386_TLS, "NT_386_TLS (x86 TLS information)"}, + {ELF::NT_386_IOPERM, "NT_386_IOPERM (x86 I/O permissions)"}, + {ELF::NT_X86_XSTATE, "NT_X86_XSTATE (x86 XSAVE extended state)"}, + + {ELF::NT_S390_HIGH_GPRS, "NT_S390_HIGH_GPRS (s390 upper register halves)"}, + {ELF::NT_S390_TIMER, "NT_S390_TIMER (s390 timer register)"}, + {ELF::NT_S390_TODCMP, "NT_S390_TODCMP (s390 TOD comparator register)"}, + {ELF::NT_S390_TODPREG, "NT_S390_TODPREG (s390 TOD programmable register)"}, + {ELF::NT_S390_CTRS, "NT_S390_CTRS (s390 control registers)"}, + {ELF::NT_S390_PREFIX, "NT_S390_PREFIX (s390 prefix register)"}, + {ELF::NT_S390_LAST_BREAK, + "NT_S390_LAST_BREAK (s390 last breaking event address)"}, + {ELF::NT_S390_SYSTEM_CALL, + "NT_S390_SYSTEM_CALL (s390 system call restart data)"}, + {ELF::NT_S390_TDB, "NT_S390_TDB (s390 transaction diagnostic block)"}, + {ELF::NT_S390_VXRS_LOW, + "NT_S390_VXRS_LOW (s390 vector registers 0-15 upper half)"}, + {ELF::NT_S390_VXRS_HIGH, "NT_S390_VXRS_HIGH (s390 vector registers 16-31)"}, + {ELF::NT_S390_GS_CB, "NT_S390_GS_CB (s390 guarded-storage registers)"}, + {ELF::NT_S390_GS_BC, + "NT_S390_GS_BC (s390 guarded-storage broadcast control)"}, + + {ELF::NT_ARM_VFP, "NT_ARM_VFP (arm VFP registers)"}, + {ELF::NT_ARM_TLS, "NT_ARM_TLS (AArch TLS registers)"}, + {ELF::NT_ARM_HW_BREAK, + "NT_ARM_HW_BREAK (AArch hardware breakpoint registers)"}, + {ELF::NT_ARM_HW_WATCH, + "NT_ARM_HW_WATCH (AArch hardware watchpoint registers)"}, + + {ELF::NT_FILE, "NT_FILE (mapped files)"}, + {ELF::NT_PRXFPREG, "NT_PRXFPREG (user_xfpregs structure)"}, + {ELF::NT_SIGINFO, "NT_SIGINFO (siginfo_t data)"}, +}; + +template +const StringRef getNoteTypeName(const typename ELFT::Note &Note, + unsigned ELFType) { + uint32_t Type = Note.getType(); + auto FindNote = [&](ArrayRef V) -> StringRef { + for (const NoteType &N : V) + if (N.ID == Type) + return N.Name; + return ""; + }; + + StringRef Name = Note.getName(); + if (Name == "GNU") + return FindNote(GNUNoteTypes); + if (Name == "FreeBSD") + return FindNote(FreeBSDNoteTypes); + if (Name == "AMD") + return FindNote(AMDNoteTypes); + if (Name == "AMDGPU") + return FindNote(AMDGPUNoteTypes); + + if (ELFType == ELF::ET_CORE) + return FindNote(CoreNoteTypes); + return FindNote(GenericNoteTypes); +} + template void GNUStyle::printNotes() { auto PrintHeader = [&](Optional SecName, const typename ELFT::Off Offset, @@ -5314,23 +5282,13 @@ template void GNUStyle::printNotes() { // Print the note owner/type. OS << " " << left_justify(Name, 20) << ' ' << format_hex(Descriptor.size(), 10) << '\t'; - if (Name == "GNU") { - OS << getGNUNoteTypeName(Type) << '\n'; - } else if (Name == "FreeBSD") { - OS << getFreeBSDNoteTypeName(Type) << '\n'; - } else if (Name == "AMD") { - OS << getAMDNoteTypeName(Type) << '\n'; - } else if (Name == "AMDGPU") { - OS << getAMDGPUNoteTypeName(Type) << '\n'; - } else { - StringRef NoteType = this->Obj.getHeader()->e_type == ELF::ET_CORE - ? getCoreNoteTypeName(Type) - : getGenericNoteTypeName(Type); - if (!NoteType.empty()) - OS << NoteType << '\n'; - else - OS << "Unknown note type: (" << format_hex(Type, 10) << ")\n"; - } + + StringRef NoteType = + getNoteTypeName(Note, this->Obj.getHeader().e_type); + if (!NoteType.empty()) + OS << NoteType << '\n'; + else + OS << "Unknown note type: (" << format_hex(Type, 10) << ")\n"; // Print the description, or fallback to printing raw bytes for unknown // owners. @@ -5364,11 +5322,11 @@ template void GNUStyle::printNotes() { }; ArrayRef Sections = cantFail(this->Obj.sections()); - if (this->Obj.getHeader()->e_type != ELF::ET_CORE && !Sections.empty()) { - for (const auto &S : Sections) { + if (this->Obj.getHeader().e_type != ELF::ET_CORE && !Sections.empty()) { + for (const Elf_Shdr &S : Sections) { if (S.sh_type != SHT_NOTE) continue; - PrintHeader(expectedToOptional(this->Obj.getSectionName(&S)), S.sh_offset, + PrintHeader(expectedToOptional(this->Obj.getSectionName(S)), S.sh_offset, S.sh_size); Error Err = Error::success(); for (auto Note : this->Obj.notes(S, Err)) @@ -5420,7 +5378,7 @@ void DumpStyle::printDependentLibsHelper( OnSectionStart(Shdr); - Expected> ContentsOrErr = Obj.getSectionContents(&Shdr); + Expected> ContentsOrErr = Obj.getSectionContents(Shdr); if (!ContentsOrErr) { Warn(I, toString(ContentsOrErr.takeError())); continue; @@ -5465,7 +5423,7 @@ void DumpStyle::printRelocationsHelper(const Elf_Shdr &Sec) { const bool IsMips64EL = this->Obj.isMips64EL(); switch (Sec.sh_type) { case ELF::SHT_REL: - if (Expected RangeOrErr = Obj.rels(&Sec)) { + if (Expected RangeOrErr = Obj.rels(Sec)) { for (const Elf_Rel &R : *RangeOrErr) printReloc(Relocation(R, IsMips64EL), ++RelNdx, Sec, SymTab); } else { @@ -5473,7 +5431,7 @@ void DumpStyle::printRelocationsHelper(const Elf_Shdr &Sec) { } break; case ELF::SHT_RELA: - if (Expected RangeOrErr = Obj.relas(&Sec)) { + if (Expected RangeOrErr = Obj.relas(Sec)) { for (const Elf_Rela &R : *RangeOrErr) printReloc(Relocation(R, IsMips64EL), ++RelNdx, Sec, SymTab); } else { @@ -5482,7 +5440,7 @@ void DumpStyle::printRelocationsHelper(const Elf_Shdr &Sec) { break; case ELF::SHT_RELR: case ELF::SHT_ANDROID_RELR: { - Expected RangeOrErr = Obj.relrs(&Sec); + Expected RangeOrErr = Obj.relrs(Sec); if (!RangeOrErr) { Warn(RangeOrErr.takeError()); break; @@ -5500,7 +5458,7 @@ void DumpStyle::printRelocationsHelper(const Elf_Shdr &Sec) { } case ELF::SHT_ANDROID_REL: case ELF::SHT_ANDROID_RELA: - if (Expected> RelasOrErr = Obj.android_relas(&Sec)) { + if (Expected> RelasOrErr = Obj.android_relas(Sec)) { for (const Elf_Rela &R : *RelasOrErr) printReloc(Relocation(R, IsMips64EL), ++RelNdx, Sec, SymTab); } else { @@ -5514,7 +5472,7 @@ template StringRef DumpStyle::getPrintableSectionName(const Elf_Shdr &Sec) const { StringRef Name = ""; if (Expected SecNameOrErr = - Obj.getSectionName(&Sec, this->dumper()->WarningHandler)) + Obj.getSectionName(Sec, this->dumper()->WarningHandler)) Name = *SecNameOrErr; else this->reportUniqueWarning(createError("unable to get the name of " + @@ -5558,16 +5516,6 @@ template void GNUStyle::printDependentLibs() { PrintSection(); } -// Used for printing section names in places where possible errors can be -// ignored. -static StringRef getSectionName(const SectionRef &Sec) { - Expected NameOrErr = Sec.getName(); - if (NameOrErr) - return *NameOrErr; - consumeError(NameOrErr.takeError()); - return ""; -} - // Used for printing symbol names in places where possible errors can be // ignored. static std::string getSymbolName(const ELFSymbolRef &Sym) { @@ -5579,16 +5527,13 @@ static std::string getSymbolName(const ELFSymbolRef &Sym) { } template -void DumpStyle::printFunctionStackSize(const ELFObjectFile *Obj, - uint64_t SymValue, - Optional FunctionSec, - const Elf_Shdr &StackSizeSec, - DataExtractor Data, - uint64_t *Offset) { +void DumpStyle::printFunctionStackSize( + uint64_t SymValue, Optional FunctionSec, + const Elf_Shdr &StackSizeSec, DataExtractor Data, uint64_t *Offset) { // This function ignores potentially erroneous input, unless it is directly // related to stack size reporting. SymbolRef FuncSym; - for (const ELFSymbolRef &Symbol : Obj->symbols()) { + for (const ELFSymbolRef &Symbol : ElfObj.symbols()) { Expected SymAddrOrErr = Symbol.getAddress(); if (!SymAddrOrErr) { consumeError(SymAddrOrErr.takeError()); @@ -5602,7 +5547,8 @@ void DumpStyle::printFunctionStackSize(const ELFObjectFile *Obj, if (Symbol.getELFType() == ELF::STT_FUNC && *SymAddrOrErr == SymValue) { // Check if the symbol is in the right section. FunctionSec == None means // "any section". - if (!FunctionSec || FunctionSec->containsSymbol(Symbol)) { + if (!FunctionSec || + ElfObj.toSectionRef(*FunctionSec).containsSymbol(Symbol)) { FuncSym = Symbol; break; } @@ -5616,7 +5562,7 @@ void DumpStyle::printFunctionStackSize(const ELFObjectFile *Obj, else reportWarning( createError("could not identify function symbol for stack size entry"), - Obj->getFileName()); + FileName); // Extract the size. The expectation is that Offset is pointing to the right // place, i.e. past the function address. @@ -5625,11 +5571,10 @@ void DumpStyle::printFunctionStackSize(const ELFObjectFile *Obj, // getULEB128() does not advance Offset if it is not able to extract a valid // integer. if (*Offset == PrevOffset) { - reportWarning( - createStringError(object_error::parse_failed, - "could not extract a valid stack size in " + - describe(*Obj->getELFFile(), StackSizeSec)), - Obj->getFileName()); + reportWarning(createStringError(object_error::parse_failed, + "could not extract a valid stack size in " + + describe(Obj, StackSizeSec)), + FileName); return; } @@ -5645,9 +5590,8 @@ void GNUStyle::printStackSizeEntry(uint64_t Size, StringRef FuncName) { } template -void DumpStyle::printStackSize(const ELFObjectFile *Obj, - RelocationRef Reloc, - SectionRef FunctionSec, +void DumpStyle::printStackSize(RelocationRef Reloc, + const Elf_Shdr *FunctionSec, const Elf_Shdr &StackSizeSec, const RelocationResolver &Resolver, DataExtractor Data) { @@ -5655,8 +5599,7 @@ void DumpStyle::printStackSize(const ELFObjectFile *Obj, // related to stack size reporting. object::symbol_iterator RelocSym = Reloc.getSymbol(); uint64_t RelocSymValue = 0; - StringRef FileStr = Obj->getFileName(); - if (RelocSym != Obj->symbol_end()) { + if (RelocSym != ElfObj.symbol_end()) { // Ensure that the relocation symbol is in the function section, i.e. the // section where the functions whose stack sizes we are reporting are // located. @@ -5665,16 +5608,16 @@ void DumpStyle::printStackSize(const ELFObjectFile *Obj, reportWarning( createError("cannot identify the section for relocation symbol '" + getSymbolName(*RelocSym) + "'"), - FileStr); + FileName); consumeError(SectionOrErr.takeError()); - } else if (*SectionOrErr != FunctionSec) { + } else if (*SectionOrErr != ElfObj.toSectionRef(FunctionSec)) { reportWarning(createError("relocation symbol '" + getSymbolName(*RelocSym) + "' is not in the expected section"), - FileStr); + FileName); // Pretend that the symbol is in the correct section and report its // stack size anyway. - FunctionSec = **SectionOrErr; + FunctionSec = ElfObj.getSection((*SectionOrErr)->getRawDataRefImpl()); } Expected RelocSymValueOrErr = RelocSym->getValue(); @@ -5689,31 +5632,29 @@ void DumpStyle::printStackSize(const ELFObjectFile *Obj, reportUniqueWarning(createStringError( object_error::parse_failed, "found invalid relocation offset (0x" + Twine::utohexstr(Offset) + - ") into " + describe(*Obj->getELFFile(), StackSizeSec) + + ") into " + describe(Obj, StackSizeSec) + " while trying to extract a stack size entry")); return; } uint64_t Addend = Data.getAddress(&Offset); uint64_t SymValue = Resolver(Reloc, RelocSymValue, Addend); - this->printFunctionStackSize(Obj, SymValue, FunctionSec, StackSizeSec, Data, + this->printFunctionStackSize(SymValue, FunctionSec, StackSizeSec, Data, &Offset); } template void DumpStyle::printNonRelocatableStackSizes( - const ELFObjectFile *Obj, std::function PrintHeader) { + std::function PrintHeader) { // This function ignores potentially erroneous input, unless it is directly // related to stack size reporting. - const ELFFile *EF = Obj->getELFFile(); - for (const SectionRef &Sec : Obj->sections()) { - if (getSectionName(Sec) != ".stack_sizes") + for (const Elf_Shdr &Sec : cantFail(Obj.sections())) { + if (this->getPrintableSectionName(Sec) != ".stack_sizes") continue; PrintHeader(); - const Elf_Shdr *ElfSec = Obj->getSection(Sec.getRawDataRefImpl()); ArrayRef Contents = - unwrapOrError(this->FileName, EF->getSectionContents(ElfSec)); - DataExtractor Data(Contents, Obj->isLittleEndian(), sizeof(Elf_Addr)); + unwrapOrError(this->FileName, Obj.getSectionContents(Sec)); + DataExtractor Data(Contents, Obj.isLE(), sizeof(Elf_Addr)); uint64_t Offset = 0; while (Offset < Contents.size()) { // The function address is followed by a ULEB representing the stack @@ -5721,12 +5662,12 @@ void DumpStyle::printNonRelocatableStackSizes( if (!Data.isValidOffsetForDataOfSize(Offset, sizeof(Elf_Addr) + 1)) { reportUniqueWarning(createStringError( object_error::parse_failed, - describe(*EF, *ElfSec) + + describe(Obj, Sec) + " ended while trying to extract a stack size entry")); break; } uint64_t SymValue = Data.getAddress(&Offset); - printFunctionStackSize(Obj, SymValue, /*FunctionSec=*/None, *ElfSec, Data, + printFunctionStackSize(SymValue, /*FunctionSec=*/None, Sec, Data, &Offset); } } @@ -5734,17 +5675,13 @@ void DumpStyle::printNonRelocatableStackSizes( template void DumpStyle::printRelocatableStackSizes( - const ELFObjectFile *Obj, std::function PrintHeader) { - const ELFFile *EF = Obj->getELFFile(); - + std::function PrintHeader) { // Build a map between stack size sections and their corresponding relocation // sections. - llvm::MapVector StackSizeRelocMap; - const SectionRef NullSection{}; - - for (const SectionRef &Sec : Obj->sections()) { + llvm::MapVector StackSizeRelocMap; + for (const Elf_Shdr &Sec : cantFail(Obj.sections())) { StringRef SectionName; - if (Expected NameOrErr = Sec.getName()) + if (Expected NameOrErr = Obj.getSectionName(Sec)) SectionName = *NameOrErr; else consumeError(NameOrErr.takeError()); @@ -5752,92 +5689,80 @@ void DumpStyle::printRelocatableStackSizes( // A stack size section that we haven't encountered yet is mapped to the // null section until we find its corresponding relocation section. if (SectionName == ".stack_sizes") - if (StackSizeRelocMap.count(Sec) == 0) { - StackSizeRelocMap[Sec] = NullSection; + if (StackSizeRelocMap + .insert(std::make_pair(&Sec, (const Elf_Shdr *)nullptr)) + .second) continue; - } // Check relocation sections if they are relocating contents of a // stack sizes section. - const Elf_Shdr *ElfSec = Obj->getSection(Sec.getRawDataRefImpl()); - uint32_t SectionType = ElfSec->sh_type; - if (SectionType != ELF::SHT_RELA && SectionType != ELF::SHT_REL) + if (Sec.sh_type != ELF::SHT_RELA && Sec.sh_type != ELF::SHT_REL) continue; - Expected RelSecOrErr = Sec.getRelocatedSection(); + Expected RelSecOrErr = Obj.getSection(Sec.sh_info); if (!RelSecOrErr) { - reportUniqueWarning( - createStringError(object_error::parse_failed, - describe(*Obj->getELFFile(), *ElfSec) + - ": failed to get a relocated section: " + - toString(RelSecOrErr.takeError()))); + reportUniqueWarning(createStringError( + object_error::parse_failed, + describe(Obj, Sec) + ": failed to get a relocated section: " + + toString(RelSecOrErr.takeError()))); continue; } - const Elf_Shdr *ContentsSec = - Obj->getSection((*RelSecOrErr)->getRawDataRefImpl()); - Expected ContentsSectionNameOrErr = - EF->getSectionName(ContentsSec); - if (!ContentsSectionNameOrErr) { - consumeError(ContentsSectionNameOrErr.takeError()); - continue; - } - if (*ContentsSectionNameOrErr != ".stack_sizes") + const Elf_Shdr *ContentsSec = *RelSecOrErr; + if (this->getPrintableSectionName(**RelSecOrErr) != ".stack_sizes") continue; + // Insert a mapping from the stack sizes section to its relocation section. - StackSizeRelocMap[Obj->toSectionRef(ContentsSec)] = Sec; + StackSizeRelocMap[ContentsSec] = &Sec; } for (const auto &StackSizeMapEntry : StackSizeRelocMap) { PrintHeader(); - const SectionRef &StackSizesSec = StackSizeMapEntry.first; - const SectionRef &RelocSec = StackSizeMapEntry.second; - const Elf_Shdr *StackSizesELFSec = - Obj->getSection(StackSizesSec.getRawDataRefImpl()); + const Elf_Shdr *StackSizesELFSec = StackSizeMapEntry.first; + const Elf_Shdr *RelocSec = StackSizeMapEntry.second; // Warn about stack size sections without a relocation section. - if (RelocSec == NullSection) { - reportWarning( - createError(".stack_sizes (" + - describe(*Obj->getELFFile(), *StackSizesELFSec) + - ") does not have a corresponding " - "relocation section"), - Obj->getFileName()); + if (!RelocSec) { + reportWarning(createError(".stack_sizes (" + + describe(Obj, *StackSizesELFSec) + + ") does not have a corresponding " + "relocation section"), + FileName); continue; } // A .stack_sizes section header's sh_link field is supposed to point // to the section that contains the functions whose stack sizes are // described in it. - const SectionRef FunctionSec = Obj->toSectionRef(unwrapOrError( - this->FileName, EF->getSection(StackSizesELFSec->sh_link))); - + const Elf_Shdr *FunctionSec = unwrapOrError( + this->FileName, Obj.getSection(StackSizesELFSec->sh_link)); bool (*IsSupportedFn)(uint64_t); RelocationResolver Resolver; - std::tie(IsSupportedFn, Resolver) = getRelocationResolver(*Obj); - auto Contents = unwrapOrError(this->FileName, StackSizesSec.getContents()); - DataExtractor Data(Contents, Obj->isLittleEndian(), sizeof(Elf_Addr)); + std::tie(IsSupportedFn, Resolver) = getRelocationResolver(ElfObj); + ArrayRef Contents = + unwrapOrError(this->FileName, Obj.getSectionContents(*StackSizesELFSec)); + DataExtractor Data(Contents, Obj.isLE(), sizeof(Elf_Addr)); + size_t I = 0; - for (const RelocationRef &Reloc : RelocSec.relocations()) { + for (const RelocationRef &Reloc : + ElfObj.toSectionRef(RelocSec).relocations()) { ++I; if (!IsSupportedFn || !IsSupportedFn(Reloc.getType())) { - const Elf_Shdr *RelocSecShdr = - Obj->getSection(RelocSec.getRawDataRefImpl()); reportUniqueWarning(createStringError( object_error::parse_failed, - describe(*EF, *RelocSecShdr) + + describe(Obj, *RelocSec) + " contains an unsupported relocation with index " + Twine(I) + - ": " + EF->getRelocationTypeName(Reloc.getType()))); + ": " + Obj.getRelocationTypeName(Reloc.getType()))); continue; } - this->printStackSize(Obj, Reloc, FunctionSec, *StackSizesELFSec, Resolver, + this->printStackSize(Reloc, FunctionSec, *StackSizesELFSec, Resolver, Data); } } } template -void GNUStyle::printStackSizes(const ELFObjectFile *Obj) { +void GNUStyle::printStackSizes() { bool HeaderHasBeenPrinted = false; auto PrintHeader = [&]() { if (HeaderHasBeenPrinted) @@ -5852,10 +5777,10 @@ void GNUStyle::printStackSizes(const ELFObjectFile *Obj) { // For non-relocatable objects, look directly for sections whose name starts // with .stack_sizes and process the contents. - if (Obj->isRelocatableObject()) - this->printRelocatableStackSizes(Obj, PrintHeader); + if (this->Obj.getHeader().e_type == ELF::ET_REL) + this->printRelocatableStackSizes(PrintHeader); else - this->printNonRelocatableStackSizes(Obj, PrintHeader); + this->printNonRelocatableStackSizes(PrintHeader); } template @@ -5989,7 +5914,7 @@ getMipsAbiFlagsSection(const ELFObjectFile *ObjF, const ELFFile *Obj = ObjF->getELFFile(); constexpr StringRef ErrPrefix = "unable to read the .MIPS.abiflags section: "; - Expected> DataOrErr = Obj->getSectionContents(Sec); + Expected> DataOrErr = Obj->getSectionContents(*Sec); if (!DataOrErr) return createError(ErrPrefix + toString(DataOrErr.takeError())); @@ -6034,21 +5959,21 @@ void GNUStyle::printMipsABIFlags(const ELFObjectFile *ObjF) { } template void LLVMStyle::printFileHeaders() { - const Elf_Ehdr *E = this->Obj.getHeader(); + const Elf_Ehdr &E = this->Obj.getHeader(); { DictScope D(W, "ElfHeader"); { DictScope D(W, "Ident"); - W.printBinary("Magic", makeArrayRef(E->e_ident).slice(ELF::EI_MAG0, 4)); - W.printEnum("Class", E->e_ident[ELF::EI_CLASS], makeArrayRef(ElfClass)); - W.printEnum("DataEncoding", E->e_ident[ELF::EI_DATA], + W.printBinary("Magic", makeArrayRef(E.e_ident).slice(ELF::EI_MAG0, 4)); + W.printEnum("Class", E.e_ident[ELF::EI_CLASS], makeArrayRef(ElfClass)); + W.printEnum("DataEncoding", E.e_ident[ELF::EI_DATA], makeArrayRef(ElfDataEncoding)); - W.printNumber("FileVersion", E->e_ident[ELF::EI_VERSION]); + W.printNumber("FileVersion", E.e_ident[ELF::EI_VERSION]); auto OSABI = makeArrayRef(ElfOSABI); - if (E->e_ident[ELF::EI_OSABI] >= ELF::ELFOSABI_FIRST_ARCH && - E->e_ident[ELF::EI_OSABI] <= ELF::ELFOSABI_LAST_ARCH) { - switch (E->e_machine) { + if (E.e_ident[ELF::EI_OSABI] >= ELF::ELFOSABI_FIRST_ARCH && + E.e_ident[ELF::EI_OSABI] <= ELF::ELFOSABI_LAST_ARCH) { + switch (E.e_machine) { case ELF::EM_AMDGPU: OSABI = makeArrayRef(AMDGPUElfOSABI); break; @@ -6060,32 +5985,32 @@ template void LLVMStyle::printFileHeaders() { break; } } - W.printEnum("OS/ABI", E->e_ident[ELF::EI_OSABI], OSABI); - W.printNumber("ABIVersion", E->e_ident[ELF::EI_ABIVERSION]); - W.printBinary("Unused", makeArrayRef(E->e_ident).slice(ELF::EI_PAD)); + W.printEnum("OS/ABI", E.e_ident[ELF::EI_OSABI], OSABI); + W.printNumber("ABIVersion", E.e_ident[ELF::EI_ABIVERSION]); + W.printBinary("Unused", makeArrayRef(E.e_ident).slice(ELF::EI_PAD)); } - W.printEnum("Type", E->e_type, makeArrayRef(ElfObjectFileType)); - W.printEnum("Machine", E->e_machine, makeArrayRef(ElfMachineType)); - W.printNumber("Version", E->e_version); - W.printHex("Entry", E->e_entry); - W.printHex("ProgramHeaderOffset", E->e_phoff); - W.printHex("SectionHeaderOffset", E->e_shoff); - if (E->e_machine == EM_MIPS) - W.printFlags("Flags", E->e_flags, makeArrayRef(ElfHeaderMipsFlags), + W.printEnum("Type", E.e_type, makeArrayRef(ElfObjectFileType)); + W.printEnum("Machine", E.e_machine, makeArrayRef(ElfMachineType)); + W.printNumber("Version", E.e_version); + W.printHex("Entry", E.e_entry); + W.printHex("ProgramHeaderOffset", E.e_phoff); + W.printHex("SectionHeaderOffset", E.e_shoff); + if (E.e_machine == EM_MIPS) + W.printFlags("Flags", E.e_flags, makeArrayRef(ElfHeaderMipsFlags), unsigned(ELF::EF_MIPS_ARCH), unsigned(ELF::EF_MIPS_ABI), unsigned(ELF::EF_MIPS_MACH)); - else if (E->e_machine == EM_AMDGPU) - W.printFlags("Flags", E->e_flags, makeArrayRef(ElfHeaderAMDGPUFlags), + else if (E.e_machine == EM_AMDGPU) + W.printFlags("Flags", E.e_flags, makeArrayRef(ElfHeaderAMDGPUFlags), unsigned(ELF::EF_AMDGPU_MACH)); - else if (E->e_machine == EM_RISCV) - W.printFlags("Flags", E->e_flags, makeArrayRef(ElfHeaderRISCVFlags)); + else if (E.e_machine == EM_RISCV) + W.printFlags("Flags", E.e_flags, makeArrayRef(ElfHeaderRISCVFlags)); else - W.printFlags("Flags", E->e_flags); - W.printNumber("HeaderSize", E->e_ehsize); - W.printNumber("ProgramHeaderEntrySize", E->e_phentsize); - W.printNumber("ProgramHeaderCount", E->e_phnum); - W.printNumber("SectionHeaderEntrySize", E->e_shentsize); + W.printFlags("Flags", E.e_flags); + W.printNumber("HeaderSize", E.e_ehsize); + W.printNumber("ProgramHeaderEntrySize", E.e_phentsize); + W.printNumber("ProgramHeaderCount", E.e_phnum); + W.printNumber("SectionHeaderEntrySize", E.e_shentsize); W.printString("SectionHeaderCount", getSectionHeadersNumString(this->Obj, this->FileName)); W.printString("StringTableSectionIndex", @@ -6186,13 +6111,13 @@ template void LLVMStyle::printSectionHeaders() { int SectionIndex = -1; std::vector> FlagsList = - getSectionFlagsForTarget(this->Obj.getHeader()->e_machine); + getSectionFlagsForTarget(this->Obj.getHeader().e_machine); for (const Elf_Shdr &Sec : cantFail(this->Obj.sections())) { DictScope SectionD(W, "Section"); W.printNumber("Index", ++SectionIndex); W.printNumber("Name", this->getPrintableSectionName(Sec), Sec.sh_name); W.printHex("Type", - object::getELFSectionTypeName(this->Obj.getHeader()->e_machine, + object::getELFSectionTypeName(this->Obj.getHeader().e_machine, Sec.sh_type), Sec.sh_type); W.printFlags("Flags", Sec.sh_flags, makeArrayRef(FlagsList)); @@ -6220,7 +6145,7 @@ template void LLVMStyle::printSectionHeaders() { const Elf_Shdr *SymSec = unwrapOrError(this->FileName, this->Obj.getSection( - &Sym, Symtab, this->dumper()->getShndxTable())); + Sym, Symtab, this->dumper()->getShndxTable())); if (SymSec == &Sec) printSymbol(&Sym, unwrapOrError(this->FileName, this->Obj.symbols(Symtab)) @@ -6232,7 +6157,7 @@ template void LLVMStyle::printSectionHeaders() { if (opts::SectionData && Sec.sh_type != ELF::SHT_NOBITS) { ArrayRef Data = - unwrapOrError(this->FileName, this->Obj.getSectionContents(&Sec)); + unwrapOrError(this->FileName, this->Obj.getSectionContents(Sec)); W.printBinaryBlock( "SectionData", StringRef(reinterpret_cast(Data.data()), Data.size())); @@ -6282,7 +6207,7 @@ void LLVMStyle::printSymbol(const Elf_Sym *Symbol, const Elf_Sym *First, W.printHex("Value", Symbol->st_value); W.printNumber("Size", Symbol->st_size); W.printEnum("Binding", Symbol->getBinding(), makeArrayRef(ElfSymbolBindings)); - if (this->Obj.getHeader()->e_machine == ELF::EM_AMDGPU && + if (this->Obj.getHeader().e_machine == ELF::EM_AMDGPU && SymbolType >= ELF::STT_LOOS && SymbolType < ELF::STT_HIOS) W.printEnum("Type", SymbolType, makeArrayRef(AMDGPUSymbolTypes)); else @@ -6294,7 +6219,7 @@ void LLVMStyle::printSymbol(const Elf_Sym *Symbol, const Elf_Sym *First, else { std::vector> SymOtherFlags(std::begin(ElfSymOtherFlags), std::end(ElfSymOtherFlags)); - if (this->Obj.getHeader()->e_machine == EM_MIPS) { + if (this->Obj.getHeader().e_machine == EM_MIPS) { // Someones in their infinite wisdom decided to make STO_MIPS_MIPS16 // flag overlapped with other ST_MIPS_xxx flags. So consider both // cases separately. @@ -6395,7 +6320,7 @@ template void LLVMStyle::printProgramHeaders() { for (const Elf_Phdr &Phdr : *PhdrsOrErr) { DictScope P(W, "ProgramHeader"); StringRef Type = - segmentTypeToString(this->Obj.getHeader()->e_machine, Phdr.p_type); + segmentTypeToString(this->Obj.getHeader().e_machine, Phdr.p_type); W.printHex("Type", Type.empty() ? "Unknown" : Type, Phdr.p_type); W.printHex("Offset", Phdr.p_offset); @@ -6505,7 +6430,7 @@ template void LLVMStyle::printCGProfile() { Expected> CGProfileOrErr = this->Obj.template getSectionContentsAsArray( - this->dumper()->getDotCGProfileSec()); + *this->dumper()->getDotCGProfileSec()); if (!CGProfileOrErr) { this->reportUniqueWarning( createError("unable to dump the SHT_LLVM_CALL_GRAPH_PROFILE section: " + @@ -6540,26 +6465,27 @@ static Expected> toULEB128Array(ArrayRef Data) { template void LLVMStyle::printAddrsig() { ListScope L(W, "Addrsig"); - if (!this->dumper()->getDotAddrsigSec()) + const Elf_Shdr *Sec = this->dumper()->getDotAddrsigSec(); + if (!Sec) return; - ArrayRef Contents = unwrapOrError( - this->FileName, - this->Obj.getSectionContents(this->dumper()->getDotAddrsigSec())); - Expected> V = toULEB128Array(Contents); - if (!V) { - reportWarning(V.takeError(), this->FileName); + + Expected> ContentsOrErr = + this->Obj.getSectionContents(*Sec); + if (!ContentsOrErr) { + this->reportUniqueWarning(ContentsOrErr.takeError()); return; } - for (uint64_t Sym : *V) { - Expected NameOrErr = this->dumper()->getStaticSymbolName(Sym); - if (NameOrErr) { - W.printNumber("Sym", *NameOrErr, Sym); - continue; - } - reportWarning(NameOrErr.takeError(), this->FileName); - W.printNumber("Sym", "", Sym); + Expected> SymsOrErr = toULEB128Array(*ContentsOrErr); + if (!SymsOrErr) { + this->reportUniqueWarning(createError("unable to decode " + + describe(this->Obj, *Sec) + ": " + + toString(SymsOrErr.takeError()))); + return; } + + for (uint64_t Sym : *SymsOrErr) + W.printNumber("Sym", this->dumper()->getStaticSymbolName(Sym), Sym); } template @@ -6624,24 +6550,14 @@ template void LLVMStyle::printNotes() { // Print the note owner/type. W.printString("Owner", Name); W.printHex("Data size", Descriptor.size()); - if (Name == "GNU") { - W.printString("Type", getGNUNoteTypeName(Type)); - } else if (Name == "FreeBSD") { - W.printString("Type", getFreeBSDNoteTypeName(Type)); - } else if (Name == "AMD") { - W.printString("Type", getAMDNoteTypeName(Type)); - } else if (Name == "AMDGPU") { - W.printString("Type", getAMDGPUNoteTypeName(Type)); - } else { - StringRef NoteType = this->Obj.getHeader()->e_type == ELF::ET_CORE - ? getCoreNoteTypeName(Type) - : getGenericNoteTypeName(Type); - if (!NoteType.empty()) - W.printString("Type", NoteType); - else - W.printString("Type", - "Unknown (" + to_string(format_hex(Type, 10)) + ")"); - } + + StringRef NoteType = + getNoteTypeName(Note, this->Obj.getHeader().e_type); + if (!NoteType.empty()) + W.printString("Type", NoteType); + else + W.printString("Type", + "Unknown (" + to_string(format_hex(Type, 10)) + ")"); // Print the description, or fallback to printing raw bytes for unknown // owners. @@ -6672,12 +6588,12 @@ template void LLVMStyle::printNotes() { }; ArrayRef Sections = cantFail(this->Obj.sections()); - if (this->Obj.getHeader()->e_type != ELF::ET_CORE && !Sections.empty()) { - for (const auto &S : Sections) { + if (this->Obj.getHeader().e_type != ELF::ET_CORE && !Sections.empty()) { + for (const Elf_Shdr &S : Sections) { if (S.sh_type != SHT_NOTE) continue; DictScope D(W, "NoteSection"); - PrintHeader(expectedToOptional(this->Obj.getSectionName(&S)), S.sh_offset, + PrintHeader(expectedToOptional(this->Obj.getSectionName(S)), S.sh_offset, S.sh_size); Error Err = Error::success(); for (auto Note : this->Obj.notes(S, Err)) @@ -6718,7 +6634,7 @@ template void LLVMStyle::printELFLinkerOptions() { continue; Expected> ContentsOrErr = - this->Obj.getSectionContents(&Shdr); + this->Obj.getSectionContents(Shdr); if (!ContentsOrErr) { this->reportUniqueWarning( createError("unable to read the content of the " @@ -6761,12 +6677,12 @@ template void LLVMStyle::printDependentLibs() { } template -void LLVMStyle::printStackSizes(const ELFObjectFile *Obj) { +void LLVMStyle::printStackSizes() { ListScope L(W, "StackSizes"); - if (Obj->isRelocatableObject()) - this->printRelocatableStackSizes(Obj, []() {}); + if (this->Obj.getHeader().e_type == ELF::ET_REL) + this->printRelocatableStackSizes([]() {}); else - this->printNonRelocatableStackSizes(Obj, []() {}); + this->printNonRelocatableStackSizes([]() {}); } template diff --git a/llvm/tools/obj2yaml/dwarf2yaml.cpp b/llvm/tools/obj2yaml/dwarf2yaml.cpp index 513fa0fdef01d..10e8ecaeec089 100644 --- a/llvm/tools/obj2yaml/dwarf2yaml.cpp +++ b/llvm/tools/obj2yaml/dwarf2yaml.cpp @@ -8,6 +8,7 @@ #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/DWARF/DWARFDebugAddr.h" #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h" #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h" #include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h" @@ -46,14 +47,52 @@ void dumpDebugAbbrev(DWARFContext &DCtx, DWARFYAML::Data &Y) { } } -void dumpDebugStrings(DWARFContext &DCtx, DWARFYAML::Data &Y) { - StringRef RemainingTable = DCtx.getDWARFObj().getStrSection(); - Y.DebugStrings.emplace(); - while (RemainingTable.size() > 0) { - auto SymbolPair = RemainingTable.split('\0'); - RemainingTable = SymbolPair.second; - Y.DebugStrings->push_back(SymbolPair.first); +Error dumpDebugAddr(DWARFContext &DCtx, DWARFYAML::Data &Y) { + DWARFDebugAddrTable AddrTable; + DWARFDataExtractor AddrData(DCtx.getDWARFObj(), + DCtx.getDWARFObj().getAddrSection(), + DCtx.isLittleEndian(), /*AddrSize=*/0); + std::vector AddrTables; + uint64_t Offset = 0; + while (AddrData.isValidOffset(Offset)) { + // We ignore any errors that don't prevent parsing the section, since we can + // still represent such sections. + if (Error Err = AddrTable.extractV5(AddrData, &Offset, /*CUAddrSize=*/0, + consumeError)) + return Err; + AddrTables.emplace_back(); + for (uint64_t Addr : AddrTable.getAddressEntries()) { + // Currently, the parser doesn't support parsing an address table with non + // linear addresses (segment_selector_size != 0). The segment selectors + // are specified to be zero. + AddrTables.back().SegAddrPairs.push_back( + {/*SegmentSelector=*/0, /*Address=*/Addr}); + } + + AddrTables.back().Format = AddrTable.getFormat(); + AddrTables.back().Length = AddrTable.getLength(); + AddrTables.back().Version = AddrTable.getVersion(); + AddrTables.back().AddrSize = AddrTable.getAddressSize(); + AddrTables.back().SegSelectorSize = AddrTable.getSegmentSelectorSize(); } + Y.DebugAddr = std::move(AddrTables); + return Error::success(); +} + +Error dumpDebugStrings(DWARFContext &DCtx, DWARFYAML::Data &Y) { + DataExtractor StrData = DCtx.getStringExtractor(); + uint64_t Offset = 0; + std::vector DebugStr; + Error Err = Error::success(); + while (StrData.isValidOffset(Offset)) { + const char *CStr = StrData.getCStr(&Offset, &Err); + if (Err) + return Err; + DebugStr.push_back(CStr); + } + + Y.DebugStrings = DebugStr; + return Err; } Error dumpDebugARanges(DWARFContext &DCtx, DWARFYAML::Data &Y) { @@ -108,6 +147,7 @@ Error dumpDebugRanges(DWARFContext &DCtx, DWARFYAML::Data &Y) { DCtx.isLittleEndian(), AddrSize); uint64_t Offset = 0; DWARFDebugRangeList DwarfRanges; + std::vector DebugRanges; while (Data.isValidOffset(Offset)) { DWARFYAML::Ranges YamlRanges; @@ -117,8 +157,10 @@ Error dumpDebugRanges(DWARFContext &DCtx, DWARFYAML::Data &Y) { return E; for (const auto &RLE : DwarfRanges.getEntries()) YamlRanges.Entries.push_back({RLE.StartAddress, RLE.EndAddress}); - Y.DebugRanges.push_back(std::move(YamlRanges)); + DebugRanges.push_back(std::move(YamlRanges)); } + + Y.DebugRanges = DebugRanges; return ErrorSuccess(); } diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp index 9f524479bb04c..75f63795cb08b 100644 --- a/llvm/tools/obj2yaml/elf2yaml.cpp +++ b/llvm/tools/obj2yaml/elf2yaml.cpp @@ -124,7 +124,7 @@ ELFDumper::getUniquedSectionName(const Elf_Shdr *Sec) { if (!SectionNames[SecIndex].empty()) return SectionNames[SecIndex]; - auto NameOrErr = Obj.getSectionName(Sec); + auto NameOrErr = Obj.getSectionName(*Sec); if (!NameOrErr) return NameOrErr; StringRef Name = *NameOrErr; @@ -153,7 +153,7 @@ ELFDumper::getUniquedSymbolName(const Elf_Sym *Sym, StringRef StrTable, return SymbolNameOrErr; StringRef Name = *SymbolNameOrErr; if (Name.empty() && Sym->getType() == ELF::STT_SECTION) { - auto ShdrOrErr = Obj.getSection(Sym, SymTab, ShndxTable); + auto ShdrOrErr = Obj.getSection(*Sym, SymTab, ShndxTable); if (!ShdrOrErr) return ShdrOrErr.takeError(); return getUniquedSectionName(*ShdrOrErr); @@ -235,14 +235,15 @@ template Expected ELFDumper::dump() { // Dump header. We do not dump EPh* and ESh* fields. When not explicitly set, // the values are set by yaml2obj automatically and there is no need to dump // them here. - Y->Header.Class = ELFYAML::ELF_ELFCLASS(Obj.getHeader()->getFileClass()); - Y->Header.Data = ELFYAML::ELF_ELFDATA(Obj.getHeader()->getDataEncoding()); - Y->Header.OSABI = Obj.getHeader()->e_ident[ELF::EI_OSABI]; - Y->Header.ABIVersion = Obj.getHeader()->e_ident[ELF::EI_ABIVERSION]; - Y->Header.Type = Obj.getHeader()->e_type; - Y->Header.Machine = ELFYAML::ELF_EM(Obj.getHeader()->e_machine); - Y->Header.Flags = Obj.getHeader()->e_flags; - Y->Header.Entry = Obj.getHeader()->e_entry; + Y->Header.Class = ELFYAML::ELF_ELFCLASS(Obj.getHeader().getFileClass()); + Y->Header.Data = ELFYAML::ELF_ELFDATA(Obj.getHeader().getDataEncoding()); + Y->Header.OSABI = Obj.getHeader().e_ident[ELF::EI_OSABI]; + Y->Header.ABIVersion = Obj.getHeader().e_ident[ELF::EI_ABIVERSION]; + Y->Header.Type = Obj.getHeader().e_type; + if (Obj.getHeader().e_machine != 0) + Y->Header.Machine = ELFYAML::ELF_EM(Obj.getHeader().e_machine); + Y->Header.Flags = Obj.getHeader().e_flags; + Y->Header.Entry = Obj.getHeader().e_entry; // Dump sections auto SectionsOrErr = Obj.sections(); @@ -415,7 +416,13 @@ Optional ELFDumper::dumpDWARFSections( if (RawSec->Name == ".debug_aranges") Err = dumpDebugARanges(*DWARFCtx.get(), DWARF); else if (RawSec->Name == ".debug_str") - dumpDebugStrings(*DWARFCtx.get(), DWARF); + Err = dumpDebugStrings(*DWARFCtx.get(), DWARF); + else if (RawSec->Name == ".debug_ranges") + Err = dumpDebugRanges(*DWARFCtx.get(), DWARF); + else if (RawSec->Name == ".debug_addr") + Err = dumpDebugAddr(*DWARFCtx.get(), DWARF); + else + continue; // If the DWARF section cannot be successfully parsed, emit raw content // instead of an entry in the DWARF section of the YAML. @@ -516,7 +523,7 @@ ELFDumper::dumpSections() { // Recognize some special SHT_PROGBITS sections by name. if (Sec.sh_type == ELF::SHT_PROGBITS) { - auto NameOrErr = getUniquedSectionName(&Sec); + auto NameOrErr = Obj.getSectionName(Sec); if (!NameOrErr) return NameOrErr.takeError(); @@ -584,7 +591,7 @@ Error ELFDumper::dumpSymbol(const Elf_Sym *Sym, const Elf_Shdr *SymTab, return Error::success(); } - auto ShdrOrErr = Obj.getSection(Sym, SymTab, ShndxTable); + auto ShdrOrErr = Obj.getSection(*Sym, SymTab, ShndxTable); if (!ShdrOrErr) return ShdrOrErr.takeError(); const Elf_Shdr *Shdr = *ShdrOrErr; @@ -607,7 +614,7 @@ Error ELFDumper::dumpRelocation(const RelT *Rel, const Elf_Shdr *SymTab, R.Offset = Rel->r_offset; R.Addend = 0; - auto SymOrErr = Obj.getRelocationSymbol(Rel, SymTab); + auto SymOrErr = Obj.getRelocationSymbol(*Rel, SymTab); if (!SymOrErr) return SymOrErr.takeError(); @@ -620,7 +627,7 @@ Error ELFDumper::dumpRelocation(const RelT *Rel, const Elf_Shdr *SymTab, auto StrTabSec = Obj.getSection(SymTab->sh_link); if (!StrTabSec) return StrTabSec.takeError(); - auto StrTabOrErr = Obj.getStringTable(*StrTabSec); + auto StrTabOrErr = Obj.getStringTable(**StrTabSec); if (!StrTabOrErr) return StrTabOrErr.takeError(); @@ -721,7 +728,7 @@ ELFDumper::dumpStackSizesSection(const Elf_Shdr *Shdr) { if (Error E = dumpCommonSection(Shdr, *S)) return std::move(E); - auto ContentOrErr = Obj.getSectionContents(Shdr); + auto ContentOrErr = Obj.getSectionContents(*Shdr); if (!ContentOrErr) return ContentOrErr.takeError(); @@ -754,7 +761,7 @@ ELFDumper::dumpAddrsigSection(const Elf_Shdr *Shdr) { if (Error E = dumpCommonSection(Shdr, *S)) return std::move(E); - auto ContentOrErr = Obj.getSectionContents(Shdr); + auto ContentOrErr = Obj.getSectionContents(*Shdr); if (!ContentOrErr) return ContentOrErr.takeError(); @@ -795,7 +802,7 @@ ELFDumper::dumpLinkerOptionsSection(const Elf_Shdr *Shdr) { if (Error E = dumpCommonSection(Shdr, *S)) return std::move(E); - auto ContentOrErr = Obj.getSectionContents(Shdr); + auto ContentOrErr = Obj.getSectionContents(*Shdr); if (!ContentOrErr) return ContentOrErr.takeError(); @@ -826,7 +833,7 @@ ELFDumper::dumpDependentLibrariesSection(const Elf_Shdr *Shdr) { if (Error E = dumpCommonSection(Shdr, *DL)) return std::move(E); - Expected> ContentOrErr = Obj.getSectionContents(Shdr); + Expected> ContentOrErr = Obj.getSectionContents(*Shdr); if (!ContentOrErr) return ContentOrErr.takeError(); @@ -853,7 +860,7 @@ ELFDumper::dumpCallGraphProfileSection(const Elf_Shdr *Shdr) { if (Error E = dumpCommonSection(Shdr, *S)) return std::move(E); - Expected> ContentOrErr = Obj.getSectionContents(Shdr); + Expected> ContentOrErr = Obj.getSectionContents(*Shdr); if (!ContentOrErr) return ContentOrErr.takeError(); ArrayRef Content = *ContentOrErr; @@ -909,7 +916,7 @@ ELFDumper::dumpDynamicSection(const Elf_Shdr *Shdr) { if (Error E = dumpCommonSection(Shdr, *S)) return std::move(E); - auto DynTagsOrErr = Obj.template getSectionContentsAsArray(Shdr); + auto DynTagsOrErr = Obj.template getSectionContentsAsArray(*Shdr); if (!DynTagsOrErr) return DynTagsOrErr.takeError(); @@ -932,7 +939,7 @@ ELFDumper::dumpRelocSection(const Elf_Shdr *Shdr) { const Elf_Shdr *SymTab = *SymTabOrErr; if (Shdr->sh_type == ELF::SHT_REL) { - auto Rels = Obj.rels(Shdr); + auto Rels = Obj.rels(*Shdr); if (!Rels) return Rels.takeError(); for (const Elf_Rel &Rel : *Rels) { @@ -942,7 +949,7 @@ ELFDumper::dumpRelocSection(const Elf_Shdr *Shdr) { S->Relocations.push_back(R); } } else { - auto Rels = Obj.relas(Shdr); + auto Rels = Obj.relas(*Shdr); if (!Rels) return Rels.takeError(); for (const Elf_Rela &Rel : *Rels) { @@ -964,7 +971,7 @@ ELFDumper::dumpRelrSection(const Elf_Shdr *Shdr) { if (auto E = dumpCommonSection(Shdr, *S)) return std::move(E); - if (Expected> Relrs = Obj.relrs(Shdr)) { + if (Expected> Relrs = Obj.relrs(*Shdr)) { S->Entries.emplace(); for (Elf_Relr Rel : *Relrs) S->Entries->emplace_back(Rel); @@ -974,7 +981,7 @@ ELFDumper::dumpRelrSection(const Elf_Shdr *Shdr) { consumeError(Relrs.takeError()); } - Expected> ContentOrErr = Obj.getSectionContents(Shdr); + Expected> ContentOrErr = Obj.getSectionContents(*Shdr); if (!ContentOrErr) return ContentOrErr.takeError(); S->Content = *ContentOrErr; @@ -990,7 +997,7 @@ ELFDumper::dumpContentSection(const Elf_Shdr *Shdr) { unsigned SecIndex = Shdr - &Sections[0]; if (SecIndex != 0 || Shdr->sh_type != ELF::SHT_NULL) { - auto ContentOrErr = Obj.getSectionContents(Shdr); + auto ContentOrErr = Obj.getSectionContents(*Shdr); if (!ContentOrErr) return ContentOrErr.takeError(); ArrayRef Content = *ContentOrErr; @@ -1012,7 +1019,7 @@ ELFDumper::dumpSymtabShndxSection(const Elf_Shdr *Shdr) { if (Error E = dumpCommonSection(Shdr, *S)) return std::move(E); - auto EntriesOrErr = Obj.template getSectionContentsAsArray(Shdr); + auto EntriesOrErr = Obj.template getSectionContentsAsArray(*Shdr); if (!EntriesOrErr) return EntriesOrErr.takeError(); for (const Elf_Word &E : *EntriesOrErr) @@ -1038,7 +1045,7 @@ ELFDumper::dumpNoteSection(const Elf_Shdr *Shdr) { if (Error E = dumpCommonSection(Shdr, *S)) return std::move(E); - auto ContentOrErr = Obj.getSectionContents(Shdr); + auto ContentOrErr = Obj.getSectionContents(*Shdr); if (!ContentOrErr) return ContentOrErr.takeError(); @@ -1074,7 +1081,7 @@ ELFDumper::dumpHashSection(const Elf_Shdr *Shdr) { if (Error E = dumpCommonSection(Shdr, *S)) return std::move(E); - auto ContentOrErr = Obj.getSectionContents(Shdr); + auto ContentOrErr = Obj.getSectionContents(*Shdr); if (!ContentOrErr) return ContentOrErr.takeError(); @@ -1115,7 +1122,7 @@ ELFDumper::dumpGnuHashSection(const Elf_Shdr *Shdr) { if (Error E = dumpCommonSection(Shdr, *S)) return std::move(E); - auto ContentOrErr = Obj.getSectionContents(Shdr); + auto ContentOrErr = Obj.getSectionContents(*Shdr); if (!ContentOrErr) return ContentOrErr.takeError(); @@ -1175,11 +1182,11 @@ ELFDumper::dumpVerdefSection(const Elf_Shdr *Shdr) { if (!StringTableShdrOrErr) return StringTableShdrOrErr.takeError(); - auto StringTableOrErr = Obj.getStringTable(*StringTableShdrOrErr); + auto StringTableOrErr = Obj.getStringTable(**StringTableShdrOrErr); if (!StringTableOrErr) return StringTableOrErr.takeError(); - auto Contents = Obj.getSectionContents(Shdr); + auto Contents = Obj.getSectionContents(*Shdr); if (!Contents) return Contents.takeError(); @@ -1220,7 +1227,7 @@ ELFDumper::dumpSymverSection(const Elf_Shdr *Shdr) { if (Error E = dumpCommonSection(Shdr, *S)) return std::move(E); - auto VersionsOrErr = Obj.template getSectionContentsAsArray(Shdr); + auto VersionsOrErr = Obj.template getSectionContentsAsArray(*Shdr); if (!VersionsOrErr) return VersionsOrErr.takeError(); for (const Elf_Half &E : *VersionsOrErr) @@ -1241,7 +1248,7 @@ ELFDumper::dumpVerneedSection(const Elf_Shdr *Shdr) { S->Info = Shdr->sh_info; - auto Contents = Obj.getSectionContents(Shdr); + auto Contents = Obj.getSectionContents(*Shdr); if (!Contents) return Contents.takeError(); @@ -1249,7 +1256,7 @@ ELFDumper::dumpVerneedSection(const Elf_Shdr *Shdr) { if (!StringTableShdrOrErr) return StringTableShdrOrErr.takeError(); - auto StringTableOrErr = Obj.getStringTable(*StringTableShdrOrErr); + auto StringTableOrErr = Obj.getStringTable(**StringTableShdrOrErr); if (!StringTableOrErr) return StringTableOrErr.takeError(); @@ -1318,7 +1325,7 @@ Expected ELFDumper::dumpGroup(const Elf_Shdr *Shdr) { return SymbolName.takeError(); S->Signature = *SymbolName; - auto MembersOrErr = Obj.template getSectionContentsAsArray(Shdr); + auto MembersOrErr = Obj.template getSectionContentsAsArray(*Shdr); if (!MembersOrErr) return MembersOrErr.takeError(); @@ -1348,7 +1355,7 @@ ELFDumper::dumpMipsABIFlags(const Elf_Shdr *Shdr) { if (Error E = dumpCommonSection(Shdr, *S)) return std::move(E); - auto ContentOrErr = Obj.getSectionContents(Shdr); + auto ContentOrErr = Obj.getSectionContents(*Shdr); if (!ContentOrErr) return ContentOrErr.takeError(); diff --git a/llvm/tools/obj2yaml/macho2yaml.cpp b/llvm/tools/obj2yaml/macho2yaml.cpp index 3a93d5c6846b5..49347431b9a4f 100644 --- a/llvm/tools/obj2yaml/macho2yaml.cpp +++ b/llvm/tools/obj2yaml/macho2yaml.cpp @@ -154,10 +154,8 @@ static Error dumpDebugSection(StringRef SecName, DWARFContext &DCtx, } if (SecName == "__debug_ranges") return dumpDebugRanges(DCtx, DWARF); - if (SecName == "__debug_str") { - dumpDebugStrings(DCtx, DWARF); - return Error::success(); - } + if (SecName == "__debug_str") + return dumpDebugStrings(DCtx, DWARF); return createStringError(errc::not_supported, "dumping " + SecName + " section is not supported"); } diff --git a/llvm/tools/obj2yaml/obj2yaml.h b/llvm/tools/obj2yaml/obj2yaml.h index 85a7ac9a4787b..c41010f111b68 100644 --- a/llvm/tools/obj2yaml/obj2yaml.h +++ b/llvm/tools/obj2yaml/obj2yaml.h @@ -41,12 +41,14 @@ struct Data; } void dumpDebugAbbrev(llvm::DWARFContext &DCtx, llvm::DWARFYAML::Data &Y); +llvm::Error dumpDebugAddr(llvm::DWARFContext &DCtx, llvm::DWARFYAML::Data &Y); llvm::Error dumpDebugARanges(llvm::DWARFContext &DCtx, llvm::DWARFYAML::Data &Y); void dumpDebugPubSections(llvm::DWARFContext &DCtx, llvm::DWARFYAML::Data &Y); void dumpDebugInfo(llvm::DWARFContext &DCtx, llvm::DWARFYAML::Data &Y); void dumpDebugLines(llvm::DWARFContext &DCtx, llvm::DWARFYAML::Data &Y); llvm::Error dumpDebugRanges(llvm::DWARFContext &DCtx, llvm::DWARFYAML::Data &Y); -void dumpDebugStrings(llvm::DWARFContext &DCtx, llvm::DWARFYAML::Data &Y); +llvm::Error dumpDebugStrings(llvm::DWARFContext &DCtx, + llvm::DWARFYAML::Data &Y); #endif diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp index a5c2a1bf1feeb..b38f67ac45197 100644 --- a/llvm/tools/opt/NewPMDriver.cpp +++ b/llvm/tools/opt/NewPMDriver.cpp @@ -336,15 +336,12 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM, } // For compatibility with legacy pass manager. // Alias analyses are not specially specified when using the legacy PM. - SmallVector NonAAPasses; for (auto PassName : Passes) { if (PB.isAAPassName(PassName)) { if (auto Err = PB.parseAAPipeline(AA, PassName)) { errs() << Arg0 << ": " << toString(std::move(Err)) << "\n"; return false; } - } else { - NonAAPasses.push_back(PassName); } } // For compatibility with the legacy PM AA pipeline. @@ -389,7 +386,7 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM, return false; } } - for (auto PassName : NonAAPasses) { + for (auto PassName : Passes) { std::string ModifiedPassName(PassName.begin(), PassName.end()); if (PB.isAnalysisPassName(PassName)) ModifiedPassName = "require<" + ModifiedPassName + ">"; diff --git a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp index d35a77fa379be..946368e1cb947 100644 --- a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp +++ b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp @@ -546,3 +546,41 @@ TEST(AssumeQueryAPI, AssumptionCache) { ASSERT_EQ(AR[0].Index, 1u); ASSERT_EQ(AR[0].Assume, &*First); } + +TEST(AssumeQueryAPI, Alignment) { + LLVMContext C; + SMDiagnostic Err; + std::unique_ptr Mod = parseAssemblyString( + "declare void @llvm.assume(i1)\n" + "define void @test(i32* %P, i32* %P1, i32* %P2, i32 %I3, i1 %B) {\n" + "call void @llvm.assume(i1 true) [\"align\"(i32* %P, i32 8, i32 %I3)]\n" + "call void @llvm.assume(i1 true) [\"align\"(i32* %P1, i32 %I3, i32 " + "%I3)]\n" + "call void @llvm.assume(i1 true) [\"align\"(i32* %P2, i32 16, i32 8)]\n" + "ret void\n}\n", + Err, C); + if (!Mod) + Err.print("AssumeQueryAPI", errs()); + + Function *F = Mod->getFunction("test"); + BasicBlock::iterator Start = F->begin()->begin(); + IntrinsicInst *II; + RetainedKnowledge RK; + II = cast(&*Start); + RK = getKnowledgeFromBundle(*II, II->bundle_op_info_begin()[0]); + ASSERT_EQ(RK.AttrKind, Attribute::Alignment); + ASSERT_EQ(RK.WasOn, F->getArg(0)); + ASSERT_EQ(RK.ArgValue, 1u); + Start++; + II = cast(&*Start); + RK = getKnowledgeFromBundle(*II, II->bundle_op_info_begin()[0]); + ASSERT_EQ(RK.AttrKind, Attribute::Alignment); + ASSERT_EQ(RK.WasOn, F->getArg(1)); + ASSERT_EQ(RK.ArgValue, 1u); + Start++; + II = cast(&*Start); + RK = getKnowledgeFromBundle(*II, II->bundle_op_info_begin()[0]); + ASSERT_EQ(RK.AttrKind, Attribute::Alignment); + ASSERT_EQ(RK.WasOn, F->getArg(2)); + ASSERT_EQ(RK.ArgValue, 8u); +} diff --git a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp index 2dad605395c37..e0ff4e891ab65 100644 --- a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp +++ b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp @@ -1766,5 +1766,60 @@ TEST_F(CGSCCPassManagerTest, TestInsertionOfNewRefSCC) { MPM.run(*M, MAM); } +TEST_F(CGSCCPassManagerTest, TestInsertionOfNewRefSCCMutuallyRecursive) { + std::unique_ptr M = parseIR("define void @f() {\n" + "entry:\n" + " ret void\n" + "}\n"); + + CGSCCPassManager CGPM(/*DebugLogging*/ true); + CGPM.addPass(LambdaSCCPassNoPreserve([&](LazyCallGraph::SCC &C, + CGSCCAnalysisManager &AM, + LazyCallGraph &CG, + CGSCCUpdateResult &UR) { + auto &FAM = + AM.getResult(C, CG).getManager(); + + for (auto &N : C) { + auto &F = N.getFunction(); + if (F.getName() != "f") + continue; + + // Create mutually recursive functions (ref only) 'h1' and 'h2'. + auto *H1 = Function::Create(F.getFunctionType(), F.getLinkage(), + F.getAddressSpace(), "h1", F.getParent()); + auto *H2 = Function::Create(F.getFunctionType(), F.getLinkage(), + F.getAddressSpace(), "h2", F.getParent()); + BasicBlock *H1BB = + BasicBlock::Create(F.getParent()->getContext(), "entry", H1); + BasicBlock *H2BB = + BasicBlock::Create(F.getParent()->getContext(), "entry", H2); + (void)CastInst::CreatePointerCast(H2, Type::getInt8PtrTy(F.getContext()), + "h2.ref", H1BB); + (void)ReturnInst::Create(H1->getContext(), H1BB); + (void)CastInst::CreatePointerCast(H1, Type::getInt8PtrTy(F.getContext()), + "h1.ref", H2BB); + (void)ReturnInst::Create(H2->getContext(), H2BB); + + // Add 'f -> h1' ref edge. + (void)CastInst::CreatePointerCast(H1, Type::getInt8PtrTy(F.getContext()), + "h.ref", &F.getEntryBlock().front()); + + CG.addNewFunctionIntoRefSCC(*H1, C.getOuterRefSCC()); + CG.addNewFunctionIntoRefSCC(*H2, C.getOuterRefSCC()); + + ASSERT_NO_FATAL_FAILURE( + updateCGAndAnalysisManagerForCGSCCPass(CG, C, N, AM, UR, FAM)) + << "Updating the call graph with a demoted, self-referential " + "call edge 'f -> f', a newly inserted ref edge 'f -> g', and " + "mutually recursive h1 <-> h2 caused a fatal failure"; + } + })); + + ModulePassManager MPM(/*DebugLogging*/ true); + MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); + MPM.run(*M, MAM); +} + #endif } // namespace diff --git a/llvm/unittests/Analysis/CMakeLists.txt b/llvm/unittests/Analysis/CMakeLists.txt index eb97f6289b67a..0480649352214 100644 --- a/llvm/unittests/Analysis/CMakeLists.txt +++ b/llvm/unittests/Analysis/CMakeLists.txt @@ -23,11 +23,13 @@ add_llvm_unittest_with_input_files(AnalysisTests CaptureTrackingTest.cpp CFGTest.cpp CGSCCPassManagerTest.cpp + ConstraintSystemTest.cpp DDGTest.cpp DivergenceAnalysisTest.cpp DomTreeUpdaterTest.cpp GlobalsModRefTest.cpp FunctionPropertiesAnalysisTest.cpp + IRSimilarityIdentifierTest.cpp IVDescriptorsTest.cpp LazyCallGraphTest.cpp LoadsTest.cpp diff --git a/llvm/unittests/Analysis/ConstraintSystemTest.cpp b/llvm/unittests/Analysis/ConstraintSystemTest.cpp new file mode 100644 index 0000000000000..337a111634186 --- /dev/null +++ b/llvm/unittests/Analysis/ConstraintSystemTest.cpp @@ -0,0 +1,153 @@ +//===--- ConstraintSystemTests.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/ConstraintSystem.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +TEST(ConstraintSolverTest, TestSolutionChecks) { + { + ConstraintSystem CS; + // x + y <= 10, x >= 5, y >= 6, x <= 10, y <= 10 + CS.addVariableRow({10, 1, 1}); + CS.addVariableRow({-5, -1, 0}); + CS.addVariableRow({-6, 0, -1}); + CS.addVariableRow({10, 1, 0}); + CS.addVariableRow({10, 0, 1}); + + EXPECT_FALSE(CS.mayHaveSolution()); + } + + { + ConstraintSystem CS; + // x + y <= 10, x >= 2, y >= 3, x <= 10, y <= 10 + CS.addVariableRow({10, 1, 1}); + CS.addVariableRow({-2, -1, 0}); + CS.addVariableRow({-3, 0, -1}); + CS.addVariableRow({10, 1, 0}); + CS.addVariableRow({10, 0, 1}); + + EXPECT_TRUE(CS.mayHaveSolution()); + } + + { + ConstraintSystem CS; + // x + y <= 10, 10 >= x, 10 >= y; does not have a solution. + CS.addVariableRow({10, 1, 1}); + CS.addVariableRow({-10, -1, 0}); + CS.addVariableRow({-10, 0, -1}); + + EXPECT_FALSE(CS.mayHaveSolution()); + } + + { + ConstraintSystem CS; + // x + y >= 20, 10 >= x, 10 >= y; does HAVE a solution. + CS.addVariableRow({-20, -1, -1}); + CS.addVariableRow({-10, -1, 0}); + CS.addVariableRow({-10, 0, -1}); + + EXPECT_TRUE(CS.mayHaveSolution()); + } + + { + ConstraintSystem CS; + + // 2x + y + 3z <= 10, 2x + y >= 10, y >= 1 + CS.addVariableRow({10, 2, 1, 3}); + CS.addVariableRow({-10, -2, -1, 0}); + CS.addVariableRow({-1, 0, 0, -1}); + + EXPECT_FALSE(CS.mayHaveSolution()); + } + + { + ConstraintSystem CS; + + // 2x + y + 3z <= 10, 2x + y >= 10 + CS.addVariableRow({10, 2, 1, 3}); + CS.addVariableRow({-10, -2, -1, 0}); + + EXPECT_TRUE(CS.mayHaveSolution()); + } +} + +TEST(ConstraintSolverTest, IsConditionImplied) { + { + // For the test below, we assume we know + // x <= 5 && y <= 3 + ConstraintSystem CS; + CS.addVariableRow({5, 1, 0}); + CS.addVariableRow({3, 0, 1}); + + // x + y <= 6 does not hold. + EXPECT_FALSE(CS.isConditionImplied({6, 1, 1})); + // x + y <= 7 does not hold. + EXPECT_FALSE(CS.isConditionImplied({7, 1, 1})); + // x + y <= 8 does hold. + EXPECT_TRUE(CS.isConditionImplied({8, 1, 1})); + + // 2 * x + y <= 12 does hold. + EXPECT_FALSE(CS.isConditionImplied({12, 2, 1})); + // 2 * x + y <= 13 does hold. + EXPECT_TRUE(CS.isConditionImplied({13, 2, 1})); + + // x + y <= 12 does hold. + EXPECT_FALSE(CS.isConditionImplied({12, 2, 1})); + // 2 * x + y <= 13 does hold. + EXPECT_TRUE(CS.isConditionImplied({13, 2, 1})); + + // x <= y == x - y <= 0 does not hold. + EXPECT_FALSE(CS.isConditionImplied({0, 1, -1})); + // y <= x == -x + y <= 0 does not hold. + EXPECT_FALSE(CS.isConditionImplied({0, -1, 1})); + } + + { + // For the test below, we assume we know + // x + 1 <= y + 1 == x - y <= 0 + ConstraintSystem CS; + CS.addVariableRow({0, 1, -1}); + + // x <= y == x - y <= 0 does hold. + EXPECT_TRUE(CS.isConditionImplied({0, 1, -1})); + // y <= x == -x + y <= 0 does not hold. + EXPECT_FALSE(CS.isConditionImplied({0, -1, 1})); + + // x <= y + 10 == x - y <= 10 does hold. + EXPECT_TRUE(CS.isConditionImplied({10, 1, -1})); + // x + 10 <= y == x - y <= -10 does NOT hold. + EXPECT_FALSE(CS.isConditionImplied({-10, 1, -1})); + } + + { + // For the test below, we assume we know + // x <= y == x - y <= 0 + // y <= z == y - x <= 0 + ConstraintSystem CS; + CS.addVariableRow({0, 1, -1, 0}); + CS.addVariableRow({0, 0, 1, -1}); + + // z <= y == -y + z <= 0 does not hold. + EXPECT_FALSE(CS.isConditionImplied({0, 0, -1, 1})); + // x <= z == x - z <= 0 does hold. + EXPECT_TRUE(CS.isConditionImplied({0, 1, 0, -1})); + } +} + +TEST(ConstraintSolverTest, IsConditionImpliedOverflow) { + ConstraintSystem CS; + // Make sure isConditionImplied returns false when there is an overflow. + int64_t Limit = std::numeric_limits::max(); + CS.addVariableRow({Limit - 1, Limit - 2, Limit - 3}); + EXPECT_FALSE(CS.isConditionImplied({Limit - 1, Limit - 2, Limit - 3})); +} +} // namespace diff --git a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp new file mode 100644 index 0000000000000..6b61021363fab --- /dev/null +++ b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp @@ -0,0 +1,1177 @@ +//===- IRSimilarityIdentifierTest.cpp - IRSimilarityIdentifier unit tests -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Tests for components for finding similarity such as the instruction mapper, +// suffix tree usage, and structural analysis. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/IRSimilarityIdentifier.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/SourceMgr.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace IRSimilarity; + +static std::unique_ptr makeLLVMModule(LLVMContext &Context, + StringRef ModuleStr) { + SMDiagnostic Err; + std::unique_ptr M = parseAssemblyString(ModuleStr, Err, Context); + assert(M && "Bad LLVM IR?"); + return M; +} + +void getVectors(Module &M, std::vector &InstrList, + std::vector &UnsignedVec) { + SpecificBumpPtrAllocator InstDataAllocator; + IRInstructionMapper Mapper(&InstDataAllocator); + + for (Function &F : M) + for (BasicBlock &BB : F) + Mapper.convertToUnsignedVec(BB, InstrList, UnsignedVec); +} + +// Checks that different opcodes are mapped to different values. +TEST(IRInstructionMapper, OpcodeDifferentiation) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = mul i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + // Check that the size of the unsigned vector and the instruction list are the + // same as a safety check. + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + + // Make sure that the unsigned vector is the expected size. + ASSERT_TRUE(UnsignedVec.size() == 3); + + // Check whether the instructions are not mapped to the same value. + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that the same opcodes and types are mapped to the same values. +TEST(IRInstructionMapper, OpcodeTypeSimilarity) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = add i32 %b, %a + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + + // Check whether the instructions are mapped to the same value. + ASSERT_TRUE(UnsignedVec[0] == UnsignedVec[1]); +} + +// Checks that the same opcode and different types are mapped to different +// values. +TEST(IRInstructionMapper, TypeDifferentiation) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b, i64 %c, i64 %d) { + bb0: + %0 = add i32 %a, %b + %1 = add i64 %c, %d + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that different predicates map to different values. +TEST(IRInstructionMapper, PredicateDifferentiation) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = icmp sge i32 %b, %a + %1 = icmp slt i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that predicates with the same swapped predicate map to different +// values. +TEST(IRInstructionMapper, PredicateIsomorphism) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = icmp sgt i32 %a, %b + %1 = icmp slt i32 %b, %a + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that the same predicate maps to the same value. +TEST(IRInstructionMapper, PredicateSimilarity) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = icmp slt i32 %a, %b + %1 = icmp slt i32 %b, %a + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] == UnsignedVec[1]); +} + +// Checks that the same predicate maps to the same value for floating point +// CmpInsts. +TEST(IRInstructionMapper, FPPredicateSimilarity) { + StringRef ModuleString = R"( + define i32 @f(double %a, double %b) { + bb0: + %0 = fcmp olt double %a, %b + %1 = fcmp olt double %b, %a + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] == UnsignedVec[1]); +} + +// Checks that the different predicate maps to a different value for floating +// point CmpInsts. +TEST(IRInstructionMapper, FPPredicatDifference) { + StringRef ModuleString = R"( + define i32 @f(double %a, double %b) { + bb0: + %0 = fcmp olt double %a, %b + %1 = fcmp oge double %b, %a + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that the zexts that have the same type parameters map to the same +// unsigned integer. +TEST(IRInstructionMapper, ZextTypeSimilarity) { + StringRef ModuleString = R"( + define i32 @f(i32 %a) { + bb0: + %0 = zext i32 %a to i64 + %1 = zext i32 %a to i64 + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] == UnsignedVec[1]); +} + +// Checks that the sexts that have the same type parameters map to the same +// unsigned integer. +TEST(IRInstructionMapper, SextTypeSimilarity) { + StringRef ModuleString = R"( + define i32 @f(i32 %a) { + bb0: + %0 = sext i32 %a to i64 + %1 = sext i32 %a to i64 + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] == UnsignedVec[1]); +} + +// Checks that the zexts that have the different type parameters map to the +// different unsigned integers. +TEST(IRInstructionMapper, ZextTypeDifference) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i8 %b) { + bb0: + %0 = zext i32 %a to i64 + %1 = zext i8 %b to i32 + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + + +// Checks that the sexts that have the different type parameters map to the +// different unsigned integers. +TEST(IRInstructionMapper, SextTypeDifference) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i8 %b) { + bb0: + %0 = sext i32 %a to i64 + %1 = sext i8 %b to i32 + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that loads that have the same type are mapped to the same unsigned +// integer. +TEST(IRInstructionMapper, LoadSimilarType) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i32* %b) { + bb0: + %0 = load i32, i32* %a + %1 = load i32, i32* %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] == UnsignedVec[1]); +} + +// Checks that loads that have the different types are mapped to +// different unsigned integers. +TEST(IRInstructionMapper, LoadDifferentType) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i64* %b) { + bb0: + %0 = load i32, i32* %a + %1 = load i64, i64* %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that loads that have the different aligns are mapped to different +// unsigned integers. +TEST(IRInstructionMapper, LoadDifferentAlign) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i32* %b) { + bb0: + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 8 + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that loads that have the different volatile settings are mapped to +// different unsigned integers. +TEST(IRInstructionMapper, LoadDifferentVolatile) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i32* %b) { + bb0: + %0 = load volatile i32, i32* %a + %1 = load i32, i32* %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that loads that have the same volatile settings are mapped to +// different unsigned integers. +TEST(IRInstructionMapper, LoadSameVolatile) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i32* %b) { + bb0: + %0 = load volatile i32, i32* %a + %1 = load volatile i32, i32* %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] == UnsignedVec[1]); +} + +// Checks that loads that have the different atomicity settings are mapped to +// different unsigned integers. +TEST(IRInstructionMapper, LoadDifferentAtomic) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i32* %b) { + bb0: + %0 = load atomic i32, i32* %a unordered, align 4 + %1 = load atomic i32, i32* %b monotonic, align 4 + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that loads that have the same atomicity settings are mapped to +// different unsigned integers. +TEST(IRInstructionMapper, LoadSameAtomic) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i32* %b) { + bb0: + %0 = load atomic i32, i32* %a unordered, align 4 + %1 = load atomic i32, i32* %b unordered, align 4 + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] == UnsignedVec[1]); +} + +// Checks that stores that have the same type are mapped to the same unsigned +// integer. +TEST(IRInstructionMapper, StoreSimilarType) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i32* %b) { + bb0: + store i32 1, i32* %a + store i32 2, i32* %a + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] == UnsignedVec[1]); +} + +// Checks that stores that have the different types are mapped to +// different unsigned integers. +TEST(IRInstructionMapper, StoreDifferentType) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i64* %b) { + bb0: + store i32 1, i32* %a + store i64 1, i64* %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that stores that have the different aligns are mapped to different +// unsigned integers. +TEST(IRInstructionMapper, StoreDifferentAlign) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i32* %b) { + bb0: + store i32 1, i32* %a, align 4 + store i32 1, i32* %b, align 8 + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that stores that have the different volatile settings are mapped to +// different unsigned integers. +TEST(IRInstructionMapper, StoreDifferentVolatile) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i32* %b) { + bb0: + store volatile i32 1, i32* %a + store i32 1, i32* %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// Checks that stores that have the same volatile settings are mapped to +// different unsigned integers. +TEST(IRInstructionMapper, StoreSameVolatile) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i32* %b) { + bb0: + store volatile i32 1, i32* %a + store volatile i32 1, i32* %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] == UnsignedVec[1]); +} + +// Checks that loads that have the same atomicity settings are mapped to +// different unsigned integers. +TEST(IRInstructionMapper, StoreSameAtomic) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i32* %b) { + bb0: + store atomic i32 1, i32* %a unordered, align 4 + store atomic i32 1, i32* %b unordered, align 4 + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] == UnsignedVec[1]); +} + +// Checks that loads that have the different atomicity settings are mapped to +// different unsigned integers. +TEST(IRInstructionMapper, StoreDifferentAtomic) { + StringRef ModuleString = R"( + define i32 @f(i32* %a, i32* %b) { + bb0: + store atomic i32 1, i32* %a unordered, align 4 + store atomic i32 1, i32* %b monotonic, align 4 + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + ASSERT_TRUE(UnsignedVec.size() == 3); + ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); +} + +// In most cases, the illegal instructions we are collecting don't require any +// sort of setup. In these cases, we can just only have illegal instructions, +// and the mapper will create 0 length vectors, and we can check that. + +// In cases where we have legal instructions needed to set up the illegal +// instruction, to check illegal instructions are assigned unsigned integers +// from the maximum value decreasing to 0, it will be greater than a legal +// instruction that comes after. So to check that we have an illegal +// instruction, we place a legal instruction after an illegal instruction, and +// check that the illegal unsigned integer is greater than the unsigned integer +// of the legal instruction. + +// Checks that the branch is mapped to be illegal since there is extra checking +// needed to ensure that a branch in one region is branching to an isomorphic +// location in a different region. +TEST(IRInstructionMapper, BranchIllegal) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = icmp slt i32 %a, %b + br i1 %0, label %bb0, label %bb1 + bb1: + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// Checks that a PHINode is mapped to be illegal since there is extra checking +// needed to ensure that a branch in one region is bin an isomorphic +// location in a different region. +TEST(IRInstructionMapper, PhiIllegal) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = phi i1 [ 0, %bb0 ], [ %0, %bb1 ] + ret i32 0 + bb1: + ret i32 1 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// Checks that an alloca instruction is mapped to be illegal. +TEST(IRInstructionMapper, AllocaIllegal) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = alloca i32 + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// Checks that an getelementptr instruction is mapped to be illegal. There is +// extra checking required for the parameters if a getelementptr has more than +// two operands. +TEST(IRInstructionMapper, GetElementPtrIllegal) { + StringRef ModuleString = R"( + %struct.RT = type { i8, [10 x [20 x i32]], i8 } + %struct.ST = type { i32, double, %struct.RT } + define i32 @f(%struct.ST* %s, i32 %a, i32 %b) { + bb0: + %0 = getelementptr inbounds %struct.ST, %struct.ST* %s, i64 1 + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// Checks that a call instruction is mapped to be illegal. We have to perform +// extra checks to ensure that both the name and function type are the same. +TEST(IRInstructionMapper, CallIllegal) { + StringRef ModuleString = R"( + declare i32 @f1(i32, i32) + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = call i32 @f1(i32 %a, i32 %b) + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// Checks that an invoke instruction is mapped to be illegal. Invoke +// instructions are considered to be illegal because of the change in the +// control flow that is currently not recognized. +TEST(IRInstructionMapper, InvokeIllegal) { + StringRef ModuleString = R"( + define i32 @f(i8 *%gep1, i32 %b) { + then: + invoke i32 undef(i8* undef) + to label %invoke unwind label %lpad + + invoke: + unreachable + + lpad: + landingpad { i8*, i32 } + catch i8* null + unreachable + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// Checks that an callbr instructions are considered to be illegal. Callbr +// instructions are considered to be illegal because of the change in the +// control flow that is currently not recognized. +TEST(IRInstructionMapper, CallBrInstIllegal) { + StringRef ModuleString = R"( + define void @test() { + fail: + ret void + } + + define i32 @f(i32 %a, i32 %b) { + bb0: + callbr void asm "xorl $0, $0; jmp ${1:l}", "r,X,~{dirflag},~{fpsr},~{flags}"(i32 %a, i8* blockaddress(@test, %fail)) to label %normal [label %fail] + fail: + ret i32 0 + normal: + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// Checks that an debuginfo intrinsics are mapped to be invisible. Since they +// do not semantically change the program, they can be recognized as similar. +TEST(IRInstructionMapper, DebugInfoInvisible) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + then: + %0 = add i32 %a, %b + call void @llvm.dbg.value(metadata !0) + %1 = add i32 %a, %b + ret i32 0 + } + + declare void @llvm.dbg.value(metadata) + !0 = distinct !{!"test\00", i32 10})"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(3)); +} + +// The following are all exception handling intrinsics. We do not currently +// handle these instruction because they are very context dependent. + +// Checks that an eh.typeid.for intrinsic is mapped to be illegal. +TEST(IRInstructionMapper, ExceptionHandlingTypeIdIllegal) { + StringRef ModuleString = R"( + @_ZTIi = external constant i8* + define i32 @f() { + then: + %0 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) + ret i32 0 + } + + declare i32 @llvm.eh.typeid.for(i8*))"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// Checks that an eh.exceptioncode intrinsic is mapped to be illegal. +TEST(IRInstructionMapper, ExceptionHandlingExceptionCodeIllegal) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + entry: + %0 = catchswitch within none [label %__except] unwind to caller + + __except: + %1 = catchpad within %0 [i8* null] + catchret from %1 to label %__except + + then: + %2 = call i32 @llvm.eh.exceptioncode(token %1) + ret i32 0 + } + + declare i32 @llvm.eh.exceptioncode(token))"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// Checks that an eh.unwind intrinsic is mapped to be illegal. +TEST(IRInstructionMapper, ExceptionHandlingUnwindIllegal) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + entry: + call void @llvm.eh.unwind.init() + ret i32 0 + } + + declare void @llvm.eh.unwind.init())"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// Checks that an eh.exceptionpointer intrinsic is mapped to be illegal. +TEST(IRInstructionMapper, ExceptionHandlingExceptionPointerIllegal) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + entry: + %0 = call i8* @llvm.eh.exceptionpointer.p0i8(i32 0) + ret i32 0 + } + + declare i8* @llvm.eh.exceptionpointer.p0i8(i32))"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// Checks that a catchpad instruction is mapped to an illegal value. +TEST(IRInstructionMapper, CatchpadIllegal) { + StringRef ModuleString = R"( + declare void @llvm.donothing() nounwind readnone + + define void @function() personality i8 3 { + entry: + invoke void @llvm.donothing() to label %normal unwind label %exception + exception: + %cs1 = catchswitch within none [label %catchpad1] unwind to caller + catchpad1: + catchpad within %cs1 [] + br label %normal + normal: + ret void + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// Checks that a cleanuppad instruction is mapped to an illegal value. +TEST(IRInstructionMapper, CleanuppadIllegal) { + StringRef ModuleString = R"( + declare void @llvm.donothing() nounwind readnone + + define void @function() personality i8 3 { + entry: + invoke void @llvm.donothing() to label %normal unwind label %exception + exception: + %cs1 = catchswitch within none [label %catchpad1] unwind to caller + catchpad1: + %clean = cleanuppad within none [] + br label %normal + normal: + ret void + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(0)); +} + +// The following three instructions are memory transfer and setting based, which +// are considered illegal since is extra checking needed to handle the address +// space checking. + +// Checks that a memset instruction is mapped to an illegal value. +TEST(IRInstructionMapper, MemSetIllegal) { + StringRef ModuleString = R"( + declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) + + define i64 @function(i64 %x, i64 %z, i64 %n) { + entry: + %pool = alloca [59 x i64], align 4 + %tmp = bitcast [59 x i64]* %pool to i8* + call void @llvm.memset.p0i8.i64(i8* nonnull %tmp, i8 0, i64 236, i32 4, i1 false) + %cmp3 = icmp eq i64 %n, 0 + %a = add i64 %x, %z + %c = add i64 %x, %z + ret i64 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(6)); + ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[1]); +} + +// Checks that a memcpy instruction is mapped to an illegal value. +TEST(IRInstructionMapper, MemCpyIllegal) { + StringRef ModuleString = R"( + declare void @llvm.memcpy.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) + + define i64 @function(i64 %x, i64 %z, i64 %n) { + entry: + %pool = alloca [59 x i64], align 4 + %tmp = bitcast [59 x i64]* %pool to i8* + call void @llvm.memcpy.p0i8.i64(i8* nonnull %tmp, i8 0, i64 236, i32 4, i1 false) + %cmp3 = icmp eq i64 %n, 0 + %a = add i64 %x, %z + %c = add i64 %x, %z + ret i64 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(6)); + ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[1]); +} + +// Checks that a memmove instruction is mapped to an illegal value. +TEST(IRInstructionMapper, MemMoveIllegal) { + StringRef ModuleString = R"( + declare void @llvm.memmove.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) + + define i64 @function(i64 %x, i64 %z, i64 %n) { + entry: + %pool = alloca [59 x i64], align 4 + %tmp = bitcast [59 x i64]* %pool to i8* + call void @llvm.memmove.p0i8.i64(i8* nonnull %tmp, i8 0, i64 236, i32 4, i1 false) + %cmp3 = icmp eq i64 %n, 0 + %a = add i64 %x, %z + %c = add i64 %x, %z + ret i64 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(6)); + ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[1]); +} + +// Checks that a variable argument instructions are mapped to an illegal value. +// We exclude variable argument instructions since variable arguments +// requires extra checking of the argument list. +TEST(IRInstructionMapper, VarArgsIllegal) { + StringRef ModuleString = R"( + declare void @llvm.va_start(i8*) + declare void @llvm.va_copy(i8*, i8*) + declare void @llvm.va_end(i8*) + + define i32 @func1(i32 %a, double %b, i8* %v, ...) nounwind { + entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca double, align 8 + %ap = alloca i8*, align 4 + %c = alloca i32, align 4 + store i32 %a, i32* %a.addr, align 4 + store double %b, double* %b.addr, align 8 + %ap1 = bitcast i8** %ap to i8* + call void @llvm.va_start(i8* %ap1) + store double %b, double* %b.addr, align 8 + store double %b, double* %b.addr, align 8 + %0 = va_arg i8** %ap, i32 + store double %b, double* %b.addr, align 8 + store double %b, double* %b.addr, align 8 + call void @llvm.va_copy(i8* %v, i8* %ap1) + store double %b, double* %b.addr, align 8 + store double %b, double* %b.addr, align 8 + call void @llvm.va_end(i8* %ap1) + store i32 %0, i32* %c, align 4 + %tmp = load i32, i32* %c, align 4 + ret i32 %tmp + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + ASSERT_EQ(InstrList.size(), UnsignedVec.size()); + ASSERT_EQ(UnsignedVec.size(), static_cast(16)); + ASSERT_TRUE(UnsignedVec[4] < UnsignedVec[3]); + ASSERT_TRUE(UnsignedVec[7] < UnsignedVec[6]); + ASSERT_TRUE(UnsignedVec[10] < UnsignedVec[9]); + ASSERT_TRUE(UnsignedVec[13] < UnsignedVec[12]); +} + +// Check the length of adding two illegal instructions one after th other. We +// should find that only one element is added for each illegal range. +TEST(IRInstructionMapper, RepeatedIllegalLength) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = mul i32 %a, %b + %2 = call i32 @f(i32 %a, i32 %b) + %3 = call i32 @f(i32 %a, i32 %b) + %4 = add i32 %a, %b + %5 = mul i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + getVectors(*M, InstrList, UnsignedVec); + + // Check that the size of the unsigned vector and the instruction list are the + // same as a safety check. + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + + // Make sure that the unsigned vector is the expected size. + ASSERT_TRUE(UnsignedVec.size() == 6); +} diff --git a/llvm/unittests/Analysis/MemorySSATest.cpp b/llvm/unittests/Analysis/MemorySSATest.cpp index b470f16261263..5c0c48b788310 100644 --- a/llvm/unittests/Analysis/MemorySSATest.cpp +++ b/llvm/unittests/Analysis/MemorySSATest.cpp @@ -1066,7 +1066,7 @@ TEST_F(MemorySSATest, TestStoreMustAlias) { MemoryDef *MemDef = dyn_cast_or_null(MSSA.getMemoryAccess(V)); EXPECT_EQ(MemDef->isOptimized(), false) << "Store " << I << " is optimized from the start?"; - EXPECT_EQ(MemDef->getOptimizedAccessType(), MayAlias) + EXPECT_EQ(MemDef->getOptimizedAccessType(), None) << "Store " << I << " has correct alias information before being optimized?"; if (V == SA1) @@ -1170,7 +1170,7 @@ TEST_F(MemorySSATest, TestStoreMayAlias) { MemoryDef *MemDef = dyn_cast_or_null(MSSA.getMemoryAccess(V)); EXPECT_EQ(MemDef->isOptimized(), false) << "Store " << I << " is optimized from the start?"; - EXPECT_EQ(MemDef->getOptimizedAccessType(), MayAlias) + EXPECT_EQ(MemDef->getOptimizedAccessType(), None) << "Store " << I << " has correct alias information before being optimized?"; ++I; diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp index 3df5dc1fb82d4..c45bca1c53bf7 100644 --- a/llvm/unittests/Analysis/ValueTrackingTest.cpp +++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp @@ -10,6 +10,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" @@ -716,12 +717,57 @@ TEST(ValueTracking, propagatesPoison) { for (auto &I : BB) { if (isa(&I)) break; - EXPECT_EQ(propagatesPoison(&I), Data[Index].first) + EXPECT_EQ(propagatesPoison(cast(&I)), Data[Index].first) << "Incorrect answer at instruction " << Index << " = " << I; Index++; } } +TEST_F(ValueTrackingTest, programUndefinedIfPoison) { + parseAssembly("declare i32 @any_num()" + "define void @test(i32 %mask) {\n" + " %A = call i32 @any_num()\n" + " %B = or i32 %A, %mask\n" + " udiv i32 1, %B" + " ret void\n" + "}\n"); + // If %A was poison, udiv raises UB regardless of %mask's value + EXPECT_EQ(programUndefinedIfPoison(A), true); +} + +TEST_F(ValueTrackingTest, programUndefinedIfUndefOrPoison) { + parseAssembly("declare i32 @any_num()" + "define void @test(i32 %mask) {\n" + " %A = call i32 @any_num()\n" + " %B = or i32 %A, %mask\n" + " udiv i32 1, %B" + " ret void\n" + "}\n"); + // If %A was undef and %mask was 1, udiv does not raise UB + EXPECT_EQ(programUndefinedIfUndefOrPoison(A), false); +} + +TEST_F(ValueTrackingTest, isGuaranteedNotToBePoison_exploitBranchCond) { + parseAssembly("declare i1 @any_bool()" + "define void @test(i1 %y) {\n" + " %A = call i1 @any_bool()\n" + " %cond = and i1 %A, %y\n" + " br i1 %cond, label %BB1, label %BB2\n" + "BB1:\n" + " ret void\n" + "BB2:\n" + " ret void\n" + "}\n"); + DominatorTree DT(*F); + for (auto &BB : *F) { + if (&BB == &F->getEntryBlock()) + continue; + + EXPECT_EQ(isGuaranteedNotToBePoison(A, BB.getTerminator(), &DT), true) + << "isGuaranteedNotToBePoison does not hold at " << *BB.getTerminator(); + } +} + TEST(ValueTracking, canCreatePoisonOrUndef) { std::string AsmHead = "declare i32 @g(i32)\n" @@ -1013,6 +1059,24 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsPtrToIntZext) { EXPECT_EQ(Known.One.getZExtValue(), 0u); } +TEST_F(ComputeKnownBitsTest, ComputeKnownBitsFreeze) { + parseAssembly("define void @test() {\n" + " %m = call i32 @any_num()\n" + " %A = freeze i32 %m\n" + " %n = and i32 %m, 31\n" + " %c = icmp eq i32 %n, 0\n" + " call void @llvm.assume(i1 %c)\n" + " ret void\n" + "}\n" + "declare void @llvm.assume(i1)\n" + "declare i32 @any_num()\n"); + AssumptionCache AC(*F); + KnownBits Known = computeKnownBits(A, M->getDataLayout(), /* Depth */ 0, &AC, + F->front().getTerminator()); + EXPECT_EQ(Known.Zero.getZExtValue(), 31u); + EXPECT_EQ(Known.One.getZExtValue(), 0u); +} + class IsBytewiseValueTest : public ValueTrackingTest, public ::testing::WithParamInterface< std::pair> { diff --git a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp new file mode 100644 index 0000000000000..5c53f39fd9a3e --- /dev/null +++ b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp @@ -0,0 +1,370 @@ +//===- llvm/unittest/CodeGen/AsmPrinterDwarfTest.cpp ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TestAsmPrinter.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Testing/Support/Error.h" + +using namespace llvm; +using testing::_; +using testing::InSequence; +using testing::SaveArg; + +namespace { + +class AsmPrinterFixtureBase : public testing::Test { + void setupTestPrinter(const std::string &TripleStr, unsigned DwarfVersion, + dwarf::DwarfFormat DwarfFormat) { + auto ExpectedTestPrinter = + TestAsmPrinter::create(TripleStr, DwarfVersion, DwarfFormat); + ASSERT_THAT_EXPECTED(ExpectedTestPrinter, Succeeded()); + TestPrinter = std::move(ExpectedTestPrinter.get()); + } + +protected: + bool init(const std::string &TripleStr, unsigned DwarfVersion, + dwarf::DwarfFormat DwarfFormat) { + setupTestPrinter(TripleStr, DwarfVersion, DwarfFormat); + return TestPrinter != nullptr; + } + + std::unique_ptr TestPrinter; +}; + +class AsmPrinterEmitDwarfSymbolReferenceTest : public AsmPrinterFixtureBase { +protected: + bool init(const std::string &TripleStr, unsigned DwarfVersion, + dwarf::DwarfFormat DwarfFormat) { + if (!AsmPrinterFixtureBase::init(TripleStr, DwarfVersion, DwarfFormat)) + return false; + + // Create a symbol which will be emitted in the tests and associate it + // with a section because that is required in some code paths. + + Val = TestPrinter->getCtx().createTempSymbol(); + Sec = TestPrinter->getCtx().getELFSection(".tst", ELF::SHT_PROGBITS, 0); + SecBeginSymbol = Sec->getBeginSymbol(); + TestPrinter->getMS().SwitchSection(Sec); + TestPrinter->getMS().emitLabel(Val); + return true; + } + + MCSymbol *Val = nullptr; + MCSection *Sec = nullptr; + MCSymbol *SecBeginSymbol = nullptr; +}; + +TEST_F(AsmPrinterEmitDwarfSymbolReferenceTest, COFF) { + if (!init("x86_64-pc-windows", /*DwarfVersion=*/4, dwarf::DWARF32)) + return; + + EXPECT_CALL(TestPrinter->getMS(), EmitCOFFSecRel32(Val, 0)); + TestPrinter->getAP()->emitDwarfSymbolReference(Val, false); +} + +TEST_F(AsmPrinterEmitDwarfSymbolReferenceTest, COFFForceOffset) { + if (!init("x86_64-pc-windows", /*DwarfVersion=*/4, dwarf::DWARF32)) + return; + + EXPECT_CALL(TestPrinter->getMS(), + emitAbsoluteSymbolDiff(Val, SecBeginSymbol, 4)); + TestPrinter->getAP()->emitDwarfSymbolReference(Val, true); +} + +TEST_F(AsmPrinterEmitDwarfSymbolReferenceTest, ELFDWARF32) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF32)) + return; + + const MCExpr *Arg0 = nullptr; + EXPECT_CALL(TestPrinter->getMS(), emitValueImpl(_, 4, _)) + .WillOnce(SaveArg<0>(&Arg0)); + TestPrinter->getAP()->emitDwarfSymbolReference(Val, false); + + const MCSymbolRefExpr *ActualArg0 = dyn_cast_or_null(Arg0); + ASSERT_NE(ActualArg0, nullptr); + EXPECT_EQ(&(ActualArg0->getSymbol()), Val); +} + +TEST_F(AsmPrinterEmitDwarfSymbolReferenceTest, ELFDWARF32ForceOffset) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF32)) + return; + + EXPECT_CALL(TestPrinter->getMS(), + emitAbsoluteSymbolDiff(Val, SecBeginSymbol, 4)); + TestPrinter->getAP()->emitDwarfSymbolReference(Val, true); +} + +TEST_F(AsmPrinterEmitDwarfSymbolReferenceTest, ELFDWARF64) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF64)) + return; + + const MCExpr *Arg0 = nullptr; + EXPECT_CALL(TestPrinter->getMS(), emitValueImpl(_, 8, _)) + .WillOnce(SaveArg<0>(&Arg0)); + TestPrinter->getAP()->emitDwarfSymbolReference(Val, false); + + const MCSymbolRefExpr *ActualArg0 = dyn_cast_or_null(Arg0); + ASSERT_NE(ActualArg0, nullptr); + EXPECT_EQ(&(ActualArg0->getSymbol()), Val); +} + +TEST_F(AsmPrinterEmitDwarfSymbolReferenceTest, ELFDWARF64ForceOffset) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF64)) + return; + + EXPECT_CALL(TestPrinter->getMS(), + emitAbsoluteSymbolDiff(Val, SecBeginSymbol, 8)); + TestPrinter->getAP()->emitDwarfSymbolReference(Val, true); +} + +class AsmPrinterEmitDwarfStringOffsetTest : public AsmPrinterFixtureBase { +protected: + bool init(const std::string &TripleStr, unsigned DwarfVersion, + dwarf::DwarfFormat DwarfFormat) { + if (!AsmPrinterFixtureBase::init(TripleStr, DwarfVersion, DwarfFormat)) + return false; + + Val.Index = DwarfStringPoolEntry::NotIndexed; + Val.Symbol = TestPrinter->getCtx().createTempSymbol(); + Val.Offset = 42; + return true; + } + + DwarfStringPoolEntry Val; +}; + +TEST_F(AsmPrinterEmitDwarfStringOffsetTest, DWARF32) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF32)) + return; + + const MCExpr *Arg0 = nullptr; + EXPECT_CALL(TestPrinter->getMS(), emitValueImpl(_, 4, _)) + .WillOnce(SaveArg<0>(&Arg0)); + TestPrinter->getAP()->emitDwarfStringOffset(Val); + + const MCSymbolRefExpr *ActualArg0 = dyn_cast_or_null(Arg0); + ASSERT_NE(ActualArg0, nullptr); + EXPECT_EQ(&(ActualArg0->getSymbol()), Val.Symbol); +} + +TEST_F(AsmPrinterEmitDwarfStringOffsetTest, + DWARF32NoRelocationsAcrossSections) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF32)) + return; + + TestPrinter->setDwarfUsesRelocationsAcrossSections(false); + EXPECT_CALL(TestPrinter->getMS(), emitIntValue(Val.Offset, 4)); + TestPrinter->getAP()->emitDwarfStringOffset(Val); +} + +TEST_F(AsmPrinterEmitDwarfStringOffsetTest, DWARF64) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF64)) + return; + + const MCExpr *Arg0 = nullptr; + EXPECT_CALL(TestPrinter->getMS(), emitValueImpl(_, 8, _)) + .WillOnce(SaveArg<0>(&Arg0)); + TestPrinter->getAP()->emitDwarfStringOffset(Val); + + const MCSymbolRefExpr *ActualArg0 = dyn_cast_or_null(Arg0); + ASSERT_NE(ActualArg0, nullptr); + EXPECT_EQ(&(ActualArg0->getSymbol()), Val.Symbol); +} + +TEST_F(AsmPrinterEmitDwarfStringOffsetTest, + DWARF64NoRelocationsAcrossSections) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF64)) + return; + + TestPrinter->setDwarfUsesRelocationsAcrossSections(false); + EXPECT_CALL(TestPrinter->getMS(), emitIntValue(Val.Offset, 8)); + TestPrinter->getAP()->emitDwarfStringOffset(Val); +} + +class AsmPrinterEmitDwarfOffsetTest : public AsmPrinterFixtureBase { +protected: + bool init(const std::string &TripleStr, unsigned DwarfVersion, + dwarf::DwarfFormat DwarfFormat) { + if (!AsmPrinterFixtureBase::init(TripleStr, DwarfVersion, DwarfFormat)) + return false; + + Label = TestPrinter->getCtx().createTempSymbol(); + return true; + } + + MCSymbol *Label = nullptr; + uint64_t Offset = 42; +}; + +TEST_F(AsmPrinterEmitDwarfOffsetTest, DWARF32) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF32)) + return; + + const MCExpr *Arg0 = nullptr; + EXPECT_CALL(TestPrinter->getMS(), emitValueImpl(_, 4, _)) + .WillOnce(SaveArg<0>(&Arg0)); + TestPrinter->getAP()->emitDwarfOffset(Label, Offset); + + const MCBinaryExpr *ActualArg0 = dyn_cast_or_null(Arg0); + ASSERT_NE(ActualArg0, nullptr); + EXPECT_EQ(ActualArg0->getOpcode(), MCBinaryExpr::Add); + + const MCSymbolRefExpr *ActualLHS = + dyn_cast_or_null(ActualArg0->getLHS()); + ASSERT_NE(ActualLHS, nullptr); + EXPECT_EQ(&(ActualLHS->getSymbol()), Label); + + const MCConstantExpr *ActualRHS = + dyn_cast_or_null(ActualArg0->getRHS()); + ASSERT_NE(ActualRHS, nullptr); + EXPECT_EQ(static_cast(ActualRHS->getValue()), Offset); +} + +TEST_F(AsmPrinterEmitDwarfOffsetTest, DWARF64) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF64)) + return; + + const MCExpr *Arg0 = nullptr; + EXPECT_CALL(TestPrinter->getMS(), emitValueImpl(_, 8, _)) + .WillOnce(SaveArg<0>(&Arg0)); + TestPrinter->getAP()->emitDwarfOffset(Label, Offset); + + const MCBinaryExpr *ActualArg0 = dyn_cast_or_null(Arg0); + ASSERT_NE(ActualArg0, nullptr); + EXPECT_EQ(ActualArg0->getOpcode(), MCBinaryExpr::Add); + + const MCSymbolRefExpr *ActualLHS = + dyn_cast_or_null(ActualArg0->getLHS()); + ASSERT_NE(ActualLHS, nullptr); + EXPECT_EQ(&(ActualLHS->getSymbol()), Label); + + const MCConstantExpr *ActualRHS = + dyn_cast_or_null(ActualArg0->getRHS()); + ASSERT_NE(ActualRHS, nullptr); + EXPECT_EQ(static_cast(ActualRHS->getValue()), Offset); +} + +class AsmPrinterEmitDwarfLengthOrOffsetTest : public AsmPrinterFixtureBase { +protected: + uint64_t Val = 42; +}; + +TEST_F(AsmPrinterEmitDwarfLengthOrOffsetTest, DWARF32) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF32)) + return; + + EXPECT_CALL(TestPrinter->getMS(), emitIntValue(Val, 4)); + TestPrinter->getAP()->emitDwarfLengthOrOffset(Val); +} + +TEST_F(AsmPrinterEmitDwarfLengthOrOffsetTest, DWARF64) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF64)) + return; + + EXPECT_CALL(TestPrinter->getMS(), emitIntValue(Val, 8)); + TestPrinter->getAP()->emitDwarfLengthOrOffset(Val); +} + +class AsmPrinterGetUnitLengthFieldByteSizeTest : public AsmPrinterFixtureBase { +}; + +TEST_F(AsmPrinterGetUnitLengthFieldByteSizeTest, DWARF32) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF32)) + return; + + EXPECT_EQ(TestPrinter->getAP()->getUnitLengthFieldByteSize(), 4u); +} + +TEST_F(AsmPrinterGetUnitLengthFieldByteSizeTest, DWARF64) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF64)) + return; + + EXPECT_EQ(TestPrinter->getAP()->getUnitLengthFieldByteSize(), 12u); +} + +class AsmPrinterMaybeEmitDwarf64MarkTest : public AsmPrinterFixtureBase {}; + +TEST_F(AsmPrinterMaybeEmitDwarf64MarkTest, DWARF32) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF32)) + return; + + EXPECT_CALL(TestPrinter->getMS(), emitIntValue(_, _)).Times(0); + TestPrinter->getAP()->maybeEmitDwarf64Mark(); +} + +TEST_F(AsmPrinterMaybeEmitDwarf64MarkTest, DWARF64) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF64)) + return; + + EXPECT_CALL(TestPrinter->getMS(), emitIntValue(dwarf::DW_LENGTH_DWARF64, 4)); + TestPrinter->getAP()->maybeEmitDwarf64Mark(); +} + +class AsmPrinterEmitDwarfUnitLengthAsIntTest : public AsmPrinterFixtureBase { +protected: + uint64_t Val = 42; +}; + +TEST_F(AsmPrinterEmitDwarfUnitLengthAsIntTest, DWARF32) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF32)) + return; + + EXPECT_CALL(TestPrinter->getMS(), emitIntValue(Val, 4)); + TestPrinter->getAP()->emitDwarfUnitLength(Val, ""); +} + +TEST_F(AsmPrinterEmitDwarfUnitLengthAsIntTest, DWARF64) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF64)) + return; + + InSequence S; + EXPECT_CALL(TestPrinter->getMS(), emitIntValue(dwarf::DW_LENGTH_DWARF64, 4)); + EXPECT_CALL(TestPrinter->getMS(), emitIntValue(Val, 8)); + + TestPrinter->getAP()->emitDwarfUnitLength(Val, ""); +} + +class AsmPrinterEmitDwarfUnitLengthAsHiLoDiffTest + : public AsmPrinterFixtureBase { +protected: + bool init(const std::string &TripleStr, unsigned DwarfVersion, + dwarf::DwarfFormat DwarfFormat) { + if (!AsmPrinterFixtureBase::init(TripleStr, DwarfVersion, DwarfFormat)) + return false; + + Hi = TestPrinter->getCtx().createTempSymbol(); + Lo = TestPrinter->getCtx().createTempSymbol(); + return true; + } + + MCSymbol *Hi = nullptr; + MCSymbol *Lo = nullptr; +}; + +TEST_F(AsmPrinterEmitDwarfUnitLengthAsHiLoDiffTest, DWARF32) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF32)) + return; + + EXPECT_CALL(TestPrinter->getMS(), emitAbsoluteSymbolDiff(Hi, Lo, 4)); + TestPrinter->getAP()->emitDwarfUnitLength(Hi, Lo, ""); +} + +TEST_F(AsmPrinterEmitDwarfUnitLengthAsHiLoDiffTest, DWARF64) { + if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF64)) + return; + + InSequence S; + EXPECT_CALL(TestPrinter->getMS(), emitIntValue(dwarf::DW_LENGTH_DWARF64, 4)); + EXPECT_CALL(TestPrinter->getMS(), emitAbsoluteSymbolDiff(Hi, Lo, 8)); + + TestPrinter->getAP()->emitDwarfUnitLength(Hi, Lo, ""); +} + +} // end namespace diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt index 831eb66e82cf4..817ddb1bbf26c 100644 --- a/llvm/unittests/CodeGen/CMakeLists.txt +++ b/llvm/unittests/CodeGen/CMakeLists.txt @@ -15,7 +15,9 @@ set(LLVM_LINK_COMPONENTS add_llvm_unittest(CodeGenTests AArch64SelectionDAGTest.cpp + AsmPrinterDwarfTest.cpp DIEHashTest.cpp + DIETest.cpp LowLevelTypeTest.cpp LexicalScopesTest.cpp MachineInstrBundleIteratorTest.cpp @@ -25,6 +27,9 @@ add_llvm_unittest(CodeGenTests ScalableVectorMVTsTest.cpp TypeTraitsTest.cpp TargetOptionsTest.cpp + TestAsmPrinter.cpp ) add_subdirectory(GlobalISel) + +target_link_libraries(CodeGenTests PRIVATE LLVMTestingSupport) diff --git a/llvm/unittests/CodeGen/DIEHashTest.cpp b/llvm/unittests/CodeGen/DIEHashTest.cpp index 649e13208f0c1..03bb7de5a0ae1 100644 --- a/llvm/unittests/CodeGen/DIEHashTest.cpp +++ b/llvm/unittests/CodeGen/DIEHashTest.cpp @@ -7,12 +7,15 @@ //===----------------------------------------------------------------------===// #include "../lib/CodeGen/AsmPrinter/DIEHash.h" +#include "TestAsmPrinter.h" #include "llvm/ADT/STLExtras.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/DIE.h" #include "llvm/CodeGen/DwarfStringPoolEntry.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Format.h" +#include "llvm/Support/Host.h" +#include "llvm/Testing/Support/Error.h" #include "gtest/gtest.h" using namespace llvm; @@ -26,6 +29,14 @@ class DIEHashTest : public testing::Test { private: StringMap Pool; + std::unique_ptr TestPrinter; + + void setupTestPrinter() { + auto ExpectedTestPrinter = TestAsmPrinter::create( + sys::getDefaultTargetTriple(), /*DwarfVersion=*/4, dwarf::DWARF32); + ASSERT_THAT_EXPECTED(ExpectedTestPrinter, Succeeded()); + TestPrinter = std::move(ExpectedTestPrinter.get()); + } public: DIEString getString(StringRef S) { @@ -33,6 +44,12 @@ class DIEHashTest : public testing::Test { return DIEString(DwarfStringPoolEntryRef( *Pool.insert(std::make_pair(S, Entry)).first, Entry.isIndexed())); } + + AsmPrinter *getAsmPrinter() { + if (!TestPrinter) + setupTestPrinter(); + return TestPrinter ? TestPrinter->getAP() : nullptr; + } }; TEST_F(DIEHashTest, Data1) { @@ -644,6 +661,10 @@ TEST_F(DIEHashTest, MemberSdata) { // }; // A a; TEST_F(DIEHashTest, MemberBlock) { + if (!this->getAsmPrinter()) + // TODO: Use GTEST_SKIP() when GTest is updated to version 1.10.0 + return; + DIE &A = *DIE::get(Alloc, dwarf::DW_TAG_structure_type); DIEInteger One(1); DIEString AStr = getString("A"); @@ -692,7 +713,7 @@ TEST_F(DIEHashTest, MemberBlock) { A.addChild(std::move(PI)); - uint64_t MD5Res = DIEHash().computeTypeSignature(A); + uint64_t MD5Res = DIEHash(this->getAsmPrinter()).computeTypeSignature(A); ASSERT_EQ(0x493af53ad3d3f651ULL, MD5Res); } } diff --git a/llvm/unittests/CodeGen/DIETest.cpp b/llvm/unittests/CodeGen/DIETest.cpp new file mode 100644 index 0000000000000..44fb0c0bf6c88 --- /dev/null +++ b/llvm/unittests/CodeGen/DIETest.cpp @@ -0,0 +1,193 @@ +//===- llvm/unittest/CodeGen/DIETest.cpp ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/DIE.h" +#include "TestAsmPrinter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/Testing/Support/Error.h" + +using namespace llvm; +using testing::_; +using testing::SaveArg; + +namespace { + +using DIETestParams = + std::tuple; + +class DIEFixtureBase : public testing::TestWithParam { +protected: + void SetUp() override { + unsigned Version; + dwarf::DwarfFormat Format; + std::tie(Version, Format, Form, Size) = GetParam(); + auto ExpectedTestPrinter = + TestAsmPrinter::create("x86_64-pc-linux", Version, Format); + ASSERT_THAT_EXPECTED(ExpectedTestPrinter, Succeeded()); + TestPrinter = std::move(ExpectedTestPrinter.get()); + } + + dwarf::Form Form; + unsigned Size; + std::unique_ptr TestPrinter; +}; + +struct DIEExprFixture : public DIEFixtureBase { + void SetUp() override { + DIEFixtureBase::SetUp(); + if (!TestPrinter) + return; + + Val = MCConstantExpr::create(42, TestPrinter->getCtx()); + } + + const MCExpr *Val = nullptr; +}; + +TEST_P(DIEExprFixture, SizeOf) { + if (!TestPrinter) + return; + + DIEExpr Tst(Val); + EXPECT_EQ(Size, Tst.SizeOf(TestPrinter->getAP(), Form)); +} + +TEST_P(DIEExprFixture, EmitValue) { + if (!TestPrinter) + return; + + DIEExpr Tst(Val); + EXPECT_CALL(TestPrinter->getMS(), emitValueImpl(Val, Size, _)); + Tst.emitValue(TestPrinter->getAP(), Form); +} + +INSTANTIATE_TEST_CASE_P( + DIETestParams, DIEExprFixture, + testing::Values( + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_data4, 4u}, + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_data8, 8u}, + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_sec_offset, 4u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_data4, 4u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_data8, 8u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_sec_offset, 8u}), ); + +struct DIELabelFixture : public DIEFixtureBase { + void SetUp() override { + DIEFixtureBase::SetUp(); + if (!TestPrinter) + return; + + Val = TestPrinter->getCtx().createTempSymbol(); + } + + const MCSymbol *Val = nullptr; +}; + +TEST_P(DIELabelFixture, SizeOf) { + if (!TestPrinter) + return; + + DIELabel Tst(Val); + EXPECT_EQ(Size, Tst.SizeOf(TestPrinter->getAP(), Form)); +} + +TEST_P(DIELabelFixture, EmitValue) { + if (!TestPrinter) + return; + + DIELabel Tst(Val); + + const MCExpr *Arg0 = nullptr; + EXPECT_CALL(TestPrinter->getMS(), emitValueImpl(_, Size, _)) + .WillOnce(SaveArg<0>(&Arg0)); + Tst.emitValue(TestPrinter->getAP(), Form); + + const MCSymbolRefExpr *ActualArg0 = dyn_cast_or_null(Arg0); + ASSERT_NE(ActualArg0, nullptr); + EXPECT_EQ(&(ActualArg0->getSymbol()), Val); +} + +INSTANTIATE_TEST_CASE_P( + DIETestParams, DIELabelFixture, + testing::Values( + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_data4, 4u}, + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_data8, 8u}, + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_sec_offset, 4u}, + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_strp, 4u}, + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_addr, 8u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_data4, 4u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_data8, 8u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_sec_offset, 8u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_strp, 8u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_addr, 8u}), ); + +struct DIEDeltaFixture : public DIEFixtureBase { + void SetUp() override { + DIEFixtureBase::SetUp(); + if (!TestPrinter) + return; + + Hi = TestPrinter->getCtx().createTempSymbol(); + Lo = TestPrinter->getCtx().createTempSymbol(); + } + + const MCSymbol *Hi = nullptr; + const MCSymbol *Lo = nullptr; +}; + +TEST_P(DIEDeltaFixture, SizeOf) { + if (!TestPrinter) + return; + + DIEDelta Tst(Hi, Lo); + EXPECT_EQ(Size, Tst.SizeOf(TestPrinter->getAP(), Form)); +} + +TEST_P(DIEDeltaFixture, EmitValue) { + if (!TestPrinter) + return; + + DIEDelta Tst(Hi, Lo); + EXPECT_CALL(TestPrinter->getMS(), emitAbsoluteSymbolDiff(Hi, Lo, Size)); + Tst.emitValue(TestPrinter->getAP(), Form); +} + +INSTANTIATE_TEST_CASE_P( + DIETestParams, DIEDeltaFixture, + testing::Values( + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_data4, 4u}, + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_data8, 8u}, + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_sec_offset, 4u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_data4, 4u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_data8, 8u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_sec_offset, 8u}), ); + +struct DIELocListFixture : public DIEFixtureBase { + void SetUp() override { DIEFixtureBase::SetUp(); } +}; + +TEST_P(DIELocListFixture, SizeOf) { + if (!TestPrinter) + return; + + DIELocList Tst(999); + EXPECT_EQ(Size, Tst.SizeOf(TestPrinter->getAP(), Form)); +} + +INSTANTIATE_TEST_CASE_P( + DIETestParams, DIELocListFixture, + testing::Values( + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_loclistx, 2u}, + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_data4, 4u}, + DIETestParams{4, dwarf::DWARF32, dwarf::DW_FORM_sec_offset, 4u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_loclistx, 2u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_data8, 8u}, + DIETestParams{4, dwarf::DWARF64, dwarf::DW_FORM_sec_offset, 8u}), ); + +} // end namespace diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp index 7fd2ea453a2ac..ac9112fe5aa49 100644 --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp @@ -406,3 +406,13 @@ TEST(LegalizerInfoTest, MMOAlignment) { 32, 8, AtomicOrdering::NotAtomic })); } } + +// This code sequence doesn't do anything, but it covers a previously uncovered +// codepath that used to crash in MSVC x86_32 debug mode. +TEST(LegalizerInfoTest, MSVCDebugMiscompile) { + const LLT S1 = LLT::scalar(1); + const LLT P0 = LLT::pointer(0, 32); + LegalizerInfo LI; + auto Builder = LI.getActionDefinitionsBuilder(TargetOpcode::G_PTRTOINT); + (void)Builder.legalForCartesianProduct({S1}, {P0}); +} diff --git a/llvm/unittests/CodeGen/TestAsmPrinter.cpp b/llvm/unittests/CodeGen/TestAsmPrinter.cpp new file mode 100644 index 0000000000000..7d04202067689 --- /dev/null +++ b/llvm/unittests/CodeGen/TestAsmPrinter.cpp @@ -0,0 +1,88 @@ +//===--- unittests/CodeGen/TestAsmPrinter.cpp -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TestAsmPrinter.h" +#include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; +using ::testing::StrictMock; + +// Note: a non-const reference argument cannot be passed through +// testing::StrictMock, thus, we pass a pointer and dereference it here. +MockMCStreamer::MockMCStreamer(MCContext *Ctx) : MCStreamer(*Ctx) {} + +MockMCStreamer::~MockMCStreamer() = default; + +TestAsmPrinter::TestAsmPrinter() = default; + +TestAsmPrinter::~TestAsmPrinter() = default; + +llvm::Expected> +TestAsmPrinter::create(const std::string &TripleStr, uint16_t DwarfVersion, + dwarf::DwarfFormat DwarfFormat) { + std::string ErrorStr; + const Target *TheTarget = TargetRegistry::lookupTarget(TripleStr, ErrorStr); + if (!TheTarget) + return std::unique_ptr(); + + std::unique_ptr TestPrinter(new TestAsmPrinter); + if (llvm::Error E = + TestPrinter->init(TheTarget, TripleStr, DwarfVersion, DwarfFormat)) + return std::move(E); + + return std::move(TestPrinter); +} + +// Note:: based on dwarfgen::Generator::init() from +// llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp +llvm::Error TestAsmPrinter::init(const Target *TheTarget, StringRef TripleName, + uint16_t DwarfVersion, + dwarf::DwarfFormat DwarfFormat) { + TM.reset(TheTarget->createTargetMachine(TripleName, "", "", TargetOptions(), + None)); + if (!TM) + return make_error("no target machine for target " + TripleName, + inconvertibleErrorCode()); + + MC.reset(new MCContext(TM->getMCAsmInfo(), TM->getMCRegisterInfo(), + TM->getObjFileLowering())); + TM->getObjFileLowering()->Initialize(*MC, *TM); + + MS = new StrictMock(MC.get()); + + Asm.reset( + TheTarget->createAsmPrinter(*TM, std::unique_ptr(MS))); + if (!Asm) + return make_error("no asm printer for target " + TripleName, + inconvertibleErrorCode()); + + // Set the DWARF version correctly on all classes that we use. + MC->setDwarfVersion(DwarfVersion); + Asm->setDwarfVersion(DwarfVersion); + + // Set the DWARF format. + MC->setDwarfFormat(DwarfFormat); + + return Error::success(); +} + +void TestAsmPrinter::setDwarfUsesRelocationsAcrossSections(bool Enable) { + struct HackMCAsmInfo : MCAsmInfo { + void setDwarfUsesRelocationsAcrossSections(bool Enable) { + DwarfUsesRelocationsAcrossSections = Enable; + } + }; + static_cast(const_cast(TM->getMCAsmInfo())) + ->setDwarfUsesRelocationsAcrossSections(Enable); +} diff --git a/llvm/unittests/CodeGen/TestAsmPrinter.h b/llvm/unittests/CodeGen/TestAsmPrinter.h new file mode 100644 index 0000000000000..65e557b9b4a60 --- /dev/null +++ b/llvm/unittests/CodeGen/TestAsmPrinter.h @@ -0,0 +1,82 @@ +//===--- unittests/CodeGen/TestAsmPrinter.h ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_UNITTESTS_CODEGEN_TESTASMPRINTER_H +#define LLVM_UNITTESTS_CODEGEN_TESTASMPRINTER_H + +#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/MC/MCStreamer.h" +#include "gmock/gmock.h" + +#include + +namespace llvm { +class AsmPrinter; +class MCContext; +class Target; +class TargetMachine; + +class MockMCStreamer : public MCStreamer { +public: + explicit MockMCStreamer(MCContext *Ctx); + ~MockMCStreamer(); + + // These methods are pure virtual in MCStreamer, thus, have to be overridden: + + MOCK_METHOD2(emitSymbolAttribute, + bool(MCSymbol *Symbol, MCSymbolAttr Attribute)); + MOCK_METHOD3(emitCommonSymbol, + void(MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment)); + MOCK_METHOD5(emitZerofill, + void(MCSection *Section, MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment, SMLoc Loc)); + + // The following are mock methods to be used in tests. + + MOCK_METHOD2(emitIntValue, void(uint64_t Value, unsigned Size)); + MOCK_METHOD3(emitValueImpl, + void(const MCExpr *Value, unsigned Size, SMLoc Loc)); + MOCK_METHOD3(emitAbsoluteSymbolDiff, + void(const MCSymbol *Hi, const MCSymbol *Lo, unsigned Size)); + MOCK_METHOD2(EmitCOFFSecRel32, void(MCSymbol const *Symbol, uint64_t Offset)); +}; + +class TestAsmPrinter { + std::unique_ptr MC; + MockMCStreamer *MS = nullptr; // Owned by AsmPrinter + std::unique_ptr TM; + std::unique_ptr Asm; + + /// Private constructor; call TestAsmPrinter::create(...) + /// to create an instance. + TestAsmPrinter(); + + /// Initialize an AsmPrinter instance with a mocked MCStreamer. + llvm::Error init(const Target *TheTarget, StringRef TripleStr, + uint16_t DwarfVersion, dwarf::DwarfFormat DwarfFormat); + +public: + /// Create an AsmPrinter and accompanied objects. + /// Returns ErrorSuccess() with an empty value if the requested target is not + /// supported so that the corresponding test can be gracefully skipped. + static llvm::Expected> + create(const std::string &TripleStr, uint16_t DwarfVersion, + dwarf::DwarfFormat DwarfFormat); + + ~TestAsmPrinter(); + + void setDwarfUsesRelocationsAcrossSections(bool Enable); + + AsmPrinter *getAP() const { return Asm.get(); } + MCContext &getCtx() const { return *MC; } + MockMCStreamer &getMS() const { return *MS; } +}; + +} // end namespace llvm + +#endif // LLVM_UNITTESTS_CODEGEN_TESTASMPRINTER_H diff --git a/llvm/unittests/DebugInfo/CodeView/TypeHashingTest.cpp b/llvm/unittests/DebugInfo/CodeView/TypeHashingTest.cpp index 8b9dc7ab285e9..b4501c36fd2b9 100644 --- a/llvm/unittests/DebugInfo/CodeView/TypeHashingTest.cpp +++ b/llvm/unittests/DebugInfo/CodeView/TypeHashingTest.cpp @@ -8,6 +8,7 @@ #include "llvm/DebugInfo/CodeView/TypeHashing.h" #include "llvm/DebugInfo/CodeView/AppendingTypeTableBuilder.h" +#include "llvm/DebugInfo/CodeView/TypeRecord.h" #include "gtest/gtest.h" diff --git a/llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp b/llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp index 5d53c0d31bdf8..69746dd638ed9 100644 --- a/llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp +++ b/llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp @@ -504,7 +504,7 @@ llvm::Error dwarfgen::Generator::init(Triple TheTriple, uint16_t V) { StringRef dwarfgen::Generator::generate() { // Offset from the first CU in the debug info section is 0 initially. - unsigned SecOffset = 0; + uint64_t SecOffset = 0; // Iterate over each compile unit and set the size and offsets for each // DIE within each compile unit. All offsets are CU relative. diff --git a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp index 2c008dfdbd33e..9a1dbbb172517 100644 --- a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp @@ -35,12 +35,12 @@ TEST_F(CoreAPIsStandardTest, BasicSuccessfulLookup) { OnCompletionRun = true; }; - std::shared_ptr FooMR; + std::unique_ptr FooMR; cantFail(JD.define(std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}}), - [&](MaterializationResponsibility R) { - FooMR = std::make_shared(std::move(R)); + [&](std::unique_ptr R) { + FooMR = std::move(R); }))); ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD), @@ -99,9 +99,9 @@ TEST_F(CoreAPIsStandardTest, ResolveUnrequestedSymbol) { cantFail(JD.define(std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}, {Bar, BarSym.getFlags()}}), - [this](MaterializationResponsibility R) { - cantFail(R.notifyResolved({{Foo, FooSym}, {Bar, BarSym}})); - cantFail(R.notifyEmitted()); + [this](std::unique_ptr R) { + cantFail(R->notifyResolved({{Foo, FooSym}, {Bar, BarSym}})); + cantFail(R->notifyEmitted()); }))); auto Result = @@ -116,14 +116,16 @@ TEST_F(CoreAPIsStandardTest, MaterializationSideEffctsOnlyBasic) { // don't return until they're emitted, and that they don't appear in query // results. - Optional FooR; + std::unique_ptr FooR; Optional Result; cantFail(JD.define(std::make_unique( SymbolFlagsMap( {{Foo, JITSymbolFlags::Exported | JITSymbolFlags::MaterializationSideEffectsOnly}}), - [&](MaterializationResponsibility R) { FooR.emplace(std::move(R)); }))); + [&](std::unique_ptr R) { + FooR = std::move(R); + }))); ES.lookup( LookupKind::Static, makeJITDylibSearchOrder(&JD), @@ -155,7 +157,9 @@ TEST_F(CoreAPIsStandardTest, MaterializationSideEffectsOnlyFailuresPersist) { SymbolFlagsMap( {{Foo, JITSymbolFlags::Exported | JITSymbolFlags::MaterializationSideEffectsOnly}}), - [&](MaterializationResponsibility R) { R.failMaterialization(); }))); + [&](std::unique_ptr R) { + R->failMaterialization(); + }))); EXPECT_THAT_EXPECTED( ES.lookup(makeJITDylibSearchOrder(&JD), SymbolLookupSet({Foo})), @@ -182,10 +186,10 @@ TEST_F(CoreAPIsStandardTest, RemoveSymbolsTest) { bool BarMaterializerDestructed = false; cantFail(JD.define(std::make_unique( SymbolFlagsMap({{Bar, BarSym.getFlags()}}), - [this](MaterializationResponsibility R) { + [this](std::unique_ptr R) { ADD_FAILURE() << "Unexpected materialization of \"Bar\""; - cantFail(R.notifyResolved({{Bar, BarSym}})); - cantFail(R.notifyEmitted()); + cantFail(R->notifyResolved({{Bar, BarSym}})); + cantFail(R->notifyEmitted()); }, nullptr, [&](const JITDylib &JD, const SymbolStringPtr &Name) { @@ -197,10 +201,12 @@ TEST_F(CoreAPIsStandardTest, RemoveSymbolsTest) { // Baz will be in the materializing state initially, then // materialized for the final removal attempt. - Optional BazR; + std::unique_ptr BazR; cantFail(JD.define(std::make_unique( SymbolFlagsMap({{Baz, BazSym.getFlags()}}), - [&](MaterializationResponsibility R) { BazR.emplace(std::move(R)); }, + [&](std::unique_ptr R) { + BazR = std::move(R); + }, nullptr, [](const JITDylib &JD, const SymbolStringPtr &Name) { ADD_FAILURE() << "\"Baz\" discarded unexpectedly"; @@ -297,7 +303,7 @@ TEST_F(CoreAPIsStandardTest, LookupFlagsTest) { JITSymbolFlags::Exported | JITSymbolFlags::Weak)); auto MU = std::make_unique( SymbolFlagsMap({{Bar, BarSym.getFlags()}}), - [](MaterializationResponsibility R) { + [](std::unique_ptr R) { llvm_unreachable("Symbol materialized on flags lookup"); }); @@ -400,10 +406,10 @@ TEST_F(CoreAPIsStandardTest, TestThatReExportsDontUnnecessarilyMaterialize) { bool BarMaterialized = false; auto BarMU = std::make_unique( SymbolFlagsMap({{Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R) { + [&](std::unique_ptr R) { BarMaterialized = true; - cantFail(R.notifyResolved({{Bar, BarSym}})); - cantFail(R.notifyEmitted()); + cantFail(R->notifyResolved({{Bar, BarSym}})); + cantFail(R->notifyEmitted()); }); cantFail(JD.define(BarMU)); @@ -444,10 +450,12 @@ TEST_F(CoreAPIsStandardTest, TestReexportsGenerator) { } TEST_F(CoreAPIsStandardTest, TestTrivialCircularDependency) { - Optional FooR; + std::unique_ptr FooR; auto FooMU = std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}}), - [&](MaterializationResponsibility R) { FooR.emplace(std::move(R)); }); + [&](std::unique_ptr R) { + FooR = std::move(R); + }); cantFail(JD.define(FooMU)); @@ -476,26 +484,29 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) { // does not prevent any symbol from becoming 'ready' once all symbols are // emitted. - // Create three MaterializationResponsibility objects: one for each of Foo, - // Bar and Baz. These are optional because MaterializationResponsibility - // does not have a default constructor). - Optional FooR; - Optional BarR; - Optional BazR; + std::unique_ptr FooR; + std::unique_ptr BarR; + std::unique_ptr BazR; // Create a MaterializationUnit for each symbol that moves the // MaterializationResponsibility into one of the locals above. auto FooMU = std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}}), - [&](MaterializationResponsibility R) { FooR.emplace(std::move(R)); }); + [&](std::unique_ptr R) { + FooR = std::move(R); + }); auto BarMU = std::make_unique( SymbolFlagsMap({{Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R) { BarR.emplace(std::move(R)); }); + [&](std::unique_ptr R) { + BarR = std::move(R); + }); auto BazMU = std::make_unique( SymbolFlagsMap({{Baz, BazSym.getFlags()}}), - [&](MaterializationResponsibility R) { BazR.emplace(std::move(R)); }); + [&](std::unique_ptr R) { + BazR = std::move(R); + }); // Define the symbols. cantFail(JD.define(FooMU)); @@ -622,18 +633,22 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) { } TEST_F(CoreAPIsStandardTest, FailureInDependency) { - Optional FooR; - Optional BarR; + std::unique_ptr FooR; + std::unique_ptr BarR; // Create a MaterializationUnit for each symbol that moves the // MaterializationResponsibility into one of the locals above. auto FooMU = std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}}), - [&](MaterializationResponsibility R) { FooR.emplace(std::move(R)); }); + [&](std::unique_ptr R) { + FooR = std::move(R); + }); auto BarMU = std::make_unique( SymbolFlagsMap({{Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R) { BarR.emplace(std::move(R)); }); + [&](std::unique_ptr R) { + BarR = std::move(R); + }); // Define the symbols. cantFail(JD.define(FooMU)); @@ -687,18 +702,22 @@ TEST_F(CoreAPIsStandardTest, FailureInDependency) { } TEST_F(CoreAPIsStandardTest, FailureInCircularDependency) { - Optional FooR; - Optional BarR; + std::unique_ptr FooR; + std::unique_ptr BarR; // Create a MaterializationUnit for each symbol that moves the // MaterializationResponsibility into one of the locals above. auto FooMU = std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}}), - [&](MaterializationResponsibility R) { FooR.emplace(std::move(R)); }); + [&](std::unique_ptr R) { + FooR = std::move(R); + }); auto BarMU = std::make_unique( SymbolFlagsMap({{Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R) { BarR.emplace(std::move(R)); }); + [&](std::unique_ptr R) { + BarR = std::move(R); + }); // Define the symbols. cantFail(JD.define(FooMU)); @@ -753,18 +772,22 @@ TEST_F(CoreAPIsStandardTest, FailureInCircularDependency) { } TEST_F(CoreAPIsStandardTest, AddDependencyOnFailedSymbol) { - Optional FooR; - Optional BarR; + std::unique_ptr FooR; + std::unique_ptr BarR; // Create a MaterializationUnit for each symbol that moves the // MaterializationResponsibility into one of the locals above. auto FooMU = std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}}), - [&](MaterializationResponsibility R) { FooR.emplace(std::move(R)); }); + [&](std::unique_ptr R) { + FooR = std::move(R); + }); auto BarMU = std::make_unique( SymbolFlagsMap({{Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R) { BarR.emplace(std::move(R)); }); + [&](std::unique_ptr R) { + BarR = std::move(R); + }); // Define the symbols. cantFail(JD.define(FooMU)); @@ -819,18 +842,22 @@ TEST_F(CoreAPIsStandardTest, AddDependencyOnFailedSymbol) { } TEST_F(CoreAPIsStandardTest, FailAfterMaterialization) { - Optional FooR; - Optional BarR; + std::unique_ptr FooR; + std::unique_ptr BarR; // Create a MaterializationUnit for each symbol that moves the // MaterializationResponsibility into one of the locals above. auto FooMU = std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}}), - [&](MaterializationResponsibility R) { FooR.emplace(std::move(R)); }); + [&](std::unique_ptr R) { + FooR = std::move(R); + }); auto BarMU = std::make_unique( SymbolFlagsMap({{Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R) { BarR.emplace(std::move(R)); }); + [&](std::unique_ptr R) { + BarR = std::move(R); + }); // Define the symbols. cantFail(JD.define(FooMU)); @@ -882,9 +909,9 @@ TEST_F(CoreAPIsStandardTest, FailMaterializerWithUnqueriedSymbols) { auto MU = std::make_unique( SymbolFlagsMap( {{Foo, JITSymbolFlags::Exported}, {Bar, JITSymbolFlags::Exported}}), - [&](MaterializationResponsibility R) { + [&](std::unique_ptr R) { MaterializerRun = true; - R.failMaterialization(); + R->failMaterialization(); }); cantFail(JD.define(std::move(MU))); @@ -911,7 +938,7 @@ TEST_F(CoreAPIsStandardTest, DropMaterializerWhenEmpty) { auto MU = std::make_unique( SymbolFlagsMap({{Foo, WeakExported}, {Bar, WeakExported}}), - [](MaterializationResponsibility R) { + [](std::unique_ptr R) { llvm_unreachable("Unexpected call to materialize"); }, nullptr, @@ -943,10 +970,10 @@ TEST_F(CoreAPIsStandardTest, AddAndMaterializeLazySymbol) { auto MU = std::make_unique( SymbolFlagsMap({{Foo, JITSymbolFlags::Exported}, {Bar, WeakExported}}), - [&](MaterializationResponsibility R) { + [&](std::unique_ptr R) { assert(BarDiscarded && "Bar should have been discarded by this point"); - cantFail(R.notifyResolved(SymbolMap({{Foo, FooSym}}))); - cantFail(R.notifyEmitted()); + cantFail(R->notifyResolved(SymbolMap({{Foo, FooSym}}))); + cantFail(R->notifyEmitted()); FooMaterialized = true; }, nullptr, @@ -985,18 +1012,18 @@ TEST_F(CoreAPIsStandardTest, TestBasicWeakSymbolMaterialization) { bool BarMaterialized = false; auto MU1 = std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}, {Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R) { - cantFail(R.notifyResolved(SymbolMap({{Foo, FooSym}, {Bar, BarSym}}))); - cantFail(R.notifyEmitted()); + [&](std::unique_ptr R) { + cantFail(R->notifyResolved(SymbolMap({{Foo, FooSym}, {Bar, BarSym}}))); + cantFail(R->notifyEmitted()); BarMaterialized = true; }); bool DuplicateBarDiscarded = false; auto MU2 = std::make_unique( SymbolFlagsMap({{Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R) { + [&](std::unique_ptr R) { ADD_FAILURE() << "Attempt to materialize Bar from the wrong unit"; - R.failMaterialization(); + R->failMaterialization(); }, nullptr, [&](const JITDylib &JD, SymbolStringPtr Name) { @@ -1026,20 +1053,21 @@ TEST_F(CoreAPIsStandardTest, TestBasicWeakSymbolMaterialization) { TEST_F(CoreAPIsStandardTest, DefineMaterializingSymbol) { bool ExpectNoMoreMaterialization = false; - ES.setDispatchMaterialization([&](std::unique_ptr MU, - MaterializationResponsibility MR) { - if (ExpectNoMoreMaterialization) - ADD_FAILURE() << "Unexpected materialization"; - MU->materialize(std::move(MR)); - }); + ES.setDispatchMaterialization( + [&](std::unique_ptr MU, + std::unique_ptr MR) { + if (ExpectNoMoreMaterialization) + ADD_FAILURE() << "Unexpected materialization"; + MU->materialize(std::move(MR)); + }); auto MU = std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}}), - [&](MaterializationResponsibility R) { + [&](std::unique_ptr R) { cantFail( - R.defineMaterializing(SymbolFlagsMap({{Bar, BarSym.getFlags()}}))); - cantFail(R.notifyResolved(SymbolMap({{Foo, FooSym}, {Bar, BarSym}}))); - cantFail(R.notifyEmitted()); + R->defineMaterializing(SymbolFlagsMap({{Bar, BarSym.getFlags()}}))); + cantFail(R->notifyResolved(SymbolMap({{Foo, FooSym}, {Bar, BarSym}}))); + cantFail(R->notifyEmitted()); }); cantFail(JD.define(MU)); @@ -1093,8 +1121,8 @@ TEST_F(CoreAPIsStandardTest, FailResolution) { auto MU = std::make_unique( SymbolFlagsMap({{Foo, JITSymbolFlags::Exported | JITSymbolFlags::Weak}, {Bar, JITSymbolFlags::Exported | JITSymbolFlags::Weak}}), - [&](MaterializationResponsibility R) { - R.failMaterialization(); + [&](std::unique_ptr R) { + R->failMaterialization(); }); cantFail(JD.define(MU)); @@ -1129,23 +1157,23 @@ TEST_F(CoreAPIsStandardTest, FailEmissionAfterResolution) { auto MU = std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}, {Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R) { - cantFail(R.notifyResolved(SymbolMap({{Foo, FooSym}, {Bar, BarSym}}))); + [&](std::unique_ptr R) { + cantFail(R->notifyResolved(SymbolMap({{Foo, FooSym}, {Bar, BarSym}}))); ES.lookup( LookupKind::Static, makeJITDylibSearchOrder(&JD), SymbolLookupSet({Baz}), SymbolState::Resolved, - [&R](Expected Result) { + [&](Expected Result) { // Called when "baz" is resolved. We don't actually depend // on or care about baz, but use it to trigger failure of // this materialization before Baz has been finalized in // order to test that error propagation is correct in this // scenario. cantFail(std::move(Result)); - R.failMaterialization(); + R->failMaterialization(); }, [&](const SymbolDependenceMap &Deps) { - R.addDependenciesForAll(Deps); + R->addDependenciesForAll(Deps); }); }); @@ -1165,7 +1193,9 @@ TEST_F(CoreAPIsStandardTest, FailAfterPartialResolution) { // Fail materialization of bar. auto BarMU = std::make_unique( SymbolFlagsMap({{Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R) { R.failMaterialization(); }); + [&](std::unique_ptr R) { + R->failMaterialization(); + }); cantFail(JD.define(std::move(BarMU))); @@ -1185,9 +1215,9 @@ TEST_F(CoreAPIsStandardTest, FailAfterPartialResolution) { TEST_F(CoreAPIsStandardTest, TestLookupWithUnthreadedMaterialization) { auto MU = std::make_unique( SymbolFlagsMap({{Foo, JITSymbolFlags::Exported}}), - [&](MaterializationResponsibility R) { - cantFail(R.notifyResolved({{Foo, FooSym}})); - cantFail(R.notifyEmitted()); + [&](std::unique_ptr R) { + cantFail(R->notifyResolved({{Foo, FooSym}})); + cantFail(R->notifyEmitted()); }); cantFail(JD.define(MU)); @@ -1204,15 +1234,14 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithThreadedMaterialization) { #if LLVM_ENABLE_THREADS std::thread MaterializationThread; - ES.setDispatchMaterialization([&](std::unique_ptr MU, - MaterializationResponsibility MR) { - auto SharedMR = - std::make_shared(std::move(MR)); - MaterializationThread = - std::thread([MU = std::move(MU), MR = std::move(SharedMR)] { - MU->materialize(std::move(*MR)); - }); - }); + ES.setDispatchMaterialization( + [&](std::unique_ptr MU, + std::unique_ptr MR) { + MaterializationThread = + std::thread([MU = std::move(MU), MR = std::move(MR)]() mutable { + MU->materialize(std::move(MR)); + }); + }); cantFail(JD.define(absoluteSymbols({{Foo, FooSym}}))); @@ -1238,23 +1267,23 @@ TEST_F(CoreAPIsStandardTest, TestGetRequestedSymbolsAndReplace) { auto MU = std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}, {Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R) { - auto Requested = R.getRequestedSymbols(); + [&](std::unique_ptr R) { + auto Requested = R->getRequestedSymbols(); EXPECT_EQ(Requested.size(), 1U) << "Expected one symbol requested"; EXPECT_EQ(*Requested.begin(), Foo) << "Expected \"Foo\" requested"; auto NewMU = std::make_unique( SymbolFlagsMap({{Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R2) { - cantFail(R2.notifyResolved(SymbolMap({{Bar, BarSym}}))); - cantFail(R2.notifyEmitted()); + [&](std::unique_ptr R2) { + cantFail(R2->notifyResolved(SymbolMap({{Bar, BarSym}}))); + cantFail(R2->notifyEmitted()); BarMaterialized = true; }); - R.replace(std::move(NewMU)); + R->replace(std::move(NewMU)); - cantFail(R.notifyResolved(SymbolMap({{Foo, FooSym}}))); - cantFail(R.notifyEmitted()); + cantFail(R->notifyResolved(SymbolMap({{Foo, FooSym}}))); + cantFail(R->notifyEmitted()); FooMaterialized = true; }); @@ -1280,13 +1309,13 @@ TEST_F(CoreAPIsStandardTest, TestGetRequestedSymbolsAndReplace) { TEST_F(CoreAPIsStandardTest, TestMaterializationResponsibilityDelegation) { auto MU = std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}, {Bar, BarSym.getFlags()}}), - [&](MaterializationResponsibility R) { - auto R2 = R.delegate({Bar}); + [&](std::unique_ptr R) { + auto R2 = R->delegate({Bar}); - cantFail(R.notifyResolved({{Foo, FooSym}})); - cantFail(R.notifyEmitted()); - cantFail(R2.notifyResolved({{Bar, BarSym}})); - cantFail(R2.notifyEmitted()); + cantFail(R->notifyResolved({{Foo, FooSym}})); + cantFail(R->notifyEmitted()); + cantFail(R2->notifyResolved({{Bar, BarSym}})); + cantFail(R2->notifyEmitted()); }); cantFail(JD.define(MU)); @@ -1309,12 +1338,11 @@ TEST_F(CoreAPIsStandardTest, TestMaterializeWeakSymbol) { JITSymbolFlags WeakExported = JITSymbolFlags::Exported; WeakExported &= JITSymbolFlags::Weak; - std::unique_ptr FooResponsibility; + std::unique_ptr FooR; auto MU = std::make_unique( SymbolFlagsMap({{Foo, FooSym.getFlags()}}), - [&](MaterializationResponsibility R) { - FooResponsibility = - std::make_unique(std::move(R)); + [&](std::unique_ptr R) { + FooR = std::move(R); }); cantFail(JD.define(MU)); @@ -1328,7 +1356,7 @@ TEST_F(CoreAPIsStandardTest, TestMaterializeWeakSymbol) { auto MU2 = std::make_unique( SymbolFlagsMap({{Foo, JITSymbolFlags::Exported}}), - [](MaterializationResponsibility R) { + [](std::unique_ptr R) { llvm_unreachable("This unit should never be materialized"); }); @@ -1339,8 +1367,8 @@ TEST_F(CoreAPIsStandardTest, TestMaterializeWeakSymbol) { consumeError(std::move(Err)); // No dependencies registered, can't fail: - cantFail(FooResponsibility->notifyResolved(SymbolMap({{Foo, FooSym}}))); - cantFail(FooResponsibility->notifyEmitted()); + cantFail(FooR->notifyResolved(SymbolMap({{Foo, FooSym}}))); + cantFail(FooR->notifyEmitted()); } static bool linkOrdersEqual(const std::vector> &LHS, diff --git a/llvm/unittests/ExecutionEngine/Orc/LazyCallThroughAndReexportsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/LazyCallThroughAndReexportsTest.cpp index 50e7b60a2df4e..81ff3e7a87b30 100644 --- a/llvm/unittests/ExecutionEngine/Orc/LazyCallThroughAndReexportsTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/LazyCallThroughAndReexportsTest.cpp @@ -39,15 +39,15 @@ TEST_F(LazyReexportsTest, BasicLocalCallThroughManagerOperation) { cantFail(JD.define(std::make_unique( SymbolFlagsMap({{DummyTarget, JITSymbolFlags::Exported}}), - [&](MaterializationResponsibility R) { + [&](std::unique_ptr R) { DummyTargetMaterialized = true; // No dependencies registered, can't fail. - cantFail(R.notifyResolved( + cantFail(R->notifyResolved( {{DummyTarget, JITEvaluatedSymbol(static_cast( reinterpret_cast(&dummyTarget)), JITSymbolFlags::Exported)}})); - cantFail(R.notifyEmitted()); + cantFail(R->notifyEmitted()); }))); unsigned NotifyResolvedCount = 0; diff --git a/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h b/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h index b25851d8f796c..afbc4a9ffaa5c 100644 --- a/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h +++ b/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h @@ -86,7 +86,7 @@ class OrcNativeTarget { class SimpleMaterializationUnit : public orc::MaterializationUnit { public: using MaterializeFunction = - std::function; + std::function)>; using DiscardFunction = std::function; using DestructorFunction = std::function; @@ -108,7 +108,8 @@ class SimpleMaterializationUnit : public orc::MaterializationUnit { StringRef getName() const override { return ""; } - void materialize(orc::MaterializationResponsibility R) override { + void + materialize(std::unique_ptr R) override { Materialize(std::move(R)); } diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt index c459ef49ae552..d5211cb275b74 100644 --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -89,6 +89,7 @@ add_llvm_unittest(SupportTests YAMLIOTest.cpp YAMLParserTest.cpp formatted_raw_ostream_test.cpp + raw_fd_stream_test.cpp raw_ostream_test.cpp raw_pwrite_stream_test.cpp raw_sha1_ostream_test.cpp diff --git a/llvm/unittests/Support/GlobPatternTest.cpp b/llvm/unittests/Support/GlobPatternTest.cpp index 17d60b2b85087..7acd311b0bb92 100644 --- a/llvm/unittests/Support/GlobPatternTest.cpp +++ b/llvm/unittests/Support/GlobPatternTest.cpp @@ -133,4 +133,17 @@ TEST_F(GlobPatternTest, ExtSym) { EXPECT_TRUE((bool)Pat2); EXPECT_TRUE(Pat2->match("\xFF")); } + +TEST_F(GlobPatternTest, IsTrivialMatchAll) { + Expected Pat1 = GlobPattern::create("*"); + EXPECT_TRUE((bool)Pat1); + EXPECT_TRUE(Pat1->isTrivialMatchAll()); + + const char *NegativeCases[] = {"a*", "*a", "?*", "*?", "**", "\\*"}; + for (auto *P : NegativeCases) { + Expected Pat2 = GlobPattern::create(P); + EXPECT_TRUE((bool)Pat2); + EXPECT_FALSE(Pat2->isTrivialMatchAll()); + } +} } diff --git a/llvm/unittests/Support/LockFileManagerTest.cpp b/llvm/unittests/Support/LockFileManagerTest.cpp index 587e442be1966..0b5a0d982a8fc 100644 --- a/llvm/unittests/Support/LockFileManagerTest.cpp +++ b/llvm/unittests/Support/LockFileManagerTest.cpp @@ -81,7 +81,7 @@ TEST(LockFileManagerTest, RelativePath) { char PathBuf[1024]; const char *OrigPath = getcwd(PathBuf, 1024); - ASSERT_FALSE(chdir(LockFileManagerTestDir.path().data())); + ASSERT_FALSE(chdir(LockFileManagerTestDir.c_str())); TempDir inner("inner"); SmallString<64> LockedFile(inner.path()); diff --git a/llvm/unittests/Support/TargetParserTest.cpp b/llvm/unittests/Support/TargetParserTest.cpp index f9392751de4e4..bec8a395f5586 100644 --- a/llvm/unittests/Support/TargetParserTest.cpp +++ b/llvm/unittests/Support/TargetParserTest.cpp @@ -782,12 +782,12 @@ TEST(TargetParserTest, ARMparseArchVersion) { } bool testAArch64CPU(StringRef CPUName, StringRef ExpectedArch, - StringRef ExpectedFPU, unsigned ExpectedFlags, + StringRef ExpectedFPU, uint64_t ExpectedFlags, StringRef CPUAttr) { AArch64::ArchKind AK = AArch64::parseCPUArch(CPUName); bool pass = AArch64::getArchName(AK).equals(ExpectedArch); - unsigned ExtKind = AArch64::getDefaultExtensions(CPUName, AK); + uint64_t ExtKind = AArch64::getDefaultExtensions(CPUName, AK); if (ExtKind > 1 && (ExtKind & AArch64::AEK_NONE)) pass &= ((ExtKind ^ AArch64::AEK_NONE) == ExpectedFlags); else @@ -1201,7 +1201,7 @@ TEST(TargetParserTest, testAArch64Extension) { } TEST(TargetParserTest, AArch64ExtensionFeatures) { - std::vector Extensions = { + std::vector Extensions = { AArch64::AEK_CRC, AArch64::AEK_CRYPTO, AArch64::AEK_FP, AArch64::AEK_SIMD, AArch64::AEK_FP16, AArch64::AEK_PROFILE, @@ -1214,7 +1214,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { std::vector Features; - unsigned ExtVal = 0; + uint64_t ExtVal = 0; for (auto Ext : Extensions) ExtVal |= Ext; diff --git a/llvm/unittests/Support/raw_fd_stream_test.cpp b/llvm/unittests/Support/raw_fd_stream_test.cpp new file mode 100644 index 0000000000000..00d834da32101 --- /dev/null +++ b/llvm/unittests/Support/raw_fd_stream_test.cpp @@ -0,0 +1,67 @@ +//===- llvm/unittest/Support/raw_fd_stream_test.cpp - raw_fd_stream tests -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallString.h" +#include "llvm/Config/llvm-config.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/raw_ostream.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +TEST(raw_fd_streamTest, ReadAfterWrite) { + SmallString<64> Path; + int FD; + ASSERT_FALSE(sys::fs::createTemporaryFile("foo", "bar", FD, Path)); + FileRemover Cleanup(Path); + std::error_code EC; + raw_fd_stream OS(Path, EC); + EXPECT_TRUE(!EC); + + char Bytes[8]; + + OS.write("01234567", 8); + + OS.seek(3); + EXPECT_EQ(OS.read(Bytes, 2), 2); + EXPECT_EQ(Bytes[0], '3'); + EXPECT_EQ(Bytes[1], '4'); + + OS.seek(4); + OS.write("xyz", 3); + + OS.seek(0); + EXPECT_EQ(OS.read(Bytes, 8), 8); + EXPECT_EQ(Bytes[0], '0'); + EXPECT_EQ(Bytes[1], '1'); + EXPECT_EQ(Bytes[2], '2'); + EXPECT_EQ(Bytes[3], '3'); + EXPECT_EQ(Bytes[4], 'x'); + EXPECT_EQ(Bytes[5], 'y'); + EXPECT_EQ(Bytes[6], 'z'); + EXPECT_EQ(Bytes[7], '7'); +} + +TEST(raw_fd_streamTest, DynCast) { + { + std::error_code EC; + raw_fd_stream OS("-", EC); + EXPECT_TRUE(dyn_cast(&OS)); + } + { + std::error_code EC; + raw_fd_ostream OS("-", EC); + EXPECT_FALSE(dyn_cast(&OS)); + } +} + +} // namespace diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp index 876e011e1ce8a..08cc81860a166 100644 --- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp +++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp @@ -383,12 +383,20 @@ TEST(MachineInstrValidTailPredication, IsCorrect) { case MVE_ASRLi: case MVE_ASRLr: case MVE_LSRL: + case MVE_LSLLi: + case MVE_LSLLr: case MVE_SQRSHR: + case MVE_SQRSHRL: case MVE_SQSHL: + case MVE_SQSHLL: case MVE_SRSHR: + case MVE_SRSHRL: case MVE_UQRSHL: + case MVE_UQRSHLL: case MVE_UQSHL: + case MVE_UQSHLL: case MVE_URSHR: + case MVE_URSHRL: case MVE_VABDf16: case MVE_VABDf32: case MVE_VABDs16: @@ -754,6 +762,12 @@ TEST(MachineInstrValidTailPredication, IsCorrect) { case MVE_VQADDu16: case MVE_VQADDu32: case MVE_VQADDu8: + case MVE_VQDMULH_qr_s16: + case MVE_VQDMULH_qr_s32: + case MVE_VQDMULH_qr_s8: + case MVE_VQDMULHi16: + case MVE_VQDMULHi32: + case MVE_VQDMULHi8: case MVE_VQDMULL_qr_s16bh: case MVE_VQDMULL_qr_s16th: case MVE_VQDMULL_qr_s32bh: @@ -762,6 +776,12 @@ TEST(MachineInstrValidTailPredication, IsCorrect) { case MVE_VQDMULLs16th: case MVE_VQDMULLs32bh: case MVE_VQDMULLs32th: + case MVE_VQRDMULH_qr_s16: + case MVE_VQRDMULH_qr_s32: + case MVE_VQRDMULH_qr_s8: + case MVE_VQRDMULHi16: + case MVE_VQRDMULHi32: + case MVE_VQRDMULHi8: case MVE_VQNEGs16: case MVE_VQNEGs32: case MVE_VQNEGs8: @@ -960,6 +980,20 @@ TEST(MachineInstrValidTailPredication, IsCorrect) { case MVE_VSUBi16: case MVE_VSUBi32: case MVE_VSUBi8: + case VLDR_P0_off: + case VLDR_P0_post: + case VLDR_P0_pre: + case VLDR_VPR_off: + case VLDR_VPR_post: + case VLDR_VPR_pre: + case VSTR_P0_off: + case VSTR_P0_post: + case VSTR_P0_pre: + case VSTR_VPR_off: + case VSTR_VPR_post: + case VSTR_VPR_pre: + case VMRS_P0: + case VMRS_VPR: return true; } }; @@ -984,27 +1018,16 @@ TEST(MachineInstrValidTailPredication, IsCorrect) { ARMSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), std::string(TM->getTargetFeatureString()), *static_cast(TM.get()), false); - const ARMBaseInstrInfo *TII = ST.getInstrInfo(); - auto MII = TM->getMCInstrInfo(); + auto MII = TM->getMCInstrInfo(); for (unsigned i = 0; i < ARM::INSTRUCTION_LIST_END; ++i) { - const MCInstrDesc &Desc = TII->get(i); - - for (auto &Op : Desc.operands()) { - // Only check instructions that access the MQPR regs. - if ((Op.OperandType & MCOI::OPERAND_REGISTER) == 0 || - (Op.RegClass != ARM::MQPRRegClassID && - Op.RegClass != ARM::QQPRRegClassID && - Op.RegClass != ARM::QQQQPRRegClassID)) - continue; - - uint64_t Flags = MII->get(i).TSFlags; - bool Valid = (Flags & ARMII::ValidForTailPredication) != 0; - ASSERT_EQ(IsValidTPOpcode(i), Valid) - << MII->getName(i) - << ": mismatched expectation for tail-predicated safety\n"; - break; - } + uint64_t Flags = MII->get(i).TSFlags; + if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) + continue; + bool Valid = (Flags & ARMII::ValidForTailPredication) != 0; + ASSERT_EQ(IsValidTPOpcode(i), Valid) + << MII->getName(i) + << ": mismatched expectation for tail-predicated safety\n"; } } diff --git a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp index 8142eaf90de10..8bec9629c5540 100644 --- a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp +++ b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp @@ -9,7 +9,10 @@ #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -294,6 +297,9 @@ class LoopPassManagerTest : public ::testing::Test { // those. FAM.registerPass([&] { return AAManager(); }); FAM.registerPass([&] { return AssumptionAnalysis(); }); + FAM.registerPass([&] { return BlockFrequencyAnalysis(); }); + FAM.registerPass([&] { return BranchProbabilityAnalysis(); }); + FAM.registerPass([&] { return PostDominatorTreeAnalysis(); }); FAM.registerPass([&] { return MemorySSAAnalysis(); }); FAM.registerPass([&] { return ScalarEvolutionAnalysis(); }); FAM.registerPass([&] { return TargetLibraryAnalysis(); }); diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp index eeb715dded43e..18a2de18c3e93 100644 --- a/llvm/utils/TableGen/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/CodeGenRegisters.cpp @@ -999,6 +999,8 @@ CodeGenRegisterClass::getMatchingSubClassWithSubRegs( const CodeGenRegisterClass *B) { // If there are multiple, identical register classes, prefer the original // register class. + if (A == B) + return false; if (A->getMembers().size() == B->getMembers().size()) return A == this; return A->getMembers().size() > B->getMembers().size(); diff --git a/llvm/utils/TableGen/DFAEmitter.cpp b/llvm/utils/TableGen/DFAEmitter.cpp index 7391f6845a4b2..e877650852898 100644 --- a/llvm/utils/TableGen/DFAEmitter.cpp +++ b/llvm/utils/TableGen/DFAEmitter.cpp @@ -174,7 +174,7 @@ namespace { struct Action { Record *R = nullptr; unsigned I = 0; - std::string S = nullptr; + std::string S; Action() = default; Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {} diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index d74cfae629f54..67b68217cbd87 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -389,6 +389,10 @@ getNameForFeatureBitset(const std::vector &FeatureBitset) { return Name; } +static std::string getScopedName(unsigned Scope, const std::string &Name) { + return ("pred:" + Twine(Scope) + ":" + Name).str(); +} + //===- MatchTable Helpers -------------------------------------------------===// class MatchTable; @@ -852,6 +856,11 @@ class RuleMatcher : public Matcher { DefinedComplexPatternSubOperandMap; /// A map of Symbolic Names to ComplexPattern sub-operands. DefinedComplexPatternSubOperandMap ComplexSubOperands; + /// A map used to for multiple referenced error check of ComplexSubOperand. + /// ComplexSubOperand can't be referenced multiple from different operands, + /// however multiple references from same operand are allowed since that is + /// how 'same operand checks' are generated. + StringMap ComplexSubOperandsParentName; uint64_t RuleID; static uint64_t NextRuleID; @@ -917,14 +926,24 @@ class RuleMatcher : public Matcher { void definePhysRegOperand(Record *Reg, OperandMatcher &OM); Error defineComplexSubOperand(StringRef SymbolicName, Record *ComplexPattern, - unsigned RendererID, unsigned SubOperandID) { - if (ComplexSubOperands.count(SymbolicName)) - return failedImport( - "Complex suboperand referenced more than once (Operand: " + - SymbolicName + ")"); + unsigned RendererID, unsigned SubOperandID, + StringRef ParentSymbolicName) { + std::string ParentName(ParentSymbolicName); + if (ComplexSubOperands.count(SymbolicName)) { + auto RecordedParentName = ComplexSubOperandsParentName[SymbolicName]; + if (RecordedParentName.compare(ParentName) != 0) + return failedImport("Error: Complex suboperand " + SymbolicName + + " referenced by different operands: " + + RecordedParentName + " and " + ParentName + "."); + // Complex suboperand referenced more than once from same the operand is + // used to generate 'same operand check'. Emitting of + // GIR_ComplexSubOperandRenderer for them is already handled. + return Error::success(); + } ComplexSubOperands[SymbolicName] = std::make_tuple(ComplexPattern, RendererID, SubOperandID); + ComplexSubOperandsParentName[SymbolicName] = ParentName; return Error::success(); } @@ -1102,6 +1121,7 @@ class PredicateMatcher { OPM_PointerToAny, OPM_RegBank, OPM_MBB, + OPM_RecordNamedOperand, }; protected: @@ -1290,6 +1310,40 @@ class PointerToAnyOperandMatcher : public OperandPredicateMatcher { } }; +/// Generates code to record named operand in RecordedOperands list at StoreIdx. +/// Predicates with 'let PredicateCodeUsesOperands = 1' get RecordedOperands as +/// an argument to predicate's c++ code once all operands have been matched. +class RecordNamedOperandMatcher : public OperandPredicateMatcher { +protected: + unsigned StoreIdx; + std::string Name; + +public: + RecordNamedOperandMatcher(unsigned InsnVarID, unsigned OpIdx, + unsigned StoreIdx, StringRef Name) + : OperandPredicateMatcher(OPM_RecordNamedOperand, InsnVarID, OpIdx), + StoreIdx(StoreIdx), Name(Name) {} + + static bool classof(const PredicateMatcher *P) { + return P->getKind() == OPM_RecordNamedOperand; + } + + bool isIdentical(const PredicateMatcher &B) const override { + return OperandPredicateMatcher::isIdentical(B) && + StoreIdx == cast(&B)->StoreIdx && + Name.compare(cast(&B)->Name) == 0; + } + + void emitPredicateOpcodes(MatchTable &Table, + RuleMatcher &Rule) const override { + Table << MatchTable::Opcode("GIM_RecordNamedOperand") + << MatchTable::Comment("MI") << MatchTable::IntValue(InsnVarID) + << MatchTable::Comment("Op") << MatchTable::IntValue(OpIdx) + << MatchTable::Comment("StoreIdx") << MatchTable::IntValue(StoreIdx) + << MatchTable::Comment("Name : " + Name) << MatchTable::LineBreak; + } +}; + /// Generates code to check that an operand is a particular target constant. class ComplexPatternOperandMatcher : public OperandPredicateMatcher { protected: @@ -3459,6 +3513,16 @@ class GlobalISelEmitter { // Rule coverage information. Optional RuleCoverage; + /// Variables used to help with collecting of named operands for predicates + /// with 'let PredicateCodeUsesOperands = 1'. WaitingForNamedOperands is set + /// to the number of named operands that predicate expects. Store locations in + /// StoreIdxForName correspond to the order in which operand names appear in + /// predicate's argument list. + /// When we visit named leaf operand and WaitingForNamedOperands is not zero, + /// add matcher that will record operand and decrease counter. + unsigned WaitingForNamedOperands = 0; + StringMap StoreIdxForName; + void gatherOpcodeValues(); void gatherTypeIDValues(); void gatherNodeEquivs(); @@ -3511,7 +3575,8 @@ class GlobalISelEmitter { void emitCxxPredicateFns(raw_ostream &OS, StringRef CodeFieldName, StringRef TypeIdentifier, StringRef ArgType, - StringRef ArgName, StringRef AdditionalDeclarations, + StringRef ArgName, StringRef AdditionalArgs, + StringRef AdditionalDeclarations, std::function Filter); void emitImmPredicateFns(raw_ostream &OS, StringRef TypeIdentifier, StringRef ArgType, @@ -3863,6 +3928,15 @@ Expected GlobalISelEmitter::createAndImportSelDAGMatcher( return std::move(Error); if (Predicate.hasGISelPredicateCode()) { + if (Predicate.usesOperands()) { + assert(WaitingForNamedOperands == 0 && + "previous predicate didn't find all operands or " + "nested predicate that uses operands"); + TreePattern *TP = Predicate.getOrigPatFragRecord(); + WaitingForNamedOperands = TP->getNumArgs(); + for (unsigned i = 0; i < WaitingForNamedOperands; ++i) + StoreIdxForName[getScopedName(Call.Scope, TP->getArgName(i))] = i; + } InsnMatcher.addPredicate(Predicate); continue; } @@ -4041,12 +4115,22 @@ Error GlobalISelEmitter::importChildMatcher( bool OperandIsImmArg, unsigned OpIdx, unsigned &TempOpIdx) { Record *PhysReg = nullptr; - StringRef SrcChildName = getSrcChildName(SrcChild, PhysReg); + std::string SrcChildName = std::string(getSrcChildName(SrcChild, PhysReg)); + if (!SrcChild->isLeaf() && + SrcChild->getOperator()->isSubClassOf("ComplexPattern")) { + // The "name" of a non-leaf complex pattern (MY_PAT $op1, $op2) is + // "MY_PAT:op1:op2" and the ones with same "name" represent same operand. + std::string PatternName = std::string(SrcChild->getOperator()->getName()); + for (unsigned i = 0; i < SrcChild->getNumChildren(); ++i) { + PatternName += ":"; + PatternName += SrcChild->getChild(i)->getName(); + } + SrcChildName = PatternName; + } OperandMatcher &OM = - PhysReg - ? InsnMatcher.addPhysRegInput(PhysReg, OpIdx, TempOpIdx) - : InsnMatcher.addOperand(OpIdx, std::string(SrcChildName), TempOpIdx); + PhysReg ? InsnMatcher.addPhysRegInput(PhysReg, OpIdx, TempOpIdx) + : InsnMatcher.addOperand(OpIdx, SrcChildName, TempOpIdx); if (OM.isSameAsAnotherOperand()) return Error::success(); @@ -4093,9 +4177,9 @@ Error GlobalISelEmitter::importChildMatcher( for (unsigned i = 0, e = SrcChild->getNumChildren(); i != e; ++i) { auto *SubOperand = SrcChild->getChild(i); if (!SubOperand->getName().empty()) { - if (auto Error = Rule.defineComplexSubOperand(SubOperand->getName(), - SrcChild->getOperator(), - RendererID, i)) + if (auto Error = Rule.defineComplexSubOperand( + SubOperand->getName(), SrcChild->getOperator(), RendererID, i, + SrcChildName)) return Error; } } @@ -4141,6 +4225,13 @@ Error GlobalISelEmitter::importChildMatcher( if (auto *ChildDefInit = dyn_cast(SrcChild->getLeafValue())) { auto *ChildRec = ChildDefInit->getDef(); + if (WaitingForNamedOperands) { + auto PA = SrcChild->getNamesAsPredicateArg().begin(); + std::string Name = getScopedName(PA->getScope(), PA->getIdentifier()); + OM.addPredicate(StoreIdxForName[Name], Name); + --WaitingForNamedOperands; + } + // Check for register classes. if (ChildRec->isSubClassOf("RegisterClass") || ChildRec->isSubClassOf("RegisterOperand")) { @@ -5236,7 +5327,8 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { // trouble than it's worth. void GlobalISelEmitter::emitCxxPredicateFns( raw_ostream &OS, StringRef CodeFieldName, StringRef TypeIdentifier, - StringRef ArgType, StringRef ArgName, StringRef AdditionalDeclarations, + StringRef ArgType, StringRef ArgName, StringRef AdditionalArgs, + StringRef AdditionalDeclarations, std::function Filter) { std::vector MatchedRecords; const auto &Defs = RK.getAllDerivedDefinitions("PatFrag"); @@ -5261,7 +5353,7 @@ void GlobalISelEmitter::emitCxxPredicateFns( OS << "bool " << Target.getName() << "InstructionSelector::test" << ArgName << "Predicate_" << TypeIdentifier << "(unsigned PredicateID, " << ArgType << " " - << ArgName << ") const {\n" + << ArgName << AdditionalArgs <<") const {\n" << AdditionalDeclarations; if (!AdditionalDeclarations.empty()) OS << "\n"; @@ -5287,12 +5379,13 @@ void GlobalISelEmitter::emitImmPredicateFns( raw_ostream &OS, StringRef TypeIdentifier, StringRef ArgType, std::function Filter) { return emitCxxPredicateFns(OS, "ImmediateCode", TypeIdentifier, ArgType, - "Imm", "", Filter); + "Imm", "", "", Filter); } void GlobalISelEmitter::emitMIPredicateFns(raw_ostream &OS) { return emitCxxPredicateFns( OS, "GISelPredicateCode", "MI", "const MachineInstr &", "MI", + ", const std::array &Operands", " const MachineFunction &MF = *MI.getParent()->getParent();\n" " const MachineRegisterInfo &MRI = MF.getRegInfo();\n" " (void)MRI;", @@ -5525,7 +5618,8 @@ void GlobalISelEmitter::run(raw_ostream &OS) { << " bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat " "&Imm) const override;\n" << " const int64_t *getMatchTable() const override;\n" - << " bool testMIPredicate_MI(unsigned PredicateID, const MachineInstr &MI) " + << " bool testMIPredicate_MI(unsigned PredicateID, const MachineInstr &MI" + ", const std::array &Operands) " "const override;\n" << "#endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL\n\n"; diff --git a/llvm/utils/TableGen/LLVMBuild.txt b/llvm/utils/TableGen/LLVMBuild.txt index 5eec4e060be58..6293aa0e40248 100644 --- a/llvm/utils/TableGen/LLVMBuild.txt +++ b/llvm/utils/TableGen/LLVMBuild.txt @@ -18,4 +18,4 @@ type = BuildTool name = tblgen parent = BuildTools -required_libraries = Support TableGen MC +required_libraries = Support TableGen diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py index 588a2870b9895..dc35859606e0f 100644 --- a/llvm/utils/UpdateTestChecks/asm.py +++ b/llvm/utils/UpdateTestChecks/asm.py @@ -15,7 +15,7 @@ class string: ##### Assembly parser ASM_FUNCTION_X86_RE = re.compile( - r'^_?(?P[^:]+):[ \t]*#+[ \t]*@(?P=func)\n(?:\s*\.?Lfunc_begin[^:\n]*:\n)?[^:]*?' + r'^_?(?P[^:]+):[ \t]*#+[ \t]*@"?(?P=func)"?\n(?:\s*\.?Lfunc_begin[^:\n]*:\n)?[^:]*?' r'(?P^##?[ \t]+[^:]+:.*?)\s*' r'^\s*(?:[^:\n]+?:\s*\n\s*\.size|\.cfi_endproc|\.globl|\.comm|\.(?:sub)?section|#+ -- End function)', flags=(re.M | re.S)) @@ -28,7 +28,7 @@ class string: flags=(re.M | re.S)) ASM_FUNCTION_AARCH64_RE = re.compile( - r'^_?(?P[^:]+):[ \t]*\/\/[ \t]*@(?P=func)\n' + r'^_?(?P[^:]+):[ \t]*\/\/[ \t]*@"?(?P=func)"?\n' r'(?:[ \t]+.cfi_startproc\n)?' # drop optional cfi noise r'(?P.*?)\n' # This list is incomplete @@ -36,21 +36,21 @@ class string: flags=(re.M | re.S)) ASM_FUNCTION_AMDGPU_RE = re.compile( - r'^_?(?P[^:]+):[ \t]*;+[ \t]*@(?P=func)\n[^:]*?' + r'^_?(?P[^:]+):[ \t]*;+[ \t]*@"?(?P=func)"?\n[^:]*?' r'(?P.*?)\n' # (body of the function) # This list is incomplete r'^\s*(\.Lfunc_end[0-9]+:\n|\.section)', flags=(re.M | re.S)) ASM_FUNCTION_HEXAGON_RE = re.compile( - r'^_?(?P[^:]+):[ \t]*//[ \t]*@(?P=func)\n[^:]*?' + r'^_?(?P[^:]+):[ \t]*//[ \t]*@"?(?P=func)"?\n[^:]*?' r'(?P.*?)\n' # (body of the function) # This list is incomplete r'.Lfunc_end[0-9]+:\n', flags=(re.M | re.S)) ASM_FUNCTION_MIPS_RE = re.compile( - r'^_?(?P[^:]+):[ \t]*#+[ \t]*@(?P=func)\n[^:]*?' # f: (name of func) + r'^_?(?P[^:]+):[ \t]*#+[ \t]*@"?(?P=func)"?\n[^:]*?' # f: (name of func) r'(?:^[ \t]+\.(frame|f?mask|set).*?\n)+' # Mips+LLVM standard asm prologue r'(?P.*?)\n' # (body of the function) # Mips+LLVM standard asm epilogue @@ -60,13 +60,13 @@ class string: flags=(re.M | re.S)) ASM_FUNCTION_MSP430_RE = re.compile( - r'^_?(?P[^:]+):[ \t]*;+[ \t]*@(?P=func)\n[^:]*?' + r'^_?(?P[^:]+):[ \t]*;+[ \t]*@"?(?P=func)"?\n[^:]*?' r'(?P.*?)\n' r'(\$|\.L)func_end[0-9]+:\n', # $func_end0: flags=(re.M | re.S)) ASM_FUNCTION_PPC_RE = re.compile( - r'^_?(?P[^:]+):[ \t]*#+[ \t]*@(?P=func)\n' + r'^_?(?P[^:]+):[ \t]*#+[ \t]*@"?(?P=func)"?\n' r'.*?' r'\.Lfunc_begin[0-9]+:\n' r'(?:[ \t]+.cfi_startproc\n)?' @@ -78,7 +78,7 @@ class string: flags=(re.M | re.S)) ASM_FUNCTION_RISCV_RE = re.compile( - r'^_?(?P[^:]+):[ \t]*#+[ \t]*@(?P=func)\n' + r'^_?(?P[^:]+):[ \t]*#+[ \t]*@"?(?P=func)"?\n' r'(?:\s*\.?L(?P=func)\$local:\n)?' # optional .L$local: due to -fno-semantic-interposition r'(?:\s*\.?Lfunc_begin[^:\n]*:\n)?[^:]*?' r'(?P^##?[ \t]+[^:]+:.*?)\s*' @@ -86,27 +86,27 @@ class string: flags=(re.M | re.S)) ASM_FUNCTION_LANAI_RE = re.compile( - r'^_?(?P[^:]+):[ \t]*!+[ \t]*@(?P=func)\n' + r'^_?(?P[^:]+):[ \t]*!+[ \t]*@"?(?P=func)"?\n' r'(?:[ \t]+.cfi_startproc\n)?' # drop optional cfi noise r'(?P.*?)\s*' r'.Lfunc_end[0-9]+:\n', flags=(re.M | re.S)) ASM_FUNCTION_SPARC_RE = re.compile( - r'^_?(?P[^:]+):[ \t]*!+[ \t]*@(?P=func)\n' + r'^_?(?P[^:]+):[ \t]*!+[ \t]*@"?(?P=func)"?\n' r'(?P.*?)\s*' r'.Lfunc_end[0-9]+:\n', flags=(re.M | re.S)) ASM_FUNCTION_SYSTEMZ_RE = re.compile( - r'^_?(?P[^:]+):[ \t]*#+[ \t]*@(?P=func)\n' + r'^_?(?P[^:]+):[ \t]*#+[ \t]*@"?(?P=func)"?\n' r'[ \t]+.cfi_startproc\n' r'(?P.*?)\n' r'.Lfunc_end[0-9]+:\n', flags=(re.M | re.S)) ASM_FUNCTION_AARCH64_DARWIN_RE = re.compile( - r'^_(?P[^:]+):[ \t]*;[ \t]@(?P=func)\n' + r'^_(?P[^:]+):[ \t]*;[ \t]@"?(?P=func)"?\n' r'([ \t]*.cfi_startproc\n[\s]*)?' r'(?P.*?)' r'([ \t]*.cfi_endproc\n[\s]*)?' @@ -114,7 +114,7 @@ class string: flags=(re.M | re.S)) ASM_FUNCTION_ARM_DARWIN_RE = re.compile( - r'^[ \t]*\.globl[ \t]*_(?P[^ \t])[ \t]*@[ \t]--[ \t]Begin[ \t]function[ \t](?P=func)' + r'^[ \t]*\.globl[ \t]*_(?P[^ \t])[ \t]*@[ \t]--[ \t]Begin[ \t]function[ \t]"?(?P=func)"?' r'(?P.*?)' r'^_(?P=func):\n[ \t]*' r'(?P.*?)' @@ -137,7 +137,7 @@ class string: flags=(re.M | re.S)) ASM_FUNCTION_WASM32_RE = re.compile( - r'^_?(?P[^:]+):[ \t]*#+[ \t]*@(?P=func)\n' + r'^_?(?P[^:]+):[ \t]*#+[ \t]*@"?(?P=func)"?\n' r'(?P.*?)\n' r'^\s*(\.Lfunc_end[0-9]+:\n|end_function)', flags=(re.M | re.S)) diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index dd0e132969da3..d49fe50e5b1c3 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -145,16 +145,16 @@ def invoke_tool(exe, cmd_args, ir): UTC_ADVERT = 'NOTE: Assertions have been autogenerated by ' OPT_FUNCTION_RE = re.compile( - r'^(\s*;\s*Function\sAttrs:\s(?P[\w\s]+?))?\s*define\s+(?:internal\s+)?[^@]*@(?P[\w.-]+?)\s*' + r'^(\s*;\s*Function\sAttrs:\s(?P[\w\s]+?))?\s*define\s+(?:internal\s+)?[^@]*@(?P[\w.$-]+?)\s*' r'(?P\((\)|(.*?[\w.-]+?)\))[^{]*\{)\n(?P.*?)^\}$', flags=(re.M | re.S)) ANALYZE_FUNCTION_RE = re.compile( - r'^\s*\'(?P[\w\s-]+?)\'\s+for\s+function\s+\'(?P[\w.-]+?)\':' + r'^\s*\'(?P[\w\s-]+?)\'\s+for\s+function\s+\'(?P[\w.$-]+?)\':' r'\s*\n(?P.*)$', flags=(re.X | re.S)) -IR_FUNCTION_RE = re.compile(r'^\s*define\s+(?:internal\s+)?[^@]*@([\w.-]+)\s*\(') +IR_FUNCTION_RE = re.compile(r'^\s*define\s+(?:internal\s+)?[^@]*@"?([\w.$-]+)"?\s*\(') TRIPLE_IR_RE = re.compile(r'^\s*target\s+triple\s*=\s*"([^"]+)"$') TRIPLE_ARG_RE = re.compile(r'-mtriple[= ]([^ ]+)') MARCH_ARG_RE = re.compile(r'-march[= ]([^ ]+)') @@ -379,7 +379,7 @@ def get_value_use(var, match): return '[[' + get_value_name(var, match) + ']]' # Replace IR value defs and uses with FileCheck variables. -def genericize_check_lines(lines, is_analyze, vars_seen, global_vars_seen): +def generalize_check_lines(lines, is_analyze, vars_seen, global_vars_seen): # This gets called for each match that occurs in # a line. We transform variables we haven't seen # into defs, and variables we have seen into uses. @@ -466,7 +466,7 @@ def add_checks(output_lines, comment_marker, prefix_list, func_dict, func_name, if attrs: output_lines.append('%s %s: Function Attrs: %s' % (comment_marker, checkprefix, attrs)) args_and_sig = str(func_dict[checkprefix][func_name].args_and_sig) - args_and_sig = genericize_check_lines([args_and_sig], is_analyze, vars_seen, global_vars_seen)[0] + args_and_sig = generalize_check_lines([args_and_sig], is_analyze, vars_seen, global_vars_seen)[0] if '[[' in args_and_sig: output_lines.append(check_label_format % (checkprefix, func_name, '')) output_lines.append('%s %s-SAME: %s' % (comment_marker, checkprefix, args_and_sig)) @@ -486,7 +486,7 @@ def add_checks(output_lines, comment_marker, prefix_list, func_dict, func_name, # For IR output, change all defs to FileCheck variables, so we're immune # to variable naming fashions. - func_body = genericize_check_lines(func_body, is_analyze, vars_seen, global_vars_seen) + func_body = generalize_check_lines(func_body, is_analyze, vars_seen, global_vars_seen) # This could be selectively enabled with an optional invocation argument. # Disabled for now: better to check everything. Be safe rather than sorry. diff --git a/llvm/utils/convert-constraint-log-to-z3.py b/llvm/utils/convert-constraint-log-to-z3.py new file mode 100755 index 0000000000000..77b0a3d95b6d4 --- /dev/null +++ b/llvm/utils/convert-constraint-log-to-z3.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +""" +Helper script to convert the log generated by '-debug-only=constraint-system' +to a Python script that uses Z3 to verify the decisions using Z3's Python API. + +Example usage: + +> cat path/to/file.log +--- +x6 + -1 * x7 <= -1 +x6 + -1 * x7 <= -2 +sat + +> ./convert-constraint-log-to-z3.py path/to/file.log > check.py && python ./check.py + +> cat check.py + from z3 import * +x3 = Int("x3") +x1 = Int("x1") +x2 = Int("x2") +s = Solver() +s.add(x1 + -1 * x2 <= 0) +s.add(x2 + -1 * x3 <= 0) +s.add(-1 * x1 + x3 <= -1) +assert(s.check() == unsat) +print('all checks passed') +""" + + +import argparse +import re + + +def main(): + parser = argparse.ArgumentParser( + description='Convert constraint log to script to verify using Z3.') + parser.add_argument('log_file', metavar='log', type=str, + help='constraint-system log file') + args = parser.parse_args() + + content = '' + with open(args.log_file, 'rt') as f: + content = f.read() + + groups = content.split('---') + var_re = re.compile('x\d+') + + print('from z3 import *') + for group in groups: + constraints = [g.strip() for g in group.split('\n') if g.strip() != ''] + variables = set() + for c in constraints[:-1]: + for m in var_re.finditer(c): + variables.add(m.group()) + if len(variables) == 0: + continue + for v in variables: + print('{} = Int("{}")'.format(v, v)) + print('s = Solver()') + for c in constraints[:-1]: + print('s.add({})'.format(c)) + expected = constraints[-1].strip() + print('assert(s.check() == {})'.format(expected)) + print('print("all checks passed")') + + +if __name__ == '__main__': + main() diff --git a/llvm/utils/gn/build/sync_source_lists_from_cmake.py b/llvm/utils/gn/build/sync_source_lists_from_cmake.py index e0c550ed7085b..a54483da8e55d 100755 --- a/llvm/utils/gn/build/sync_source_lists_from_cmake.py +++ b/llvm/utils/gn/build/sync_source_lists_from_cmake.py @@ -29,6 +29,9 @@ def patch_gn_file(gn_file, add, remove): srcs_tok = 'sources = [' tokloc = gn_contents.find(srcs_tok) + while tokloc != -1 and tokloc + len(srcs_tok) < len(gn_contents) and \ + gn_contents[tokloc + len(srcs_tok)] == ']': + tokloc = gn_contents.find(srcs_tok, tokloc + 1) if tokloc == -1: raise ValueError(gn_file + ': Failed to find source list') if gn_contents.find(srcs_tok, tokloc + 1) != -1: diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/BUILD.gn index 81c9ec0ede11f..69217b702a601 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/BUILD.gn @@ -1,9 +1,32 @@ import("//clang/lib/StaticAnalyzer/Frontend/enable.gni") +import("//llvm/utils/gn/build/write_cmake_config.gni") +import("enable.gni") + +config("clang-tidy-config_Config") { + visibility = [ ":clang-tidy-config" ] + include_dirs = [ "$target_gen_dir" ] +} + +write_cmake_config("clang-tidy-config") { + input = "clang-tidy-config.h.cmake" + output = "$target_gen_dir/clang-tidy-config.h" + values = [] + + if (clang_tidy_enable_static_analyzer) { + values += [ "CLANG_TIDY_ENABLE_STATIC_ANALYZER=1" ] + } else { + values += [ "CLANG_TIDY_ENABLE_STATIC_ANALYZER=" ] + } + + # Let targets depending on this find the generated file. + public_configs = [ ":clang-tidy-config_Config" ] +} static_library("clang-tidy") { output_name = "clangTidy" configs += [ "//llvm/utils/gn/build:clang_code" ] deps = [ + ":clang-tidy-config", "//clang/include/clang/StaticAnalyzer/Checkers", "//clang/lib/AST", "//clang/lib/ASTMatchers", @@ -19,7 +42,7 @@ static_library("clang-tidy") { "//llvm/lib/Support", ] - if (clang_enable_static_analyzer) { + if (clang_tidy_enable_static_analyzer) { deps += [ "//clang/lib/StaticAnalyzer/Core", "//clang/lib/StaticAnalyzer/Frontend", @@ -42,6 +65,7 @@ group("all-checks") { # If you add a check, also add it to ClangTidyForceLinker.h. deps = [ "//clang-tools-extra/clang-tidy/abseil", + "//clang-tools-extra/clang-tidy/altera", "//clang-tools-extra/clang-tidy/android", "//clang-tools-extra/clang-tidy/boost", "//clang-tools-extra/clang-tidy/bugprone", @@ -63,7 +87,7 @@ group("all-checks") { "//clang-tools-extra/clang-tidy/readability", "//clang-tools-extra/clang-tidy/zircon", ] - if (clang_enable_static_analyzer) { + if (clang_tidy_enable_static_analyzer) { deps += [ "//clang-tools-extra/clang-tidy/mpi" ] } } diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/altera/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/altera/BUILD.gn new file mode 100644 index 0000000000000..52f2e3d5f23d6 --- /dev/null +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/altera/BUILD.gn @@ -0,0 +1,18 @@ +static_library("altera") { + output_name = "clangTidyAlteraModule" + configs += [ "//llvm/utils/gn/build:clang_code" ] + deps = [ + "//clang-tools-extra/clang-tidy", + "//clang-tools-extra/clang-tidy/utils", + "//clang/lib/AST", + "//clang/lib/ASTMatchers", + "//clang/lib/Analysis", + "//clang/lib/Basic", + "//clang/lib/Lex", + "//llvm/lib/Support", + ] + sources = [ + "AlteraTidyModule.cpp", + "StructPackAlignCheck.cpp", + ] +} diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn index ff8b4e4c7d148..c31078df039d9 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn @@ -25,7 +25,6 @@ static_library("cppcoreguidelines") { "NarrowingConversionsCheck.cpp", "NoMallocCheck.cpp", "OwningMemoryCheck.cpp", - "PreferMemberInitializerCheck.cpp", "ProBoundsArrayToPointerDecayCheck.cpp", "ProBoundsConstantArrayIndexCheck.cpp", "ProBoundsPointerArithmeticCheck.cpp", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/enable.gni b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/enable.gni new file mode 100644 index 0000000000000..9fc3e6e4d64b2 --- /dev/null +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/enable.gni @@ -0,0 +1,4 @@ +declare_args() { + # Whether to include the static analyzer in the clang-tidy binary. + clang_tidy_enable_static_analyzer = true +} diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/tool/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/tool/BUILD.gn index 3f06214498d60..7ee93b521c812 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/tool/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/tool/BUILD.gn @@ -3,6 +3,7 @@ executable("clang-tidy") { deps = [ "//clang-tools-extra/clang-tidy", "//clang-tools-extra/clang-tidy:all-checks", + "//clang-tools-extra/clang-tidy:clang-tidy-config", "//clang/lib/AST", "//clang/lib/ASTMatchers", "//clang/lib/Basic", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn index 84d3f14bb2f27..7fa4cc8fd32c1 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn @@ -27,6 +27,7 @@ static_library("clangd") { ":features", "//clang-tools-extra/clang-tidy", "//clang-tools-extra/clang-tidy:all-checks", + "//clang-tools-extra/clang-tidy:clang-tidy-config", "//clang-tools-extra/clangd/support", "//clang/lib/AST", "//clang/lib/ASTMatchers", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn index dfd320164feb8..f732e837a88ef 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn @@ -63,6 +63,7 @@ unittest("ClangdTests") { "IndexTests.cpp", "JSONTransportTests.cpp", "LSPClient.cpp", + "LoggerTests.cpp", "ModulesTests.cpp", "ParsedASTTests.cpp", "PathMappingTests.cpp", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/test/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/test/BUILD.gn index 383cb2e1b15cd..e8b1f155a5205 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/test/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/test/BUILD.gn @@ -1,3 +1,4 @@ +import("//clang-tools-extra/clang-tidy/enable.gni") import("//clang/lib/StaticAnalyzer/Frontend/enable.gni") import("//clang/tools/libclang/include_clang_tools_extra.gni") import("//llvm/triples.gni") @@ -38,10 +39,10 @@ write_lit_config("lit_site_cfg") { "Python3_EXECUTABLE=$python_path", ] - if (clang_enable_static_analyzer) { - extra_values += [ "CLANG_ENABLE_STATIC_ANALYZER=1" ] + if (clang_tidy_enable_static_analyzer) { + extra_values += [ "CLANG_TIDY_ENABLE_STATIC_ANALYZER=1" ] } else { - extra_values += [ "CLANG_ENABLE_STATIC_ANALYZER=0" ] + extra_values += [ "CLANG_TIDY_ENABLE_STATIC_ANALYZER=0" ] } if (libclang_include_clang_tools_extra) { diff --git a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn index bb3d69d046bef..4d645799dbf65 100644 --- a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn @@ -81,7 +81,6 @@ static_library("AST") { "ExternalASTMerger.cpp", "ExternalASTSource.cpp", "FormatString.cpp", - "IgnoreExpr.cpp", "InheritViz.cpp", "Interp/ByteCodeEmitter.cpp", "Interp/ByteCodeExprGen.cpp", diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index d1fc6ad4d9799..c43e531fc7180 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -159,6 +159,7 @@ copy("Headers") { "openmp_wrappers/__clang_openmp_device_functions.h", "openmp_wrappers/cmath", "openmp_wrappers/complex.h", + "openmp_wrappers/complex_cmath.h", "openmp_wrappers/math.h", "pconfigintrin.h", "pkuintrin.h", diff --git a/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/BUILD.gn index d6072517391ff..4716d42bfdc18 100644 --- a/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/BUILD.gn @@ -18,6 +18,7 @@ unittest("SyntaxTests") { sources = [ "BuildTreeTest.cpp", "MutationsTest.cpp", + "SynthesisTest.cpp", "TokensTest.cpp", "TreeTestBase.cpp", ] diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn index 024a2aa0dfbc6..5ce3cba59ac46 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn @@ -64,6 +64,7 @@ static_library("builtins") { "divdi3.c", "divmoddi4.c", "divmodsi4.c", + "divmodti4.c", "divsc3.c", "divsf3.c", "divsi3.c", diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/scudo/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/scudo/BUILD.gn index 1143b265a3773..c8c057f85cd3c 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/scudo/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/scudo/BUILD.gn @@ -27,6 +27,8 @@ group("scudo") { # This target is unused, it only exists to satisfy # sync_source_lists_from_cmake.py. source_set("sources") { + configs -= [ "//llvm/utils/gn/build:llvm_code" ] + configs += [ "//llvm/utils/gn/build:crt_code" ] sources = [ "scudo_allocator.cpp", "scudo_allocator.h", diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 904ace07585f0..e30622f52195f 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -23,7 +23,7 @@ if (libcxx_needs_site_config) { values += [ "_LIBCPP_ABI_NAMESPACE=$libcxx_abi_namespace" ] } if (libcxx_abi_unstable) { - values += [ "_LIBCPP_ABI_UNSTABLE=" ] + values += [ "_LIBCPP_ABI_UNSTABLE=1" ] } } diff --git a/llvm/utils/gn/secondary/lld/test/BUILD.gn b/llvm/utils/gn/secondary/lld/test/BUILD.gn index bfb63a39ba65a..00cb2f2c024c8 100644 --- a/llvm/utils/gn/secondary/lld/test/BUILD.gn +++ b/llvm/utils/gn/secondary/lld/test/BUILD.gn @@ -43,9 +43,9 @@ write_lit_cfg("lit_site_cfg") { } if (llvm_enable_libxml2) { - extra_values += [ "LLVM_LIBXML2_ENABLED=1" ] + extra_values += [ "LLVM_ENABLE_LIBXML2=1" ] } else { - extra_values += [ "LLVM_LIBXML2_ENABLED=0" ] # Must be 0. + extra_values += [ "LLVM_ENABLE_LIBXML2=0" ] # Must be 0. } if (llvm_enable_zlib) { diff --git a/llvm/utils/gn/secondary/lld/wasm/BUILD.gn b/llvm/utils/gn/secondary/lld/wasm/BUILD.gn index c32205f9f9f63..98bc93e3cdc8f 100644 --- a/llvm/utils/gn/secondary/lld/wasm/BUILD.gn +++ b/llvm/utils/gn/secondary/lld/wasm/BUILD.gn @@ -22,6 +22,7 @@ static_library("wasm") { "InputChunks.cpp", "InputFiles.cpp", "LTO.cpp", + "MapFile.cpp", "MarkLive.cpp", "OutputSections.cpp", "Relocations.cpp", diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index d54242da38cca..acbd66aca4ded 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -304,9 +304,9 @@ write_cmake_config("config") { } if (llvm_enable_libxml2) { - values += [ "LLVM_LIBXML2_ENABLED=1" ] + values += [ "LLVM_ENABLE_LIBXML2=1" ] } else { - values += [ "LLVM_LIBXML2_ENABLED=" ] + values += [ "LLVM_ENABLE_LIBXML2=" ] } } diff --git a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn index 1c6d22dd672af..8f86e7fdddcc3 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn @@ -35,6 +35,7 @@ static_library("Analysis") { "CmpInstAnalysis.cpp", "CodeMetrics.cpp", "ConstantFolding.cpp", + "ConstraintSystem.cpp", "CostModel.cpp", "DDG.cpp", "Delinearization.cpp", @@ -51,6 +52,7 @@ static_library("Analysis") { "GlobalsModRef.cpp", "GuardUtils.cpp", "HeatUtils.cpp", + "IRSimilarityIdentifier.cpp", "IVDescriptors.cpp", "IVUsers.cpp", "IndirectCallPromotionAnalysis.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn index 9afe48db159b2..bb8a671dd6a7d 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn @@ -8,6 +8,7 @@ static_library("Passes") { "//llvm/lib/Target", "//llvm/lib/Transforms/AggressiveInstCombine", "//llvm/lib/Transforms/Coroutines", + "//llvm/lib/Transforms/HelloNew", "//llvm/lib/Transforms/IPO", "//llvm/lib/Transforms/InstCombine", "//llvm/lib/Transforms/Instrumentation", diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn index 3a452fc6e0601..9adb514705d44 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn @@ -18,17 +18,32 @@ tablegen("PPCGenFastISel") { td_file = "PPC.td" } +tablegen("PPCGenGlobalISel") { + visibility = [ ":LLVMPowerPCCodeGen" ] + args = [ "-gen-global-isel" ] + td_file = "PPC.td" +} + +tablegen("PPCGenRegisterBank") { + visibility = [ ":LLVMPowerPCCodeGen" ] + args = [ "-gen-register-bank" ] + td_file = "PPC.td" +} + static_library("LLVMPowerPCCodeGen") { deps = [ ":PPCGenCallingConv", ":PPCGenDAGISel", ":PPCGenFastISel", + ":PPCGenGlobalISel", + ":PPCGenRegisterBank", "MCTargetDesc", "TargetInfo", "//llvm/include/llvm/Config:llvm-config", "//llvm/lib/Analysis", "//llvm/lib/CodeGen", "//llvm/lib/CodeGen/AsmPrinter", + "//llvm/lib/CodeGen/GlobalISel", "//llvm/lib/CodeGen/SelectionDAG", "//llvm/lib/IR", "//llvm/lib/MC", @@ -38,6 +53,10 @@ static_library("LLVMPowerPCCodeGen") { ] include_dirs = [ "." ] sources = [ + "GISel/PPCCallLowering.cpp", + "GISel/PPCInstructionSelector.cpp", + "GISel/PPCLegalizerInfo.cpp", + "GISel/PPCRegisterBankInfo.cpp", "PPCAsmPrinter.cpp", "PPCBoolRetToInt.cpp", "PPCBranchCoalescing.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/HelloNew/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/HelloNew/BUILD.gn new file mode 100644 index 0000000000000..5e6167324a4ae --- /dev/null +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/HelloNew/BUILD.gn @@ -0,0 +1,9 @@ +static_library("HelloNew") { + output_name = "LLVMHelloNew" + deps = [ + "//llvm/lib/Analysis", + "//llvm/lib/IR", + "//llvm/lib/Support", + ] + sources = [ "HelloWorld.cpp" ] +} diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn index dbac54ab97041..edcf13309a578 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn @@ -16,11 +16,11 @@ static_library("Instrumentation") { "DataFlowSanitizer.cpp", "GCOVProfiling.cpp", "HWAddressSanitizer.cpp", - "HeapProfiler.cpp", "IndirectCallPromotion.cpp", "InstrOrderFile.cpp", "InstrProfiling.cpp", "Instrumentation.cpp", + "MemProfiler.cpp", "MemorySanitizer.cpp", "PGOInstrumentation.cpp", "PGOMemOPSizeOpt.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn index 60fcbe0318713..9d4c7a06c9402 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn @@ -15,6 +15,7 @@ static_library("Scalar") { "BDCE.cpp", "CallSiteSplitting.cpp", "ConstantHoisting.cpp", + "ConstraintElimination.cpp", "CorrelatedValuePropagation.cpp", "DCE.cpp", "DeadStoreElimination.cpp", diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn index c714d9b5ba7b1..1b48d08751212 100644 --- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn @@ -162,9 +162,15 @@ write_lit_config("lit_site_cfg") { } if (llvm_enable_libxml2) { - extra_values += [ "LLVM_LIBXML2_ENABLED=1" ] + extra_values += [ "LLVM_ENABLE_LIBXML2=1" ] } else { - extra_values += [ "LLVM_LIBXML2_ENABLED=0" ] # Must be 0. + extra_values += [ "LLVM_ENABLE_LIBXML2=0" ] # Must be 0. + } + + if (llvm_enable_expensive_checks) { + extra_values += [ "LLVM_ENABLE_EXPENSIVE_CHECKS=1" ] + } else { + extra_values += [ "LLVM_ENABLE_EXPENSIVE_CHECKS=0" ] # Must be 0. } if (llvm_enable_threads) { diff --git a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn index c4bed481e051b..50c02aa2214ef 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn @@ -19,11 +19,13 @@ unittest("AnalysisTests") { "CGSCCPassManagerTest.cpp", "CallGraphTest.cpp", "CaptureTrackingTest.cpp", + "ConstraintSystemTest.cpp", "DDGTest.cpp", "DivergenceAnalysisTest.cpp", "DomTreeUpdaterTest.cpp", "FunctionPropertiesAnalysisTest.cpp", "GlobalsModRefTest.cpp", + "IRSimilarityIdentifierTest.cpp", "IVDescriptorsTest.cpp", "LazyCallGraphTest.cpp", "LoadsTest.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn index 2cf9a4e05c2dd..fe5ee15605c0b 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn @@ -13,10 +13,13 @@ unittest("CodeGenTests") { "//llvm/lib/Support", "//llvm/lib/Target", "//llvm/lib/Target:TargetsToBuild", + "//llvm/lib/Testing/Support", ] sources = [ "AArch64SelectionDAGTest.cpp", + "AsmPrinterDwarfTest.cpp", "DIEHashTest.cpp", + "DIETest.cpp", "LexicalScopesTest.cpp", "LowLevelTypeTest.cpp", "MachineInstrBundleIteratorTest.cpp", @@ -25,6 +28,7 @@ unittest("CodeGenTests") { "PassManagerTest.cpp", "ScalableVectorMVTsTest.cpp", "TargetOptionsTest.cpp", + "TestAsmPrinter.cpp", "TypeTraitsTest.cpp", ] has_custom_main = true diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn index f47e5a996b336..2aee1db5086ec 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn @@ -90,6 +90,7 @@ unittest("SupportTests") { "YAMLIOTest.cpp", "YAMLParserTest.cpp", "formatted_raw_ostream_test.cpp", + "raw_fd_stream_test.cpp", "raw_ostream_test.cpp", "raw_pwrite_stream_test.cpp", "raw_sha1_ostream_test.cpp", diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn index 4559926899c9f..bd1382d4def7d 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn @@ -1,7 +1,6 @@ executable("llvm-tblgen") { deps = [ "//llvm/include/llvm/Config:llvm-config", - "//llvm/lib/MC", "//llvm/lib/Support", "//llvm/lib/TableGen", "//llvm/utils/TableGen/GlobalISel", diff --git a/mlir/docs/CAPI.md b/mlir/docs/CAPI.md index 2ec25d15747c7..e71dee0917744 100644 --- a/mlir/docs/CAPI.md +++ b/mlir/docs/CAPI.md @@ -97,37 +97,32 @@ as follows. its first argument is `Y`, and it is the responsibility of the caller to ensure it is indeed the case. -### Returning String References +### Auxiliary Types + +#### `StringRef` Numerous MLIR functions return instances of `StringRef` to refer to a non-owning segment of a string. This segment may or may not be null-terminated. In C API, -these functions take an additional callback argument of type -`MlirStringCallback` (pointer to a function with signature `void (*)(const char -*, intptr_t, void *)`) and a pointer to user-defined data. This callback is -invoked with a pointer to the string segment, its size and is forwarded the -user-defined data. The caller is in charge of managing the string segment -according to its memory model: for strings owned by the object (e.g., string -attributes), the caller can store the pointer and the size and use them directly -as long as the parent object is live or copy the string to a new location with a -null terminator if expected; for generated strings (e.g., in printing), the -caller is expected to copy the string segment if it intends to use it later. - -**Note:** this interface may be revised in the near future. - -### Conversion To String and Printing - -IR objects can be converted to a string representation, for example for -printing, using `mlirXPrint(MlirX, MlirStringCallback, void *)` functions. These -functions accept take arguments a callback with signature `void (*)(const char -*, intptr_t, void *)` and a pointer to user-defined data. They call the callback -and supply it with chunks of the string representation, provided as a pointer to -the first character and a length, and forward the user-defined data unmodified. -It is up to the caller to allocate memory if the string representation must be -stored and perform the copy. There is no guarantee that the pointer supplied to -the callback points to a null-terminated string, the size argument should be -used to find the end of the string. The callback may be called multiple times -with consecutive chunks of the string representation (the printing itself is -buffered). +these are represented as instances of `MlirStringRef` structure that contains a +pointer to the first character of the string fragment (`str`) and the fragment +length (`length`). Note that the fragment is _not necessarily_ null-terminated, +the `length` field must be used to identify the last character. `MlirStringRef` +is a non-owning pointer, the caller is in charge of perfoming the copy or +ensuring that the pointee outlives all uses of `MlirStringRef`. + +### Printing + +IR objects can be printed using `mlirXPrint(MlirX, MlirStringCallback, void *)` +functions. These functions accept take arguments a callback with signature `void +(*)(const char *, intptr_t, void *)` and a pointer to user-defined data. They +call the callback and supply it with chunks of the string representation, +provided as a pointer to the first character and a length, and forward the +user-defined data unmodified. It is up to the caller to allocate memory if the +string representation must be stored and perform the copy. There is no guarantee +that the pointer supplied to the callback points to a null-terminated string, +the size argument should be used to find the end of the string. The callback may +be called multiple times with consecutive chunks of the string representation +(the printing itself is buffered). *Rationale*: this approach allows the caller to have full control of the allocation and avoid unnecessary allocation and copying inside the printer. diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md index 92ca92218219c..6e577db4501c1 100644 --- a/mlir/docs/PassManagement.md +++ b/mlir/docs/PassManagement.md @@ -104,6 +104,15 @@ struct MyOperationPass : public OperationPass { }; ``` +### Dependent Dialects + +Dialects must be loaded in the MLIRContext before entities from these dialects +(operations, types, attributes, ...) can be created. Dialects must be loaded +before starting the multi-threaded pass pipeline execution. To this end, a pass +that can create an entity from a dialect that isn't already loaded must express +this by overriding the `getDependentDialects()` method and declare this list of +Dialects explicitly. + ## Analysis Management An important concept, along with transformation passes, are analyses. These are @@ -684,6 +693,8 @@ It contains the following fields: * description - A longer, more detailed description of the pass. This is used when generating pass documentation. +* dependentDialects + - A list of strings that are the Dialect classes this pass can introduce. * constructor - A piece of C++ code used to create a default instance of the pass. * options diff --git a/mlir/docs/Tutorials/UnderstandingTheIRStructure.md b/mlir/docs/Tutorials/UnderstandingTheIRStructure.md new file mode 100644 index 0000000000000..8b4f7724741fa --- /dev/null +++ b/mlir/docs/Tutorials/UnderstandingTheIRStructure.md @@ -0,0 +1,287 @@ +# Understanding the IR Structure + +The MLIR Language Reference describes the +[High Level Structure](../LangRef/#high-level-structure), this document +illustrates this structure through examples, and introduces at the same time the +C++ APIs involved in manipulating it. + +We will implement a [pass](../PassManagement/#operation-pass) that traverses any +MLIR input and prints the entity inside the IR. A pass (or in general almost any +piece of IR) is always rooted with an operation. Most of the time the top-level +operation is a `ModuleOp`, the MLIR `PassManager` is actually limited to +operation on a top-level `ModuleOp`. As such a pass starts with an operation, +and so will our traversal: + +``` + void runOnOperation() override { + Operation *op = getOperation(); + resetIndent(); + printOperation(op); + } +``` + +## Traversing the IR Nesting + +The IR is recursively nested, an `Operation` can have one or multiple nested +`Region`s, each of which is actually a list of `Blocks`, each of which itself +wraps a list of `Operation`s. Our traversal will follow this structure with +three methods: `printOperation()`, `printRegion()`, and `printBlock()`. + +The first method inspects the properties of an operation, before iterating on +the nested regions and print them individually: + +```c++ + void printOperation(Operation *op) { + // Print the operation itself and some of its properties + printIndent() << "visiting op: '" << op->getName() << "' with " + << op->getNumOperands() << " operands and " + << op->getNumResults() << " results\n"; + // Print the operation attributes + if (!op->getAttrs().empty()) { + printIndent() << op->getAttrs().size() << " attributes:\n"; + for (NamedAttribute attr : op->getAttrs()) + printIndent() << " - '" << attr.first << "' : '" << attr.second + << "'\n"; + } + + // Recurse into each of the regions attached to the operation. + printIndent() << " " << op->getNumRegions() << " nested regions:\n"; + auto indent = pushIndent(); + for (Region ®ion : op->getRegions()) + printRegion(region); + } +``` + +A `Region` does not hold anything other than a list of `Block`s: + +```c++ + void printRegion(Region ®ion) { + // A region does not hold anything by itself other than a list of blocks. + printIndent() << "Region with " << region.getBlocks().size() + << " blocks:\n"; + auto indent = pushIndent(); + for (Block &block : region.getBlocks()) + printBlock(block); + } +``` + +Finally, a `Block` has a list of arguments, and holds a list of `Operation`s: + +```c++ + void printBlock(Block &block) { + // Print the block intrinsics properties (basically: argument list) + printIndent() + << "Block with " << block.getNumArguments() << " arguments, " + << block.getNumSuccessors() + << " successors, and " + // Note, this `.size()` is traversing a linked-list and is O(n). + << block.getOperations().size() << " operations\n"; + + // A block main role is to hold a list of Operations: let's recurse into + // printing each operation. + auto indent = pushIndent(); + for (Operation &op : block.getOperations()) + printOperation(&op); + } +``` + +The code for the pass is available +[here in the repo](https://github.com/llvm/llvm-project/blob/master/mlir/test/lib/IR/TestPrintNesting.cpp) +and can be exercised with `mlir-opt -test-print-nesting`. + +### Example + +The Pass introduced in the previous section can be applied on the following IR +with `mlir-opt -test-print-nesting -allow-unregistered-dialect +llvm-project/mlir/test/IR/print-ir-nesting.mlir`: + +```mlir +"module"() ( { + %0:4 = "dialect.op1"() {"attribute name" = 42 : i32} : () -> (i1, i16, i32, i64) + "dialect.op2"() ( { + "dialect.innerop1"(%0#0, %0#1) : (i1, i16) -> () + }, { + "dialect.innerop2"() : () -> () + "dialect.innerop3"(%0#0, %0#2, %0#3)[^bb1, ^bb2] : (i1, i32, i64) -> () + ^bb1(%1: i32): // pred: ^bb0 + "dialect.innerop4"() : () -> () + "dialect.innerop5"() : () -> () + ^bb2(%2: i64): // pred: ^bb0 + "dialect.innerop6"() : () -> () + "dialect.innerop7"() : () -> () + }) {"other attribute" = 42 : i64} : () -> () + "module_terminator"() : () -> () +}) : () -> () +``` + +And will yield the following output: + +``` +visiting op: 'module' with 0 operands and 0 results + 1 nested regions: + Region with 1 blocks: + Block with 0 arguments, 0 successors, and 3 operations + visiting op: 'dialect.op1' with 0 operands and 4 results + 1 attributes: + - 'attribute name' : '42 : i32' + 0 nested regions: + visiting op: 'dialect.op2' with 0 operands and 0 results + 2 nested regions: + Region with 1 blocks: + Block with 0 arguments, 0 successors, and 1 operations + visiting op: 'dialect.innerop1' with 2 operands and 0 results + 0 nested regions: + Region with 3 blocks: + Block with 0 arguments, 2 successors, and 2 operations + visiting op: 'dialect.innerop2' with 0 operands and 0 results + 0 nested regions: + visiting op: 'dialect.innerop3' with 3 operands and 0 results + 0 nested regions: + Block with 1 arguments, 0 successors, and 2 operations + visiting op: 'dialect.innerop4' with 0 operands and 0 results + 0 nested regions: + visiting op: 'dialect.innerop5' with 0 operands and 0 results + 0 nested regions: + Block with 1 arguments, 0 successors, and 2 operations + visiting op: 'dialect.innerop6' with 0 operands and 0 results + 0 nested regions: + visiting op: 'dialect.innerop7' with 0 operands and 0 results + 0 nested regions: + visiting op: 'module_terminator' with 0 operands and 0 results + 0 nested regions: +``` + +## Other IR Traversal Methods. + +In many cases, unwrapping the recursive structure of the IR is cumbersome and +you may be interested in using other helpers. + +### Filtered iterator: `getOps()` + +For example the `Block` class exposes a convenient templated method +`getOps()` that provided a filtered iterator. Here is an example: + +```c++ + auto varOps = entryBlock.getOps(); + for (spirv::GlobalVariableOp gvOp : varOps) { + // process each GlobalVariable Operation in the block. + ... + } +``` + +Similarly, the `Region` class exposes the same `getOps` method that will iterate +on all the blocks in the region. + +### Walkers + +The `getOps()` is useful to iterate on some Operations immediately listed +inside a single block (or a single region), however it is frequently interesting +to traverse the IR in a nested fashion. To this end MLIR exposes the `walk()` +helper on `Operation`, `Block`, and `Region`. This helper takes a single +argument: a callback method that will be invoked for every operation recursively +nested under the provided entity. + +```c++ + // Recursively traverse all the regions and blocks nested inside the function + // and apply the callback on every single operation in post-order. + getFunction().walk([&](mlir::Operation *op) { + // process Operation `op`. + }); +``` + +The provided callback can be specialized to filter on a particular type of +Operation, for example the following will apply the callback only on `LinalgOp` +operations nested inside the function: + +```c++ + getFunction.walk([](LinalgOp linalgOp) { + // process LinalgOp `linalgOp`. + }); +``` + +Finally, the callback can optionally stop the walk by returning a +`WalkResult::interrupt()` value. For example the following walk will find all +`AllocOp` nested inside the function and interrupt the traversal if one of them +does not satisfy a criteria: + +```c++ + WalkResult result = getFunction().walk([&](AllocOp allocOp) { + if (!isValid(allocOp)) + return WalkResult::interrupt(); + return WalkResult::advance(); + }); + if (result.wasInterrupted()) + // One alloc wasn't matching. + ... +``` + +## Traversing the def-use chains + +Another relationship in the IR is the one that links a `Value` with its users. +As defined in the +[language reference](https://mlir.llvm.org/docs/LangRef/#high-level-structure), +each Value is either a `BlockArgument` or the result of exactly one `Operation` +(an `Operation` can have multiple results, each of them is a separate `Value`). +The users of a `Value` are `Operation`s, through their arguments: each +`Operation` argument references a single `Value`. + +Here is a code sample that inspects the operands of an `Operation` and prints +some information about them: + +```c++ + // Print information about the producer of each of the operands. + for (Value operand : op->getOperands()) { + if (Operation *producer = operand.getDefiningOp()) { + llvm::outs() << " - Operand produced by operation '" + << producer->getName() << "'\n"; + } else { + // If there is no defining op, the Value is necessarily a Block + // argument. + auto blockArg = operand.cast(); + llvm::outs() << " - Operand produced by Block argument, number " + << blockArg.getArgNumber() << "\n"; + } + } +``` + +Similarly, the following code sample iterates through the result `Value`s +produced by an `Operation` and for each result will iterate the users of these +results and print informations about them: + +```c++ + // Print information about the user of each of the result. + llvm::outs() << "Has " << op->getNumResults() << " results:\n"; + for (auto indexedResult : llvm::enumerate(op->getResults())) { + Value result = indexedResult.value(); + llvm::outs() << " - Result " << indexedResult.index(); + if (result.use_empty()) { + llvm::outs() << " has no uses\n"; + continue; + } + if (result.hasOneUse()) { + llvm::outs() << " has a single use: "; + } else { + llvm::outs() << " has " + << std::distance(result.getUses().begin(), + result.getUses().end()) + << " uses:\n"; + } + for (Operation *userOp : result.getUsers()) { + llvm::outs() << " - " << userOp->getName() << "\n"; + } + } +``` + +The illustrating code for this pass is available +[here in the repo](https://github.com/llvm/llvm-project/blob/master/mlir/test/lib/IR/TestPrintDefUse.cpp) +and can be exercised with `mlir-opt -test-print-defuse`. + +The chaining of `Value`s and their uses can be viewed as following: + +![Index Map Example](/includes/img/DefUseChains.svg) + +The uses of a `Value` (`OpOperand` or `BlockOperand`) are also chained in a +doubly linked-list, which is particularly useful when replacing all uses of a +`Value` with a new one ("RAUW"): + +![Index Map Example](/includes/img/Use-list.svg) diff --git a/mlir/docs/includes/img/DefUseChains.svg b/mlir/docs/includes/img/DefUseChains.svg new file mode 100644 index 0000000000000..2d5b75246772a --- /dev/null +++ b/mlir/docs/includes/img/DefUseChains.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/mlir/docs/includes/img/Use-list.svg b/mlir/docs/includes/img/Use-list.svg new file mode 100644 index 0000000000000..4840619f06741 --- /dev/null +++ b/mlir/docs/includes/img/Use-list.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/mlir/examples/standalone/include/Standalone/StandaloneDialect.h b/mlir/examples/standalone/include/Standalone/StandaloneDialect.h index ac1ac86a178e4..d3eb24cc308df 100644 --- a/mlir/examples/standalone/include/Standalone/StandaloneDialect.h +++ b/mlir/examples/standalone/include/Standalone/StandaloneDialect.h @@ -11,12 +11,6 @@ #include "mlir/IR/Dialect.h" -namespace mlir { -namespace standalone { - #include "Standalone/StandaloneOpsDialect.h.inc" -} // namespace standalone -} // namespace mlir - #endif // STANDALONE_STANDALONEDIALECT_H diff --git a/mlir/examples/standalone/include/Standalone/StandaloneDialect.td b/mlir/examples/standalone/include/Standalone/StandaloneDialect.td index 403a83a712b15..a7fd789376e22 100644 --- a/mlir/examples/standalone/include/Standalone/StandaloneDialect.td +++ b/mlir/examples/standalone/include/Standalone/StandaloneDialect.td @@ -23,7 +23,7 @@ def Standalone_Dialect : Dialect { illustrate the basic setup required to develop MLIR-based tools without working inside of the LLVM source tree. }]; - let cppNamespace = "standalone"; + let cppNamespace = "::mlir::standalone"; } //===----------------------------------------------------------------------===// diff --git a/mlir/examples/standalone/include/Standalone/StandaloneOps.h b/mlir/examples/standalone/include/Standalone/StandaloneOps.h index 18b02aff856de..5a8c5d1040e62 100644 --- a/mlir/examples/standalone/include/Standalone/StandaloneOps.h +++ b/mlir/examples/standalone/include/Standalone/StandaloneOps.h @@ -13,13 +13,7 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/Interfaces/SideEffectInterfaces.h" -namespace mlir { -namespace standalone { - #define GET_OP_CLASSES #include "Standalone/StandaloneOps.h.inc" -} // namespace standalone -} // namespace mlir - #endif // STANDALONE_STANDALONEOPS_H diff --git a/mlir/examples/standalone/lib/Standalone/StandaloneOps.cpp b/mlir/examples/standalone/lib/Standalone/StandaloneOps.cpp index f15bf02b36af7..497eb98705d83 100644 --- a/mlir/examples/standalone/lib/Standalone/StandaloneOps.cpp +++ b/mlir/examples/standalone/lib/Standalone/StandaloneOps.cpp @@ -10,9 +10,5 @@ #include "Standalone/StandaloneDialect.h" #include "mlir/IR/OpImplementation.h" -namespace mlir { -namespace standalone { #define GET_OP_CLASSES #include "Standalone/StandaloneOps.cpp.inc" -} // namespace standalone -} // namespace mlir diff --git a/mlir/examples/toy/Ch2/include/toy/Dialect.h b/mlir/examples/toy/Ch2/include/toy/Dialect.h index 4ddc63c2b4dc8..8bcad903c5387 100644 --- a/mlir/examples/toy/Ch2/include/toy/Dialect.h +++ b/mlir/examples/toy/Ch2/include/toy/Dialect.h @@ -34,12 +34,12 @@ class ToyDialect : public mlir::Dialect { static llvm::StringRef getDialectNamespace() { return "toy"; } }; +} // end namespace toy +} // end namespace mlir + /// Include the auto-generated header file containing the declarations of the /// toy operations. #define GET_OP_CLASSES #include "toy/Ops.h.inc" -} // end namespace toy -} // end namespace mlir - #endif // MLIR_TUTORIAL_TOY_DIALECT_H_ diff --git a/mlir/examples/toy/Ch2/include/toy/Ops.td b/mlir/examples/toy/Ch2/include/toy/Ops.td index 4a56edb57b3ec..db01e226384b1 100644 --- a/mlir/examples/toy/Ch2/include/toy/Ops.td +++ b/mlir/examples/toy/Ch2/include/toy/Ops.td @@ -20,7 +20,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" // can define our operations. def Toy_Dialect : Dialect { let name = "toy"; - let cppNamespace = "toy"; + let cppNamespace = "::mlir::toy"; } // Base class for toy dialect operations. This operation inherits from the base diff --git a/mlir/examples/toy/Ch3/include/toy/Dialect.h b/mlir/examples/toy/Ch3/include/toy/Dialect.h index 4ddc63c2b4dc8..8bcad903c5387 100644 --- a/mlir/examples/toy/Ch3/include/toy/Dialect.h +++ b/mlir/examples/toy/Ch3/include/toy/Dialect.h @@ -34,12 +34,12 @@ class ToyDialect : public mlir::Dialect { static llvm::StringRef getDialectNamespace() { return "toy"; } }; +} // end namespace toy +} // end namespace mlir + /// Include the auto-generated header file containing the declarations of the /// toy operations. #define GET_OP_CLASSES #include "toy/Ops.h.inc" -} // end namespace toy -} // end namespace mlir - #endif // MLIR_TUTORIAL_TOY_DIALECT_H_ diff --git a/mlir/examples/toy/Ch3/include/toy/Ops.td b/mlir/examples/toy/Ch3/include/toy/Ops.td index f7320ebc1d12d..d889b81bef0a4 100644 --- a/mlir/examples/toy/Ch3/include/toy/Ops.td +++ b/mlir/examples/toy/Ch3/include/toy/Ops.td @@ -19,7 +19,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" // can define our operations. def Toy_Dialect : Dialect { let name = "toy"; - let cppNamespace = "toy"; + let cppNamespace = "::mlir::toy"; } // Base class for toy dialect operations. This operation inherits from the base diff --git a/mlir/examples/toy/Ch4/include/toy/Dialect.h b/mlir/examples/toy/Ch4/include/toy/Dialect.h index b1a38ec60a0cf..0853347408925 100644 --- a/mlir/examples/toy/Ch4/include/toy/Dialect.h +++ b/mlir/examples/toy/Ch4/include/toy/Dialect.h @@ -36,12 +36,12 @@ class ToyDialect : public mlir::Dialect { static llvm::StringRef getDialectNamespace() { return "toy"; } }; +} // end namespace toy +} // end namespace mlir + /// Include the auto-generated header file containing the declarations of the /// toy operations. #define GET_OP_CLASSES #include "toy/Ops.h.inc" -} // end namespace toy -} // end namespace mlir - #endif // MLIR_TUTORIAL_TOY_DIALECT_H_ diff --git a/mlir/examples/toy/Ch4/include/toy/Ops.td b/mlir/examples/toy/Ch4/include/toy/Ops.td index 48c08a6a9369c..2ce4692e63f28 100644 --- a/mlir/examples/toy/Ch4/include/toy/Ops.td +++ b/mlir/examples/toy/Ch4/include/toy/Ops.td @@ -21,7 +21,7 @@ include "toy/ShapeInferenceInterface.td" // can define our operations. def Toy_Dialect : Dialect { let name = "toy"; - let cppNamespace = "toy"; + let cppNamespace = "::mlir::toy"; } // Base class for toy dialect operations. This operation inherits from the base diff --git a/mlir/examples/toy/Ch5/include/toy/Dialect.h b/mlir/examples/toy/Ch5/include/toy/Dialect.h index b1a38ec60a0cf..0853347408925 100644 --- a/mlir/examples/toy/Ch5/include/toy/Dialect.h +++ b/mlir/examples/toy/Ch5/include/toy/Dialect.h @@ -36,12 +36,12 @@ class ToyDialect : public mlir::Dialect { static llvm::StringRef getDialectNamespace() { return "toy"; } }; +} // end namespace toy +} // end namespace mlir + /// Include the auto-generated header file containing the declarations of the /// toy operations. #define GET_OP_CLASSES #include "toy/Ops.h.inc" -} // end namespace toy -} // end namespace mlir - #endif // MLIR_TUTORIAL_TOY_DIALECT_H_ diff --git a/mlir/examples/toy/Ch5/include/toy/Ops.td b/mlir/examples/toy/Ch5/include/toy/Ops.td index 210513f22fec1..2a746bb2d800a 100644 --- a/mlir/examples/toy/Ch5/include/toy/Ops.td +++ b/mlir/examples/toy/Ch5/include/toy/Ops.td @@ -21,7 +21,7 @@ include "toy/ShapeInferenceInterface.td" // can define our operations. def Toy_Dialect : Dialect { let name = "toy"; - let cppNamespace = "toy"; + let cppNamespace = "::mlir::toy"; } // Base class for toy dialect operations. This operation inherits from the base diff --git a/mlir/examples/toy/Ch6/include/toy/Dialect.h b/mlir/examples/toy/Ch6/include/toy/Dialect.h index b1a38ec60a0cf..0853347408925 100644 --- a/mlir/examples/toy/Ch6/include/toy/Dialect.h +++ b/mlir/examples/toy/Ch6/include/toy/Dialect.h @@ -36,12 +36,12 @@ class ToyDialect : public mlir::Dialect { static llvm::StringRef getDialectNamespace() { return "toy"; } }; +} // end namespace toy +} // end namespace mlir + /// Include the auto-generated header file containing the declarations of the /// toy operations. #define GET_OP_CLASSES #include "toy/Ops.h.inc" -} // end namespace toy -} // end namespace mlir - #endif // MLIR_TUTORIAL_TOY_DIALECT_H_ diff --git a/mlir/examples/toy/Ch6/include/toy/Ops.td b/mlir/examples/toy/Ch6/include/toy/Ops.td index a92f597fd178b..d9a612d00fe9c 100644 --- a/mlir/examples/toy/Ch6/include/toy/Ops.td +++ b/mlir/examples/toy/Ch6/include/toy/Ops.td @@ -21,7 +21,7 @@ include "toy/ShapeInferenceInterface.td" // can define our operations. def Toy_Dialect : Dialect { let name = "toy"; - let cppNamespace = "toy"; + let cppNamespace = "::mlir::toy"; } // Base class for toy dialect operations. This operation inherits from the base diff --git a/mlir/examples/toy/Ch7/include/toy/Dialect.h b/mlir/examples/toy/Ch7/include/toy/Dialect.h index 4eceb422efa63..fb2927834779b 100644 --- a/mlir/examples/toy/Ch7/include/toy/Dialect.h +++ b/mlir/examples/toy/Ch7/include/toy/Dialect.h @@ -50,6 +50,9 @@ class ToyDialect : public mlir::Dialect { static llvm::StringRef getDialectNamespace() { return "toy"; } }; +} // end namespace toy +} // end namespace mlir + //===----------------------------------------------------------------------===// // Toy Operations //===----------------------------------------------------------------------===// @@ -59,6 +62,9 @@ class ToyDialect : public mlir::Dialect { #define GET_OP_CLASSES #include "toy/Ops.h.inc" +namespace mlir { +namespace toy { + //===----------------------------------------------------------------------===// // Toy Types //===----------------------------------------------------------------------===// diff --git a/mlir/examples/toy/Ch7/include/toy/Ops.td b/mlir/examples/toy/Ch7/include/toy/Ops.td index ab0cf9dbb0ff6..dc9472c569a9f 100644 --- a/mlir/examples/toy/Ch7/include/toy/Ops.td +++ b/mlir/examples/toy/Ch7/include/toy/Ops.td @@ -21,7 +21,7 @@ include "toy/ShapeInferenceInterface.td" // can define our operations. def Toy_Dialect : Dialect { let name = "toy"; - let cppNamespace = "toy"; + let cppNamespace = "::mlir::toy"; } // Base class for toy dialect operations. This operation inherits from the base diff --git a/mlir/include/mlir-c/AffineMap.h b/mlir/include/mlir-c/AffineMap.h index bef13fd0bfa84..a5d99185eaf40 100644 --- a/mlir/include/mlir-c/AffineMap.h +++ b/mlir/include/mlir-c/AffineMap.h @@ -18,6 +18,116 @@ extern "C" { DEFINE_C_API_STRUCT(MlirAffineMap, const void); +/** Gets the context that the given affine map was created with*/ +MlirContext mlirAffineMapGetContext(MlirAffineMap affineMap); + +/** Checks whether an affine map is null. */ +inline int mlirAffineMapIsNull(MlirAffineMap affineMap) { + return !affineMap.ptr; +} + +/** Checks if two affine maps are equal. */ +int mlirAffineMapEqual(MlirAffineMap a1, MlirAffineMap a2); + +/** Prints an affine map by sending chunks of the string representation and + * forwarding `userData to `callback`. Note that the callback may be called + * several times with consecutive chunks of the string. */ +void mlirAffineMapPrint(MlirAffineMap affineMap, MlirStringCallback callback, + void *userData); + +/** Prints the affine map to the standard error stream. */ +void mlirAffineMapDump(MlirAffineMap affineMap); + +/** Creates a zero result affine map with no dimensions or symbols in the + * context. The affine map is owned by the context. */ +MlirAffineMap mlirAffineMapEmptyGet(MlirContext ctx); + +/** Creates a zero result affine map of the given dimensions and symbols in the + * context. The affine map is owned by the context. */ +MlirAffineMap mlirAffineMapGet(MlirContext ctx, intptr_t dimCount, + intptr_t symbolCount); + +/** Creates a single constant result affine map in the context. The affine map + * is owned by the context. */ +MlirAffineMap mlirAffineMapConstantGet(MlirContext ctx, int64_t val); + +/** Creates an affine map with 'numDims' identity in the context. The affine map + * is owned by the context. */ +MlirAffineMap mlirAffineMapMultiDimIdentityGet(MlirContext ctx, + intptr_t numDims); + +/** Creates an identity affine map on the most minor dimensions in the context. + * The affine map is owned by the context. The function asserts that the number + * of dimensions is greater or equal to the number of results. */ +MlirAffineMap mlirAffineMapMinorIdentityGet(MlirContext ctx, intptr_t dims, + intptr_t results); + +/** Creates an affine map with a permutation expression and its size in the + * context. The permutation expression is a non-empty vector of integers. + * The elements of the permutation vector must be continuous from 0 and cannot + * be repeated (i.e. `[1,2,0]` is a valid permutation. `[2,0]` or `[1,1,2]` is + * an invalid invalid permutation.) The affine map is owned by the context. */ +MlirAffineMap mlirAffineMapPermutationGet(MlirContext ctx, intptr_t size, + unsigned *permutation); + +/** Checks whether the given affine map is an identity affine map. The function + * asserts that the number of dimensions is greater or equal to the number of + * results. */ +int mlirAffineMapIsIdentity(MlirAffineMap affineMap); + +/** Checks whether the given affine map is a minor identity affine map. */ +int mlirAffineMapIsMinorIdentity(MlirAffineMap affineMap); + +/** Checks whether the given affine map is an empty affine map. */ +int mlirAffineMapIsEmpty(MlirAffineMap affineMap); + +/** Checks whether the given affine map is a single result constant affine + * map. */ +int mlirAffineMapIsSingleConstant(MlirAffineMap affineMap); + +/** Returns the constant result of the given affine map. The function asserts + * that the map has a single constant result. */ +int64_t mlirAffineMapGetSingleConstantResult(MlirAffineMap affineMap); + +/** Returns the number of dimensions of the given affine map. */ +intptr_t mlirAffineMapGetNumDims(MlirAffineMap affineMap); + +/** Returns the number of symbols of the given affine map. */ +intptr_t mlirAffineMapGetNumSymbols(MlirAffineMap affineMap); + +/** Returns the number of results of the given affine map. */ +intptr_t mlirAffineMapGetNumResults(MlirAffineMap affineMap); + +/** Returns the number of inputs (dimensions + symbols) of the given affine + * map. */ +intptr_t mlirAffineMapGetNumInputs(MlirAffineMap affineMap); + +/** Checks whether the given affine map represents a subset of a symbol-less + * permutation map. */ +int mlirAffineMapIsProjectedPermutation(MlirAffineMap affineMap); + +/** Checks whether the given affine map represents a symbol-less permutation + * map. */ +int mlirAffineMapIsPermutation(MlirAffineMap affineMap); + +/** Returns the affine map consisting of the `resultPos` subset. */ +MlirAffineMap mlirAffineMapGetSubMap(MlirAffineMap affineMap, intptr_t size, + intptr_t *resultPos); + +/** Returns the affine map consisting of the most major `numResults` results. + * Returns the null AffineMap if the `numResults` is equal to zero. + * Returns the `affineMap` if `numResults` is greater or equals to number of + * results of the given affine map. */ +MlirAffineMap mlirAffineMapGetMajorSubMap(MlirAffineMap affineMap, + intptr_t numResults); + +/** Returns the affine map consisting of the most minor `numResults` results. + * Returns the null AffineMap if the `numResults` is equal to zero. + * Returns the `affineMap` if `numResults` is greater or equals to number of + * results of the given affine map. */ +MlirAffineMap mlirAffineMapGetMinorSubMap(MlirAffineMap affineMap, + intptr_t numResults); + #ifdef __cplusplus } #endif diff --git a/mlir/include/mlir-c/StandardAttributes.h b/mlir/include/mlir-c/StandardAttributes.h index ab8d837aeeb8b..2ea2ba7a2d4fa 100644 --- a/mlir/include/mlir-c/StandardAttributes.h +++ b/mlir/include/mlir-c/StandardAttributes.h @@ -16,6 +16,7 @@ #include "mlir-c/AffineMap.h" #include "mlir-c/IR.h" +#include "mlir-c/Support.h" #ifdef __cplusplus extern "C" { @@ -152,13 +153,9 @@ MlirAttribute mlirOpaqueAttrGet(MlirContext ctx, const char *dialectNamespace, * is associated. The namespace string is owned by the context. */ const char *mlirOpaqueAttrGetDialectNamespace(MlirAttribute attr); -/** Calls the provided callback with the opaque byte data stored in the given - * opaque attribute. The callback is invoked once, and the data it receives is - * not necessarily null terminated. The data remains live as long as the context - * in which the attribute lives. */ -/* TODO: consider exposing StringRef and using it instead of the callback. */ -void mlirOpaqueAttrGetData(MlirAttribute attr, MlirStringCallback callback, - void *userData); +/** Returns the raw data as a string reference. The data remains live as long as + * the context in which the attribute lives. */ +MlirStringRef mlirOpaqueAttrGetData(MlirAttribute attr); /*============================================================================*/ /* String attribute. */ @@ -178,13 +175,9 @@ MlirAttribute mlirStringAttrGet(MlirContext ctx, intptr_t length, MlirAttribute mlirStringAttrTypedGet(MlirType type, intptr_t length, const char *data); -/** Calls the provided callback with the string stored in the given string - * attribute. The callback is invoked once, and the data it receives is not - * necessarily null terminated. The data remains live as long as the context in - * which the attribute lives. */ -/* TODO: consider exposing StringRef and using it instead of the callback. */ -void mlirStringAttrGetValue(MlirAttribute attr, MlirStringCallback callback, - void *userData); +/** Returns the attribute values as a string reference. The data remains live as + * long as the context in which the attribute lives. */ +MlirStringRef mlirStringAttrGetValue(MlirAttribute attr); /*============================================================================*/ /* SymbolRef attribute. */ @@ -201,23 +194,13 @@ MlirAttribute mlirSymbolRefAttrGet(MlirContext ctx, intptr_t length, const char *symbol, intptr_t numReferences, MlirAttribute *references); -/** Calls the provided callback with the string containing the root referenced - * symbol. The callback is invoked once, and the data it receives is not - * necessarily null terminated. The data remains live as long as the context in - * which the attribute lives. */ -/* TODO: consider exposing StringRef and using it instead of the callback. */ -void mlirSymbolRefAttrGetRootReference(MlirAttribute attr, - MlirStringCallback callback, - void *userData); - -/** Calls the provided callback with the string containing the leaf referenced - * symbol. The callback is invoked once, and the data it receives is not - * necessarily null terminated. The data remains live as long as the context in - * which the attribute lives. */ -/* TODO: consider exposing StringRef and using it instead of the callback. */ -void mlirSymbolRefAttrGetLeafReference(MlirAttribute attr, - MlirStringCallback callback, - void *userData); +/** Returns the string reference to the root referenced symbol. The data remains + * live as long as the context in which the attribute lives. */ +MlirStringRef mlirSymbolRefAttrGetRootReference(MlirAttribute attr); + +/** Returns the stirng reference to the leaf referenced symbol. The data remains + * live as long as the context in which the attribute lives. */ +MlirStringRef mlirSymbolRefAttrGetLeafReference(MlirAttribute attr); /** Returns the number of references nested in the given symbol reference * attribute. */ @@ -240,14 +223,9 @@ int mlirAttributeIsAFlatSymbolRef(MlirAttribute attr); MlirAttribute mlirFlatSymbolRefAttrGet(MlirContext ctx, intptr_t length, const char *symbol); -/** Calls the provided callback with the string containing the referenced - * symbol. The callback is invoked once, and the data it receives is not - * necessarily null terminated. The data remains live as long as the context in - * which the attribute lives. */ -/* TODO: consider exposing StringRef and using it instead of the callback. */ -void mlirFloatSymbolRefAttrGetValue(MlirAttribute attr, - MlirStringCallback callback, - void *userData); +/** Returns the referenced symbol as a string reference. The data remains live + * as long as the context in which the attribute lives. */ +MlirStringRef mlirFlatSymbolRefAttrGetValue(MlirAttribute attr); /*============================================================================*/ /* Type attribute. */ @@ -383,10 +361,7 @@ int64_t mlirDenseElementsAttrGetInt64SplatValue(MlirAttribute attr); uint64_t mlirDenseElementsAttrGetUInt64SplatValue(MlirAttribute attr); float mlirDenseElementsAttrGetFloatSplatValue(MlirAttribute attr); double mlirDenseElementsAttrGetDoubleSplatValue(MlirAttribute attr); -/* TODO: consider exposing StringRef and using it instead of the callback. */ -void mlirDenseElementsAttrGetStringSplatValue(MlirAttribute attr, - MlirStringCallback callback, - void *userData); +MlirStringRef mlirDenseElementsAttrGetStringSplatValue(MlirAttribute attr); /** Returns the pos-th value (flat contiguous indexing) of a specific type * contained by the given dense elements attribute. */ @@ -397,10 +372,8 @@ int64_t mlirDenseElementsAttrGetInt64Value(MlirAttribute attr, intptr_t pos); uint64_t mlirDenseElementsAttrGetUInt64Value(MlirAttribute attr, intptr_t pos); float mlirDenseElementsAttrGetFloatValue(MlirAttribute attr, intptr_t pos); double mlirDenseElementsAttrGetDoubleValue(MlirAttribute attr, intptr_t pos); -/* TODO: consider exposing StringRef and using it instead of the callback. */ -void mlirDenseElementsAttrGetStringValue(MlirAttribute attr, intptr_t pos, - MlirStringCallback callback, - void *userData); +MlirStringRef mlirDenseElementsAttrGetStringValue(MlirAttribute attr, + intptr_t pos); /*============================================================================*/ /* Opaque elements attribute. */ diff --git a/mlir/include/mlir-c/Support.h b/mlir/include/mlir-c/Support.h new file mode 100644 index 0000000000000..1039c68c09bf0 --- /dev/null +++ b/mlir/include/mlir-c/Support.h @@ -0,0 +1,57 @@ +/*===-- mlir-c/Support.h - Helpers for C API to Core MLIR ---------*- C -*-===*\ +|* *| +|* Part of the LLVM Project, under the Apache License v2.0 with LLVM *| +|* Exceptions. *| +|* See https://llvm.org/LICENSE.txt for license information. *| +|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception *| +|* *| +|*===----------------------------------------------------------------------===*| +|* *| +|* This header declares the auxiliary data structures used in C APIs to core *| +|* MLIR functionality. *| +|* *| +\*===----------------------------------------------------------------------===*/ + +#ifndef MLIR_C_SUPPORT_H +#define MLIR_C_SUPPORT_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================*/ +/* MlirStringRef. */ +/*============================================================================*/ + +/** A pointer to a sized fragment of a string, not necessarily null-terminated. + * Does not own the underlying string. This is equivalent to llvm::StringRef. + */ +struct MlirStringRef { + const char *data; /**< Pointer to the first symbol. */ + size_t length; /**< Length of the fragment. */ +}; +typedef struct MlirStringRef MlirStringRef; + +/** Constructs a string reference from the pointer and length. The pointer need + * not reference to a null-terminated string. + */ +inline MlirStringRef mlirStringRefCreate(const char *str, size_t length) { + MlirStringRef result; + result.data = str; + result.length = length; + return result; +} + +/** Constructs a string reference from a null-terminated C string. Prefer + * mlirStringRefCreate if the length of the string is known. + */ +MlirStringRef mlirStringRefCreateFromCString(const char *str); + +#ifdef __cplusplus +} +#endif + +#endif // MLIR_C_SUPPORT_H diff --git a/mlir/include/mlir/Analysis/AffineStructures.h b/mlir/include/mlir/Analysis/AffineStructures.h index e7b10c37825bd..d64a24e713d13 100644 --- a/mlir/include/mlir/Analysis/AffineStructures.h +++ b/mlir/include/mlir/Analysis/AffineStructures.h @@ -307,6 +307,9 @@ class FlatAffineConstraints { /// otherwise. bool containsId(Value id) const; + /// Swap the posA^th identifier with the posB^th identifier. + void swapId(unsigned posA, unsigned posB); + // Add identifiers of the specified kind - specified positions are relative to // the kind of identifier. The coefficient column corresponding to the added // identifier is initialized to zero. 'id' is the Value corresponding to the diff --git a/mlir/include/mlir/CAPI/Support.h b/mlir/include/mlir/CAPI/Support.h new file mode 100644 index 0000000000000..0c2b069906657 --- /dev/null +++ b/mlir/include/mlir/CAPI/Support.h @@ -0,0 +1,31 @@ +//===- Support.h - C API Helpers Implementation -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains definitions for converting MLIR C++ objects into helper +// C structures for the purpose of C API. This file should not be included from +// C++ code other than C API implementation nor from C code. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_CAPI_SUPPORT_H +#define MLIR_CAPI_SUPPORT_H + +#include "mlir-c/Support.h" +#include "llvm/ADT/StringRef.h" + +/// Converts a StringRef into its MLIR C API equivalent. +inline MlirStringRef wrap(llvm::StringRef ref) { + return mlirStringRefCreate(ref.data(), ref.size()); +} + +/// Creates a StringRef out of its MLIR C API equivalent. +inline llvm::StringRef unwrap(MlirStringRef ref) { + return llvm::StringRef(ref.data, ref.length); +} + +#endif // MLIR_CAPI_SUPPORT_H diff --git a/mlir/include/mlir/CAPI/Utils.h b/mlir/include/mlir/CAPI/Utils.h new file mode 100644 index 0000000000000..022f09df6a5de --- /dev/null +++ b/mlir/include/mlir/CAPI/Utils.h @@ -0,0 +1,48 @@ +//===- Utils.h - C API General Utilities ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines general utilities for C API. This file should not be +// included from C++ code other than C API implementation nor from C code. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_CAPI_UTILS_H +#define MLIR_CAPI_UTILS_H + +#include "llvm/Support/raw_ostream.h" + +/* ========================================================================== */ +/* Printing helper. */ +/* ========================================================================== */ + +namespace mlir { +namespace detail { +/// A simple raw ostream subclass that forwards write_impl calls to the +/// user-supplied callback together with opaque user-supplied data. +class CallbackOstream : public llvm::raw_ostream { +public: + CallbackOstream(std::function callback, + void *opaqueData) + : callback(callback), opaqueData(opaqueData), pos(0u) {} + + void write_impl(const char *ptr, size_t size) override { + callback(ptr, size, opaqueData); + pos += size; + } + + uint64_t current_pos() const override { return pos; } + +private: + std::function callback; + void *opaqueData; + uint64_t pos; +}; +} // end namespace detail +} // end namespace mlir + +#endif // MLIR_CAPI_UTILS_H diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index d4b478dbf4ed0..dae59c9e792e0 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -350,7 +350,7 @@ def ConvertVectorToLLVM : Pass<"convert-vector-to-llvm", "ModuleOp"> { "bool", /*default=*/"false", "Allows llvm to reassociate floating-point reductions for speed">, Option<"enableIndexOptimizations", "enable-index-optimizations", - "bool", /*default=*/"false", + "bool", /*default=*/"true", "Allows compiler to assume indices fit in 32-bit if that yields faster code"> ]; } diff --git a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h index 63ffd78373825..ab047a08f404c 100644 --- a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h +++ b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h @@ -34,6 +34,7 @@ class UnrankedMemRefType; namespace LLVM { class LLVMDialect; class LLVMType; +class LLVMPointerType; } // namespace LLVM /// Callback to convert function argument types. It converts a MemRef function @@ -281,8 +282,8 @@ class MemRefDescriptor : public StructBuilder { void setConstantStride(OpBuilder &builder, Location loc, unsigned pos, uint64_t stride); - /// Returns the (LLVM) type this descriptor points to. - LLVM::LLVMType getElementType(); + /// Returns the (LLVM) pointer type this descriptor contains. + LLVM::LLVMPointerType getElementPtrType(); /// Builds IR populating a MemRef descriptor structure from a list of /// individual values composing that descriptor, in the following order: diff --git a/mlir/include/mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h b/mlir/include/mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h index 81ffa63281357..1a6fe7d166d05 100644 --- a/mlir/include/mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h +++ b/mlir/include/mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h @@ -22,7 +22,7 @@ class OperationPass; /// ConvertVectorToLLVM pass in include/mlir/Conversion/Passes.td struct LowerVectorToLLVMOptions { bool reassociateFPReductions = false; - bool enableIndexOptimizations = false; + bool enableIndexOptimizations = true; LowerVectorToLLVMOptions &setReassociateFPReductions(bool b) { reassociateFPReductions = b; return *this; @@ -42,8 +42,7 @@ void populateVectorToLLVMMatrixConversionPatterns( /// Collect a set of patterns to convert from the Vector dialect to LLVM. void populateVectorToLLVMConversionPatterns( LLVMTypeConverter &converter, OwningRewritePatternList &patterns, - bool reassociateFPReductions = false, - bool enableIndexOptimizations = false); + bool reassociateFPReductions = false, bool enableIndexOptimizations = true); /// Create a pass to convert vector operations to the LLVMIR dialect. std::unique_ptr> createConvertVectorToLLVMPass( diff --git a/mlir/include/mlir/Dialect/AVX512/AVX512.td b/mlir/include/mlir/Dialect/AVX512/AVX512.td index e1ed35c50e875..eee24ce1d5d54 100644 --- a/mlir/include/mlir/Dialect/AVX512/AVX512.td +++ b/mlir/include/mlir/Dialect/AVX512/AVX512.td @@ -21,7 +21,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" def AVX512_Dialect : Dialect { let name = "avx512"; - let cppNamespace = "avx512"; + let cppNamespace = "::mlir::avx512"; } //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/AVX512/AVX512Dialect.h b/mlir/include/mlir/Dialect/AVX512/AVX512Dialect.h index 544fb7c2a495f..aae3dbdf179fb 100644 --- a/mlir/include/mlir/Dialect/AVX512/AVX512Dialect.h +++ b/mlir/include/mlir/Dialect/AVX512/AVX512Dialect.h @@ -17,15 +17,9 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/Interfaces/SideEffectInterfaces.h" -namespace mlir { -namespace avx512 { +#include "mlir/Dialect/AVX512/AVX512Dialect.h.inc" #define GET_OP_CLASSES #include "mlir/Dialect/AVX512/AVX512.h.inc" -#include "mlir/Dialect/AVX512/AVX512Dialect.h.inc" - -} // namespace avx512 -} // namespace mlir - #endif // MLIR_DIALECT_AVX512_AVX512DIALECT_H_ diff --git a/mlir/include/mlir/Dialect/AVX512/CMakeLists.txt b/mlir/include/mlir/Dialect/AVX512/CMakeLists.txt index bc57372689b28..3c14238be1bbe 100644 --- a/mlir/include/mlir/Dialect/AVX512/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/AVX512/CMakeLists.txt @@ -1 +1,2 @@ -add_mlir_dialect(AVX512 avx512 AVX512) +add_mlir_dialect(AVX512 avx512) +add_mlir_doc(AVX512 -gen-op-doc AVX512 Dialects/) diff --git a/mlir/include/mlir/Dialect/Affine/EDSC/Builders.h b/mlir/include/mlir/Dialect/Affine/EDSC/Builders.h index 96191e01296a5..d99f29f3b5ba9 100644 --- a/mlir/include/mlir/Dialect/Affine/EDSC/Builders.h +++ b/mlir/include/mlir/Dialect/Affine/EDSC/Builders.h @@ -47,6 +47,18 @@ void affineLoopNestBuilder( void affineLoopBuilder(ValueRange lbs, ValueRange ubs, int64_t step, function_ref bodyBuilderFn = nullptr); +/// Creates a single affine "for" loop, iterating from max(lbs) to min(ubs) with +/// the given step. Uses the OpBuilder and Location stored in ScopedContext and +/// assumes they are non-null. "iterArgs" is used to specify the initial values +/// of the result affine "for" might yield. The optional "bodyBuilderFn" +/// callback is called to construct the body of the loop and is passed the +/// induction variable and the iteration arguments. The function is expected to +/// use the builder and location stored in ScopedContext at the moment of the +/// call. The function will create the affine terminator op in case "iterArgs" +/// is empty and "bodyBuilderFn" is not present. +void affineLoopBuilder( + ValueRange lbs, ValueRange ubs, int64_t step, ValueRange iterArgs, + function_ref bodyBuilderFn = nullptr); namespace op { Value operator+(Value lhs, Value rhs); diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td index 480e1717c5884..88c4a6fda7f4d 100644 --- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td @@ -174,30 +174,74 @@ def AffineForOp : Affine_Op<"for", return } ``` + `affine.for` can also operate on loop-carried variables and return the final + values after loop termination. The initial values of the variables are + passed as additional SSA operands to the "affine.for" following the 2 loop + control values lower bound, upper bound. The operation region has equivalent + arguments for each variable representing the value of the variable at the + current iteration. + + The region must terminate with an `affine.yield` that passes all the current + iteration variables to the next iteration, or to the `affine.for` result, if + at the last iteration. + + `affine.for` results hold the final values after the last iteration. + For example, to sum-reduce a memref: + + ```mlir + func @reduce(%buffer: memref<1024xf32>) -> (f32) { + // Initial sum set to 0. + %sum_0 = constant 0.0 : f32 + // iter_args binds initial values to the loop's region arguments. + %sum = affine.for %i = 0 to 10 step 2 + iter_args(%sum_iter = %sum_0) -> (f32) { + %t = affine.load %buffer[%i] : memref<1024xf32> + %sum_next = addf %sum_iter, %t : f32 + // Yield current iteration sum to next iteration %sum_iter or to %sum + // if final iteration. + affine.yield %sum_next : f32 + } + return %sum : f32 + } + ``` + If the `affine.for` defines any values, a yield terminator must be + explicitly present. The number and types of the "affine.for" results must + match the initial values in the `iter_args` binding and the yield operands. }]; let arguments = (ins Variadic); + let results = (outs Variadic:$results); let regions = (region SizedRegion<1>:$region); let skipDefaultBuilders = 1; let builders = [ OpBuilder<"OpBuilder &builder, OperationState &result, " "int64_t lowerBound, int64_t upperBound, int64_t step = 1, " - "function_ref bodyBuilder " - " = nullptr">, + "ValueRange iterArgs = llvm::None, function_ref bodyBuilder = nullptr">, OpBuilder<"OpBuilder &builder, OperationState &result, " "ValueRange lbOperands, AffineMap lbMap, " "ValueRange ubOperands, AffineMap ubMap, " - "int64_t step = 1, " - "function_ref bodyBuilder " - " = nullptr"> + "int64_t step = 1, ValueRange iterArgs = llvm::None, " + "function_ref " + "bodyBuilder = nullptr"> ]; let extraClassDeclaration = [{ + /// Defining the function type we use for building the body of affine.for. + using BodyBuilderFn = + function_ref; + static StringRef getStepAttrName() { return "step"; } static StringRef getLowerBoundAttrName() { return "lower_bound"; } static StringRef getUpperBoundAttrName() { return "upper_bound"; } Value getInductionVar() { return getBody()->getArgument(0); } + Block::BlockArgListType getRegionIterArgs() { + return getBody()->getArguments().drop_front(); + } + Operation::operand_range getIterOperands() { + return getOperands().drop_front(getNumControlOperands()); + } // TODO: provide iterators for the lower and upper bound operands // if the current access via getLowerBound(), getUpperBound() is too slow. @@ -251,6 +295,17 @@ def AffineForOp : Affine_Op<"for", IntegerAttr::get(IndexType::get(context), step)); } + /// Returns number of region arguments for loop-carried values. + unsigned getNumRegionIterArgs() { + return getBody()->getNumArguments() - 1; + } + + /// Number of operands controlling the loop: lb and ub. + unsigned getNumControlOperands() { return getOperation()->getNumOperands() - getNumIterOperands(); } + + /// Get the number of loop-carried values. + unsigned getNumIterOperands(); + /// Returns true if the lower bound is constant. bool hasConstantLowerBound(); /// Returns true if the upper bound is constant. @@ -540,7 +595,7 @@ def AffineMaxOp : AffineMinMaxOpBase<"max", [NoSideEffect]> { }]; } -def AffineParallelOp : Affine_Op<"parallel", +def AffineParallelOp : Affine_Op<"parallel", [ImplicitAffineTerminator, RecursiveSideEffects, DeclareOpInterfaceMethods]> { let summary = "multi-index parallel band operation"; @@ -569,7 +624,7 @@ def AffineParallelOp : Affine_Op<"parallel", Note: Calling AffineParallelOp::build will create the required region and block, and insert the required terminator if it is trivial (i.e. no values - are yielded). Parsing will also create the required region, block, and + are yielded). Parsing will also create the required region, block, and terminator, even when they are missing from the textual representation. Example (3x3 valid convolution): diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h index db1c3bfead94f..580fbf53ae4f2 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.h +++ b/mlir/include/mlir/Dialect/Affine/Passes.h @@ -61,7 +61,8 @@ std::unique_ptr> createLoopTilingPass(); /// and no callback is provided, anything passed from the command-line (if at /// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor). std::unique_ptr> createLoopUnrollPass( - int unrollFactor = -1, bool unrollFull = false, + int unrollFactor = -1, bool unrollUpToFactor = false, + bool unrollFull = false, const std::function &getUnrollFactor = nullptr); /// Creates a loop unroll jam pass to unroll jam by the specified factor. A diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td index 0e7f3e43661ef..4359ea0fa0a2c 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.td +++ b/mlir/include/mlir/Dialect/Affine/Passes.td @@ -71,6 +71,8 @@ def AffineLoopUnroll : FunctionPass<"affine-loop-unroll"> { let options = [ Option<"unrollFactor", "unroll-factor", "unsigned", /*default=*/"4", "Use this unroll factor for all loops being unrolled">, + Option<"unrollUpToFactor", "unroll-up-to-factor", "bool", + /*default=*/"false", "Allow unrolling up to the factor specified">, Option<"unrollFull", "unroll-full", "bool", /*default=*/"false", "Fully unroll loops">, Option<"numRepetitions", "unroll-num-reps", "unsigned", /*default=*/"1", diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td index 32e0952a15b41..5641d60b0e285 100644 --- a/mlir/include/mlir/Dialect/GPU/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td @@ -21,6 +21,7 @@ include "mlir/IR/OpBase.td" def GPU_Dialect : Dialect { let name = "gpu"; + let cppNamespace = "::mlir::gpu"; let hasOperationAttrVerify = 1; let extraClassDeclaration = [{ diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h index 35daee29aa6af..b55b0c8a3396a 100644 --- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h @@ -34,12 +34,13 @@ struct KernelDim3 { Value z; }; +} // end namespace gpu +} // end namespace mlir + #include "mlir/Dialect/GPU/GPUOpsDialect.h.inc" #define GET_OP_CLASSES #include "mlir/Dialect/GPU/GPUOps.h.inc" -} // end namespace gpu -} // end namespace mlir #endif // MLIR_DIALECT_GPU_GPUDIALECT_H diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h index 298ec0c803f0f..8bce2fd0ad2bb 100644 --- a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h +++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h @@ -27,8 +27,11 @@ struct LogicalResult; class Operation; class Region; +} // namespace mlir + #include "mlir/Dialect/GPU/ParallelLoopMapperAttr.h.inc" +namespace mlir { namespace scf { class ParallelOp; } diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512.td index 12668c4da41be..fcc90a2a801ed 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512.td @@ -21,7 +21,7 @@ include "mlir/Dialect/LLVMIR/LLVMOpBase.td" def LLVMAVX512_Dialect : Dialect { let name = "llvm_avx512"; - let cppNamespace = "LLVM"; + let cppNamespace = "::mlir::LLVM"; } //----------------------------------------------------------------------------// diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h index 27b98fd189107..c028fda514fe0 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h @@ -16,15 +16,9 @@ #include "mlir/IR/Dialect.h" #include "mlir/IR/OpDefinition.h" -namespace mlir { -namespace LLVM { - #define GET_OP_CLASSES #include "mlir/Dialect/LLVMIR/LLVMAVX512.h.inc" #include "mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h.inc" -} // namespace LLVM -} // namespace mlir - #endif // MLIR_DIALECT_LLVMIR_LLVMAVX512DIALECT_H_ diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h index 2f465f07a97e4..5c16f33e9fc06 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h @@ -49,18 +49,23 @@ struct LLVMTypeStorage; struct LLVMDialectImpl; } // namespace detail +} // namespace LLVM +} // namespace mlir + ///// Ops ///// #define GET_OP_CLASSES #include "mlir/Dialect/LLVMIR/LLVMOps.h.inc" #include "mlir/Dialect/LLVMIR/LLVMOpsDialect.h.inc" +namespace mlir { +namespace LLVM { /// Create an LLVM global containing the string "value" at the module containing /// surrounding the insertion point of builder. Obtain the address of that /// global and use it to compute the address of the first character in the /// string (operations inserted at the builder insertion point). Value createGlobalString(Location loc, OpBuilder &builder, StringRef name, - StringRef value, LLVM::Linkage linkage); + StringRef value, Linkage linkage); /// LLVM requires some operations to be inside of a Module operation. This /// function confirms that the Operation has the desired properties. diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td index 10755a436115f..a6be8ef6d8bae 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td @@ -23,7 +23,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" def LLVM_Dialect : Dialect { let name = "llvm"; - let cppNamespace = "LLVM"; + let cppNamespace = "::mlir::LLVM"; /// FIXME: at the moment this is a dependency of the translation to LLVM IR, /// not really one of this dialect per-se. diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index b5bf4ac779727..626bc4b889892 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -59,7 +59,7 @@ def LLVM_VoidResultTypeOpBuilder : OpBuilder< "OpBuilder &builder, OperationState &result, Type resultType, " "ValueRange operands, ArrayRef attributes = {}", [{ - auto llvmType = resultType.dyn_cast(); (void)llvmType; + auto llvmType = resultType.dyn_cast(); (void)llvmType; assert(llvmType && "result must be an LLVM type"); assert(llvmType.isVoidTy() && "for zero-result operands, only 'void' is accepted as result type"); @@ -301,7 +301,7 @@ def LLVM_LoadOp : "unsigned alignment = 0, bool isVolatile = false, " "bool isNonTemporal = false", [{ - auto type = addr.getType().cast().getPointerElementTy(); + auto type = addr.getType().cast().getPointerElementTy(); build(b, result, type, addr, alignment, isVolatile, isNonTemporal); }]>, OpBuilder< @@ -494,8 +494,8 @@ def LLVM_ShuffleVectorOp "OpBuilder &b, OperationState &result, Value v1, Value v2, " "ArrayAttr mask, ArrayRef attrs = {}">]; let verifier = [{ - auto wrappedVectorType1 = v1().getType().cast(); - auto wrappedVectorType2 = v2().getType().cast(); + auto wrappedVectorType1 = v1().getType().cast(); + auto wrappedVectorType2 = v2().getType().cast(); if (!wrappedVectorType2.isVectorTy()) return emitOpError("expected LLVM IR Dialect vector type for operand #2"); if (wrappedVectorType1.getVectorElementType() != @@ -770,7 +770,7 @@ def LLVM_LLVMFuncOp let builders = [ OpBuilder<"OpBuilder &builder, OperationState &result, StringRef name, " - "LLVMType type, LLVM::Linkage linkage = LLVM::Linkage::External, " + "LLVMType type, Linkage linkage = Linkage::External, " "ArrayRef attrs = {}, " "ArrayRef argAttrs = {}"> ]; diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h index 9cc5314bdb901..fff82e3b9f4f4 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h @@ -19,16 +19,10 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/Interfaces/SideEffectInterfaces.h" -namespace mlir { -namespace NVVM { - ///// Ops ///// #define GET_OP_CLASSES #include "mlir/Dialect/LLVMIR/NVVMOps.h.inc" #include "mlir/Dialect/LLVMIR/NVVMOpsDialect.h.inc" -} // namespace NVVM -} // namespace mlir - #endif /* MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_ */ diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 7d47e5012ac9a..5f72ad35a6701 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -22,7 +22,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" def NVVM_Dialect : Dialect { let name = "nvvm"; - let cppNamespace = "NVVM"; + let cppNamespace = "::mlir::NVVM"; let dependentDialects = ["LLVM::LLVMDialect"]; } diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h index eb40373c3f117..b00b8ac0b125a 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h @@ -27,16 +27,10 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/Interfaces/SideEffectInterfaces.h" -namespace mlir { -namespace ROCDL { - ///// Ops ///// #define GET_OP_CLASSES #include "mlir/Dialect/LLVMIR/ROCDLOps.h.inc" #include "mlir/Dialect/LLVMIR/ROCDLOpsDialect.h.inc" -} // namespace ROCDL -} // namespace mlir - #endif /* MLIR_DIALECT_LLVMIR_ROCDLDIALECT_H_ */ diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index f85c4f02899b4..c6d2ded073e63 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -22,7 +22,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" def ROCDL_Dialect : Dialect { let name = "rocdl"; - let cppNamespace = "ROCDL"; + let cppNamespace = "::mlir::ROCDL"; let dependentDialects = ["LLVM::LLVMDialect"]; } diff --git a/mlir/include/mlir/Dialect/Linalg/EDSC/Intrinsics.h b/mlir/include/mlir/Dialect/Linalg/EDSC/Intrinsics.h index 399c49d1e5721..d842069f65705 100644 --- a/mlir/include/mlir/Dialect/Linalg/EDSC/Intrinsics.h +++ b/mlir/include/mlir/Dialect/Linalg/EDSC/Intrinsics.h @@ -20,6 +20,7 @@ using linalg_dot = OperationBuilder; using linalg_fill = OperationBuilder; using linalg_matmul = OperationBuilder; using linalg_matvec = OperationBuilder; +using linalg_vecmat = OperationBuilder; using linalg_range = ValueBuilder; using linalg_reshape = ValueBuilder; using linalg_slice = ValueBuilder; diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td index 7955345f69668..8ac82b768ad3f 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td @@ -31,6 +31,7 @@ def Linalg_Dialect : Dialect { are also available and should be read first before going in the details of the op semantics. }]; + let cppNamespace = "::mlir::linalg"; } // Whether a type is a RangeType. diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc index 27d4330a54d5f..765e045e9e77c 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc @@ -8,6 +8,11 @@ def matvec(A: f32(M, N), y: f32(N)) -> (x: f32(M)) { x(m) = std_addf(std_mulf(A(m, n), y(n))); } +ods_def: +def vecmat(y: f32(M), A: f32(M, N)) -> (x: f32(N)) { + x(n) = std_addf(std_mulf(y(m), A(m, n))); +} + ods_def: def dot(A: f32(M), B: f32(M)) -> (C: f32()) { C() = std_addf(std_mulf(A(m), B(m))); @@ -20,52 +25,50 @@ def batch_matmul(A: f32(Batch, M, K), B: f32(Batch, K, N)) -> (C: f32(Batch, M, ods_def: def conv_1d(I: f32(W), K: f32(KW)) -> (O: f32(W)) { - O(w) = std_addf(O(w), std_mulf(I(w + kw), K(kw))); + O(w) = std_addf(std_mulf(I(w + kw), K(kw))); } ods_def: def conv_1d_nwc(I: f32(N, W, C), K: f32(F, KW, C)) -> (O: f32(N, W, F)) { - O(n, w, f) = std_addf(O(n, w, f), - std_mulf(I(n, w + kw, c), K(f, kw, c))); + O(n, w, f) = std_addf(std_mulf(I(n, w + kw, c), K(f, kw, c))); } ods_def: def conv_1d_ncw(I: f32(N, C, W), K: f32(F, C, KW)) -> (O: f32(N, F, W)) { - O(n, f, w) = std_addf(O(n, f, w), - std_mulf(I(n, c, w + kw), K(f, c, kw))); + O(n, f, w) = std_addf(std_mulf(I(n, c, w + kw), K(f, c, kw))); } ods_def: def conv_2d(I: f32(H, W), K: f32(KH, KW)) -> (O: f32(H, W)) { - O(h, w) = std_addf(O(h, w), std_mulf(I(h + kh, w + kw), K(kh, kw))); + O(h, w) = std_addf(std_mulf(I(h + kh, w + kw), K(kh, kw))); } ods_def: def conv_2d_nhwc(I: f32(N, H, W, C), K: f32(F, KH, KW, C)) -> (O: f32(N, H, W, F)) { - O(n, h, w, f) = std_addf(O(n, h, w, f), - std_mulf(I(n, h + kh, w + kw, c), K(f, kh, kw, c))); + O(n, h, w, f) = std_addf(std_mulf( + I(n, h + kh, w + kw, c), K(f, kh, kw, c))); } ods_def: def conv_2d_nchw(I: f32(N, C, H, W), K: f32(F, C, KH, KW)) -> (O: f32(N, F, H, W)) { - O(n, f, h, w) = std_addf(O(n, f, h, w), - std_mulf(I(n, c, h + kh, w + kw), K(f, c, kh, kw))); + O(n, f, h, w) = std_addf(std_mulf( + I(n, c, h + kh, w + kw), K(f, c, kh, kw))); } ods_def: def conv_3d(I: f32(D, H, W), K: f32(KD, KH, KW)) -> (O: f32(D, H, W)) { - O(d, h, w) = std_addf(O(d, h, w), - std_mulf(I(d + kd, h + kh, w + kw), K(kd, kh, kw))); + O(d, h, w) = std_addf(std_mulf( + I(d + kd, h + kh, w + kw), K(kd, kh, kw))); } ods_def: def conv_3d_ndhwc(I: f32(N, D, H, W, C), K: f32(F, KD, KH, KW, C)) -> (O: f32(N, D, H, W, F)) { - O(n, d, h, w, f) = std_addf(O(n, d, h, w, f), - std_mulf(I(n, d + kd, h + kh, w + kw, c), K(f, kd, kh, kw, c))); + O(n, d, h, w, f) = std_addf(std_mulf( + I(n, d + kd, h + kh, w + kw, c), K(f, kd, kh, kw, c))); } ods_def: def conv_3d_ncdhw(I: f32(N, C, D, H, W), K: f32(F, C, KD, KH, KW)) -> (O: f32(N, F, D, H, W)) { - O(n, f, d, h, w) = std_addf(O(n, f, d, h, w), - std_mulf(I(n, c, d + kd, h + kh, w + kw), K(f, c, kd, kh, kw))); -} \ No newline at end of file + O(n, f, d, h, w) = std_addf(std_mulf( + I(n, c, d + kd, h + kh, w + kw), K(f, c, kd, kh, kw))); +} diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h index 21bff4185abf8..09fc11bc49175 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h @@ -85,6 +85,9 @@ AffineMap extractOrIdentityMap(Optional maybeMap, unsigned rank, SmallVector concat(ArrayRef a, ArrayRef b); +} // namespace linalg +} // namespace mlir + #include "mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterfaces.h.inc" #define GET_OP_CLASSES @@ -93,7 +96,5 @@ SmallVector concat(ArrayRef a, #define GET_OP_CLASSES #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.h.inc" -} // namespace linalg -} // namespace mlir #endif // MLIR_DIALECT_LINALG_LINALGOPS_H_ diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td index 1366e920039bf..a7855e6327b20 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td @@ -300,7 +300,7 @@ def Linalg_TransposeOp : Linalg_Op<"transpose", [NoSideEffect]>, Example: ```mlir - %1 = linalg.transpose %0 (i, j) -> (j, i) : memref + %1 = linalg.transpose %0 (i, j) -> (j, i) : memref to memref ``` }]; @@ -308,13 +308,7 @@ def Linalg_TransposeOp : Linalg_Op<"transpose", [NoSideEffect]>, "OpBuilder &b, OperationState &result, Value view, " "AffineMapAttr permutation, ArrayRef attrs = {}">]; - let verifier = [{ - if (!permutation().isPermutation()) - return emitOpError("expected a permutation map"); - if (permutation().getNumDims() != getShapedType().getRank()) - return emitOpError("expected a permutation map of same rank as the view"); - return success(); - }]; + let verifier = [{ return ::verify(*this); }]; let extraClassDeclaration = [{ static StringRef getPermutationAttrName() { return "permutation"; } diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index e003fd15d0b1e..41beab0590085 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -130,21 +130,22 @@ def CopyOp : LinalgStructured_Op<"copy", [ let extraClassDeclaration = libraryCallName # [{ // Rank-polymorphic. // filling_value -> O(ivs) with parallel iterators. - llvm::Optional> referenceIterators() { - unsigned nPar = input().getType().cast().getRank(); - return SmallVector(nPar, getParallelIteratorTypeName()); + ArrayAttr iterator_types() { + unsigned nPar = getInputShapedType(0).getRank(); + return Builder(getContext()).getStrArrayAttr( + SmallVector(nPar, getParallelIteratorTypeName())); } // I(input_perm(ivs)) -> O(output_perm(ivs)) - llvm::Optional> referenceIndexingMaps() { + ArrayAttr indexing_maps() { MLIRContext *context = getContext(); auto maybeInputMap = inputPermutation(); auto maybeOutputMap = outputPermutation(); unsigned inputRank = getInputShapedType(0).getRank(); unsigned outputRank = getOutputShapedType(0).getRank(); - return SmallVector{ + return Builder(getContext()).getAffineMapArrayAttr({ extractOrIdentityMap(maybeInputMap, inputRank, context), - extractOrIdentityMap(maybeOutputMap, outputRank, context)}; + extractOrIdentityMap(maybeOutputMap, outputRank, context)}); } Value getSource() { return input();} @@ -163,16 +164,17 @@ def FillOp : LinalgStructured_Op<"fill", [NInputs<0>, NOutputs<1>]> { let extraClassDeclaration = libraryCallName # [{ // Rank-polymorphic. // filling_value -> O(ivs) with parallel iterators. - llvm::Optional> referenceIterators() { - unsigned nPar = output().getType().cast().getRank(); - return SmallVector(nPar, getParallelIteratorTypeName()); + ArrayAttr iterator_types() { + unsigned nPar = getOutputShapedType(0).getRank(); + return Builder(getContext()).getStrArrayAttr( + SmallVector(nPar, getParallelIteratorTypeName())); } - llvm::Optional> referenceIndexingMaps() { + ArrayAttr indexing_maps() { MLIRContext *context = getContext(); // filling_value -> O(ivs) - return SmallVector{ - extractOrIdentityMap(llvm::None, getNumParallelLoops(), context)}; + return Builder(getContext()).getAffineMapArrayAttr({ + extractOrIdentityMap(llvm::None, getNumParallelLoops(), context)}); } }]; @@ -295,7 +297,7 @@ def ConvOp : PoolingBase_Op<"conv", [NInputs<2>, NOutputs<1>]> { getNumOutputFeatureDimensions(); } - llvm::Optional> referenceIterators() { + ArrayAttr iterator_types() { // Outer parallel loops are always the number of output dimensions; i.e. // [b, xs, q] in the TF notation above. unsigned nPar = getOutputShapedType(0).getRank(); @@ -310,7 +312,7 @@ def ConvOp : PoolingBase_Op<"conv", [NInputs<2>, NOutputs<1>]> { iters.reserve(nPar + nRed + nWin); iters.append(nRed, getReductionIteratorTypeName()); iters.append(nWin, getWindowIteratorTypeName()); - return iters; + return Builder(getContext()).getStrArrayAttr(iters); } // F(z0, ..., zN-1, q, k) * @@ -318,7 +320,7 @@ def ConvOp : PoolingBase_Op<"conv", [NInputs<2>, NOutputs<1>]> { // -> O(b, x0, ..., xN-1, k) // for N equal to `nWindow`. If there is no padding attribute, it will be // ignored. - llvm::Optional> referenceIndexingMaps() { + ArrayAttr indexing_maps() { MLIRContext *context = getContext(); auto nWin = getNumWindowLoops(); assert(nWin > 0 && "expected at least one window dimension"); @@ -343,7 +345,7 @@ def ConvOp : PoolingBase_Op<"conv", [NInputs<2>, NOutputs<1>]> { auto zs = makeAffineDimExprs(nWin, idx, context); // Construct the weighedSum expression. auto ws = weightedPoolingInputIndex(*this, xs, zs); - return SmallVector{ + return Builder(getContext()).getAffineMapArrayAttr({ // filter[z[0], ..., z[N-1], q, k] AffineMap::get(idx, 0, concat(concat(zs, qs), ks), context), // input[b, @@ -353,7 +355,7 @@ def ConvOp : PoolingBase_Op<"conv", [NInputs<2>, NOutputs<1>]> { // q] AffineMap::get(idx, 0, concat(concat(bs, ws), qs), context), // output[b, x[0], ..., x[N-1], k] - AffineMap::get(idx, 0, concat(concat(bs, xs), ks), context)}; + AffineMap::get(idx, 0, concat(concat(bs, xs), ks), context)}); } }]; @@ -384,7 +386,7 @@ class SingleInputPoolingBase_Op OptionalAttr:$padding); let extraClassDeclaration = commonUtils# [{ - llvm::Optional> referenceIterators() { + ArrayAttr iterator_types() { // Outer parallel loops are always the number of output dimensions. unsigned nPar = getOutputShapedType(0).getRank(); // The window loops has the same number loops with output dimensions. @@ -392,10 +394,10 @@ class SingleInputPoolingBase_Op SmallVector iters(nPar, getParallelIteratorTypeName()); iters.reserve(nPar + nWin); iters.append(nWin, getWindowIteratorTypeName()); - return iters; + return Builder(getContext()).getStrArrayAttr(iters); } - llvm::Optional> referenceIndexingMaps() { + ArrayAttr indexing_maps() { MLIRContext *context = getContext(); auto nPar = getNumParallelLoops(); auto nWin = getNumWindowLoops(); @@ -406,14 +408,13 @@ class SingleInputPoolingBase_Op // Construct the weighedSum expression. auto inputDims = weightedPoolingInputIndex(*this, outputDims, windowDims); - return SmallVector{ + return Builder(getContext()).getAffineMapArrayAttr({ // input AffineMap::get(idx, 0, inputDims, context), // windowDims AffineMap::get(idx, 0, windowDims, context), // output - AffineMap::get(idx, 0, outputDims, context) - }; + AffineMap::get(idx, 0, outputDims, context)}); } }]; @@ -485,16 +486,6 @@ class GenericOpBase : LinalgStructuredBase_Op> referenceIterators() { - llvm_unreachable( - "No such thing as reference iterator types for a generic op."); - } - - llvm::Optional> referenceIndexingMaps() { - llvm_unreachable( - "No such thing as reference indexing maps for a generic op."); - } - llvm::Optional getSymbolSource() { auto ss = symbol_source(); return ss.hasValue() ? @@ -807,8 +798,6 @@ def IndexedGenericOp : GenericOpBase<"indexed_generic"> { // Named Linalg ops, implemented as a declarative configurations of generic ops. //===----------------------------------------------------------------------===// -def NamedStructuredOpTraits : NativeOpTrait<"linalg::NamedStructuredOpTraits">; - class LinalgNamedStructured_Op props> : LinalgStructuredBase_Op { string spec = ?; diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td index 82882b083b2d8..0e8216cc4268f 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td @@ -18,173 +18,492 @@ include "mlir/Dialect/Linalg/IR/LinalgBase.td" // The linalg 'LinalgStructuredInterface' provides access to the 'LinalgOp' // interface. def LinalgStructuredInterface : OpInterface<"LinalgOp"> { + let cppNamespace = "::mlir::linalg"; let methods = [ //===------------------------------------------------------------------===// // Loop types handling. //===------------------------------------------------------------------===// InterfaceMethod< - "Return the number of parallel loops within the current operation.", - "unsigned", "getNumParallelLoops" + /*desc=*/[{ + Return the number of parallel loops within the current operation. + }], + /*retTy=*/"unsigned", + /*methodName=*/"getNumParallelLoops", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return getNumIterators(getParallelIteratorTypeName(), + $_op.iterator_types()); + }] >, InterfaceMethod< - "Return the number of reduction loops within the current operation.", - "unsigned", "getNumReductionLoops" + /*desc=*/[{ + Return the number of reduction loops within the current operation. + }], + /*retTy=*/"unsigned", + /*methodName=*/"getNumReductionLoops", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return getNumIterators(getReductionIteratorTypeName(), + $_op.iterator_types()); + }] >, InterfaceMethod< - "Return the number of window loops within the current operation.", - "unsigned", "getNumWindowLoops" + /*desc=*/[{ + Return the number of window loops within the current operation. + }], + /*retTy=*/"unsigned", + /*methodName=*/"getNumWindowLoops", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return getNumIterators(getWindowIteratorTypeName(), + $_op.iterator_types()); + }] >, InterfaceMethod< - "Return the number of loops within the current operation.", - "unsigned", "getNumLoops">, - + /*desc=*/[{ + Return the total number of loops within the current operation. + }], + /*retTy=*/"unsigned", + /*methodName=*/"getNumLoops", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return getNumIterators($_op.iterator_types()); + }] + >, InterfaceMethod< - [{Returns true if the current operation has only one loop and it's a - reduction loop}], - "bool", "hasSingleReductionLoop">, - + /*desc=*/[{ + Returns true if the current operation has only one loop and it's a + reduction loop. + }], + /*retTy=*/"bool", + /*methodName=*/"hasSingleReductionLoop", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + auto iters = $_op.iterator_types(); + return iters.size() == 1 && + getNumIterators(getReductionIteratorTypeName(), iters) == 1; + }]>, //===------------------------------------------------------------------===// - // Input arguments handling. + // Num input/output arguments handling. //===------------------------------------------------------------------===// + // These special methods must be defined by each op that wants to implement + // the LinalgStructuredInterface. For now, this is either: + // - inherited statically by using the NInputs or + // NOutputs traits. + // - derived from args_in/args_out attributes (for linalg.generic and + // linalg.indexed_generic ops). + InterfaceMethod< + /*desc=*/[{ + Return the number of inputs from the current operation. + }], + /*retTy=*/"unsigned", + /*methodName=*/"getNumInputs" + >, InterfaceMethod< - "Return the number of inputs from the current operation.", - "unsigned", "getNumInputs" + /*desc=*/[{ + Return the number of outputs from the current operation. + }], + /*retTy=*/"unsigned", + /*methodName=*/"getNumOutputs" >, - InterfaceMethod<"Return the input view at the given index.", - "Value", "getInput", (ins "unsigned":$i) + //===------------------------------------------------------------------===// + // Input arguments handling. + //===------------------------------------------------------------------===// + InterfaceMethod< + /*desc=*/[{ + Return the `i`-th input value. + The `i^th` input argument is always the `i^th` operand regardless of + whether we have tensors or buffers. + }], + /*retTy=*/"Value", + /*methodName=*/"getInput", + /*args=*/(ins "unsigned":$i), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + assert(i < $_op.getNumInputs()); + return this->getOperation()->getOperand(i); + }] >, - InterfaceMethod<[{ + InterfaceMethod< + /*desc=*/[{ Return the index of the given input value `v`, or `None` if the value is not an input. }], - "llvm::Optional", "getIndexOfInput", (ins "Value":$v) + /*retTy=*/"llvm::Optional", + /*methodName=*/"getIndexOfInput", + /*args=*/(ins "Value":$value), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + auto it = llvm::find(getInputs(), value); + if (it != getInputs().end()) + return it - getInputs().begin(); + return llvm::None; + }] >, InterfaceMethod< - "Return the input operands from the current operation.", - "Operation::operand_range", "getInputs" - >, - InterfaceMethod<[{ + /*desc=*/[{ Return the `i`-th input shaped type, irrespective of buffer or tensor type. - }], "ShapedType", "getInputShapedType", (ins "unsigned":$i)>, - InterfaceMethod<[{ + }], + /*retTy=*/"ShapedType", + /*methodName=*/"getInputShapedType", + /*args=*/(ins "unsigned":$i), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return getInput(i).getType().template cast(); + }] + >, + InterfaceMethod< + /*desc=*/[{ + Return the input operands from the current operation. + }], + /*retTy=*/"Operation::operand_range", + /*methodName=*/"getInputs", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + auto range = this->getOperation()->getOperands(); + return {range.begin(), range.begin() + $_op.getNumInputs()}; + }] + >, + InterfaceMethod< + /*desc=*/[{ Return the subset of input operands that are of ranked tensor type. - }], "SmallVector", "getInputTensorTypes">, + }], + /*retTy=*/"SmallVector", + /*methodName=*/"getInputTensorTypes" , + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + SmallVector res; + for (Type type : getInputs().getTypes()) + if (auto t = type.template dyn_cast()) + res.push_back(t); + return res; + }] + >, //===------------------------------------------------------------------===// // Output arguments handling. //===------------------------------------------------------------------===// InterfaceMethod< - "Return the number of outputs from the current operation.", - "unsigned", "getNumOutputs" - >, - InterfaceMethod<"Return the output buffer at the given index.", - "Value", "getOutputBuffer", (ins "unsigned":$i) + /*desc=*/[{ + Return the output buffer at the given index, asserts that this is a + buffer operand and not a tensor result. + The `i^th` output argument is an operand (resp. a return value) iff it + is a value of buffer type (resp. a return value of tensor type). + }], + /*retTy=*/"Value", + /*methodName=*/"getOutputBuffer", + /*args=*/(ins "unsigned":$i), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + // Output buffers are passed as output buffer operands (side-effecting). + // Output tensors are results. + // The union of the 2 are all the outputs and we want to ensure i does + // not overflow the buffer operands. + assert(i + this->getOperation()->getNumResults() < $_op.getNumOutputs() + && "overflowing output buffer index"); + return this->getOperation()->getOperand($_op.getNumInputs() + i); + }] >, - InterfaceMethod<[{ + InterfaceMethod< + /*desc=*/[{ Return the index of the given buffer value, or `None` if the value is not part of the output buffers. }], - "llvm::Optional", "getIndexOfOutputBuffer", (ins "Value":$view) + /*retTy=*/"llvm::Optional", + /*methodName=*/"getIndexOfOutputBuffer", + /*args=*/(ins "Value":$value), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + auto it = llvm::find(getOutputBuffers(), value); + if (it != getOutputBuffers().end()) + return it - getOutputBuffers().begin(); + return llvm::None; + }] >, - InterfaceMethod<[{ + InterfaceMethod< + /*desc=*/[{ Return the type of the output buffer at the given index. - }], "MemRefType", "getOutputBufferType", (ins "unsigned":$i)>, - InterfaceMethod<[{ + }], + /*retTy=*/"MemRefType", + /*methodName=*/"getOutputBufferType", + /*args=*/(ins "unsigned":$i), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return getOutputBuffer(i).getType().template cast(); + }]>, + InterfaceMethod< + /*desc=*/[{ Return the `i`-th output shaped type, irrespective of buffer or tensor type. - }], "ShapedType", "getOutputShapedType", (ins "unsigned":$i)>, - InterfaceMethod<[{ + }], + /*retTy=*/"ShapedType", + /*methodName=*/"getOutputShapedType", + /*args=*/(ins "unsigned":$i), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return getShapedType(i + $_op.getNumInputs()); + }]>, + InterfaceMethod< + /*desc=*/[{ Return the results that are of ranked tensor type. - }], "SmallVector", "getOutputTensorTypes">, + }], + /*retTy=*/"SmallVector", + /*methodName=*/"getOutputTensorTypes", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + SmallVector res; + for (Type type : this->getOperation()->getResults().getTypes()) + res.push_back(type.template cast()); + return res; + }]>, InterfaceMethod< - "Return the output buffers (operands) from the current operation.", - "Operation::operand_range", "getOutputBuffers" + /*desc=*/[{ + Return the output buffers (operands) from the current operation. + }], + /*retTy=*/"Operation::operand_range", + /*methodName=*/"getOutputBuffers", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + auto range = this->getOperation()->getOperands(); + return {range.begin() + $_op.getNumInputs(), + range.begin() + getNumInputsAndOutputBuffers()}; + }] >, //===------------------------------------------------------------------===// // Input and Output arguments handling. //===------------------------------------------------------------------===// InterfaceMethod< - "Return one single buffer at position `$i`.", - "Value", "getBuffer", (ins "unsigned":$i) + /*desc=*/[{ + Return one single buffer at position `$i`. + }], + /*retTy=*/"Value", + /*methodName=*/"getBuffer", + /*args=*/(ins "unsigned":$i), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + assert(i < getNumInputsAndOutputBuffers() && "overflowing buffers index"); + return this->getOperation()->getOperand(i); + }] >, InterfaceMethod< - "Return the number of inputs and outputs, irrespective of their buffer " - "or tensor type.", - "unsigned", "getNumInputsAndOutputs" + /*desc=*/[{ + Return the number of inputs and outputs, irrespective of their buffer or + tensor type. + }], + /*retTy=*/"unsigned", + /*methodName=*/"getNumInputsAndOutputs", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return $_op.getNumInputs() + $_op.getNumOutputs(); + }] >, InterfaceMethod< - "Return the number of inputs, irrespective of their buffer or tensor " - "type, and output buffers", - "unsigned", "getNumInputsAndOutputBuffers" + /*desc=*/[{ + Return the number of inputs, irrespective of their buffer or tensor type + and output buffers + }], + /*retTy=*/"unsigned", + /*methodName=*/"getNumInputsAndOutputBuffers", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return $_op.getNumInputs() + $_op.getNumOutputs() - + this->getOperation()->getNumResults(); + }] >, InterfaceMethod< - "Return the range over inputs (irrespective of type) and output buffers.", - "Operation::operand_range", "getInputsAndOutputBuffers" + /*desc=*/[{ + Return the range over inputs (irrespective of type) and output buffers. + }], + /*retTy=*/"Operation::operand_range", + /*methodName=*/"getInputsAndOutputBuffers", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + auto range = this->getOperation()->getOperands(); + return {range.begin(), range.begin() + getNumInputsAndOutputBuffers()}; + }] >, InterfaceMethod< - "Return the shaped types for all the inputs and outputs", - "SmallVector", "getInputOutputShapedTypes" + /*desc=*/[{ + Return the `i`-th shaped type, there are 3 cases: + 1. if `i < $_op.getNumInputs()` then return `getInputShapedType(i)`; + otherwise + 2. if `i < getNumInputsAndOutputBuffers()` then return the + `getOutputBufferType(i - $_op.getNumInputs())`; otherwise + 3. return the `i - getNumInputsAndOutputBuffers()` result type. + }], + /*retTy=*/"ShapedType", + /*methodName=*/"getShapedType", + /*args=*/(ins "unsigned":$i), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + if (i < $_op.getNumInputs()) + return getInputShapedType(i); + if (i < getNumInputsAndOutputBuffers()) + return getOutputBufferType(i - $_op.getNumInputs()); + return getOutputTensorTypes()[i - getNumInputsAndOutputBuffers()]; + }]>, + InterfaceMethod< + /*desc=*/[{ + Return the shaped types for all the inputs and outputs + }], + /*retTy=*/"SmallVector", + /*methodName=*/"getInputOutputShapedTypes", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + SmallVector inputOutputTypes( + this->getOperation()->operand_type_begin(), + this->getOperation()->operand_type_end()); + inputOutputTypes.append(this->getOperation()->result_type_begin(), + this->getOperation()->result_type_end()); + return llvm::to_vector<4>( + llvm::map_range(inputOutputTypes, [](Type type) -> ShapedType { + return type.cast(); + })); + }] >, //===------------------------------------------------------------------===// // Other interface methods. //===------------------------------------------------------------------===// InterfaceMethod< - "Return the reference iterators for this named op (if any are " - "specified). These reference iterators are used to specify the default " - "behavior of the op. Typically this would be a static method but in " - "order to allow rank-polymorphic ops, this needs to be per object " - "instance. Named ops must define referenceIterators, even if empty for " - "the 0-D case. Generic ops on the other hand have a None " - "`referenceIterators`", - "llvm::Optional>", "referenceIterators" + /*desc=*/[{ + Return the iterator types attribute within the current operation. + }], + /*retTy=*/"ArrayAttr", + /*methodName=*/"iterator_types", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return $_op.iterator_types(); + }] >, InterfaceMethod< - "Return the reference indexing maps for this named op (if any are " - "specified). Typically this would be a static method but in order to " - "allow rank-polymorphic ops, this needs to be per object instance. Named " - "ops must define referenceIterators, even if empty for the 0-D case. " - "Generic ops on the other hand have a None `referenceIndexingMaps`", - "llvm::Optional>", "referenceIndexingMaps" + /*desc=*/[{ + Return the indexing maps attribute within the current operation. + }], + /*retTy=*/"ArrayAttr", + /*methodName=*/"indexing_maps" >, InterfaceMethod< - "Return the iterator types attribute within the current operation.", - "ArrayAttr", "iterator_types" + /*desc=*/[{ + Return the indexing maps within the current operation. + }], + /*retTy=*/"SmallVector", + /*methodName=*/"getIndexingMaps", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return llvm::to_vector<4>( + llvm::map_range($_op.indexing_maps(), + [](Attribute attr) -> AffineMap { + return attr.cast().getValue(); + })); + }] >, InterfaceMethod< - "Return the indexing maps attribute within the current operation.", - "ArrayAttr", "indexing_maps" + /*desc=*/[{ + Return the input or output indexing map at index `i`. + }], + /*retTy=*/"AffineMap", + /*methodName=*/"getIndexingMap", + /*args=*/(ins "unsigned":$i), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + assert(i < getNumInputsAndOutputs()); + return $_op.indexing_maps() + .getValue()[i] + .template cast() + .getValue(); + }] >, InterfaceMethod< - "Return the indexing maps within the current operation.", - "SmallVector", "getIndexingMaps" - >, - InterfaceMethod<"Return the input or output indexing map at index `i`.", - "AffineMap", "getIndexingMap", (ins "unsigned":$i) - >, - InterfaceMethod<"Return the input indexing map at index `i`.", - "AffineMap", "getInputIndexingMap", (ins "unsigned":$i) + /*desc=*/[{ + Return the input indexing map at index `i`. + }], + /*retTy=*/"AffineMap", + /*methodName=*/"getInputIndexingMap", + /*args=*/(ins "unsigned":$i), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + assert(i < $_op.getNumInputs()); + return $_op.indexing_maps() + .getValue()[i] + .template cast() + .getValue(); + }] >, - InterfaceMethod<"Return the output indexing map at index `i`.", - "AffineMap", "getOutputIndexingMap", (ins "unsigned":$i) + InterfaceMethod< + /*desc=*/[{ + Return the output indexing map at index `i`. + }], + /*retTy=*/"AffineMap", + /*methodName=*/"getOutputIndexingMap", + /*args=*/(ins "unsigned":$i), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + assert(i < $_op.getNumOutputs()); + return $_op.indexing_maps() + .getValue()[i + $_op.getNumInputs()] + .template cast() + .getValue(); + }] >, - InterfaceMethod<[{ + InterfaceMethod< + /*desc=*/[{ Return whether the op has only MemRef input and outputs. - }], "bool", "hasBufferSemantics">, - InterfaceMethod<[{ + }], + /*retTy=*/"bool", + /*methodName=*/"hasBufferSemantics", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return this->getOperation()->getNumResults() == 0 && + llvm::all_of(getInputs(), + [](Value v) { return v.getType().isa(); }); + }] + >, + InterfaceMethod< + /*desc=*/[{ Return whether the op has only RankedTensor input and outputs. - }], "bool", "hasTensorSemantics">, + }], + /*retTy=*/"bool", + /*methodName=*/"hasTensorSemantics", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + auto isTensorType = [](Value v) { + return v.getType().isa(); + }; + return llvm::all_of(getInputs(), isTensorType) && + llvm::all_of(this->getOperation()->getResults(), isTensorType); + }] + >, //===------------------------------------------------------------------===// // Other static interface methods. //===------------------------------------------------------------------===// - StaticInterfaceMethod<[{ + StaticInterfaceMethod< + /*desc=*/[{ Create an operation of the current type with the given location, operands, and attributes. }], - "Operation *", "create", + /*retTy=*/"Operation *", + /*methodName=*/"create", (ins "OpBuilder &":$builder, "Location":$loc, "ValueRange":$operands, "ArrayRef":$attributes), [{ @@ -192,11 +511,13 @@ def LinalgStructuredInterface : OpInterface<"LinalgOp"> { attributes); }] >, - InterfaceMethod<[{ + InterfaceMethod< + /*desc=*/[{ Clone the current operation with the given location and operands. This is used to abstract away the optional underlying region creation. }], - "Operation *", "clone", + /*retTy=*/"Operation *", + /*methodName=*/"clone", (ins "OpBuilder &":$b, "Location":$loc, "ValueRange":$operands), [{ BlockAndValueMapping map; unsigned numRegions = $_op.getOperation()->getNumRegions(); diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h index 8dda7d0a1445f..c4790ca617f11 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h @@ -49,8 +49,8 @@ template class NOutputs { }; }; -/// This class provides the API for structured ops that are known to operate on -/// buffers or tensors. This trait must be used in conjunction with an op +/// This class provides a verifier for structured ops that are known to operate +/// on buffers or tensors. This trait must be used in conjunction with an op /// definition or a trait that provides the methods `getNumInputs` and /// `getNumOutputs`. Use as a trait as follows: /// @@ -59,324 +59,18 @@ template class NOutputs { template class StructuredOpTraits : public OpTrait::TraitBase { -private: - /// Return the number of inputs, irrespective of their buffer or tensor type. - /// For internal use only. - unsigned nInputs() { - return cast(this->getOperation()).getNumInputs(); - } - /// Return the number of outputs, irrespective of their buffer or tensor type. - /// For internal use only. - unsigned nOutputs() { - return cast(this->getOperation()).getNumOutputs(); - } - public: - //==========================================================================// - // Loop types handling. - //==========================================================================// - unsigned getNumParallelLoops() { - return getNumIterators( - getParallelIteratorTypeName(), - cast(this->getOperation()).iterator_types()); - } - unsigned getNumReductionLoops() { - return getNumIterators( - getReductionIteratorTypeName(), - cast(this->getOperation()).iterator_types()); - } - unsigned getNumWindowLoops() { - return getNumIterators( - getWindowIteratorTypeName(), - cast(this->getOperation()).iterator_types()); - } - unsigned getNumLoops() { - return getNumIterators( - cast(this->getOperation()).iterator_types()); - } - - bool hasSingleReductionLoop() { - auto iterators = cast(this->getOperation()).iterator_types(); - return iterators.size() == 1 && - getNumIterators(getReductionIteratorTypeName(), iterators); - } - - //==========================================================================// - // Input arguments handling. - //==========================================================================// - // The `i^th` input argument is always the `i^th` operand regardless of - // whether we have tensors or buffers. - // - /// Return the `i`-th input value. - Value getInput(unsigned i) { - assert(i < nInputs()); - return this->getOperation()->getOperand(i); - } - /// Return the index of `value` in the list of inputs if found, llvm::None - /// otherwise. - Optional getIndexOfInput(Value value) { - auto it = llvm::find(getInputs(), value); - if (it != getInputs().end()) - return it - getInputs().begin(); - return llvm::None; - } - /// Return the `i`-th input shaped type, irrespective of buffer or tensor - /// type. - ShapedType getInputShapedType(unsigned i) { - return getInput(i).getType().template cast(); - } - /// Return the range over inputs. - Operation::operand_range getInputs() { - auto range = this->getOperation()->getOperands(); - return {range.begin(), range.begin() + nInputs()}; - } - /// Query the subset of input operands that are of ranked tensor type. - SmallVector getInputTensorTypes() { - SmallVector res; - for (Type type : getInputs().getTypes()) - if (auto t = type.template dyn_cast()) - res.push_back(t); - return res; - } - - //==========================================================================// - // Output arguments handling. - //==========================================================================// - // The `i^th` output argument is an operand (resp. a return value) iff it is - // a value of buffer type (resp. a return value of tensor type). - - /// Return the `i`-th output, asserts that this is a buffer operand and not - /// a tensor result. - Value getOutputBuffer(unsigned i) { - assert(i + this->getOperation()->getNumResults() < nOutputs() && - "overflowing output buffer index"); - return this->getOperation()->getOperand(nInputs() + i); - } - /// Return the index of `value` in the list of output buffers if found, - /// llvm::None otherwise. - Optional getIndexOfOutputBuffer(Value value) { - auto it = llvm::find(getOutputBuffers(), value); - if (it != getOutputBuffers().end()) - return it - getOutputBuffers().begin(); - return llvm::None; - } - /// Return the `i`-th output buffer type. - MemRefType getOutputBufferType(unsigned i) { - return getOutputBuffer(i).getType().template cast(); - } - /// Return the `i`-th output shaped type, irrespective of buffer of tensor - /// type. - ShapedType getOutputShapedType(unsigned i) { - return getShapedType(i + nInputs()); - } - /// Query the subset of results that are of ranked tensor type. - SmallVector getOutputTensorTypes() { - SmallVector res; - for (Type type : this->getOperation()->getResults().getTypes()) - res.push_back(type.template cast()); - return res; - } - /// Return the range over outputs. - Operation::operand_range getOutputBuffers() { - auto range = this->getOperation()->getOperands(); - return {range.begin() + nInputs(), - range.begin() + getNumInputsAndOutputBuffers()}; - } - - //==========================================================================// - // Input and Output arguments handling. - //==========================================================================// - Value getBuffer(unsigned i) { - assert(i < getNumInputsAndOutputBuffers() && "overflowing buffers index"); - return this->getOperation()->getOperand(i); - } - /// Return the number of inputs and outputs, irrespective of their buffer or - /// tensor type. - unsigned getNumInputsAndOutputs() { return nInputs() + nOutputs(); } - /// Return the number of inputs, irrespective of their buffer or tensor type, - /// and output buffers. - unsigned getNumInputsAndOutputBuffers() { - assert(this->getOperation()->getNumResults() <= nOutputs()); - return nInputs() + nOutputs() - this->getOperation()->getNumResults(); - } - /// Return the range over inputs (irrespective of type) and output buffers. - Operation::operand_range getInputsAndOutputBuffers() { - auto range = this->getOperation()->getOperands(); - return {range.begin(), range.begin() + getNumInputsAndOutputBuffers()}; - } - /// Return the `i`-th shaped type, there are 3 cases: - /// 1. if `i < nInputs()` then return `getInputShapedType(i)`; otherwise - /// 2. if `i < getNumInputsAndOutputBuffers()` then return the - /// `getOutputBufferType(i - nInputs())`; otherwise - /// 3. return the `i - getNumInputsAndOutputBuffers()` result type. - ShapedType getShapedType(unsigned i) { - if (i < nInputs()) - return getInputShapedType(i); - if (i < getNumInputsAndOutputBuffers()) - return getOutputBufferType(i - nInputs()).template cast(); - return getOutputTensorTypes()[i - getNumInputsAndOutputBuffers()] - .template cast(); - } - /// Return the shaped types for all the inputs and outputs - SmallVector getInputOutputShapedTypes() { - SmallVector inputOutputTypes( - this->getOperation()->operand_type_begin(), - this->getOperation()->operand_type_end()); - inputOutputTypes.append(this->getOperation()->result_type_begin(), - this->getOperation()->result_type_end()); - return llvm::to_vector<4>( - llvm::map_range(inputOutputTypes, [](Type type) -> ShapedType { - return type.cast(); - })); - } - - //==========================================================================// - // Other interface methods. - //==========================================================================// - - // Get or build the indexing_maps ArrayAttr. - ArrayAttr iterator_types() { - // Return the attribute if it is present. - if (auto attr = this->getOperation()->getAttr("iterator_types")) - return attr.template cast(); - - // If not, form the attribute using the reference iterator types for the - // ConcreteType. - auto maybeReferenceIteratorTypes = - cast(this->getOperation()).referenceIterators(); - - // If there is no reference, this must be a generic op. - // TODO: Traits are used to define ops. Split into cpp to avoid cyclic - // dependency. - auto name = this->getOperation()->getName().getStringRef(); - if (!maybeReferenceIteratorTypes && name != "generic" && - name != "indexed_generic") { - this->getOperation()->dump(); - llvm_unreachable("Op missing referenceIterators"); - } - - // If we have a reference, build the reference attribute and set it in the - // op before returning. - auto *ctx = this->getOperation()->getContext(); - auto attrRange = llvm::map_range(*maybeReferenceIteratorTypes, - [ctx](StringRef str) -> Attribute { - return StringAttr::get(str, ctx); - }); - auto attr = ArrayAttr::get(llvm::to_vector<4>(attrRange), ctx); - // TODO: Need to memoize this. Can't just store as an attribute atm as it - // will impact parser, printer and tests. - // this->getOperation()->setAttr("iterator_types", attr); - return attr; - } - - // Get or build the indexing_maps ArrayAttr. - ArrayAttr indexing_maps() { - // Return the attribute if it is present. - if (auto attr = this->getOperation()->getAttr("indexing_maps")) - return attr.template cast(); - - // If not, form the attribute using the reference indexing map for the - // ConcreteType. - auto maybeReferenceIndexingMaps = - cast(this->getOperation()).referenceIndexingMaps(); - - // If there is no reference, this must be a generic op. - auto name = this->getOperation()->getName().getStringRef(); - if (!maybeReferenceIndexingMaps && name != "generic" && - name != "indexed_generic") { - this->getOperation()->dump(); - llvm_unreachable("Op missing referenceIndexingMaps"); - } - - // If we have a reference, build the reference attribute and set it in the - // op before returning. - auto *ctx = this->getOperation()->getContext(); - auto attrRange = - llvm::map_range(*maybeReferenceIndexingMaps, [ctx](AffineMap map) { - // 0-D corner case because there is no such thing as a concrete empty - // map type. - if (!map) - map = AffineMap::get(0, 0, getAffineConstantExpr(0, ctx)); - return AffineMapAttr::get(map); - }); - SmallVector attrs{attrRange.begin(), attrRange.end()}; - auto attr = ArrayAttr::get(attrs, ctx); - // TODO: Need to memoize this. Can't just store as an attribute atm as it - // will impact parser, printer and tests. - // this->getOperation()->setAttr("indexing_maps", attr); - return attr; - } - - SmallVector getIndexingMaps() { - return llvm::to_vector<4>( - llvm::map_range(indexing_maps(), [](Attribute attr) -> AffineMap { - return attr.cast().getValue(); - })); - } - - AffineMap getIndexingMap(unsigned i) { - assert(i < getNumInputsAndOutputs()); - return indexing_maps() - .getValue()[i] - .template cast() - .getValue(); - } - - AffineMap getInputIndexingMap(unsigned i) { - assert(i < nInputs()); - return indexing_maps() - .getValue()[i] - .template cast() - .getValue(); - } - - AffineMap getOutputIndexingMap(unsigned i) { - assert(i < nOutputs()); - return indexing_maps() - .getValue()[i + nInputs()] - .template cast() - .getValue(); - } - - /// Query whether the op has only buffer inputs and no returns. - bool hasBufferSemantics() { - return this->getOperation()->getNumResults() == 0 && - llvm::all_of(getInputs(), - [](Value v) { return v.getType().isa(); }); - } - - /// Query whether the op has only tensor inputs and outputs. - bool hasTensorSemantics() { - auto isTensorType = [](Value v) { - return v.getType().isa(); - }; - return llvm::all_of(getInputs(), isTensorType) && - llvm::all_of(this->getOperation()->getResults(), isTensorType); - } - - //==========================================================================// - // Other static interface methods. - //==========================================================================// static LogicalResult verifyTrait(Operation *op) { + ConcreteType concreteOp = cast(op); auto nOperands = cast(op).getNumInputsAndOutputBuffers(); if (failed(OpTrait::impl::verifyAtLeastNOperands(op, nOperands))) return failure(); + if (op->getNumResults() > concreteOp.getNumOutputs()) + return op->emitError("unexpected #results > #outputs"); return success(); } }; -/// This class provides the API for named Linalg StructuredOps. -template -class NamedStructuredOpTraits - : public OpTrait::TraitBase { -public: - static SmallVector referenceIterators(TypeRange inputTypes, - TypeRange outputTypes); - - static SmallVector referenceIndexingMaps(TypeRange inputTypes, - TypeRange outputTypes); -}; - } // namespace linalg } // namespace OpTrait } // namespace mlir diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h index 18b2c3aaa53d1..a4e32b9263e8c 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h @@ -12,11 +12,12 @@ #include "mlir/IR/Dialect.h" #include "mlir/IR/Types.h" +#include "mlir/Dialect/Linalg/IR/LinalgOpsDialect.h.inc" + namespace mlir { class MLIRContext; namespace linalg { -#include "mlir/Dialect/Linalg/IR/LinalgOpsDialect.h.inc" /// A RangeType represents a minimal range abstraction (min, max, step). /// It is constructed by calling the linalg.range op with three values index of diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index f438b6587c8bc..a34ea00fdf5df 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -30,6 +30,10 @@ struct TiledLinalgOp { SmallVector loops; }; +/// Populates patterns for vectorization of all ConvN-D ops. +void populateConvVectorizationPatterns( + MLIRContext *context, SmallVectorImpl &patterns); + /// Performs standalone tiling of a single LinalgOp by `tileSizes`. /// and permute the loop nest according to `interchangeVector` /// The permutation is expressed as a list of integers that specify @@ -309,6 +313,13 @@ struct LinalgTilingPattern : public LinalgBaseTilingPattern { PatternBenefit benefit = 1) : LinalgBaseTilingPattern(OpTy::getOperationName(), context, options, marker, benefit) {} + LogicalResult matchAndRewrite(Operation *op, + PatternRewriter &rewriter) const override { + if (failed(LinalgBaseTilingPattern::matchAndRewrite(op, rewriter))) + return failure(); + rewriter.eraseOp(op); + return success(); + } }; /// @@ -411,7 +422,8 @@ enum class LinalgLoweringType { AffineLoops = 2, ParallelLoops = 3 }; -template struct LinalgLoweringPattern : public RewritePattern { +template +struct LinalgLoweringPattern : public RewritePattern { LinalgLoweringPattern(MLIRContext *context, LinalgLoweringType loweringType, LinalgMarker marker = LinalgMarker(), PatternBenefit benefit = 1) @@ -531,6 +543,58 @@ struct AffineMinSCFCanonicalizationPattern PatternRewriter &rewriter) const override; }; +/// Converts Convolution op into vector contraction. +/// +/// Conversion expects ConvOp to have dimensions marked in the *mask* as +/// false of size 1. This ensures that the ConvOp can be lowered to vector +/// contraction of dimensions marked in the *mask* as true. +/// +/// A good example is ConvNHWCOp which is 2D Conv op with channels as the last +/// dimension. For this op we contract last 3 dimensions. +/// The initial op definition looks like this: +/// ``` +/// linalg.conv_2d_nhwc %arg0, %arg1, %arg2 : +/// (memref<1x3x3x3xf32>, memref<1x3x3x3xf32>, memref) +/// ``` +/// This op can be expressed as a dot product between %arg0 (input) and +/// %arg1 (kernel) which is written into first entry of %arg2 (output). This is +/// the ConvOp this pass expects and converts into: +/// ``` +/// #map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +/// #map1 = affine_map<(d0, d1, d2) -> ()> +/// ..... +/// %0 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %c0_f32 +/// : memref<1x3x3x3xf32>, vector<3x3x3xf32> +/// %1 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %c0_f32 +/// : memref<1x3x3x3xf32>, vector<3x3x3xf32> +/// %2 = vector.contract {indexing_maps = [#map0, #map0, #map1], +/// iterator_types = ["reduction", "reduction", "reduction"]} %0, %1, +/// %c0_f32 : vector<3x3x3xf32>, vector<3x3x3xf32> into f32 +/// store %2, %arg2[%c0, %c0, %c0, %c0] : memref +/// ``` +/// where first 2 operations read input and kernel memory buffers into vectors. +/// Subsequently, they are contracted together and the result is written to +/// the first entry of the output buffer. +template +class ConvOpVectorization : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + SmallVector mask; + +public: + ConvOpVectorization(MLIRContext *context, SmallVector msk) + : OpRewritePattern(context) { + assert(msk.size() == N && "Mask size does not match rank"); + this->mask = msk; + } + + LogicalResult matchAndRewrite(ConvOp minOp, + PatternRewriter &rewriter) const override; + + // TODO: Make these pass arguments. + static const int tileSize = 3; + static const int noTile = 1; +}; + //===----------------------------------------------------------------------===// // Support for staged pattern application. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h index beef1a70096e6..c0c59bda1894f 100644 --- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h @@ -94,42 +94,22 @@ Operation *fuseTensorOps(PatternRewriter &rewriter, Operation *consumer, unsigned consumerIdx, OperationFolder *folder = nullptr); -/// Returns the linearized list of all view dimensions in a linalgOp. Applying +/// Returns the linearized list of all view dimensions in a `linalgOp`. Applying /// the inverse, concatenated loopToOperandRangeMaps to this list allows the /// derivation of loop ranges for any linalgOp. -template -SmallVector getViewSizes(OpBuilder &builder, ConcreteOp linalgOp) { - auto loc = linalgOp.getLoc(); - SmallVector res; - SmallVector ranks; - for (auto v : linalgOp.getInputsAndOutputBuffers()) { - MemRefType t = v.getType().template cast(); - ranks.push_back(t.getRank()); - for (unsigned i = 0; i < t.getRank(); ++i) - res.push_back(builder.create(loc, v, i)); - } - - auto attr = linalgOp.template getAttrOfType("symbol_source"); - if (attr) { - // Find the correct position for inserting values for symbols. - unsigned numSymb = ranks[attr.getInt()], symbolsPos = 0; - for (unsigned idx = 0; idx < attr.getInt(); idx++) - symbolsPos += ranks[idx]; - - // Append the end of the value list that corresponds to the - // values mapping to symbols. Since inside concatinated map symbols are - // repeated we have to repeat the sizes as well. - - // Reserve is mandatory to avoid a potential undefined behavior with - // pushing back to smallvector from itself. - res.reserve(res.size() + ranks.size() * numSymb); - for (unsigned idx = 0, s = ranks.size(); idx < s; ++idx) - for (unsigned idx2 = 0; idx2 < numSymb; ++idx2) - res.push_back(res[symbolsPos + idx2]); - } - return res; +SmallVector getViewSizes(OpBuilder &builder, LinalgOp linalgOp); +template +SmallVector getViewSizes(OpBuilder &builder, ConcreteOpTy linalgOp) { + return getViewSizes(builder, cast(linalgOp.getOperation())); } +/// Returns the loop ranges of the `linalgOp`. Applies the inverse of the +/// concatenated indexing maps to the result of `getViewSizes`. Returns None if +/// the bounds computation fails. +Optional> +getLoopRanges(OpBuilder &builder, LinalgOp linalgOp, + OperationFolder *folder = nullptr); + /// Returns the values obtained by applying `map` to the list of values. /// When non-null, the optional pointer `folder` is used to call into the /// `createAndFold` builder method. If `folder` is null, the regular `create` diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h index 8f5e1daf9aebc..40700e6d1b736 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h @@ -16,15 +16,14 @@ #include "mlir/IR/Dialect.h" #include "mlir/IR/OpDefinition.h" +#include "mlir/Dialect/OpenACC/OpenACCOpsDialect.h.inc" #include "mlir/Dialect/OpenACC/OpenACCOpsEnums.h.inc" -namespace mlir { -namespace acc { - #define GET_OP_CLASSES #include "mlir/Dialect/OpenACC/OpenACCOps.h.inc" -#include "mlir/Dialect/OpenACC/OpenACCOpsDialect.h.inc" +namespace mlir { +namespace acc { /// Enumeration used to encode the execution mapping on a loop construct. /// They refer directly to the OpenACC 3.0 standard: diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index 30d6f435b75fa..3fa26f932bd9e 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -24,7 +24,7 @@ def OpenACC_Dialect : Dialect { This dialect models the construct from the OpenACC 3.0 directive language. }]; - let cppNamespace = "acc"; + let cppNamespace = "::mlir::acc"; } // Base class for OpenACC dialect ops. @@ -36,7 +36,7 @@ class OpenACC_Op traits = []> : let parser = [{ return ::parse$cppClass(parser, result); }]; } -// Reduction operation enumeration +// Reduction operation enumeration. def OpenACC_ReductionOpAdd : StrEnumAttrCase<"redop_add">; def OpenACC_ReductionOpMul : StrEnumAttrCase<"redop_mul">; def OpenACC_ReductionOpMax : StrEnumAttrCase<"redop_max">; @@ -60,10 +60,22 @@ def OpenACC_ReductionOpAttr : StrEnumAttr<"ReductionOpAttr", let cppNamespace = "::mlir::acc"; } +// Type used in operation below. +def IntOrIndex : AnyTypeOf<[AnyInteger, Index]>; + //===----------------------------------------------------------------------===// // 2.5.1 parallel Construct //===----------------------------------------------------------------------===// +// Parallel op default enumeration +def OpenACC_DefaultNone : StrEnumAttrCase<"none">; +def OpenACC_DefaultPresent : StrEnumAttrCase<"present">; +def OpenACC_DefaultAttr : StrEnumAttr<"DefaultAttr", + "default attribute value for parallel op", + [OpenACC_DefaultNone, OpenACC_DefaultPresent]> { + let cppNamespace = "::mlir::acc"; +} + def OpenACC_ParallelOp : OpenACC_Op<"parallel", [AttrSizedOperandSegments]> { let summary = "parallel construct"; @@ -81,25 +93,29 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel", ``` }]; - let arguments = (ins Optional:$async, - Variadic:$waitOperands, - Optional:$numGangs, - Optional:$numWorkers, - Optional:$vectorLength, + let arguments = (ins Optional:$async, + Variadic:$waitOperands, + Optional:$numGangs, + Optional:$numWorkers, + Optional:$vectorLength, Optional:$ifCond, Optional:$selfCond, OptionalAttr:$reductionOp, Variadic:$reductionOperands, Variadic:$copyOperands, Variadic:$copyinOperands, + Variadic:$copyinReadonlyOperands, Variadic:$copyoutOperands, + Variadic:$copyoutZeroOperands, Variadic:$createOperands, + Variadic:$createZeroOperands, Variadic:$noCreateOperands, Variadic:$presentOperands, Variadic:$devicePtrOperands, Variadic:$attachOperands, Variadic:$gangPrivateOperands, - Variadic:$gangFirstPrivateOperands); + Variadic:$gangFirstPrivateOperands, + OptionalAttr:$defaultAttr); let regions = (region AnyRegion:$region); @@ -114,8 +130,11 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel", static StringRef getReductionKeyword() { return "reduction"; } static StringRef getCopyKeyword() { return "copy"; } static StringRef getCopyinKeyword() { return "copyin"; } + static StringRef getCopyinReadonlyKeyword() { return "copyin_readonly"; } static StringRef getCopyoutKeyword() { return "copyout"; } + static StringRef getCopyoutZeroKeyword() { return "copyout_zero"; } static StringRef getCreateKeyword() { return "create"; } + static StringRef getCreateZeroKeyword() { return "create_zero"; } static StringRef getNoCreateKeyword() { return "no_create"; } static StringRef getPresentKeyword() { return "present"; } static StringRef getDevicePtrKeyword() { return "deviceptr"; } @@ -200,7 +219,8 @@ def OpenACC_TerminatorOp : OpenACC_Op<"terminator", [Terminator]> { //===----------------------------------------------------------------------===// def OpenACC_LoopOp : OpenACC_Op<"loop", - [AttrSizedOperandSegments]> { + [AttrSizedOperandSegments, + SingleBlockImplicitTerminator<"acc::YieldOp">]> { let summary = "loop construct"; let description = [{ @@ -228,13 +248,14 @@ def OpenACC_LoopOp : OpenACC_Op<"loop", Optional:$gangStatic, Optional:$workerNum, Optional:$vectorLength, - UnitAttr:$loopSeq, - UnitAttr:$loopIndependent, - UnitAttr:$loopAuto, + UnitAttr:$seq, + UnitAttr:$independent, + UnitAttr:$auto_, Variadic:$tileOperands, Variadic:$privateOperands, OptionalAttr:$reductionOp, - Variadic:$reductionOperands); + Variadic:$reductionOperands, + DefaultValuedAttr:$exec_mapping); let results = (outs Variadic:$results); @@ -256,7 +277,7 @@ def OpenACC_LoopOp : OpenACC_Op<"loop", static StringRef getReductionKeyword() { return "reduction"; } }]; - let verifier = ?; + let verifier = [{ return ::verifyLoopOp(*this); }]; } // Yield operation for the acc.loop and acc.parallel operations. diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h index 8f0bb93e1043e..0715b9ddd394c 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h @@ -16,16 +16,10 @@ #include "mlir/IR/Dialect.h" #include "mlir/IR/OpDefinition.h" +#include "mlir/Dialect/OpenMP/OpenMPOpsDialect.h.inc" #include "mlir/Dialect/OpenMP/OpenMPOpsEnums.h.inc" -namespace mlir { -namespace omp { - #define GET_OP_CLASSES #include "mlir/Dialect/OpenMP/OpenMPOps.h.inc" -#include "mlir/Dialect/OpenMP/OpenMPOpsDialect.h.inc" -} // namespace omp -} // namespace mlir - #endif // MLIR_DIALECT_OPENMP_OPENMPDIALECT_H_ diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index eb92745d6fa5e..3ac7f2c5dda53 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -19,7 +19,7 @@ include "mlir/Dialect/OpenMP/OmpCommon.td" def OpenMP_Dialect : Dialect { let name = "omp"; - let cppNamespace = "omp"; + let cppNamespace = "::mlir::omp"; } class OpenMP_Op traits = []> : diff --git a/mlir/include/mlir/Dialect/PDL/IR/PDL.h b/mlir/include/mlir/Dialect/PDL/IR/PDL.h index 64dbf8f74399f..14136021d26ce 100644 --- a/mlir/include/mlir/Dialect/PDL/IR/PDL.h +++ b/mlir/include/mlir/Dialect/PDL/IR/PDL.h @@ -19,8 +19,6 @@ #include "mlir/IR/SymbolTable.h" #include "mlir/Interfaces/SideEffectInterfaces.h" -namespace mlir { -namespace pdl { //===----------------------------------------------------------------------===// // PDL Dialect //===----------------------------------------------------------------------===// @@ -34,7 +32,5 @@ namespace pdl { #define GET_OP_CLASSES #include "mlir/Dialect/PDL/IR/PDLOps.h.inc" -} // end namespace pdl -} // end namespace mlir #endif // MLIR_DIALECT_PDL_IR_PDL_H_ diff --git a/mlir/include/mlir/Dialect/PDL/IR/PDLBase.td b/mlir/include/mlir/Dialect/PDL/IR/PDLBase.td index 9802bf9431572..b372e594e2e73 100644 --- a/mlir/include/mlir/Dialect/PDL/IR/PDLBase.td +++ b/mlir/include/mlir/Dialect/PDL/IR/PDLBase.td @@ -63,7 +63,7 @@ def PDL_Dialect : Dialect { }]; let name = "pdl"; - let cppNamespace = "mlir::pdl"; + let cppNamespace = "::mlir::pdl"; } //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/PDLInterp/IR/PDLInterp.h b/mlir/include/mlir/Dialect/PDLInterp/IR/PDLInterp.h index 6d895679b3d65..07c7f84c80784 100644 --- a/mlir/include/mlir/Dialect/PDLInterp/IR/PDLInterp.h +++ b/mlir/include/mlir/Dialect/PDLInterp/IR/PDLInterp.h @@ -18,8 +18,6 @@ #include "mlir/Interfaces/InferTypeOpInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" -namespace mlir { -namespace pdl_interp { //===----------------------------------------------------------------------===// // PDLInterp Dialect //===----------------------------------------------------------------------===// @@ -33,7 +31,4 @@ namespace pdl_interp { #define GET_OP_CLASSES #include "mlir/Dialect/PDLInterp/IR/PDLInterpOps.h.inc" -} // end namespace pdl_interp -} // end namespace mlir - #endif // MLIR_DIALECT_PDLINTERP_IR_PDLINTERP_H_ diff --git a/mlir/include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td b/mlir/include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td index 58a2032a21825..e95162bb65806 100644 --- a/mlir/include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td +++ b/mlir/include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td @@ -34,7 +34,7 @@ def PDLInterp_Dialect : Dialect { }]; let name = "pdl_interp"; - let cppNamespace = "mlir::pdl_interp"; + let cppNamespace = "::mlir::pdl_interp"; let dependentDialects = ["pdl::PDLDialect"]; } diff --git a/mlir/include/mlir/Dialect/Quant/QuantOps.h b/mlir/include/mlir/Dialect/Quant/QuantOps.h index 234a2b44c6f6b..00a6032a2fea0 100644 --- a/mlir/include/mlir/Dialect/Quant/QuantOps.h +++ b/mlir/include/mlir/Dialect/Quant/QuantOps.h @@ -18,15 +18,9 @@ #include "mlir/Interfaces/SideEffectInterfaces.h" #include "llvm/Support/MathExtras.h" -namespace mlir { -namespace quant { - #include "mlir/Dialect/Quant/QuantOpsDialect.h.inc" #define GET_OP_CLASSES #include "mlir/Dialect/Quant/QuantOps.h.inc" -} // namespace quant -} // namespace mlir - #endif // MLIR_DIALECT_QUANT_QUANTOPS_H_ diff --git a/mlir/include/mlir/Dialect/Quant/QuantOpsBase.td b/mlir/include/mlir/Dialect/Quant/QuantOpsBase.td index aa7c311e20a3f..10339fcbcf5d8 100644 --- a/mlir/include/mlir/Dialect/Quant/QuantOpsBase.td +++ b/mlir/include/mlir/Dialect/Quant/QuantOpsBase.td @@ -17,6 +17,7 @@ include "mlir/IR/OpBase.td" def Quantization_Dialect : Dialect { let name = "quant"; + let cppNamespace = "::mlir::quant"; } //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/SCF/SCF.h b/mlir/include/mlir/Dialect/SCF/SCF.h index 3974b58cbfbba..55c8cbf5fa744 100644 --- a/mlir/include/mlir/Dialect/SCF/SCF.h +++ b/mlir/include/mlir/Dialect/SCF/SCF.h @@ -23,14 +23,18 @@ namespace mlir { namespace scf { - void buildTerminatedBody(OpBuilder &builder, Location loc); +} // namespace scf +} // namespace mlir #include "mlir/Dialect/SCF/SCFOpsDialect.h.inc" #define GET_OP_CLASSES #include "mlir/Dialect/SCF/SCFOps.h.inc" +namespace mlir { +namespace scf { + // Insert `loop.yield` at the end of the only region's only block if it // does not have a terminator already. If a new `loop.yield` is inserted, // the location is specified by `loc`. If the region is empty, insert a new diff --git a/mlir/include/mlir/Dialect/SCF/SCFOps.td b/mlir/include/mlir/Dialect/SCF/SCFOps.td index 59ba50fbe2322..179b4d773a3a4 100644 --- a/mlir/include/mlir/Dialect/SCF/SCFOps.td +++ b/mlir/include/mlir/Dialect/SCF/SCFOps.td @@ -19,7 +19,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" def SCF_Dialect : Dialect { let name = "scf"; - let cppNamespace = "scf"; + let cppNamespace = "::mlir::scf"; } // Base class for SCF dialect ops. diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVAttributes.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVAttributes.h index b1909b3675535..a743fa9c30d98 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVAttributes.h +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVAttributes.h @@ -17,10 +17,10 @@ #include "mlir/IR/Attributes.h" #include "mlir/Support/LLVM.h" -namespace mlir { // Pull in SPIR-V attribute definitions for target and ABI. #include "mlir/Dialect/SPIRV/TargetAndABI.h.inc" +namespace mlir { namespace spirv { enum class Capability : uint32_t; enum class Extension; diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td index 21f926a1500c5..83150dad514db 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td @@ -45,7 +45,7 @@ def SPIRV_Dialect : Dialect { high-level designs and implementation structures of the SPIR-V dialect. }]; - let cppNamespace = "spirv"; + let cppNamespace = "::mlir::spirv"; let hasConstantMaterializer = 1; let hasOperationAttrVerify = 1; let hasRegionArgAttrVerify = 1; @@ -226,21 +226,24 @@ class Capability capabilities> : Availability { let instance = "ref"; } +class SPIRVOpInterface : OpInterface { + let cppNamespace = "::mlir::spirv"; +} // TODO: the following interfaces definitions are duplicating with the above. // Remove them once we are able to support dialect-specific contents in ODS. -def QueryMinVersionInterface : OpInterface<"QueryMinVersionInterface"> { +def QueryMinVersionInterface : SPIRVOpInterface<"QueryMinVersionInterface"> { let methods = [InterfaceMethod<"", "::mlir::spirv::Version", "getMinVersion">]; } -def QueryMaxVersionInterface : OpInterface<"QueryMaxVersionInterface"> { +def QueryMaxVersionInterface : SPIRVOpInterface<"QueryMaxVersionInterface"> { let methods = [InterfaceMethod<"", "::mlir::spirv::Version", "getMaxVersion">]; } -def QueryExtensionInterface : OpInterface<"QueryExtensionInterface"> { +def QueryExtensionInterface : SPIRVOpInterface<"QueryExtensionInterface"> { let methods = [InterfaceMethod< "", "::llvm::SmallVector<::llvm::ArrayRef<::mlir::spirv::Extension>, 1>", "getExtensions">]; } -def QueryCapabilityInterface : OpInterface<"QueryCapabilityInterface"> { +def QueryCapabilityInterface : SPIRVOpInterface<"QueryCapabilityInterface"> { let methods = [InterfaceMethod< "", "::llvm::SmallVector<::llvm::ArrayRef<::mlir::spirv::Capability>, 1>", @@ -3253,6 +3256,7 @@ def SPV_OC_OpGroupBroadcast : I32EnumAttrCase<"OpGroupBroadcast", 263 def SPV_OC_OpNoLine : I32EnumAttrCase<"OpNoLine", 317>; def SPV_OC_OpModuleProcessed : I32EnumAttrCase<"OpModuleProcessed", 330>; def SPV_OC_OpGroupNonUniformElect : I32EnumAttrCase<"OpGroupNonUniformElect", 333>; +def SPV_OC_OpGroupNonUniformBroadcast : I32EnumAttrCase<"OpGroupNonUniformBroadcast", 337>; def SPV_OC_OpGroupNonUniformBallot : I32EnumAttrCase<"OpGroupNonUniformBallot", 339>; def SPV_OC_OpGroupNonUniformIAdd : I32EnumAttrCase<"OpGroupNonUniformIAdd", 349>; def SPV_OC_OpGroupNonUniformFAdd : I32EnumAttrCase<"OpGroupNonUniformFAdd", 350>; @@ -3320,16 +3324,16 @@ def SPV_OpcodeAttr : SPV_OC_OpBranch, SPV_OC_OpBranchConditional, SPV_OC_OpReturn, SPV_OC_OpReturnValue, SPV_OC_OpUnreachable, SPV_OC_OpGroupBroadcast, SPV_OC_OpNoLine, SPV_OC_OpModuleProcessed, SPV_OC_OpGroupNonUniformElect, - SPV_OC_OpGroupNonUniformBallot, SPV_OC_OpGroupNonUniformIAdd, - SPV_OC_OpGroupNonUniformFAdd, SPV_OC_OpGroupNonUniformIMul, - SPV_OC_OpGroupNonUniformFMul, SPV_OC_OpGroupNonUniformSMin, - SPV_OC_OpGroupNonUniformUMin, SPV_OC_OpGroupNonUniformFMin, - SPV_OC_OpGroupNonUniformSMax, SPV_OC_OpGroupNonUniformUMax, - SPV_OC_OpGroupNonUniformFMax, SPV_OC_OpSubgroupBallotKHR, - SPV_OC_OpTypeCooperativeMatrixNV, SPV_OC_OpCooperativeMatrixLoadNV, - SPV_OC_OpCooperativeMatrixStoreNV, SPV_OC_OpCooperativeMatrixMulAddNV, - SPV_OC_OpCooperativeMatrixLengthNV, SPV_OC_OpSubgroupBlockReadINTEL, - SPV_OC_OpSubgroupBlockWriteINTEL + SPV_OC_OpGroupNonUniformBroadcast, SPV_OC_OpGroupNonUniformBallot, + SPV_OC_OpGroupNonUniformIAdd, SPV_OC_OpGroupNonUniformFAdd, + SPV_OC_OpGroupNonUniformIMul, SPV_OC_OpGroupNonUniformFMul, + SPV_OC_OpGroupNonUniformSMin, SPV_OC_OpGroupNonUniformUMin, + SPV_OC_OpGroupNonUniformFMin, SPV_OC_OpGroupNonUniformSMax, + SPV_OC_OpGroupNonUniformUMax, SPV_OC_OpGroupNonUniformFMax, + SPV_OC_OpSubgroupBallotKHR, SPV_OC_OpTypeCooperativeMatrixNV, + SPV_OC_OpCooperativeMatrixLoadNV, SPV_OC_OpCooperativeMatrixStoreNV, + SPV_OC_OpCooperativeMatrixMulAddNV, SPV_OC_OpCooperativeMatrixLengthNV, + SPV_OC_OpSubgroupBlockReadINTEL, SPV_OC_OpSubgroupBlockWriteINTEL ]>; // End opcode section. Generated from SPIR-V spec; DO NOT MODIFY! diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVCastOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVCastOps.td index c67c8d5e45423..0e595984dde4d 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVCastOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVCastOps.td @@ -122,6 +122,8 @@ def SPV_ConvertFToSOp : SPV_CastOp<"ConvertFToS", SPV_Integer, SPV_Float, []> { %3 = spv.ConvertFToS %2 : vector<3xf32> to vector<3xi32> ``` }]; + + let verifier = [{ return verifyCastOp(this->getOperation(), false, true); }]; } // ----- @@ -155,6 +157,8 @@ def SPV_ConvertFToUOp : SPV_CastOp<"ConvertFToU", SPV_Integer, SPV_Float, []> { %3 = spv.ConvertFToU %2 : vector<3xf32> to vector<3xi32> ``` }]; + + let verifier = [{ return verifyCastOp(this->getOperation(), false, true); }]; } // ----- @@ -186,6 +190,8 @@ def SPV_ConvertSToFOp : SPV_CastOp<"ConvertSToF", SPV_Float, SPV_Integer, []> { %3 = spv.ConvertSToF %2 : vector<3xi32> to vector<3xf32> ``` }]; + + let verifier = [{ return verifyCastOp(this->getOperation(), false, true); }]; } // ----- @@ -217,6 +223,8 @@ def SPV_ConvertUToFOp : SPV_CastOp<"ConvertUToF", SPV_Float, SPV_Integer, []> { %3 = spv.ConvertUToF %2 : vector<3xi32> to vector<3xf32> ``` }]; + + let verifier = [{ return verifyCastOp(this->getOperation(), false, true); }]; } // ----- diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVDialect.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVDialect.h index 2cffebec60ea6..1b37abb937644 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVDialect.h +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVDialect.h @@ -20,9 +20,9 @@ namespace spirv { enum class Decoration : uint32_t; -#include "mlir/Dialect/SPIRV/SPIRVOpsDialect.h.inc" - } // end namespace spirv } // end namespace mlir +#include "mlir/Dialect/SPIRV/SPIRVOpsDialect.h.inc" + #endif // MLIR_DIALECT_SPIRV_SPIRVDIALECT_H_ diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVNonUniformOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVNonUniformOps.td index 34be336bb2a56..da3da3050efce 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVNonUniformOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVNonUniformOps.td @@ -105,6 +105,77 @@ def SPV_GroupNonUniformBallotOp : SPV_Op<"GroupNonUniformBallot", []> { // ----- +def SPV_GroupNonUniformBroadcastOp : SPV_Op<"GroupNonUniformBroadcast", + [NoSideEffect, AllTypesMatch<["value", "result"]>]> { + let summary = [{ + Return the Value of the invocation identified by the id Id to all active + invocations in the group. + }]; + + let description = [{ + Result Type must be a scalar or vector of floating-point type, integer + type, or Boolean type. + + Execution must be Workgroup or Subgroup Scope. + + The type of Value must be the same as Result Type. + + Id must be a scalar of integer type, whose Signedness operand is 0. + + Before version 1.5, Id must come from a constant instruction. Starting + with version 1.5, Id must be dynamically uniform. + + The resulting value is undefined if Id is an inactive invocation, or is + greater than or equal to the size of the group. + + + + ``` + scope ::= `"Workgroup"` | `"Subgroup"` + integer-float-scalar-vector-type ::= integer-type | float-type | + `vector<` integer-literal `x` integer-type `>` | + `vector<` integer-literal `x` float-type `>` + group-non-uniform-broadcast-op ::= ssa-id `=` + `spv.GroupNonUniformBroadcast` scope ssa_use, + ssa_use `:` integer-float-scalar-vector-type `,` integer-type + ```mlir + + #### Example: + + ``` + %scalar_value = ... : f32 + %vector_value = ... : vector<4xf32> + %id = ... : i32 + %0 = spv.GroupNonUniformBroadcast "Subgroup" %scalar_value, %id : f32, i32 + %1 = spv.GroupNonUniformBroadcast "Workgroup" %vector_value, %id : + vector<4xf32>, i32 + ``` + }]; + + let availability = [ + MinVersion, + MaxVersion, + Extension<[]>, + Capability<[SPV_C_GroupNonUniformBallot]> + ]; + + let arguments = (ins + SPV_ScopeAttr:$execution_scope, + SPV_Type:$value, + SPV_Integer:$id + ); + + let results = (outs + SPV_Type:$result + ); + + let assemblyFormat = [{ + $execution_scope operands attr-dict `:` type($value) `,` type($id) + }]; +} + +// ----- + def SPV_GroupNonUniformElectOp : SPV_Op<"GroupNonUniformElect", []> { let summary = [{ Result is true only in the active invocation with the lowest id in the @@ -368,8 +439,8 @@ def SPV_GroupNonUniformFMulOp : def SPV_GroupNonUniformIAddOp : SPV_GroupNonUniformArithmeticOp<"GroupNonUniformIAdd", SPV_Integer, []> { let summary = [{ - An integer add group operation of all Value operands contributed active - by invocations in the group. + An integer add group operation of all Value operands contributed by + active invocations in the group. }]; let description = [{ diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.h index 01a2c6081643a..61568df03dcd8 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.h +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.h @@ -28,11 +28,15 @@ class VerCapExtAttr; // TableGen'erated operation interfaces for querying versions, extensions, and // capabilities. #include "mlir/Dialect/SPIRV/SPIRVAvailability.h.inc" +} // namespace spirv +} // namespace mlir // TablenGen'erated operation declarations. #define GET_OP_CLASSES #include "mlir/Dialect/SPIRV/SPIRVOps.h.inc" +namespace mlir { +namespace spirv { // TableGen'erated helper functions. // // Get the name used in the Op to refer to an enum value of the given diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h index 2d224effdee35..43fb708c7908d 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h @@ -77,25 +77,25 @@ class SPIRVType : public Type { /// The extension requirements for each type are following the /// ((Extension::A OR Extension::B) AND (Extension::C OR Extension::D)) /// convention. - using ExtensionArrayRefVector = SmallVectorImpl>; + using ExtensionArrayRefVector = SmallVectorImpl>; /// Appends to `extensions` the extensions needed for this type to appear in /// the given `storage` class. This method does not guarantee the uniqueness /// of extensions; the same extension may be appended multiple times. void getExtensions(ExtensionArrayRefVector &extensions, - Optional storage = llvm::None); + Optional storage = llvm::None); /// The capability requirements for each type are following the /// ((Capability::A OR Extension::B) AND (Capability::C OR Capability::D)) /// convention. - using CapabilityArrayRefVector = SmallVectorImpl>; + using CapabilityArrayRefVector = SmallVectorImpl>; /// Appends to `capabilities` the capabilities needed for this type to appear /// in the given `storage` class. This method does not guarantee the /// uniqueness of capabilities; the same capability may be appended multiple /// times. void getCapabilities(CapabilityArrayRefVector &capabilities, - Optional storage = llvm::None); + Optional storage = llvm::None); /// Returns the size in bytes for each type. If no size can be calculated, /// returns `llvm::None`. Note that if the type has explicit layout, it is @@ -116,9 +116,9 @@ class ScalarType : public SPIRVType { static bool isValid(IntegerType); void getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, - Optional storage = llvm::None); + Optional storage = llvm::None); void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - Optional storage = llvm::None); + Optional storage = llvm::None); Optional getSizeInBytes(); }; @@ -144,9 +144,9 @@ class CompositeType : public SPIRVType { bool hasCompileTimeKnownNumElements() const; void getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, - Optional storage = llvm::None); + Optional storage = llvm::None); void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - Optional storage = llvm::None); + Optional storage = llvm::None); Optional getSizeInBytes(); }; @@ -172,9 +172,9 @@ class ArrayType : public Type::TypeBase storage = llvm::None); + Optional storage = llvm::None); void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - Optional storage = llvm::None); + Optional storage = llvm::None); /// Returns the array size in bytes. Since array type may have an explicit /// stride declaration (in bytes), we also include it in the calculation. @@ -215,9 +215,9 @@ class ImageType // TODO: Add support for Access qualifier void getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, - Optional storage = llvm::None); + Optional storage = llvm::None); void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - Optional storage = llvm::None); + Optional storage = llvm::None); }; // SPIR-V pointer type @@ -233,9 +233,9 @@ class PointerType : public Type::TypeBase storage = llvm::None); + Optional storage = llvm::None); void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - Optional storage = llvm::None); + Optional storage = llvm::None); }; // SPIR-V run-time array type @@ -257,9 +257,9 @@ class RuntimeArrayType unsigned getArrayStride() const; void getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, - Optional storage = llvm::None); + Optional storage = llvm::None); void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - Optional storage = llvm::None); + Optional storage = llvm::None); }; // SPIR-V struct type @@ -335,21 +335,21 @@ class StructType : public Type::TypeBase &memberDecorations) const; - // Returns in `decorationsInfo` all the spirv::Decorations (apart from - // Offset) associated with the `i`-th member of the StructType. + // Returns in `decorationsInfo` all the Decorations (apart from Offset) + // associated with the `i`-th member of the StructType. void getMemberDecorations(unsigned i, SmallVectorImpl &decorationsInfo) const; void getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, - Optional storage = llvm::None); + Optional storage = llvm::None); void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - Optional storage = llvm::None); + Optional storage = llvm::None); }; llvm::hash_code @@ -362,21 +362,21 @@ class CooperativeMatrixNVType public: using Base::Base; - static CooperativeMatrixNVType get(Type elementType, spirv::Scope scope, + static CooperativeMatrixNVType get(Type elementType, Scope scope, unsigned rows, unsigned columns); Type getElementType() const; /// Return the scope of the cooperative matrix. - spirv::Scope getScope() const; + Scope getScope() const; /// return the number of rows of the matrix. unsigned getRows() const; /// return the number of columns of the matrix. unsigned getColumns() const; void getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, - Optional storage = llvm::None); + Optional storage = llvm::None); void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - Optional storage = llvm::None); + Optional storage = llvm::None); }; // SPIR-V matrix type @@ -412,9 +412,9 @@ class MatrixType : public Type::TypeBase storage = llvm::None); + Optional storage = llvm::None); void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - Optional storage = llvm::None); + Optional storage = llvm::None); }; } // end namespace spirv diff --git a/mlir/include/mlir/Dialect/Shape/IR/Shape.h b/mlir/include/mlir/Dialect/Shape/IR/Shape.h index cc601bdedaca6..f40d6154544ae 100644 --- a/mlir/include/mlir/Dialect/Shape/IR/Shape.h +++ b/mlir/include/mlir/Dialect/Shape/IR/Shape.h @@ -67,12 +67,12 @@ class WitnessType : public Type::TypeBase { using Base::Base; }; +} // namespace shape +} // namespace mlir + #define GET_OP_CLASSES #include "mlir/Dialect/Shape/IR/ShapeOps.h.inc" #include "mlir/Dialect/Shape/IR/ShapeOpsDialect.h.inc" -} // namespace shape -} // namespace mlir - #endif // MLIR_SHAPE_IR_SHAPE_H diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeBase.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeBase.td index 754dfcd6452f3..b038819bca3d1 100644 --- a/mlir/include/mlir/Dialect/Shape/IR/ShapeBase.td +++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeBase.td @@ -36,7 +36,7 @@ def ShapeDialect : Dialect { concatting etc. on how to combine them). }]; - let cppNamespace = "shape"; + let cppNamespace = "::mlir::shape"; let hasConstantMaterializer = 1; } diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td index 2e8f032370399..ed89ce36fb8a7 100644 --- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td +++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td @@ -738,5 +738,27 @@ def Shape_ConstWitnessOp : Shape_Op<"const_witness", [ConstantLike, NoSideEffect let hasFolder = 1; } +def Shape_CstrRequireOp : Shape_Op<"cstr_require", []> { + let summary = "Represents a runtime assertion that an i1 is `true`"; + let description = [{ + Represents a runtime assretion that an i1 is true. It returns a + !shape.witness to order this assertion. + + For simplicity, prefer using other cstr_* ops if they are available for a + given constraint. + + Example: + ```mlir + %bool = ... + %w0 = shape.cstr_require %bool // Passing if `%bool` is true. + ``` + }]; + let arguments = (ins I1:$pred); + let results = (outs Shape_WitnessType:$result); + + let assemblyFormat = "$pred attr-dict"; + + let hasFolder = 1; +} #endif // SHAPE_OPS diff --git a/mlir/include/mlir/Dialect/StandardOps/EDSC/Builders.h b/mlir/include/mlir/Dialect/StandardOps/EDSC/Builders.h index 36df24f60c704..ffb3ba30b699a 100644 --- a/mlir/include/mlir/Dialect/StandardOps/EDSC/Builders.h +++ b/mlir/include/mlir/Dialect/StandardOps/EDSC/Builders.h @@ -20,10 +20,10 @@ namespace edsc { class BoundsCapture { public: unsigned rank() const { return lbs.size(); } - Value lb(unsigned idx) { return lbs[idx]; } - Value ub(unsigned idx) { return ubs[idx]; } - int64_t step(unsigned idx) { return steps[idx]; } - std::tuple range(unsigned idx) { + Value lb(unsigned idx) const { return lbs[idx]; } + Value ub(unsigned idx) const { return ubs[idx]; } + int64_t step(unsigned idx) const { return steps[idx]; } + std::tuple range(unsigned idx) const { return std::make_tuple(lbs[idx], ubs[idx], steps[idx]); } void swapRanges(unsigned i, unsigned j) { @@ -34,9 +34,9 @@ class BoundsCapture { std::swap(steps[i], steps[j]); } - ArrayRef getLbs() { return lbs; } - ArrayRef getUbs() { return ubs; } - ArrayRef getSteps() { return steps; } + ArrayRef getLbs() const { return lbs; } + ArrayRef getUbs() const { return ubs; } + ArrayRef getSteps() const { return steps; } protected: SmallVector lbs; @@ -52,8 +52,6 @@ class BoundsCapture { class MemRefBoundsCapture : public BoundsCapture { public: explicit MemRefBoundsCapture(Value v); - MemRefBoundsCapture(const MemRefBoundsCapture &) = default; - MemRefBoundsCapture &operator=(const MemRefBoundsCapture &) = default; unsigned fastestVarying() const { return rank() - 1; } @@ -69,8 +67,6 @@ class VectorBoundsCapture : public BoundsCapture { public: explicit VectorBoundsCapture(Value v); explicit VectorBoundsCapture(VectorType t); - VectorBoundsCapture(const VectorBoundsCapture &) = default; - VectorBoundsCapture &operator=(const VectorBoundsCapture &) = default; private: Value base; diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td index f326ae5578650..2113dfeb4c089 100644 --- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td +++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td @@ -1504,6 +1504,15 @@ def DynamicTensorFromElementsOp : Std_Op<"dynamic_tensor_from_elements", let arguments = (ins Variadic:$dynamicExtents); let results = (outs AnyRankedTensor:$result); let regions = (region SizedRegion<1>:$body); + + let builders = [ + // Build op and populate its body per callback function. + OpBuilder<"OpBuilder &b, OperationState &result, Type resultTy, " + "ValueRange dynamicExtents, " + "function_ref">, + ]; + + let hasCanonicalizer = 1; } //===----------------------------------------------------------------------===// @@ -1604,8 +1613,13 @@ def ExtractElementOp : Std_Op<"extract_element", // TensorFromElementsOp //===----------------------------------------------------------------------===// -def TensorFromElementsOp : Std_Op<"tensor_from_elements", - [NoSideEffect, SameOperandsAndResultElementType]> { +def TensorFromElementsOp : Std_Op<"tensor_from_elements", [ + NoSideEffect, + TypesMatchWith<"operand types match result element type", + "result", "elements", "SmallVector(" + "$_self.cast().getDimSize(0), " + "$_self.cast().getElementType())"> + ]> { string summary = "tensor from elements operation."; string description = [{ Create a 1D tensor from a range of same-type arguments. @@ -1618,17 +1632,20 @@ def TensorFromElementsOp : Std_Op<"tensor_from_elements", }]; let arguments = (ins Variadic:$elements); - let results = (outs AnyTensor:$result); + let results = (outs 1DTensorOf<[AnyType]>:$result); + + let assemblyFormat = "$elements attr-dict `:` type($result)"; + + // This op is fully verified by its traits. + let verifier = ?; let skipDefaultBuilders = 1; - let builders = [OpBuilder< - "OpBuilder &builder, OperationState &result, ValueRange elements", [{ - assert(!elements.empty() && "expected at least one element"); - result.addOperands(elements); - result.addTypes( - RankedTensorType::get({static_cast(elements.size())}, - *elements.getTypes().begin())); - }]>]; + let builders = [ + OpBuilder<"OpBuilder &b, OperationState &result, Type elementType," + "ValueRange elements">, + // Special case builder for when `elements` has size >=1. + OpBuilder<"OpBuilder &b, OperationState &result, ValueRange elements"> + ]; let hasCanonicalizer = 1; } @@ -2428,10 +2445,10 @@ def SignExtendIOp : Std_Op<"sexti", def SIToFPOp : CastOp<"sitofp">, Arguments<(ins AnyType:$in)> { let summary = "cast from integer type to floating-point"; let description = [{ - Cast from a value interpreted as signed integer to the corresponding - floating-point value. If the value cannot be exactly represented, it is - rounded using the default rounding mode. Only scalars are currently - supported. + Cast from a value interpreted as signed or vector of signed integers to the + corresponding floating-point scalar or vector value. If the value cannot be + exactly represented, it is rounded using the default rounding mode. Scalars + and vector types are currently supported. }]; let extraClassDeclaration = [{ @@ -2980,6 +2997,8 @@ def TensorCastOp : CastOp<"tensor_cast"> { /// The result of a tensor_cast is always a tensor. TensorType getType() { return getResult().getType().cast(); } }]; + + let hasCanonicalizer = 1; } //===----------------------------------------------------------------------===// @@ -3109,10 +3128,10 @@ def TruncateIOp : Std_Op<"trunci", [NoSideEffect, SameOperandsAndResultShape]> { def UIToFPOp : CastOp<"uitofp">, Arguments<(ins AnyType:$in)> { let summary = "cast from unsigned integer type to floating-point"; let description = [{ - Cast from a value interpreted as unsigned integer to the corresponding - floating-point value. If the value cannot be exactly represented, it is - rounded using the default rounding mode. Only scalars are currently - supported. + Cast from a value interpreted as unsigned integer or vector of unsigned + integers to the corresponding scalar or vector floating-point value. If the + value cannot be exactly represented, it is rounded using the default + rounding mode. Scalars and vector types are currently supported. }]; let extraClassDeclaration = [{ diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.h b/mlir/include/mlir/Dialect/Vector/VectorOps.h index 562e07f98774d..2354cc6abd890 100644 --- a/mlir/include/mlir/Dialect/Vector/VectorOps.h +++ b/mlir/include/mlir/Dialect/Vector/VectorOps.h @@ -128,13 +128,11 @@ namespace impl { AffineMap getTransferMinorIdentityMap(MemRefType memRefType, VectorType vectorType); } // namespace impl +} // end namespace vector +} // end namespace mlir #define GET_OP_CLASSES #include "mlir/Dialect/Vector/VectorOps.h.inc" - #include "mlir/Dialect/Vector/VectorOpsDialect.h.inc" -} // end namespace vector -} // end namespace mlir - #endif // MLIR_DIALECT_VECTOR_VECTOROPS_H diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.td b/mlir/include/mlir/Dialect/Vector/VectorOps.td index dceb850ad929c..04aa18cfd6482 100644 --- a/mlir/include/mlir/Dialect/Vector/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/VectorOps.td @@ -19,7 +19,7 @@ include "mlir/Interfaces/VectorInterfaces.td" def Vector_Dialect : Dialect { let name = "vector"; - let cppNamespace = "vector"; + let cppNamespace = "::mlir::vector"; let hasConstantMaterializer = 1; } @@ -270,6 +270,7 @@ def Vector_BroadcastOp : } }]; let assemblyFormat = "$source attr-dict `:` type($source) `to` type($vector)"; + let hasFolder = 1; } def Vector_ShuffleOp : diff --git a/mlir/include/mlir/IR/AttributeSupport.h b/mlir/include/mlir/IR/AttributeSupport.h index 35084a20493f5..c0e3a0bb9b26e 100644 --- a/mlir/include/mlir/IR/AttributeSupport.h +++ b/mlir/include/mlir/IR/AttributeSupport.h @@ -16,6 +16,7 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/IR/StorageUniquerSupport.h" #include "llvm/ADT/PointerIntPair.h" +#include "llvm/ADT/Twine.h" namespace mlir { class MLIRContext; @@ -142,6 +143,14 @@ class AttributeUniquer { static typename std::enable_if_t< !std::is_same::value, T> get(MLIRContext *ctx, Args &&...args) { +#ifndef NDEBUG + if (!ctx->getAttributeUniquer().isParametricStorageInitialized( + T::getTypeID())) + llvm::report_fatal_error(llvm::Twine("can't create Attribute '") + + llvm::getTypeName() + + "' because storage uniquer isn't initialized: " + "the dialect was likely not loaded."); +#endif return ctx->getAttributeUniquer().get( [ctx](AttributeStorage *storage) { initializeAttributeStorage(storage, ctx, T::getTypeID()); @@ -153,6 +162,14 @@ class AttributeUniquer { static typename std::enable_if_t< std::is_same::value, T> get(MLIRContext *ctx) { +#ifndef NDEBUG + if (!ctx->getAttributeUniquer().isSingletonStorageInitialized( + T::getTypeID())) + llvm::report_fatal_error(llvm::Twine("can't create Attribute '") + + llvm::getTypeName() + + "' because storage uniquer isn't initialized: " + "the dialect was likely not loaded."); +#endif return ctx->getAttributeUniquer().get(T::getTypeID()); } diff --git a/mlir/include/mlir/IR/BlockSupport.h b/mlir/include/mlir/IR/BlockSupport.h index f3dd6140420e4..fc16effbba70d 100644 --- a/mlir/include/mlir/IR/BlockSupport.h +++ b/mlir/include/mlir/IR/BlockSupport.h @@ -75,6 +75,47 @@ class SuccessorRange final friend RangeBaseT; }; +//===----------------------------------------------------------------------===// +// BlockRange +//===----------------------------------------------------------------------===// + +/// This class provides an abstraction over the different types of ranges over +/// Blocks. In many cases, this prevents the need to explicitly materialize a +/// SmallVector/std::vector. This class should be used in places that are not +/// suitable for a more derived type (e.g. ArrayRef) or a template range +/// parameter. +class BlockRange final + : public llvm::detail::indexed_accessor_range_base< + BlockRange, llvm::PointerUnion, + Block *, Block *, Block *> { +public: + using RangeBaseT::RangeBaseT; + BlockRange(ArrayRef blocks = llvm::None); + BlockRange(SuccessorRange successors); + template , Arg>::value>> + BlockRange(Arg &&arg) + : BlockRange(ArrayRef(std::forward(arg))) {} + BlockRange(std::initializer_list blocks) + : BlockRange(ArrayRef(blocks)) {} + +private: + /// The owner of the range is either: + /// * A pointer to the first element of an array of block operands. + /// * A pointer to the first element of an array of Block *. + using OwnerT = llvm::PointerUnion; + + /// See `llvm::detail::indexed_accessor_range_base` for details. + static OwnerT offset_base(OwnerT object, ptrdiff_t index); + + /// See `llvm::detail::indexed_accessor_range_base` for details. + static Block *dereference_iterator(OwnerT object, ptrdiff_t index); + + /// Allow access to `offset_base` and `dereference_iterator`. + friend RangeBaseT; +}; + //===----------------------------------------------------------------------===// // Operation Iterators //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h index 0c30869752ea3..ccf11489add07 100644 --- a/mlir/include/mlir/IR/Builders.h +++ b/mlir/include/mlir/IR/Builders.h @@ -333,7 +333,7 @@ class OpBuilder : public Builder { /// defining operation. This will cause subsequent insertions to go right /// after it. Otherwise, value is a BlockArgumen. Sets the insertion point to /// the start of its block. - void setInsertionPointAfter(Value val) { + void setInsertionPointAfterValue(Value val) { if (Operation *op = val.getDefiningOp()) { setInsertionPointAfter(op); } else { diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td index b0f08e93666a3..ec0e229ae627d 100644 --- a/mlir/include/mlir/IR/OpBase.td +++ b/mlir/include/mlir/IR/OpBase.td @@ -1443,7 +1443,7 @@ class StructFieldAttr { // Structured attribute that wraps a DictionaryAttr and provides both a // validation method and set of accessors for a fixed set of fields. This is // useful when representing data that would normally be in a structure. -class StructAttr attributes> : DictionaryAttrBase()">, "DictionaryAttr with field(s): " # @@ -1459,7 +1459,7 @@ class StructAttr fields = attributes; @@ -1672,7 +1672,7 @@ class OpTrait; // purpose to wrap around C++ symbol string with this class is to make // traits specified for ops in TableGen less alien and more integrated. class NativeOpTrait : OpTrait { - string trait = "OpTrait::" # prop; + string trait = "::mlir::OpTrait::" # prop; } // ParamNativeOpTrait corresponds to the template-parameterized traits in the @@ -1687,7 +1687,7 @@ class ParamNativeOpTrait // affects op definition generator internals, like how op builders and // operand/attribute/result getters are generated. class GenInternalOpTrait : OpTrait { - string trait = "OpTrait::" # prop; + string trait = "::mlir::OpTrait::" # prop; } // PredOpTrait is an op trait implemented by way of a predicate on the op. diff --git a/mlir/include/mlir/IR/Operation.h b/mlir/include/mlir/IR/Operation.h index 5f5e9017ae512..6de7677dbf052 100644 --- a/mlir/include/mlir/IR/Operation.h +++ b/mlir/include/mlir/IR/Operation.h @@ -32,25 +32,25 @@ class Operation final public: /// Create a new Operation with the specific fields. static Operation *create(Location location, OperationName name, - ArrayRef resultTypes, ArrayRef operands, + TypeRange resultTypes, ValueRange operands, ArrayRef attributes, - ArrayRef successors, unsigned numRegions); + BlockRange successors, unsigned numRegions); /// Overload of create that takes an existing MutableDictionaryAttr to avoid /// unnecessarily uniquing a list of attributes. static Operation *create(Location location, OperationName name, - ArrayRef resultTypes, ArrayRef operands, + TypeRange resultTypes, ValueRange operands, MutableDictionaryAttr attributes, - ArrayRef successors, unsigned numRegions); + BlockRange successors, unsigned numRegions); /// Create a new Operation from the fields stored in `state`. static Operation *create(const OperationState &state); /// Create a new Operation with the specific fields. static Operation *create(Location location, OperationName name, - ArrayRef resultTypes, ArrayRef operands, + TypeRange resultTypes, ValueRange operands, MutableDictionaryAttr attributes, - ArrayRef successors = {}, + BlockRange successors = {}, RegionRange regions = {}); /// The name of an operation is the key identifier for it. @@ -633,7 +633,7 @@ class Operation final bool hasValidOrder() { return orderIndex != kInvalidOrderIdx; } private: - Operation(Location location, OperationName name, ArrayRef resultTypes, + Operation(Location location, OperationName name, TypeRange resultTypes, unsigned numSuccessors, unsigned numRegions, const MutableDictionaryAttr &attributes, bool hasOperandStorage); diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h index 7fce4b808d2e4..11e85f20af445 100644 --- a/mlir/include/mlir/IR/OperationSupport.h +++ b/mlir/include/mlir/IR/OperationSupport.h @@ -29,6 +29,7 @@ namespace mlir { class Block; +class BlockRange; class Dialect; class Operation; struct OperationState; @@ -42,7 +43,6 @@ class Pattern; class Region; class ResultRange; class RewritePattern; -class SuccessorRange; class Type; class Value; class ValueRange; @@ -394,12 +394,8 @@ struct OperationState { attributes.append(newAttributes); } - /// Add an array of successors. - void addSuccessors(ArrayRef newSuccessors) { - successors.append(newSuccessors.begin(), newSuccessors.end()); - } void addSuccessors(Block *successor) { successors.push_back(successor); } - void addSuccessors(SuccessorRange newSuccessors); + void addSuccessors(BlockRange newSuccessors); /// Create a region that should be attached to the operation. These regions /// can be filled in immediately without waiting for Operation to be diff --git a/mlir/include/mlir/IR/TypeSupport.h b/mlir/include/mlir/IR/TypeSupport.h index ace5eaa733454..c1de589579154 100644 --- a/mlir/include/mlir/IR/TypeSupport.h +++ b/mlir/include/mlir/IR/TypeSupport.h @@ -15,6 +15,7 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/IR/StorageUniquerSupport.h" +#include "llvm/ADT/Twine.h" namespace mlir { class Dialect; @@ -126,6 +127,13 @@ struct TypeUniquer { static typename std::enable_if_t< !std::is_same::value, T> get(MLIRContext *ctx, Args &&...args) { +#ifndef NDEBUG + if (!ctx->getTypeUniquer().isParametricStorageInitialized(T::getTypeID())) + llvm::report_fatal_error(llvm::Twine("can't create type '") + + llvm::getTypeName() + + "' because storage uniquer isn't initialized: " + "the dialect was likely not loaded."); +#endif return ctx->getTypeUniquer().get( [&](TypeStorage *storage) { storage->initialize(AbstractType::lookup(T::getTypeID(), ctx)); @@ -137,6 +145,13 @@ struct TypeUniquer { static typename std::enable_if_t< std::is_same::value, T> get(MLIRContext *ctx) { +#ifndef NDEBUG + if (!ctx->getTypeUniquer().isSingletonStorageInitialized(T::getTypeID())) + llvm::report_fatal_error(llvm::Twine("can't create type '") + + llvm::getTypeName() + + "' because storage uniquer isn't initialized: " + "the dialect was likely not loaded."); +#endif return ctx->getTypeUniquer().get(T::getTypeID()); } diff --git a/mlir/include/mlir/Interfaces/SideEffectInterfaces.td b/mlir/include/mlir/Interfaces/SideEffectInterfaces.td index 1ee623b613659..0f189fa8164ba 100644 --- a/mlir/include/mlir/Interfaces/SideEffectInterfaces.td +++ b/mlir/include/mlir/Interfaces/SideEffectInterfaces.td @@ -51,7 +51,7 @@ class EffectOpInterfaceBase Collects all of the operation's effects into `effects`. }], "void", "getEffects", - (ins "SmallVectorImpl> &":$effects) >, InterfaceMethod<[{ @@ -59,7 +59,7 @@ class EffectOpInterfaceBase }], "void", "getEffectsOnValue", (ins "Value":$value, - "SmallVectorImpl> &":$effects), [{ $_op.getEffects(effects); llvm::erase_if(effects, [&](auto &it) { @@ -73,7 +73,7 @@ class EffectOpInterfaceBase }], "void", "getEffectsOnResource", (ins "SideEffects::Resource *":$resource, - "SmallVectorImpl> &":$effects), [{ $_op.getEffects(effects); llvm::erase_if(effects, [&](auto &it) { @@ -87,7 +87,7 @@ class EffectOpInterfaceBase /// Collect all of the effect instances that correspond to the given /// `Effect` and place them in 'effects'. template void getEffects( - SmallVectorImpl> &effects) { getEffects(effects); llvm::erase_if(effects, [&](auto &it) { @@ -115,7 +115,7 @@ class EffectOpInterfaceBase /// Returns true if this operation has no effects. bool hasNoEffect() { - SmallVector, 4> effects; + SmallVector<::mlir::SideEffects::EffectInstance<}] # baseEffect # [{>, 4> effects; getEffects(effects); return effects.empty(); } @@ -124,7 +124,7 @@ class EffectOpInterfaceBase static bool hasNoEffect(Operation *op) { if (auto interface = dyn_cast<}] # name # [{>(op)) return interface.hasNoEffect(); - return op->hasTrait(); + return op->hasTrait<::mlir::OpTrait::HasRecursiveSideEffects>(); } }]; @@ -178,7 +178,7 @@ class SideEffectsTraitBase { + "::mlir::MemoryEffects::Effect"> { let description = [{ An interface used to query information about the memory effects applied by an operation. diff --git a/mlir/include/mlir/Support/StorageUniquer.h b/mlir/include/mlir/Support/StorageUniquer.h index eb04688be1902..d0a6170805bfd 100644 --- a/mlir/include/mlir/Support/StorageUniquer.h +++ b/mlir/include/mlir/Support/StorageUniquer.h @@ -210,6 +210,16 @@ class StorageUniquer { return get(TypeID::get()); } + /// Test if there is a singleton storage uniquer initialized for the provided + /// TypeID. This is only useful for debugging/diagnostic purpose: the uniquer + /// is initialized when a dialect is loaded. + bool isSingletonStorageInitialized(TypeID id); + + /// Test if there is a parametric storage uniquer initialized for the provided + /// TypeID. This is only useful for debugging/diagnostic purpose: the uniquer + /// is initialized when a dialect is loaded. + bool isParametricStorageInitialized(TypeID id); + /// Changes the mutable component of 'storage' by forwarding the trailing /// arguments to the 'mutate' function of the derived class. template diff --git a/mlir/include/mlir/TableGen/Dialect.h b/mlir/include/mlir/TableGen/Dialect.h index 623d614d26d38..ee86a2504b3c9 100644 --- a/mlir/include/mlir/TableGen/Dialect.h +++ b/mlir/include/mlir/TableGen/Dialect.h @@ -67,11 +67,13 @@ class Dialect { // underlying record. bool operator==(const Dialect &other) const; + bool operator!=(const Dialect &other) const { return !(*this == other); } + // Compares two dialects by comparing the names of the dialects. bool operator<(const Dialect &other) const; // Returns whether the dialect is defined. - operator bool() const { return def != nullptr; } + explicit operator bool() const { return def != nullptr; } private: const llvm::Record *def; diff --git a/mlir/include/mlir/TableGen/OpClass.h b/mlir/include/mlir/TableGen/OpClass.h index 1ac5b1692625f..a82b9dd879769 100644 --- a/mlir/include/mlir/TableGen/OpClass.h +++ b/mlir/include/mlir/TableGen/OpClass.h @@ -24,35 +24,190 @@ #define MLIR_TABLEGEN_OPCLASS_H_ #include "mlir/Support/LLVM.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" +#include "llvm/Support/raw_ostream.h" +#include #include namespace mlir { namespace tblgen { class FmtObjectBase; +// Class for holding a single parameter of an op's method for C++ code emission. +class OpMethodParameter { +public: + // Properties (qualifiers) for the parameter. + enum Property { + PP_None = 0x0, + PP_Optional = 0x1, + }; + + OpMethodParameter(StringRef type, StringRef name, StringRef defaultValue = "", + Property properties = PP_None) + : type(type), name(name), defaultValue(defaultValue), + properties(properties) {} + + OpMethodParameter(StringRef type, StringRef name, Property property) + : OpMethodParameter(type, name, "", property) {} + + // Writes the parameter as a part of a method declaration to `os`. + void writeDeclTo(raw_ostream &os) const { writeTo(os, /*emitDefault=*/true); } + + // Writes the parameter as a part of a method definition to `os` + void writeDefTo(raw_ostream &os) const { writeTo(os, /*emitDefault=*/false); } + + const std::string &getType() const { return type; } + bool hasDefaultValue() const { return !defaultValue.empty(); } + +private: + void writeTo(raw_ostream &os, bool emitDefault) const; + + std::string type; + std::string name; + std::string defaultValue; + Property properties; +}; + +// Base class for holding parameters of an op's method for C++ code emission. +class OpMethodParameters { +public: + // Discriminator for LLVM-style RTTI. + enum ParamsKind { + // Separate type and name for each parameter is not known. + PK_Unresolved, + // Each parameter is resolved to a type and name. + PK_Resolved, + }; + + OpMethodParameters(ParamsKind kind) : kind(kind) {} + virtual ~OpMethodParameters() {} + + // LLVM-style RTTI support. + ParamsKind getKind() const { return kind; } + + // Writes the parameters as a part of a method declaration to `os`. + virtual void writeDeclTo(raw_ostream &os) const = 0; + + // Writes the parameters as a part of a method definition to `os` + virtual void writeDefTo(raw_ostream &os) const = 0; + + // Factory methods to create the correct type of `OpMethodParameters` + // object based on the arguments. + static std::unique_ptr create(); + + static std::unique_ptr create(StringRef params); + + static std::unique_ptr + create(llvm::SmallVectorImpl &¶ms); + + static std::unique_ptr + create(StringRef type, StringRef name, StringRef defaultValue = ""); + +private: + const ParamsKind kind; +}; + +// Class for holding unresolved parameters. +class OpMethodUnresolvedParameters : public OpMethodParameters { +public: + OpMethodUnresolvedParameters(StringRef params) + : OpMethodParameters(PK_Unresolved), parameters(params) {} + + // write the parameters as a part of a method declaration to the given `os`. + void writeDeclTo(raw_ostream &os) const override; + + // write the parameters as a part of a method definition to the given `os` + void writeDefTo(raw_ostream &os) const override; + + // LLVM-style RTTI support. + static bool classof(const OpMethodParameters *params) { + return params->getKind() == PK_Unresolved; + } + +private: + std::string parameters; +}; + +// Class for holding resolved parameters. +class OpMethodResolvedParameters : public OpMethodParameters { +public: + OpMethodResolvedParameters() : OpMethodParameters(PK_Resolved) {} + + OpMethodResolvedParameters(llvm::SmallVectorImpl &¶ms) + : OpMethodParameters(PK_Resolved) { + for (OpMethodParameter ¶m : params) + parameters.emplace_back(std::move(param)); + } + + OpMethodResolvedParameters(StringRef type, StringRef name, + StringRef defaultValue) + : OpMethodParameters(PK_Resolved) { + parameters.emplace_back(type, name, defaultValue); + } + + // Returns the number of parameters. + size_t getNumParameters() const { return parameters.size(); } + + // Returns if this method makes the `other` method redundant. Note that this + // is more than just finding conflicting methods. This method determines if + // the 2 set of parameters are conflicting and if so, returns true if this + // method has a more general set of parameters that can replace all possible + // calls to the `other` method. + bool makesRedundant(const OpMethodResolvedParameters &other) const; + + // write the parameters as a part of a method declaration to the given `os`. + void writeDeclTo(raw_ostream &os) const override; + + // write the parameters as a part of a method definition to the given `os` + void writeDefTo(raw_ostream &os) const override; + + // LLVM-style RTTI support. + static bool classof(const OpMethodParameters *params) { + return params->getKind() == PK_Resolved; + } + +private: + llvm::SmallVector parameters; +}; + // Class for holding the signature of an op's method for C++ code emission class OpMethodSignature { public: - OpMethodSignature(StringRef retType, StringRef name, StringRef params); + template + OpMethodSignature(StringRef retType, StringRef name, Args &&...args) + : returnType(retType), methodName(name), + parameters(OpMethodParameters::create(std::forward(args)...)) {} + OpMethodSignature(OpMethodSignature &&) = default; + + // Returns if a method with this signature makes a method with `other` + // signature redundant. Only supports resolved parameters. + bool makesRedundant(const OpMethodSignature &other) const; + + // Returns the number of parameters (for resolved parameters). + size_t getNumParameters() const { + return cast(parameters.get()) + ->getNumParameters(); + } + + // Returns the name of the method. + StringRef getName() const { return methodName; } // Writes the signature as a method declaration to the given `os`. void writeDeclTo(raw_ostream &os) const; + // Writes the signature as the start of a method definition to the given `os`. // `namePrefix` is the prefix to be prepended to the method name (typically // namespaces for qualifying the method definition). void writeDefTo(raw_ostream &os, StringRef namePrefix) const; private: - // Returns true if the given C++ `type` ends with '&' or '*', or is empty. - static bool elideSpaceAfterType(StringRef type); - std::string returnType; std::string methodName; - std::string parameters; + std::unique_ptr parameters; }; // Class for holding the body of an op's method for C++ code emission @@ -79,13 +234,22 @@ class OpMethod { // querying properties. enum Property { MP_None = 0x0, - MP_Static = 0x1, // Static method - MP_Constructor = 0x2, // Constructor - MP_Private = 0x4, // Private method + MP_Static = 0x1, + MP_Constructor = 0x2, + MP_Private = 0x4, + MP_Declaration = 0x8, + MP_StaticDeclaration = MP_Static | MP_Declaration, }; - OpMethod(StringRef retType, StringRef name, StringRef params, - Property property, bool declOnly); + template + OpMethod(StringRef retType, StringRef name, Property property, unsigned id, + Args &&...args) + : properties(property), + methodSignature(retType, name, std::forward(args)...), + methodBody(properties & MP_Declaration), id(id) {} + + OpMethod(OpMethod &&) = default; + virtual ~OpMethod() = default; OpMethodBody &body() { return methodBody; } @@ -96,8 +260,20 @@ class OpMethod { // Returns true if this is a private method. bool isPrivate() const { return properties & MP_Private; } + // Returns the name of this method. + StringRef getName() const { return methodSignature.getName(); } + + // Returns the ID for this method + unsigned getID() const { return id; } + + // Returns if this method makes the `other` method redundant. + bool makesRedundant(const OpMethod &other) const { + return methodSignature.makesRedundant(other.methodSignature); + } + // Writes the method as a declaration to the given `os`. virtual void writeDeclTo(raw_ostream &os) const; + // Writes the method as a definition to the given `os`. `namePrefix` is the // prefix to be prepended to the method name (typically namespaces for // qualifying the method definition). @@ -105,18 +281,18 @@ class OpMethod { protected: Property properties; - // Whether this method only contains a declaration. - bool isDeclOnly; OpMethodSignature methodSignature; OpMethodBody methodBody; + const unsigned id; }; // Class for holding an op's constructor method for C++ code emission. class OpConstructor : public OpMethod { public: - OpConstructor(StringRef retType, StringRef name, StringRef params, - Property property, bool declOnly) - : OpMethod(retType, name, params, property, declOnly){}; + template + OpConstructor(StringRef className, Property property, unsigned id, + Args &&...args) + : OpMethod("", className, property, id, std::forward(args)...) {} // Add member initializer to constructor initializing `name` with `value`. void addMemberInitializer(StringRef name, StringRef value); @@ -137,12 +313,33 @@ class Class { public: explicit Class(StringRef name); - // Creates a new method in this class. - OpMethod &newMethod(StringRef retType, StringRef name, StringRef params = "", - OpMethod::Property = OpMethod::MP_None, - bool declOnly = false); - - OpConstructor &newConstructor(StringRef params = "", bool declOnly = false); + // Adds a new method to this class and prune redundant methods. Returns null + // if the method was not added (because an existing method would make it + // redundant), else returns a pointer to the added method. Note that this call + // may also delete existing methods that are made redundant by a method to the + // class. + template + OpMethod *addMethodAndPrune(StringRef retType, StringRef name, + OpMethod::Property properties, Args &&...args) { + auto newMethod = std::make_unique( + retType, name, properties, nextMethodID++, std::forward(args)...); + return addMethodAndPrune(methods, std::move(newMethod)); + } + + template + OpMethod *addMethodAndPrune(StringRef retType, StringRef name, + Args &&...args) { + return addMethodAndPrune(retType, name, OpMethod::MP_None, + std::forward(args)...); + } + + template + OpConstructor *addConstructorAndPrune(Args &&...args) { + auto newConstructor = std::make_unique( + getClassName(), OpMethod::MP_Constructor, nextMethodID++, + std::forward(args)...); + return addMethodAndPrune(constructors, std::move(newConstructor)); + } // Creates a new field in this class. void newField(StringRef type, StringRef name, StringRef defaultValue = ""); @@ -156,9 +353,63 @@ class Class { StringRef getClassName() const { return className; } protected: + // Get a list of all the methods to emit, filtering out hidden ones. + void forAllMethods(llvm::function_ref func) const { + using ConsRef = const std::unique_ptr &; + using MethodRef = const std::unique_ptr &; + llvm::for_each(constructors, [&](ConsRef ptr) { func(*ptr); }); + llvm::for_each(methods, [&](MethodRef ptr) { func(*ptr); }); + } + + // For deterministic code generation, keep methods sorted in the order in + // which they were generated. + template + struct MethodCompare { + bool operator()(const std::unique_ptr &x, + const std::unique_ptr &y) const { + return x->getID() < y->getID(); + } + }; + + template + using MethodSet = + std::set, MethodCompare>; + + template + MethodTy *addMethodAndPrune(MethodSet &set, + std::unique_ptr &&newMethod) { + // Check if the new method will be made redundant by existing methods. + for (auto &method : set) + if (method->makesRedundant(*newMethod)) + return nullptr; + + // We can add this a method to the set. Prune any existing methods that will + // be made redundant by adding this new method. Note that the redundant + // check between two methods is more than a conflict check. makesRedundant() + // below will check if the new method conflicts with an existing method and + // if so, returns true if the new method makes the existing method redundant + // because all calls to the existing method can be subsumed by the new + // method. So makesRedundant() does a combined job of finding conflicts and + // deciding which of the 2 conflicting methods survive. + // + // Note: llvm::erase_if does not work with sets of std::unique_ptr, so doing + // it manually here. + for (auto it = set.begin(), end = set.end(); it != end;) { + if (newMethod->makesRedundant(*(it->get()))) + it = set.erase(it); + else + ++it; + } + + MethodTy *ret = newMethod.get(); + set.insert(std::move(newMethod)); + return ret; + } + std::string className; - SmallVector constructors; - SmallVector methods; + MethodSet constructors; + MethodSet methods; + unsigned nextMethodID = 0; SmallVector fields; }; diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h index d7fac87af0be2..34c5506503644 100644 --- a/mlir/include/mlir/TableGen/Operator.h +++ b/mlir/include/mlir/TableGen/Operator.h @@ -242,6 +242,17 @@ class Operator { // debugging purposes. void print(llvm::raw_ostream &os) const; + // A helper RAII class to emit nested namespaces for this op. + class NamespaceEmitter { + public: + NamespaceEmitter(raw_ostream &os, Operator &op); + ~NamespaceEmitter(); + + private: + raw_ostream &os; + SmallVector namespaces; + }; + // Return whether all the result types are known. bool allResultTypesKnown() const { return allResultsHaveKnownTypes; }; diff --git a/mlir/include/mlir/Transforms/BufferPlacement.h b/mlir/include/mlir/Transforms/BufferPlacement.h index b3db7794fd971..8d3e476928b75 100644 --- a/mlir/include/mlir/Transforms/BufferPlacement.h +++ b/mlir/include/mlir/Transforms/BufferPlacement.h @@ -24,34 +24,6 @@ namespace mlir { -/// Prepares a buffer placement phase. It can place (user-defined) alloc -/// nodes. This simplifies the integration of the actual buffer-placement -/// pass. Sample usage: -/// BufferAssignmentPlacer baHelper(regionOp); -/// -> determine alloc positions -/// auto allocPosition = baHelper.computeAllocPosition(value); -/// -> place alloc -/// allocBuilder.setInsertionPoint(positions.getAllocPosition()); -/// -/// Note: this class is intended to be used during legalization. In order -/// to move alloc and dealloc nodes into the right places you can use the -/// createBufferPlacementPass() function. -class BufferAssignmentPlacer { -public: - /// Creates a new assignment builder. - explicit BufferAssignmentPlacer(Operation *op); - - /// Returns the operation this analysis was constructed from. - Operation *getOperation() const { return operation; } - - /// Computes the actual position to place allocs for the given result. - OpBuilder::InsertPoint computeAllocPosition(OpResult result); - -private: - /// The operation this analysis was constructed from. - Operation *operation; -}; - /// A helper type converter class for using inside Buffer Assignment operation /// conversion patterns. The default constructor keeps all the types intact /// except for the ranked-tensor types which is converted to memref types. @@ -157,31 +129,20 @@ class BufferAssignmentTypeConverter : public TypeConverter { SmallVector decomposeTypeConversions; }; -/// Helper conversion pattern that encapsulates a BufferAssignmentPlacer -/// instance. Sample usage: -/// class CustomConversionPattern : public -/// BufferAssignmentOpConversionPattern -/// { -/// ... matchAndRewrite(...) { -/// -> Access stored BufferAssignmentPlacer -/// bufferAssignment->computeAllocPosition(resultOp); -/// } -/// }; +/// Helper conversion pattern that encapsulates a BufferAssignmentTypeConverter +/// instance. template class BufferAssignmentOpConversionPattern : public OpConversionPattern { public: explicit BufferAssignmentOpConversionPattern( - MLIRContext *context, BufferAssignmentPlacer *bufferAssignment = nullptr, - BufferAssignmentTypeConverter *converter = nullptr, + MLIRContext *context, BufferAssignmentTypeConverter *converter, PatternBenefit benefit = 1) - : OpConversionPattern(context, benefit), - bufferAssignment(bufferAssignment), converter(converter) { + : OpConversionPattern(context, benefit), converter(converter) { assert(converter && "The type converter has not been defined"); } protected: - BufferAssignmentPlacer *bufferAssignment; BufferAssignmentTypeConverter *converter; }; @@ -197,7 +158,7 @@ class BufferAssignmentFuncOpConverter /// Performs the actual signature rewriting step. LogicalResult matchAndRewrite(mlir::FuncOp, ArrayRef, - ConversionPatternRewriter &) const; + ConversionPatternRewriter &) const override; }; /// Rewrites the `ReturnOp` to conform with the changed function signature. @@ -274,7 +235,7 @@ class BufferAssignmentCallOpConverter /// Performs the actual rewriting step. LogicalResult matchAndRewrite(CallOp, ArrayRef, - ConversionPatternRewriter &) const; + ConversionPatternRewriter &) const override; }; /// Populates `patterns` with the conversion patterns of buffer @@ -282,8 +243,7 @@ class BufferAssignmentCallOpConverter template static void populateWithBufferAssignmentOpConversionPatterns( - MLIRContext *context, BufferAssignmentPlacer *placer, - BufferAssignmentTypeConverter *converter, + MLIRContext *context, BufferAssignmentTypeConverter *converter, OwningRewritePatternList *patterns) { // clang-format off patterns->insert< @@ -291,7 +251,7 @@ static void populateWithBufferAssignmentOpConversionPatterns( BufferAssignmentFuncOpConverter, BufferAssignmentReturnOpConverter - >(context, placer, converter); + >(context, converter); // clang-format on } } // end namespace mlir diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h index 5a0d46f5ba575..aaff786fbe2f7 100644 --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -88,16 +88,28 @@ LLVM_NODISCARD LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef shifts, bool unrollPrologueEpilogue = false); +/// Identify valid and profitable bands of loops to tile. This is currently just +/// a temporary placeholder to test the mechanics of tiled code generation. +/// Returns all maximal outermost perfect loop nests to tile. +void getTileableBands(FuncOp f, + std::vector> *bands); + /// Tiles the specified band of perfectly nested loops creating tile-space loops -/// and intra-tile loops. A band is a contiguous set of loops. `tiledNest` when -/// non-null is set to the loops of the tiled nest from outermost to innermost. -/// Loops in `input` are erased when the tiling is successful. +/// and intra-tile loops. A band is a contiguous set of loops. LLVM_NODISCARD LogicalResult tilePerfectlyNested(MutableArrayRef input, ArrayRef tileSizes, SmallVectorImpl *tiledNest = nullptr); +/// Tiles the specified band of perfectly nested loops creating tile-space +/// loops and intra-tile loops, using SSA values as tiling parameters. A band +/// is a contiguous set of loops. +LLVM_NODISCARD +LogicalResult tilePerfectlyNestedParametric( + MutableArrayRef input, ArrayRef tileSizes, + SmallVectorImpl *tiledNest = nullptr); + /// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA' /// and 'forOpB' are part of a perfectly nested sequence of loops. void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB); diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-call.mlir new file mode 100644 index 0000000000000..8f3c6df79f904 --- /dev/null +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-call.mlir @@ -0,0 +1,61 @@ +// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=4" -convert-linalg-to-loops \ +// RUN: -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=4" \ +// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func @print_memref_f32(memref<*xf32>) + +// Creates and returns a 1-D buffer of size %s1 filled with the value %f +func @alloc_1d_filled_f32(%s1 : index, %f : f32) -> memref { + %buf = alloc(%s1) : memref + linalg.fill(%buf, %f) : memref, f32 + return %buf : memref +} + +func @conv_1d(%arg0: memref, %arg1: memref, %arg2: memref) { + linalg.conv_1d %arg0, %arg1, %arg2 : (memref, memref, memref) + return +} + +func @main() { + %c3 = constant 3 : index + %c6 = constant 6 : index + %c8 = constant 8 : index + %f10 = constant 10.00000e+00 : f32 + %val = constant 2.00000e+00 : f32 + %zero = constant 0.00000e+00 : f32 + + %filter1D = call @alloc_1d_filled_f32(%c3, %val) : (index, f32) -> (memref) + %in1D = call @alloc_1d_filled_f32(%c8, %val) : (index, f32) -> (memref) + %out1D = call @alloc_1d_filled_f32(%c6, %zero) : (index, f32) -> (memref) + + store %f10, %in1D[%c3] : memref + call @conv_1d(%in1D, %filter1D, %out1D) : (memref, memref, memref) -> () + %out1D_ = memref_cast %out1D : memref to memref<*xf32> + call @print_memref_f32(%out1D_): (memref<*xf32>) -> () + + dealloc %filter1D : memref + dealloc %in1D : memref + dealloc %out1D : memref + return +} + +// CHECK: Unranked Memref {{.*}} +// CHECK-NEXT: [12, 28, 28, 28, 12, 12] diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir new file mode 100644 index 0000000000000..46634a7e5921c --- /dev/null +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir @@ -0,0 +1,67 @@ +// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,4" -convert-linalg-to-loops \ +// RUN: -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,4" \ +// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func @print_memref_f32(memref<*xf32>) + +// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f +func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref { + %buf = alloc(%s1, %s2, %s3) : memref + linalg.fill(%buf, %f) : memref, f32 + return %buf : memref +} + +func @conv_1d_ncw(%arg0: memref, %arg1: memref, %arg2: memref) { + linalg.conv_1d_ncw %arg0, %arg1, %arg2 : (memref, memref, memref) + return +} + +func @main() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c3 = constant 3 : index + %c6 = constant 6 : index + %c8 = constant 8 : index + %f10 = constant 10.00000e+00 : f32 + %val = constant 2.00000e+00 : f32 + %zero = constant 0.00000e+00 : f32 + + %filter1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c3, %val) : (index, index, index, f32) -> (memref) + %in1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c8, %val) : (index, index, index, f32) -> (memref) + %out1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c6, %zero) : (index, index, index, f32) -> (memref) + + store %f10, %in1D_ncw[%c0, %c0, %c3] : memref + call @conv_1d_ncw(%in1D_ncw, %filter1D_ncw, %out1D_ncw) : (memref, memref, memref) -> () + %out1D_ncw_ = memref_cast %out1D_ncw : memref to memref<*xf32> + call @print_memref_f32(%out1D_ncw_): (memref<*xf32>) -> () + + dealloc %filter1D_ncw : memref + dealloc %in1D_ncw : memref + dealloc %out1D_ncw : memref + return +} + +// CHECK: Unranked Memref {{.*}} +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-SAME: [12, 28, 28, 28, 12, 12] +// CHECK-SAME: ] +// CHECK-SAME: ] diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir new file mode 100644 index 0000000000000..a6aeb30fc153b --- /dev/null +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir @@ -0,0 +1,78 @@ +// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,4" -convert-linalg-to-loops \ +// RUN: -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,4" \ +// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func @print_memref_f32(memref<*xf32>) + +// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f +func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref { + %buf = alloc(%s1, %s2, %s3) : memref + linalg.fill(%buf, %f) : memref, f32 + return %buf : memref +} + +func @conv_1d_nwc(%arg0: memref, %arg1: memref, %arg2: memref) { + linalg.conv_1d_nwc %arg0, %arg1, %arg2 : (memref, memref, memref) + return +} + +func @main() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c3 = constant 3 : index + %c6 = constant 6 : index + %c8 = constant 8 : index + %f10 = constant 10.00000e+00 : f32 + %val = constant 2.00000e+00 : f32 + %zero = constant 0.00000e+00 : f32 + + %filter1D_nwc = call @alloc_3d_filled_f32(%c1, %c3, %c1, %val) : (index, index, index, f32) -> (memref) + %in1D_nwc = call @alloc_3d_filled_f32(%c3, %c8, %c1, %val) : (index, index, index, f32) -> (memref) + %out1D_nwc = call @alloc_3d_filled_f32(%c3, %c6, %c1, %zero) : (index, index, index, f32) -> (memref) + + store %f10, %in1D_nwc[%c0, %c3, %c0] : memref + call @conv_1d_nwc(%in1D_nwc, %filter1D_nwc, %out1D_nwc) : (memref, memref, memref) -> () + %out1D_nwc_ = memref_cast %out1D_nwc : memref to memref<*xf32> + call @print_memref_f32(%out1D_nwc_): (memref<*xf32>) -> () + + dealloc %filter1D_nwc : memref + dealloc %in1D_nwc : memref + dealloc %out1D_nwc : memref + return +} + +// CHECK: Unranked Memref {{.*}} +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-SAME: [12], +// CHECK-COUNT-3: [28], +// CHECK-NEXT: [12], +// CHECK-NEXT: [12] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-5: [12], +// CHECK-NEXT: [12] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-5: [12], +// CHECK-NEXT: [12] +// CHECK-SAME: ] +// CHECK-SAME: ] diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-call.mlir new file mode 100644 index 0000000000000..819d95ef5da0c --- /dev/null +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-call.mlir @@ -0,0 +1,66 @@ +// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,2" -convert-linalg-to-loops \ +// RUN: -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,2" \ +// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func @print_memref_f32(memref<*xf32>) + +// Creates and returns a 2-D buffer of size (%s1, %s2) filled with the value %f +func @alloc_2d_filled_f32(%s1 : index, %s2 : index, %f : f32) -> memref { + %buf = alloc(%s1, %s2) : memref + linalg.fill(%buf, %f) : memref, f32 + return %buf : memref +} + +func @conv_2d(%arg0: memref, %arg1: memref, %arg2: memref) { + linalg.conv_2d %arg0, %arg1, %arg2 : (memref, memref, memref) + return +} + +func @main() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c3 = constant 3 : index + %c6 = constant 6 : index + %c8 = constant 8 : index + %f10 = constant 10.00000e+00 : f32 + %val = constant 2.00000e+00 : f32 + %zero = constant 0.00000e+00 : f32 + + %filter2D = call @alloc_2d_filled_f32(%c3, %c3, %val) : (index, index, f32) -> (memref) + %in2D = call @alloc_2d_filled_f32(%c8, %c8, %val) : (index, index, f32) -> (memref) + %out2D = call @alloc_2d_filled_f32(%c6, %c6, %zero) : (index, index, f32) -> (memref) + + store %f10, %in2D[%c0, %c3] : memref + call @conv_2d(%in2D, %filter2D, %out2D) : (memref, memref, memref) -> () + %out2D_ = memref_cast %out2D : memref to memref<*xf32> + call @print_memref_f32(%out2D_): (memref<*xf32>) -> () + + dealloc %filter2D : memref + dealloc %in2D : memref + dealloc %out2D : memref + return +} + +// CHECK: Unranked Memref {{.*}} +// CHECK-NEXT: [ +// CHECK-SAME: [36, 52, 52, 52, 36, 36], +// CHECK-COUNT-5: [36, 36, 36, 36, 36, 36] +// CHECK-SAME: ] diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir new file mode 100644 index 0000000000000..fb0e70861864b --- /dev/null +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir @@ -0,0 +1,80 @@ +// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,0,4,4" -convert-linalg-to-loops \ +// RUN: -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,0,4,4" \ +// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func @print_memref_f32(memref<*xf32>) + +// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f +func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref { + %buf = alloc(%s1, %s2, %s3, %s4) : memref + linalg.fill(%buf, %f) : memref, f32 + return %buf : memref +} + +func @conv_2d_nchw(%arg0: memref, %arg1: memref, %arg2: memref) { + linalg.conv_2d_nchw %arg0, %arg1, %arg2 : (memref, memref, memref) + return +} + +func @main() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c3 = constant 3 : index + %c6 = constant 6 : index + %c8 = constant 8 : index + %f10 = constant 10.00000e+00 : f32 + %val = constant 2.00000e+00 : f32 + %zero = constant 0.00000e+00 : f32 + + %filter2D_nchw = call @alloc_4d_filled_f32(%c1, %c1, %c3, %c3, %val) : (index, index, index, index, f32) -> (memref) + %in2D_nchw = call @alloc_4d_filled_f32(%c3, %c1, %c8, %c8, %val) : (index, index, index, index, f32) -> (memref) + %out2D_nchw = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (memref) + + store %f10, %in2D_nchw[%c0, %c0, %c0, %c3] : memref + call @conv_2d_nchw(%in2D_nchw, %filter2D_nchw, %out2D_nchw) : (memref, memref, memref) -> () + %out2D_nchw_ = memref_cast %out2D_nchw : memref to memref<*xf32> + call @print_memref_f32(%out2D_nchw_): (memref<*xf32>) -> () + + dealloc %filter2D_nchw : memref + dealloc %in2D_nchw : memref + dealloc %out2D_nchw : memref + return +} + +// CHECK: Unranked Memref {{.*}} +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-SAME: [ +// CHECK-SAME: [36, 52, 52, 52, 36, 36], +// CHECK-COUNT-5: [36, 36, 36, 36, 36, 36] +// CHECK-SAME: ] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-COUNT-6: [36, 36, 36, 36, 36, 36] +// CHECK-SAME: ] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-COUNT-6: [36, 36, 36, 36, 36, 36] +// CHECK-SAME: ] +// CHECK-SAME: ] +// CHECK-SAME: ] diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir new file mode 100644 index 0000000000000..5888eec7d67a4 --- /dev/null +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir @@ -0,0 +1,126 @@ +// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,3,2" -convert-linalg-to-loops \ +// RUN: -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,3,2" \ +// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func @print_memref_f32(memref<*xf32>) + +// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f +func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref { + %buf = alloc(%s1, %s2, %s3, %s4) : memref + linalg.fill(%buf, %f) : memref, f32 + return %buf : memref +} + +func @conv_2d_nhwc(%arg0: memref, %arg1: memref, %arg2: memref) { + linalg.conv_2d_nhwc %arg0, %arg1, %arg2 : (memref, memref, memref) + return +} + +func @main() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c3 = constant 3 : index + %c6 = constant 6 : index + %c8 = constant 8 : index + %f10 = constant 10.00000e+00 : f32 + %val = constant 2.00000e+00 : f32 + %zero = constant 0.00000e+00 : f32 + + %filter2D_nhwc = call @alloc_4d_filled_f32(%c1, %c3, %c3, %c3, %val) :(index, index, index, index, f32) -> (memref) + %in2D_nhwc = call @alloc_4d_filled_f32(%c3, %c8, %c8, %c3, %val) : (index, index, index, index, f32) -> (memref) + %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c6, %c6, %c1, %zero) : (index, index, index, index, f32) -> (memref) + + store %f10, %in2D_nhwc[%c0, %c0, %c3, %c0] : memref + call @conv_2d_nhwc(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (memref, memref, memref) -> () + %out2D_nhwc_ = memref_cast %out2D_nhwc : memref to memref<*xf32> + call @print_memref_f32(%out2D_nhwc_): (memref<*xf32>) -> () + + dealloc %filter2D_nhwc : memref + dealloc %in2D_nhwc : memref + dealloc %out2D_nhwc : memref + return +} + +// CHECK: Unranked Memref {{.*}} +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-SAME: [ +// CHECK-SAME: [108], +// CHECK-COUNT-3: [124], +// CHECK-COUNT-2: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ] +// CHECK-SAME: ] +// CHECK-SAME: ] diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-call.mlir new file mode 100644 index 0000000000000..f0ca37f86fcd0 --- /dev/null +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-call.mlir @@ -0,0 +1,83 @@ +// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,2,2" -convert-linalg-to-loops \ +// RUN: -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,2,2" \ +// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func @print_memref_f32(memref<*xf32>) + +// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f +func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref { + %buf = alloc(%s1, %s2, %s3) : memref + linalg.fill(%buf, %f) : memref, f32 + return %buf : memref +} + +func @conv_3d(%arg0: memref, %arg1: memref, %arg2: memref) { + linalg.conv_3d %arg0, %arg1, %arg2 : (memref, memref, memref) + return +} + +func @main() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c3 = constant 3 : index + %c6 = constant 6 : index + %c8 = constant 8 : index + %f10 = constant 10.00000e+00 : f32 + %val = constant 2.00000e+00 : f32 + %zero = constant 0.00000e+00 : f32 + + %filter3D = call @alloc_3d_filled_f32(%c3, %c3, %c3, %val) : (index, index, index, f32) -> (memref) + %in3D = call @alloc_3d_filled_f32(%c8, %c8, %c8, %val) : (index, index, index, f32) -> (memref) + %out3D = call @alloc_3d_filled_f32(%c6, %c6, %c6, %zero) : (index, index, index, f32) -> (memref) + + store %f10, %in3D[%c0, %c0, %c3] : memref + call @conv_3d(%in3D, %filter3D, %out3D) : (memref, memref, memref) -> () + %out3D_ = memref_cast %out3D : memref to memref<*xf32> + call @print_memref_f32(%out3D_): (memref<*xf32>) -> () + + dealloc %filter3D : memref + dealloc %in3D : memref + dealloc %out3D : memref + return +} + +// CHECK: Unranked Memref {{.*}} +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-SAME: [108, 124, 124, 124, 108, 108], +// CHECK-COUNT-5: [108, 108, 108, 108, 108, 108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] +// CHECK-SAME: ] +// CHECK-SAME: ] diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir new file mode 100644 index 0000000000000..a56a260b9cd8a --- /dev/null +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir @@ -0,0 +1,87 @@ +// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,5,5,5" -convert-linalg-to-loops \ +// RUN: -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,5,5,5" \ +// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func @print_memref_f32(memref<*xf32>) + +// Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f +func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> memref { + %buf = alloc(%s1, %s2, %s3, %s4, %s5) : memref + linalg.fill(%buf, %f) : memref, f32 + return %buf : memref +} + +func @conv_3d_ncdhw(%arg0: memref, %arg1: memref, %arg2: memref) { + linalg.conv_3d_ncdhw %arg0, %arg1, %arg2 : (memref, memref, memref) + return +} + +func @main() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c3 = constant 3 : index + %c6 = constant 6 : index + %c8 = constant 8 : index + %f10 = constant 10.00000e+00 : f32 + %val = constant 2.00000e+00 : f32 + %zero = constant 0.00000e+00 : f32 + + %filter3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c3, %c3, %c3, %val) : (index, index, index, index, index, f32) -> (memref) + %in3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c8, %c8, %c8, %val) : (index, index, index, index, index, f32) -> (memref) + %out3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c6, %c6, %c6, %zero) : (index, index, index, index, index, f32) -> (memref) + + store %f10, %in3D_ncdhw[%c0, %c0, %c0, %c0, %c3] : memref + call @conv_3d_ncdhw(%in3D_ncdhw, %filter3D_ncdhw, %out3D_ncdhw) : (memref, memref, memref) -> () + %out3D_ncdhw_ = memref_cast %out3D_ncdhw : memref to memref<*xf32> + call @print_memref_f32(%out3D_ncdhw_): (memref<*xf32>) -> () + + dealloc %filter3D_ncdhw : memref + dealloc %in3D_ncdhw : memref + dealloc %out3D_ncdhw : memref + return +} + +// CHECK: Unranked Memref {{.*}} +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-SAME: [ +// CHECK-SAME: [ +// CHECK-SAME: [108, 124, 124, 124, 108, 108], +// CHECK-COUNT-5: [108, 108, 108, 108, 108, 108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] +// CHECK-SAME: ] +// CHECK-SAME: ] +// CHECK-SAME: ] +// CHECK-SAME: ] diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir new file mode 100644 index 0000000000000..37fc6453e5dd0 --- /dev/null +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir @@ -0,0 +1,189 @@ +// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,5,5,5" -convert-linalg-to-loops \ +// RUN: -convert-linalg-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,5,5,5" \ +// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func @print_memref_f32(memref<*xf32>) + +// Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f +func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> memref { + %buf = alloc(%s1, %s2, %s3, %s4, %s5) : memref + linalg.fill(%buf, %f) : memref, f32 + return %buf : memref +} + +func @conv_3d_ndhwc(%arg0: memref, %arg1: memref, %arg2: memref) { + linalg.conv_3d_ndhwc %arg0, %arg1, %arg2 : (memref, memref, memref) + return +} + + +func @main() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c3 = constant 3 : index + %c6 = constant 6 : index + %c8 = constant 8 : index + %f10 = constant 10.00000e+00 : f32 + %val = constant 2.00000e+00 : f32 + %zero = constant 0.00000e+00 : f32 + + %filter3D_ndhwc = call @alloc_5d_filled_f32(%c1, %c3, %c3, %c3, %c1, %val) : (index, index, index, index, index, f32) -> (memref) + %in3D_ndhwc = call @alloc_5d_filled_f32(%c1, %c8, %c8, %c8, %c1, %val) : (index, index, index, index, index, f32) -> (memref) + %out3D_ndhwc = call @alloc_5d_filled_f32(%c1, %c6, %c6, %c6, %c1, %zero) : (index, index, index, index, index, f32) -> (memref) + + store %f10, %in3D_ndhwc[%c0, %c0, %c0, %c3, %c0] : memref + call @conv_3d_ndhwc(%in3D_ndhwc, %filter3D_ndhwc, %out3D_ndhwc) : (memref, memref, memref) -> () + %out3D_ndhwc_ = memref_cast %out3D_ndhwc : memref to memref<*xf32> + call @print_memref_f32(%out3D_ndhwc_): (memref<*xf32>) -> () + + dealloc %filter3D_ndhwc : memref + dealloc %in3D_ndhwc : memref + dealloc %out3D_ndhwc : memref + return +} + +// CHECK: Unranked Memref {{.*}} +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-SAME: [ +// CHECK-SAME: [ +// CHECK-SAME: [108], +// CHECK-COUNT-3: [124], +// CHECK-COUNT-2: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-SAME: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ], +// CHECK-NEXT: [ +// CHECK-COUNT-6: [108] +// CHECK-SAME: ] +// CHECK-SAME: ] +// CHECK-SAME: ] +// CHECK-SAME: ] diff --git a/mlir/integration_test/Dialect/Vector/CPU/test-transfer-to-loops.mlir b/mlir/integration_test/Dialect/Vector/CPU/test-transfer-to-loops.mlir index 8d965779dfc6d..38cbabc329989 100644 --- a/mlir/integration_test/Dialect/Vector/CPU/test-transfer-to-loops.mlir +++ b/mlir/integration_test/Dialect/Vector/CPU/test-transfer-to-loops.mlir @@ -4,6 +4,7 @@ // RUN: FileCheck %s #map0 = affine_map<(d0, d1) -> (d1, d0)> +#map1 = affine_map<(d0, d1) -> (d1)> func @print_memref_f32(memref<*xf32>) @@ -29,6 +30,7 @@ func @main() { %c0 = constant 0 : index %c1 = constant 1 : index %c2 = constant 2 : index + %c3 = constant 3 : index %c6 = constant 6 : index %cst = constant -4.2e+01 : f32 %0 = call @alloc_2d_filled_f32(%c6, %c6) : (index, index) -> memref @@ -76,6 +78,28 @@ func @main() { // CHECK-SAME: ( 205, 305, 405, 505, 504 ), // CHECK-SAME: ( 105, 205, 305, 405, 505 ) ) + %3 = vector.transfer_read %0[%c2, %c3], %cst : memref, vector<5x5xf32> + vector.print %3 : vector<5x5xf32> + // New 5x5 block rooted @{2, 3} in memory. + // CHECK-NEXT: ( ( 403, 503, 502, -42, -42 ), + // CHECK-SAME: ( 404, 504, 503, -42, -42 ), + // CHECK-SAME: ( 405, 505, 504, -42, -42 ), + // CHECK-SAME: ( 305, 405, 505, -42, -42 ), + // CHECK-SAME: ( -42, -42, -42, -42, -42 ) ) + + %4 = vector.transfer_read %0[%c2, %c3], %cst {permutation_map = #map0} : memref, vector<5x5xf32> + vector.print %4 : vector<5x5xf32> + // Transposed 5x5 block rooted @{2, 3} in memory. + // CHECK-NEXT: ( ( 403, 404, 405, 305, -42 ), + // CHECK-SAME: ( 503, 504, 505, 405, -42 ), + // CHECK-SAME: ( 502, 503, 504, 505, -42 ), + // CHECK-SAME: ( -42, -42, -42, -42, -42 ), + // CHECK-SAME: ( -42, -42, -42, -42, -42 ) ) + + %5 = vector.transfer_read %0[%c2, %c3], %cst {permutation_map = #map1} : memref, vector<5xf32> + vector.print %5 : vector<5xf32> + // CHECK-NEXT: ( 403, 503, 502, -42, -42 ) + dealloc %0 : memref return } diff --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp index 546dfa4ba7db2..5b7f4d4982d02 100644 --- a/mlir/lib/Analysis/AffineStructures.cpp +++ b/mlir/lib/Analysis/AffineStructures.cpp @@ -366,23 +366,6 @@ areIdsUnique(const FlatAffineConstraints &cst) { return true; } -// Swap the posA^th identifier with the posB^th identifier. -static void swapId(FlatAffineConstraints *A, unsigned posA, unsigned posB) { - assert(posA < A->getNumIds() && "invalid position A"); - assert(posB < A->getNumIds() && "invalid position B"); - - if (posA == posB) - return; - - for (unsigned r = 0, e = A->getNumInequalities(); r < e; r++) { - std::swap(A->atIneq(r, posA), A->atIneq(r, posB)); - } - for (unsigned r = 0, e = A->getNumEqualities(); r < e; r++) { - std::swap(A->atEq(r, posA), A->atEq(r, posB)); - } - std::swap(A->getId(posA), A->getId(posB)); -} - /// Merge and align the identifiers of A and B starting at 'offset', so that /// both constraint systems get the union of the contained identifiers that is /// dimension-wise and symbol-wise unique; both constraint systems are updated @@ -429,7 +412,7 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineConstraints *A, assert(loc >= offset && "A's dim appears in B's aligned range"); assert(loc < B->getNumDimIds() && "A's dim appears in B's non-dim position"); - swapId(B, d, loc); + B->swapId(d, loc); } else { B->addDimId(d); B->setIdValue(d, aDimValue); @@ -451,7 +434,7 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineConstraints *A, if (B->findId(aSymValue, &loc)) { assert(loc >= B->getNumDimIds() && loc < B->getNumDimAndSymbolIds() && "A's symbol appears in B's non-symbol position"); - swapId(B, s, loc); + B->swapId(s, loc); } else { B->addSymbolId(s - B->getNumDimIds()); B->setIdValue(s, aSymValue); @@ -619,7 +602,7 @@ LogicalResult FlatAffineConstraints::composeMatchingMap(AffineMap other) { static void turnDimIntoSymbol(FlatAffineConstraints *cst, Value id) { unsigned pos; if (cst->findId(id, &pos) && pos < cst->getNumDimIds()) { - swapId(cst, pos, cst->getNumDimIds() - 1); + cst->swapId(pos, cst->getNumDimIds() - 1); cst->setDimSymbolSeparation(cst->getNumSymbolIds() + 1); } } @@ -629,7 +612,7 @@ static void turnSymbolIntoDim(FlatAffineConstraints *cst, Value id) { unsigned pos; if (cst->findId(id, &pos) && pos >= cst->getNumDimIds() && pos < cst->getNumDimAndSymbolIds()) { - swapId(cst, pos, cst->getNumDimIds()); + cst->swapId(pos, cst->getNumDimIds()); cst->setDimSymbolSeparation(cst->getNumSymbolIds() - 1); } } @@ -1964,6 +1947,20 @@ bool FlatAffineConstraints::containsId(Value id) const { }); } +void FlatAffineConstraints::swapId(unsigned posA, unsigned posB) { + assert(posA < getNumIds() && "invalid position A"); + assert(posB < getNumIds() && "invalid position B"); + + if (posA == posB) + return; + + for (unsigned r = 0, e = getNumInequalities(); r < e; r++) + std::swap(atIneq(r, posA), atIneq(r, posB)); + for (unsigned r = 0, e = getNumEqualities(); r < e; r++) + std::swap(atEq(r, posA), atEq(r, posB)); + std::swap(getId(posA), getId(posB)); +} + void FlatAffineConstraints::setDimSymbolSeparation(unsigned newSymbolCount) { assert(newSymbolCount <= numDims + numSymbols && "invalid separation position"); diff --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp index 8f5f87ba620ee..120d4e4a91372 100644 --- a/mlir/lib/Analysis/SliceAnalysis.cpp +++ b/mlir/lib/Analysis/SliceAnalysis.cpp @@ -12,6 +12,7 @@ #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/IR/Function.h" #include "mlir/IR/Operation.h" @@ -84,7 +85,8 @@ static void getBackwardSliceImpl(Operation *op, if (!op) return; - assert((op->getNumRegions() == 0 || isa(op)) && + assert((op->getNumRegions() == 0 || + isa(op)) && "unexpected generic op with regions"); // Evaluate whether we should keep this def. diff --git a/mlir/lib/Bindings/Python/IRModules.cpp b/mlir/lib/Bindings/Python/IRModules.cpp index bf1235a77d08c..527c530518cac 100644 --- a/mlir/lib/Bindings/Python/IRModules.cpp +++ b/mlir/lib/Bindings/Python/IRModules.cpp @@ -285,10 +285,8 @@ class PyStringAttribute : public PyConcreteAttribute { c.def_property_readonly( "value", [](PyStringAttribute &self) { - PySinglePartStringAccumulator accum; - mlirStringAttrGetValue(self.attr, accum.getCallback(), - accum.getUserData()); - return accum.takeValue(); + MlirStringRef stringRef = mlirStringAttrGetValue(self.attr); + return py::str(stringRef.data, stringRef.length); }, "Returns the value of the string attribute"); } diff --git a/mlir/lib/CAPI/IR/AffineMap.cpp b/mlir/lib/CAPI/IR/AffineMap.cpp index d80d9e20486a0..6a87c269a4216 100644 --- a/mlir/lib/CAPI/IR/AffineMap.cpp +++ b/mlir/lib/CAPI/IR/AffineMap.cpp @@ -9,7 +9,119 @@ #include "mlir-c/AffineMap.h" #include "mlir-c/IR.h" #include "mlir/CAPI/AffineMap.h" +#include "mlir/CAPI/IR.h" +#include "mlir/CAPI/Utils.h" #include "mlir/IR/AffineMap.h" -// This is a placeholder for affine map bindings. The file is here to serve as a -// compilation unit that includes the headers. +// TODO: expose the C API related to `AffineExpr` and mutable affine map. + +using namespace mlir; + +MlirContext mlirAffineMapGetContext(MlirAffineMap affineMap) { + return wrap(unwrap(affineMap).getContext()); +} + +int mlirAffineMapEqual(MlirAffineMap a1, MlirAffineMap a2) { + return unwrap(a1) == unwrap(a2); +} + +void mlirAffineMapPrint(MlirAffineMap affineMap, MlirStringCallback callback, + void *userData) { + mlir::detail::CallbackOstream stream(callback, userData); + unwrap(affineMap).print(stream); + stream.flush(); +} + +void mlirAffineMapDump(MlirAffineMap affineMap) { unwrap(affineMap).dump(); } + +MlirAffineMap mlirAffineMapEmptyGet(MlirContext ctx) { + return wrap(AffineMap::get(unwrap(ctx))); +} + +MlirAffineMap mlirAffineMapGet(MlirContext ctx, intptr_t dimCount, + intptr_t symbolCount) { + return wrap(AffineMap::get(dimCount, symbolCount, unwrap(ctx))); +} + +MlirAffineMap mlirAffineMapConstantGet(MlirContext ctx, int64_t val) { + return wrap(AffineMap::getConstantMap(val, unwrap(ctx))); +} + +MlirAffineMap mlirAffineMapMultiDimIdentityGet(MlirContext ctx, + intptr_t numDims) { + return wrap(AffineMap::getMultiDimIdentityMap(numDims, unwrap(ctx))); +} + +MlirAffineMap mlirAffineMapMinorIdentityGet(MlirContext ctx, intptr_t dims, + intptr_t results) { + return wrap(AffineMap::getMinorIdentityMap(dims, results, unwrap(ctx))); +} + +MlirAffineMap mlirAffineMapPermutationGet(MlirContext ctx, intptr_t size, + unsigned *permutation) { + return wrap(AffineMap::getPermutationMap( + llvm::makeArrayRef(permutation, static_cast(size)), unwrap(ctx))); +} + +int mlirAffineMapIsIdentity(MlirAffineMap affineMap) { + return unwrap(affineMap).isIdentity(); +} + +int mlirAffineMapIsMinorIdentity(MlirAffineMap affineMap) { + return unwrap(affineMap).isMinorIdentity(); +} + +int mlirAffineMapIsEmpty(MlirAffineMap affineMap) { + return unwrap(affineMap).isEmpty(); +} + +int mlirAffineMapIsSingleConstant(MlirAffineMap affineMap) { + return unwrap(affineMap).isSingleConstant(); +} + +int64_t mlirAffineMapGetSingleConstantResult(MlirAffineMap affineMap) { + return unwrap(affineMap).getSingleConstantResult(); +} + +intptr_t mlirAffineMapGetNumDims(MlirAffineMap affineMap) { + return unwrap(affineMap).getNumDims(); +} + +intptr_t mlirAffineMapGetNumSymbols(MlirAffineMap affineMap) { + return unwrap(affineMap).getNumSymbols(); +} + +intptr_t mlirAffineMapGetNumResults(MlirAffineMap affineMap) { + return unwrap(affineMap).getNumResults(); +} + +intptr_t mlirAffineMapGetNumInputs(MlirAffineMap affineMap) { + return unwrap(affineMap).getNumInputs(); +} + +int mlirAffineMapIsProjectedPermutation(MlirAffineMap affineMap) { + return unwrap(affineMap).isProjectedPermutation(); +} + +int mlirAffineMapIsPermutation(MlirAffineMap affineMap) { + return unwrap(affineMap).isPermutation(); +} + +MlirAffineMap mlirAffineMapGetSubMap(MlirAffineMap affineMap, intptr_t size, + intptr_t *resultPos) { + SmallVector pos; + pos.reserve(size); + for (intptr_t i = 0; i < size; ++i) + pos.push_back(static_cast(resultPos[i])); + return wrap(unwrap(affineMap).getSubMap(pos)); +} + +MlirAffineMap mlirAffineMapGetMajorSubMap(MlirAffineMap affineMap, + intptr_t numResults) { + return wrap(unwrap(affineMap).getMajorSubMap(numResults)); +} + +MlirAffineMap mlirAffineMapGetMinorSubMap(MlirAffineMap affineMap, + intptr_t numResults) { + return wrap(unwrap(affineMap).getMinorSubMap(numResults)); +} diff --git a/mlir/lib/CAPI/IR/CMakeLists.txt b/mlir/lib/CAPI/IR/CMakeLists.txt index 3e2e3d6a22d82..4158a4c96efd0 100644 --- a/mlir/lib/CAPI/IR/CMakeLists.txt +++ b/mlir/lib/CAPI/IR/CMakeLists.txt @@ -4,6 +4,7 @@ add_mlir_library(MLIRCAPIIR IR.cpp StandardAttributes.cpp StandardTypes.cpp + Support.cpp EXCLUDE_FROM_LIBMLIR diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp index 2a008a2114d67..8611d6537371a 100644 --- a/mlir/lib/CAPI/IR/IR.cpp +++ b/mlir/lib/CAPI/IR/IR.cpp @@ -9,43 +9,16 @@ #include "mlir-c/IR.h" #include "mlir/CAPI/IR.h" +#include "mlir/CAPI/Utils.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Dialect.h" #include "mlir/IR/Module.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Types.h" #include "mlir/Parser.h" -#include "llvm/Support/raw_ostream.h" using namespace mlir; -/* ========================================================================== */ -/* Printing helper. */ -/* ========================================================================== */ - -namespace { -/// A simple raw ostream subclass that forwards write_impl calls to the -/// user-supplied callback together with opaque user-supplied data. -class CallbackOstream : public llvm::raw_ostream { -public: - CallbackOstream(std::function callback, - void *opaqueData) - : callback(callback), opaqueData(opaqueData), pos(0u) {} - - void write_impl(const char *ptr, size_t size) override { - callback(ptr, size, opaqueData); - pos += size; - } - - uint64_t current_pos() const override { return pos; } - -private: - std::function callback; - void *opaqueData; - uint64_t pos; -}; -} // end namespace - /* ========================================================================== */ /* Context API. */ /* ========================================================================== */ @@ -77,7 +50,7 @@ MlirLocation mlirLocationUnknownGet(MlirContext context) { void mlirLocationPrint(MlirLocation location, MlirStringCallback callback, void *userData) { - CallbackOstream stream(callback, userData); + detail::CallbackOstream stream(callback, userData); unwrap(location).print(stream); stream.flush(); } @@ -244,7 +217,7 @@ MlirAttribute mlirOperationGetAttributeByName(MlirOperation op, void mlirOperationPrint(MlirOperation op, MlirStringCallback callback, void *userData) { - CallbackOstream stream(callback, userData); + detail::CallbackOstream stream(callback, userData); unwrap(op)->print(stream); stream.flush(); } @@ -326,7 +299,7 @@ MlirValue mlirBlockGetArgument(MlirBlock block, intptr_t pos) { void mlirBlockPrint(MlirBlock block, MlirStringCallback callback, void *userData) { - CallbackOstream stream(callback, userData); + detail::CallbackOstream stream(callback, userData); unwrap(block)->print(stream); stream.flush(); } @@ -341,7 +314,7 @@ MlirType mlirValueGetType(MlirValue value) { void mlirValuePrint(MlirValue value, MlirStringCallback callback, void *userData) { - CallbackOstream stream(callback, userData); + detail::CallbackOstream stream(callback, userData); unwrap(value).print(stream); stream.flush(); } @@ -361,7 +334,7 @@ MlirContext mlirTypeGetContext(MlirType type) { int mlirTypeEqual(MlirType t1, MlirType t2) { return unwrap(t1) == unwrap(t2); } void mlirTypePrint(MlirType type, MlirStringCallback callback, void *userData) { - CallbackOstream stream(callback, userData); + detail::CallbackOstream stream(callback, userData); unwrap(type).print(stream); stream.flush(); } @@ -382,7 +355,7 @@ int mlirAttributeEqual(MlirAttribute a1, MlirAttribute a2) { void mlirAttributePrint(MlirAttribute attr, MlirStringCallback callback, void *userData) { - CallbackOstream stream(callback, userData); + detail::CallbackOstream stream(callback, userData); unwrap(attr).print(stream); stream.flush(); } diff --git a/mlir/lib/CAPI/IR/StandardAttributes.cpp b/mlir/lib/CAPI/IR/StandardAttributes.cpp index cade603132dcf..77d5fcb8b33c2 100644 --- a/mlir/lib/CAPI/IR/StandardAttributes.cpp +++ b/mlir/lib/CAPI/IR/StandardAttributes.cpp @@ -9,6 +9,7 @@ #include "mlir-c/StandardAttributes.h" #include "mlir/CAPI/AffineMap.h" #include "mlir/CAPI/IR.h" +#include "mlir/CAPI/Support.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/StandardTypes.h" @@ -165,10 +166,8 @@ const char *mlirOpaqueAttrGetDialectNamespace(MlirAttribute attr) { return unwrap(attr).cast().getDialectNamespace().c_str(); } -void mlirOpaqueAttrGetData(MlirAttribute attr, MlirStringCallback callback, - void *userData) { - StringRef data = unwrap(attr).cast().getAttrData(); - callback(data.data(), static_cast(data.size()), userData); +MlirStringRef mlirOpaqueAttrGetData(MlirAttribute attr) { + return wrap(unwrap(attr).cast().getAttrData()); } /*============================================================================*/ @@ -189,10 +188,8 @@ MlirAttribute mlirStringAttrTypedGet(MlirType type, intptr_t length, return wrap(StringAttr::get(StringRef(data, length), unwrap(type))); } -void mlirStringAttrGetValue(MlirAttribute attr, MlirStringCallback callback, - void *userData) { - StringRef data = unwrap(attr).cast().getValue(); - callback(data.data(), static_cast(data.size()), userData); +MlirStringRef mlirStringAttrGetValue(MlirAttribute attr) { + return wrap(unwrap(attr).cast().getValue()); } /*============================================================================*/ @@ -213,18 +210,12 @@ MlirAttribute mlirSymbolRefAttrGet(MlirContext ctx, intptr_t length, return wrap(SymbolRefAttr::get(StringRef(symbol, length), refs, unwrap(ctx))); } -void mlirSymbolRefAttrGetRootReference(MlirAttribute attr, - MlirStringCallback callback, - void *userData) { - StringRef ref = unwrap(attr).cast().getRootReference(); - callback(ref.data(), ref.size(), userData); +MlirStringRef mlirSymbolRefAttrGetRootReference(MlirAttribute attr) { + return wrap(unwrap(attr).cast().getRootReference()); } -void mlirSymbolRefAttrGetLeafReference(MlirAttribute attr, - MlirStringCallback callback, - void *userData) { - StringRef ref = unwrap(attr).cast().getLeafReference(); - callback(ref.data(), ref.size(), userData); +MlirStringRef mlirSymbolRefAttrGetLeafReference(MlirAttribute attr) { + return wrap(unwrap(attr).cast().getLeafReference()); } intptr_t mlirSymbolRefAttrGetNumNestedReferences(MlirAttribute attr) { @@ -250,11 +241,8 @@ MlirAttribute mlirFlatSymbolRefAttrGet(MlirContext ctx, intptr_t length, return wrap(FlatSymbolRefAttr::get(StringRef(symbol, length), unwrap(ctx))); } -void mlirFloatSymbolRefAttrGetValue(MlirAttribute attr, - MlirStringCallback callback, - void *userData) { - StringRef symbol = unwrap(attr).cast().getValue(); - callback(symbol.data(), symbol.size(), userData); +MlirStringRef mlirFlatSymbolRefAttrGetValue(MlirAttribute attr) { + return wrap(unwrap(attr).cast().getValue()); } /*============================================================================*/ @@ -477,12 +465,9 @@ float mlirDenseElementsAttrGetFloatSplatValue(MlirAttribute attr) { double mlirDenseElementsAttrGetDoubleSplatValue(MlirAttribute attr) { return unwrap(attr).cast().getSplatValue(); } -void mlirDenseElementsAttrGetStringSplatValue(MlirAttribute attr, - MlirStringCallback callback, - void *userData) { - StringRef str = - unwrap(attr).cast().getSplatValue(); - callback(str.data(), str.size(), userData); +MlirStringRef mlirDenseElementsAttrGetStringSplatValue(MlirAttribute attr) { + return wrap( + unwrap(attr).cast().getSplatValue()); } //===----------------------------------------------------------------------===// @@ -518,13 +503,11 @@ double mlirDenseElementsAttrGetDoubleValue(MlirAttribute attr, intptr_t pos) { return *(unwrap(attr).cast().getValues().begin() + pos); } -void mlirDenseElementsAttrGetStringValue(MlirAttribute attr, intptr_t pos, - MlirStringCallback callback, - void *userData) { - StringRef str = +MlirStringRef mlirDenseElementsAttrGetStringValue(MlirAttribute attr, + intptr_t pos) { + return wrap( *(unwrap(attr).cast().getValues().begin() + - pos); - callback(str.data(), str.size(), userData); + pos)); } /*============================================================================*/ diff --git a/mlir/lib/CAPI/IR/Support.cpp b/mlir/lib/CAPI/IR/Support.cpp new file mode 100644 index 0000000000000..e4b409906297d --- /dev/null +++ b/mlir/lib/CAPI/IR/Support.cpp @@ -0,0 +1,15 @@ +//===- Support.cpp - Helpers for C interface to MLIR API ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir-c/Support.h" + +#include + +MlirStringRef mlirStringRefCreateFromCString(const char *str) { + return mlirStringRefCreate(str, strlen(str)); +} diff --git a/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp b/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp index d56dffdd0dc17..93b7764a6a773 100644 --- a/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp +++ b/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp @@ -244,6 +244,7 @@ void mlir::populateLinalgToStandardConversionPatterns( LinalgOpConversion, LinalgOpConversion, LinalgOpConversion, + LinalgOpConversion, LinalgOpConversion, LinalgOpConversion, LinalgOpConversion, diff --git a/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp b/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp index 8c917e08f942c..0a6953842a149 100644 --- a/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp +++ b/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp @@ -182,8 +182,9 @@ LogicalResult ConstShapeOpConverter::matchAndRewrite( extentOperands.push_back( rewriter.create(loc, extent.getLimitedValue())); } - Value tensor = rewriter.create(loc, extentOperands); Type indexTy = rewriter.getIndexType(); + Value tensor = + rewriter.create(loc, indexTy, extentOperands); Type resultTy = RankedTensorType::get({ShapedType::kDynamicSize}, indexTy); rewriter.replaceOpWithNewOp(op, tensor, resultTy); return success(); @@ -422,6 +423,7 @@ LogicalResult ShapeOfOpConversion::matchAndRewrite( return failure(); // For ranked tensor arguments, lower to `tensor_from_elements`. + auto loc = op.getLoc(); ShapeOfOp::Adaptor transformed(operands); Value tensor = transformed.arg(); Type tensorTy = tensor.getType(); @@ -431,7 +433,6 @@ LogicalResult ShapeOfOpConversion::matchAndRewrite( SmallVector extentValues; RankedTensorType rankedTensorTy = tensorTy.cast(); int64_t rank = rankedTensorTy.getRank(); - auto loc = op.getLoc(); for (int64_t i = 0; i < rank; i++) { if (rankedTensorTy.isDynamicDim(i)) { Value extent = rewriter.create(loc, tensor, i); @@ -444,33 +445,24 @@ LogicalResult ShapeOfOpConversion::matchAndRewrite( } // Materialize extent tensor. - Value staticExtentTensor = - rewriter.create(loc, extentValues); + Value staticExtentTensor = rewriter.create( + loc, rewriter.getIndexType(), extentValues); rewriter.replaceOpWithNewOp(op, staticExtentTensor, op.getType()); return success(); } - // Allocate stack memory. - auto loc = op.getLoc(); + // Lower to `dynamic_tensor_from_elements` otherwise. + auto *ctx = rewriter.getContext(); Value rank = rewriter.create(loc, tensor); - Type indexTy = rewriter.getIndexType(); - Type memTy = MemRefType::get({ShapedType::kDynamicSize}, indexTy); - Value mem = rewriter.create(loc, memTy, ValueRange{rank}); - - // Copy shape extents to stack-allocated memory. - Value zero = rewriter.create(loc, 0); - Value one = rewriter.create(loc, 1); - rewriter.create( - loc, zero, rank, one, llvm::None, - [&](OpBuilder &b, Location loc, Value iv, ValueRange args) { - Value dim = rewriter.create(loc, tensor, iv); - rewriter.create(loc, dim, mem, ValueRange{iv}); - rewriter.create(loc); + rewriter.replaceOpWithNewOp( + op, getExtentTensorType(ctx), ValueRange{rank}, + [&](OpBuilder &b, Location loc, ValueRange args) { + Value dim = args.front(); + Value extent = b.create(loc, tensor, dim); + b.create(loc, extent); }); - // Load extents to tensor value. - rewriter.replaceOpWithNewOp(op.getOperation(), mem); return success(); } diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp index 55a926ef1423d..814a2550015d8 100644 --- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp +++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp @@ -642,9 +642,11 @@ void MemRefDescriptor::setConstantStride(OpBuilder &builder, Location loc, createIndexAttrConstant(builder, loc, indexType, stride)); } -LLVM::LLVMType MemRefDescriptor::getElementType() { - return value.getType().cast().getStructElementType( - kAlignedPtrPosInMemRefDescriptor); +LLVM::LLVMPointerType MemRefDescriptor::getElementPtrType() { + return value.getType() + .cast() + .getStructElementType(kAlignedPtrPosInMemRefDescriptor) + .cast(); } /// Creates a MemRef descriptor structure from a list of individual values @@ -894,7 +896,7 @@ Value ConvertToLLVMPattern::getStridedElementPtr( Value ConvertToLLVMPattern::getDataPtr( Location loc, MemRefType type, Value memRefDesc, ValueRange indices, ConversionPatternRewriter &rewriter) const { - LLVM::LLVMType ptrType = MemRefDescriptor(memRefDesc).getElementType(); + LLVM::LLVMType ptrType = MemRefDescriptor(memRefDesc).getElementPtrType(); int64_t offset; SmallVector strides; auto successStrides = getStridesAndOffset(type, strides, offset); @@ -1110,6 +1112,8 @@ struct FuncOpConversionBase : public ConvertOpToLLVMPattern { TypeConverter::SignatureConversion result(funcOp.getNumArguments()); auto llvmType = typeConverter.convertFunctionSignature( funcOp.getType(), varargsAttr && varargsAttr.getValue(), result); + if (!llvmType) + return nullptr; // Propagate argument attributes to all converted arguments obtained after // converting a given original argument. @@ -3386,7 +3390,7 @@ Type LLVMTypeConverter::packFunctionResults(ArrayRef types) { SmallVector resultTypes; resultTypes.reserve(types.size()); for (auto t : types) { - auto converted = convertType(t).dyn_cast(); + auto converted = convertType(t).dyn_cast_or_null(); if (!converted) return {}; resultTypes.push_back(converted); diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index a43bec855ff0a..73fd3285ec974 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -198,7 +198,7 @@ static LogicalResult getBasePtr(ConversionPatternRewriter &rewriter, Value base; if (failed(getBase(rewriter, loc, memref, memRefType, base))) return failure(); - auto pType = MemRefDescriptor(memref).getElementType(); + auto pType = MemRefDescriptor(memref).getElementPtrType(); ptr = rewriter.create(loc, pType, base); return success(); } @@ -225,7 +225,7 @@ static LogicalResult getIndexedPtrs(ConversionPatternRewriter &rewriter, Value base; if (failed(getBase(rewriter, loc, memref, memRefType, base))) return failure(); - auto pType = MemRefDescriptor(memref).getElementType(); + auto pType = MemRefDescriptor(memref).getElementPtrType(); auto ptrsType = LLVM::LLVMType::getVectorTy(pType, vType.getDimSize(0)); ptrs = rewriter.create(loc, ptrsType, base, indices); return success(); @@ -1096,7 +1096,7 @@ static bool isContiguous(MemRefType memRefType, SmallVectorImpl &strides) { int64_t offset; auto successStrides = getStridesAndOffset(memRefType, strides, offset); - bool isContiguous = (strides.back() == 1); + bool isContiguous = strides.empty() || strides.back() == 1; if (isContiguous) { auto sizes = memRefType.getShape(); for (int index = 0, e = strides.size() - 2; index < e; ++index) { @@ -1151,7 +1151,7 @@ class VectorTypeCastOpConversion : public ConvertToLLVMPattern { // Create descriptor. auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy); - Type llvmTargetElementTy = desc.getElementType(); + Type llvmTargetElementTy = desc.getElementPtrType(); // Set allocated ptr. Value allocated = sourceMemRef.allocatedPtr(rewriter, loc); allocated = diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp index 8f7d43829846b..c0d283d7af451 100644 --- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp +++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp @@ -108,17 +108,10 @@ class NDTransferOpHelper { private: /// Creates the loop nest on the "major" dimensions and calls the /// `loopBodyBuilder` lambda in the context of the loop nest. - template - void emitLoops(Lambda loopBodyBuilder); - - /// Operate within the body of `emitLoops` to: - /// 1. Compute the indexings `majorIvs + majorOffsets` and save them in - /// `majorIvsPlusOffsets`. - /// 2. Return a boolean that determines whether the first `majorIvs.rank()` - /// dimensions `majorIvs + majorOffsets` are all within `memrefBounds`. - Value emitInBoundsCondition(ValueRange majorIvs, ValueRange majorOffsets, - MemRefBoundsCapture &memrefBounds, - SmallVectorImpl &majorIvsPlusOffsets); + void + emitLoops(llvm::function_ref + loopBodyBuilder); /// Common state to lower vector transfer ops. PatternRewriter &rewriter; @@ -140,8 +133,10 @@ class NDTransferOpHelper { }; template -template -void NDTransferOpHelper::emitLoops(Lambda loopBodyBuilder) { +void NDTransferOpHelper::emitLoops( + llvm::function_ref + loopBodyBuilder) { /// Loop nest operates on the major dimensions MemRefBoundsCapture memrefBoundsCapture(xferOp.memref()); @@ -196,11 +191,16 @@ static Value onTheFlyFoldSLT(Value v, Value ub) { return slt(v, ub); } -template -Value NDTransferOpHelper::emitInBoundsCondition( - ValueRange majorIvs, ValueRange majorOffsets, - MemRefBoundsCapture &memrefBounds, - SmallVectorImpl &majorIvsPlusOffsets) { +/// 1. Compute the indexings `majorIvs + majorOffsets` and save them in +/// `majorIvsPlusOffsets`. +/// 2. Return a value of i1 that determines whether the first `majorIvs.rank()` +/// dimensions `majorIvs + majorOffsets` are all within `memrefBounds`. +static Value +emitInBoundsCondition(PatternRewriter &rewriter, + VectorTransferOpInterface xferOp, unsigned leadingRank, + ValueRange majorIvs, ValueRange majorOffsets, + const MemRefBoundsCapture &memrefBounds, + SmallVectorImpl &majorIvsPlusOffsets) { Value inBoundsCondition; majorIvsPlusOffsets.reserve(majorIvs.size()); unsigned idx = 0; @@ -246,7 +246,7 @@ LogicalResult NDTransferOpHelper::doReplace() { emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets, ValueRange majorOffsets, ValueRange minorOffsets, - MemRefBoundsCapture &memrefBounds) { + const MemRefBoundsCapture &memrefBounds) { /// Lambda to load 1-D vector in the current loop ivs + offset context. auto load1DVector = [&](ValueRange majorIvsPlusOffsets) -> Value { SmallVector indexing; @@ -271,7 +271,8 @@ LogicalResult NDTransferOpHelper::doReplace() { // context. SmallVector majorIvsPlusOffsets; Value inBoundsCondition = emitInBoundsCondition( - majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); + rewriter, cast(xferOp.getOperation()), + leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); if (inBoundsCondition) { // 2. If the condition is not null, we need an IfOp, which may yield @@ -344,7 +345,7 @@ LogicalResult NDTransferOpHelper::doReplace() { emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets, ValueRange majorOffsets, ValueRange minorOffsets, - MemRefBoundsCapture &memrefBounds) { + const MemRefBoundsCapture &memrefBounds) { // Lower to 1-D vector_transfer_write and let recursion handle it. auto emitTransferWrite = [&](ValueRange majorIvsPlusOffsets) { SmallVector indexing; @@ -374,7 +375,8 @@ LogicalResult NDTransferOpHelper::doReplace() { // context. SmallVector majorIvsPlusOffsets; Value inBoundsCondition = emitInBoundsCondition( - majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); + rewriter, cast(xferOp.getOperation()), + leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); if (inBoundsCondition) { // 2.a. If the condition is not null, we need an IfOp, to write @@ -424,120 +426,90 @@ static int computeCoalescedIndex(TransferOpTy transfer) { return coalescedIdx; } -/// Emits remote memory accesses that are clipped to the boundaries of the -/// MemRef. template -static SmallVector -clip(TransferOpTy transfer, MemRefBoundsCapture &bounds, ArrayRef ivs) { - using namespace mlir::edsc; - - Value zero(std_constant_index(0)), one(std_constant_index(1)); - SmallVector memRefAccess(transfer.indices()); - SmallVector clippedScalarAccessExprs(memRefAccess.size()); - // Indices accessing to remote memory are clipped and their expressions are - // returned in clippedScalarAccessExprs. - for (unsigned memRefDim = 0; memRefDim < clippedScalarAccessExprs.size(); +VectorTransferRewriter::VectorTransferRewriter( + VectorTransferToSCFOptions options, MLIRContext *context) + : RewritePattern(TransferOpTy::getOperationName(), 1, context), + options(options) {} + +/// Used for staging the transfer in a local buffer. +template +MemRefType VectorTransferRewriter::tmpMemRefType( + TransferOpTy transfer) const { + auto vectorType = transfer.getVectorType(); + return MemRefType::get(vectorType.getShape().drop_back(), + VectorType::get(vectorType.getShape().take_back(), + vectorType.getElementType()), + {}, 0); +} + +static void emitWithBoundsChecks( + PatternRewriter &rewriter, VectorTransferOpInterface transfer, + ValueRange ivs, const MemRefBoundsCapture &memRefBoundsCapture, + function_ref)> inBoundsFun, + function_ref)> outOfBoundsFun = nullptr) { + // Permute the incoming indices according to the permutation map. + SmallVector indices = + linalg::applyMapToValues(rewriter, transfer.getLoc(), + transfer.permutation_map(), transfer.indices()); + + // Generate a bounds check if necessary. + SmallVector majorIvsPlusOffsets; + Value inBoundsCondition = + emitInBoundsCondition(rewriter, transfer, 0, ivs, indices, + memRefBoundsCapture, majorIvsPlusOffsets); + + // Apply the permutation map to the ivs. The permutation map may not use all + // the inputs. + SmallVector scalarAccessExprs(transfer.indices().size()); + for (unsigned memRefDim = 0; memRefDim < transfer.indices().size(); ++memRefDim) { // Linear search on a small number of entries. int loopIndex = -1; auto exprs = transfer.permutation_map().getResults(); for (auto en : llvm::enumerate(exprs)) { auto expr = en.value(); - auto dim = expr.template dyn_cast(); + auto dim = expr.dyn_cast(); // Sanity check. - assert( - (dim || expr.template cast().getValue() == 0) && - "Expected dim or 0 in permutationMap"); + assert((dim || expr.cast().getValue() == 0) && + "Expected dim or 0 in permutationMap"); if (dim && memRefDim == dim.getPosition()) { loopIndex = en.index(); break; } } - // We cannot distinguish atm between unrolled dimensions that implement - // the "always full" tile abstraction and need clipping from the other - // ones. So we conservatively clip everything. using namespace edsc::op; - auto N = bounds.ub(memRefDim); - auto i = memRefAccess[memRefDim]; - if (loopIndex < 0) { - auto N_minus_1 = N - one; - auto select_1 = std_select(slt(i, N), i, N_minus_1); - clippedScalarAccessExprs[memRefDim] = - std_select(slt(i, zero), zero, select_1); - } else { - auto ii = ivs[loopIndex]; - auto i_plus_ii = i + ii; - auto N_minus_1 = N - one; - auto select_1 = std_select(slt(i_plus_ii, N), i_plus_ii, N_minus_1); - clippedScalarAccessExprs[memRefDim] = - std_select(slt(i_plus_ii, zero), zero, select_1); - } + auto i = transfer.indices()[memRefDim]; + scalarAccessExprs[memRefDim] = loopIndex < 0 ? i : i + ivs[loopIndex]; } - return clippedScalarAccessExprs; + if (inBoundsCondition) + conditionBuilder( + /* scf.if */ inBoundsCondition, // { + [&] { inBoundsFun(scalarAccessExprs); }, + // } else { + outOfBoundsFun ? [&] { outOfBoundsFun(scalarAccessExprs); } + : function_ref() + // } + ); + else + inBoundsFun(scalarAccessExprs); } namespace mlir { -template -VectorTransferRewriter::VectorTransferRewriter( - VectorTransferToSCFOptions options, MLIRContext *context) - : RewritePattern(TransferOpTy::getOperationName(), 1, context), - options(options) {} - -/// Used for staging the transfer in a local buffer. -template -MemRefType VectorTransferRewriter::tmpMemRefType( - TransferOpTy transfer) const { - auto vectorType = transfer.getVectorType(); - return MemRefType::get(vectorType.getShape().drop_back(), - VectorType::get(vectorType.getShape().take_back(), - vectorType.getElementType()), - {}, 0); -} - /// Lowers TransferReadOp into a combination of: /// 1. local memory allocation; /// 2. perfect loop nest over: /// a. scalar load from local buffers (viewed as a scalar memref); -/// a. scalar store to original memref (with clipping). +/// a. scalar store to original memref (with padding). /// 3. vector_load from local buffer (viewed as a memref<1 x vector>); /// 4. local memory deallocation. /// /// Lowers the data transfer part of a TransferReadOp while ensuring no /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by -/// clipping. This means that a given value in memory can be read multiple -/// times and concurrently. -/// -/// Important notes about clipping and "full-tiles only" abstraction: -/// ================================================================= -/// When using clipping for dealing with boundary conditions, the same edge -/// value will appear multiple times (a.k.a edge padding). This is fine if the -/// subsequent vector operations are all data-parallel but **is generally -/// incorrect** in the presence of reductions or extract operations. -/// -/// More generally, clipping is a scalar abstraction that is expected to work -/// fine as a baseline for CPUs and GPUs but not for vector_load and DMAs. -/// To deal with real vector_load and DMAs, a "padded allocation + view" -/// abstraction with the ability to read out-of-memref-bounds (but still within -/// the allocated region) is necessary. -/// -/// Whether using scalar loops or vector_load/DMAs to perform the transfer, -/// junk values will be materialized in the vectors and generally need to be -/// filtered out and replaced by the "neutral element". This neutral element is -/// op-dependent so, in the future, we expect to create a vector filter and -/// apply it to a splatted constant vector with the proper neutral element at -/// each ssa-use. This filtering is not necessary for pure data-parallel -/// operations. -/// -/// In the case of vector_store/DMAs, Read-Modify-Write will be required, which -/// also have concurrency implications. Note that by using clipped scalar stores -/// in the presence of data-parallel only operations, we generate code that -/// writes the same value multiple time on the edge locations. -/// -/// TODO: implement alternatives to clipping. -/// TODO: support non-data-parallel operations. +/// padding. /// Performs the rewrite. template <> @@ -584,24 +556,31 @@ LogicalResult VectorTransferRewriter::matchAndRewrite( steps.push_back(std_constant_index(step)); // 2. Emit alloc-copy-load-dealloc. + MLIRContext *ctx = op->getContext(); Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer); StdIndexedValue local(tmp); - Value vec = vector_type_cast(tmp); loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { - auto ivs = llvm::to_vector<8>(loopIvs); + auto ivsStorage = llvm::to_vector<8>(loopIvs); // Swap the ivs which will reorder memory accesses. if (coalescedIdx >= 0) - std::swap(ivs.back(), ivs[coalescedIdx]); - // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist). - SmallVector indices = clip(transfer, memRefBoundsCapture, ivs); - ArrayRef indicesRef(indices), ivsRef(ivs); - Value pos = - std_index_cast(IntegerType::get(32, op->getContext()), ivsRef.back()); - Value vector = vector_insert_element(remote(indicesRef), - local(ivsRef.drop_back()), pos); - local(ivsRef.drop_back()) = vector; + std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]); + + ArrayRef ivs(ivsStorage); + Value pos = std_index_cast(IntegerType::get(32, ctx), ivs.back()); + Value inVector = local(ivs.drop_back()); + auto loadValue = [&](ArrayRef indices) { + Value vector = vector_insert_element(remote(indices), inVector, pos); + local(ivs.drop_back()) = vector; + }; + auto loadPadding = [&](ArrayRef) { + Value vector = vector_insert_element(transfer.padding(), inVector, pos); + local(ivs.drop_back()) = vector; + }; + emitWithBoundsChecks( + rewriter, cast(transfer.getOperation()), ivs, + memRefBoundsCapture, loadValue, loadPadding); }); - Value vectorValue = std_load(vec); + Value vectorValue = std_load(vector_type_cast(tmp)); // 3. Propagate. rewriter.replaceOp(op, vectorValue); @@ -613,19 +592,11 @@ LogicalResult VectorTransferRewriter::matchAndRewrite( /// 2. vector_store to local buffer (viewed as a memref<1 x vector>); /// 3. perfect loop nest over: /// a. scalar load from local buffers (viewed as a scalar memref); -/// a. scalar store to original memref (with clipping). +/// a. scalar store to original memref (if in bounds). /// 4. local memory deallocation. /// /// More specifically, lowers the data transfer part while ensuring no -/// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by -/// clipping. This means that a given value in memory can be written to multiple -/// times and concurrently. -/// -/// See `Important notes about clipping and full-tiles only abstraction` in the -/// description of `readClipped` above. -/// -/// TODO: implement alternatives to clipping. -/// TODO: support non-data-parallel operations. +/// out-of-bounds accesses are possible. template <> LogicalResult VectorTransferRewriter::matchAndRewrite( Operation *op, PatternRewriter &rewriter) const { @@ -675,17 +646,21 @@ LogicalResult VectorTransferRewriter::matchAndRewrite( Value vec = vector_type_cast(tmp); std_store(vectorValue, vec); loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { - auto ivs = llvm::to_vector<8>(loopIvs); - // Swap the ivs which will reorder memory accesses. + auto ivsStorage = llvm::to_vector<8>(loopIvs); + // Swap the ivsStorage which will reorder memory accesses. if (coalescedIdx >= 0) - std::swap(ivs.back(), ivs[coalescedIdx]); - // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist). - SmallVector indices = clip(transfer, memRefBoundsCapture, ivs); - ArrayRef indicesRef(indices), ivsRef(ivs); + std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]); + + ArrayRef ivs(ivsStorage); Value pos = - std_index_cast(IntegerType::get(32, op->getContext()), ivsRef.back()); - Value scalar = vector_extract_element(local(ivsRef.drop_back()), pos); - remote(indices) = scalar; + std_index_cast(IntegerType::get(32, op->getContext()), ivs.back()); + auto storeValue = [&](ArrayRef indices) { + Value scalar = vector_extract_element(local(ivs.drop_back()), pos); + remote(indices) = scalar; + }; + emitWithBoundsChecks( + rewriter, cast(transfer.getOperation()), ivs, + memRefBoundsCapture, storeValue); }); // 3. Erase. diff --git a/mlir/lib/Dialect/AVX512/IR/AVX512Dialect.cpp b/mlir/lib/Dialect/AVX512/IR/AVX512Dialect.cpp index 3595970c38f25..697f00864b15b 100644 --- a/mlir/lib/Dialect/AVX512/IR/AVX512Dialect.cpp +++ b/mlir/lib/Dialect/AVX512/IR/AVX512Dialect.cpp @@ -25,10 +25,5 @@ void avx512::AVX512Dialect::initialize() { >(); } -namespace mlir { -namespace avx512 { #define GET_OP_CLASSES #include "mlir/Dialect/AVX512/AVX512.cpp.inc" -} // namespace avx512 -} // namespace mlir - diff --git a/mlir/lib/Dialect/Affine/EDSC/Builders.cpp b/mlir/lib/Dialect/Affine/EDSC/Builders.cpp index a96ba970afde7..11926d26368be 100644 --- a/mlir/lib/Dialect/Affine/EDSC/Builders.cpp +++ b/mlir/lib/Dialect/Affine/EDSC/Builders.cpp @@ -47,8 +47,9 @@ void mlir::edsc::affineLoopBuilder(ValueRange lbs, ValueRange ubs, int64_t step, // updating the scoped context. builder.create( loc, lbs, builder.getMultiDimIdentityMap(lbs.size()), ubs, - builder.getMultiDimIdentityMap(ubs.size()), step, - [&](OpBuilder &nestedBuilder, Location nestedLoc, Value iv) { + builder.getMultiDimIdentityMap(ubs.size()), step, llvm::None, + [&](OpBuilder &nestedBuilder, Location nestedLoc, Value iv, + ValueRange itrArgs) { if (bodyBuilderFn) { ScopedContext nestedContext(nestedBuilder, nestedLoc); OpBuilder::InsertionGuard guard(nestedBuilder); @@ -58,6 +59,30 @@ void mlir::edsc::affineLoopBuilder(ValueRange lbs, ValueRange ubs, int64_t step, }); } +void mlir::edsc::affineLoopBuilder( + ValueRange lbs, ValueRange ubs, int64_t step, ValueRange iterArgs, + function_ref bodyBuilderFn) { + // Fetch the builder and location. + assert(ScopedContext::getContext() && "EDSC ScopedContext not set up"); + OpBuilder &builder = ScopedContext::getBuilderRef(); + Location loc = ScopedContext::getLocation(); + + // Create the actual loop and call the body builder, if provided, after + // updating the scoped context. + builder.create( + loc, lbs, builder.getMultiDimIdentityMap(lbs.size()), ubs, + builder.getMultiDimIdentityMap(ubs.size()), step, iterArgs, + [&](OpBuilder &nestedBuilder, Location nestedLoc, Value iv, + ValueRange itrArgs) { + if (bodyBuilderFn) { + ScopedContext nestedContext(nestedBuilder, nestedLoc); + OpBuilder::InsertionGuard guard(nestedBuilder); + bodyBuilderFn(iv, itrArgs); + } else if (itrArgs.empty()) + nestedBuilder.create(nestedLoc); + }); +} + static std::pair categorizeValueByAffineType(MLIRContext *context, Value val, unsigned &numDims, unsigned &numSymbols) { diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index f3473859e88c9..440875db39181 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -1173,10 +1173,12 @@ LogicalResult AffineDmaWaitOp::fold(ArrayRef cstOperands, // AffineForOp //===----------------------------------------------------------------------===// -void AffineForOp::build( - OpBuilder &builder, OperationState &result, ValueRange lbOperands, - AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step, - function_ref bodyBuilder) { +/// 'bodyBuilder' is used to build the body of affine.for. If iterArgs and +/// bodyBuilder are empty/null, we include default terminator op. +void AffineForOp::build(OpBuilder &builder, OperationState &result, + ValueRange lbOperands, AffineMap lbMap, + ValueRange ubOperands, AffineMap ubMap, int64_t step, + ValueRange iterArgs, BodyBuilderFn bodyBuilder) { assert(((!lbMap && lbOperands.empty()) || lbOperands.size() == lbMap.getNumInputs()) && "lower bound operand count does not match the affine map"); @@ -1185,6 +1187,9 @@ void AffineForOp::build( "upper bound operand count does not match the affine map"); assert(step > 0 && "step has to be a positive integer constant"); + for (Value val : iterArgs) + result.addTypes(val.getType()); + // Add an attribute for the step. result.addAttribute(getStepAttrName(), builder.getIntegerAttr(builder.getIndexType(), step)); @@ -1197,56 +1202,75 @@ void AffineForOp::build( result.addAttribute(getUpperBoundAttrName(), AffineMapAttr::get(ubMap)); result.addOperands(ubOperands); + result.addOperands(iterArgs); // Create a region and a block for the body. The argument of the region is // the loop induction variable. Region *bodyRegion = result.addRegion(); - Block *body = new Block; - Value inductionVar = body->addArgument(IndexType::get(builder.getContext())); - bodyRegion->push_back(body); - if (bodyBuilder) { - OpBuilder::InsertionGuard guard(builder); - builder.setInsertionPointToStart(body); - bodyBuilder(builder, result.location, inductionVar); - } else { + bodyRegion->push_back(new Block); + Block &bodyBlock = bodyRegion->front(); + Value inductionVar = bodyBlock.addArgument(builder.getIndexType()); + for (Value val : iterArgs) + bodyBlock.addArgument(val.getType()); + + // Create the default terminator if the builder is not provided and if the + // iteration arguments are not provided. Otherwise, leave this to the caller + // because we don't know which values to return from the loop. + if (iterArgs.empty() && !bodyBuilder) { ensureTerminator(*bodyRegion, builder, result.location); + } else if (bodyBuilder) { + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(&bodyBlock); + bodyBuilder(builder, result.location, inductionVar, + bodyBlock.getArguments().drop_front()); } } -void AffineForOp::build( - OpBuilder &builder, OperationState &result, int64_t lb, int64_t ub, - int64_t step, - function_ref bodyBuilder) { +void AffineForOp::build(OpBuilder &builder, OperationState &result, int64_t lb, + int64_t ub, int64_t step, ValueRange iterArgs, + BodyBuilderFn bodyBuilder) { auto lbMap = AffineMap::getConstantMap(lb, builder.getContext()); auto ubMap = AffineMap::getConstantMap(ub, builder.getContext()); - return build(builder, result, {}, lbMap, {}, ubMap, step, bodyBuilder); + return build(builder, result, {}, lbMap, {}, ubMap, step, iterArgs, + bodyBuilder); } static LogicalResult verify(AffineForOp op) { // Check that the body defines as single block argument for the induction // variable. auto *body = op.getBody(); - if (body->getNumArguments() != 1 || !body->getArgument(0).getType().isIndex()) + if (body->getNumArguments() == 0 || !body->getArgument(0).getType().isIndex()) return op.emitOpError( "expected body to have a single index argument for the " "induction variable"); - // Verify that there are enough operands for the bounds. - AffineMap lowerBoundMap = op.getLowerBoundMap(), - upperBoundMap = op.getUpperBoundMap(); - if (op.getNumOperands() != - (lowerBoundMap.getNumInputs() + upperBoundMap.getNumInputs())) - return op.emitOpError( - "operand count must match with affine map dimension and symbol count"); - // Verify that the bound operands are valid dimension/symbols. /// Lower bound. - if (failed(verifyDimAndSymbolIdentifiers(op, op.getLowerBoundOperands(), - op.getLowerBoundMap().getNumDims()))) - return failure(); + if (op.getLowerBoundMap().getNumInputs() > 0) + if (failed( + verifyDimAndSymbolIdentifiers(op, op.getLowerBoundOperands(), + op.getLowerBoundMap().getNumDims()))) + return failure(); /// Upper bound. - if (failed(verifyDimAndSymbolIdentifiers(op, op.getUpperBoundOperands(), - op.getUpperBoundMap().getNumDims()))) - return failure(); + if (op.getUpperBoundMap().getNumInputs() > 0) + if (failed( + verifyDimAndSymbolIdentifiers(op, op.getUpperBoundOperands(), + op.getUpperBoundMap().getNumDims()))) + return failure(); + + unsigned opNumResults = op.getNumResults(); + if (opNumResults == 0) + return success(); + + // If ForOp defines values, check that the number and types of the defined + // values match ForOp initial iter operands and backedge basic block + // arguments. + if (op.getNumIterOperands() != opNumResults) + return op.emitOpError( + "mismatch between the number of loop-carried values and results"); + if (op.getNumRegionIterArgs() != opNumResults) + return op.emitOpError( + "mismatch between the number of basic block args and results"); + return success(); } @@ -1375,9 +1399,34 @@ static ParseResult parseAffineForOp(OpAsmParser &parser, "expected step to be representable as a positive signed integer"); } + // Parse the optional initial iteration arguments. + SmallVector regionArgs, operands; + SmallVector argTypes; + regionArgs.push_back(inductionVariable); + + if (succeeded(parser.parseOptionalKeyword("iter_args"))) { + // Parse assignment list and results type list. + if (parser.parseAssignmentList(regionArgs, operands) || + parser.parseArrowTypeList(result.types)) + return failure(); + // Resolve input operands. + for (auto operandType : llvm::zip(operands, result.types)) + if (parser.resolveOperand(std::get<0>(operandType), + std::get<1>(operandType), result.operands)) + return failure(); + } + // Induction variable. + Type indexType = builder.getIndexType(); + argTypes.push_back(indexType); + // Loop carried variables. + argTypes.append(result.types.begin(), result.types.end()); // Parse the body region. Region *body = result.addRegion(); - if (parser.parseRegion(*body, inductionVariable, builder.getIndexType())) + if (regionArgs.size() != argTypes.size()) + return parser.emitError( + parser.getNameLoc(), + "mismatch between the number of loop-carried values and results"); + if (parser.parseRegion(*body, regionArgs, argTypes)) return failure(); AffineForOp::ensureTerminator(*body, builder, result.location); @@ -1427,6 +1476,13 @@ static void printBound(AffineMapAttr boundMap, map.getNumDims(), p); } +unsigned AffineForOp::getNumIterOperands() { + AffineMap lbMap = getLowerBoundMapAttr().getValue(); + AffineMap ubMap = getUpperBoundMapAttr().getValue(); + + return getNumOperands() - lbMap.getNumInputs() - ubMap.getNumInputs(); +} + static void print(OpAsmPrinter &p, AffineForOp op) { p << op.getOperationName() << ' '; p.printOperand(op.getBody()->getArgument(0)); @@ -1437,9 +1493,22 @@ static void print(OpAsmPrinter &p, AffineForOp op) { if (op.getStep() != 1) p << " step " << op.getStep(); + + bool printBlockTerminators = false; + if (op.getNumIterOperands() > 0) { + p << " iter_args("; + auto regionArgs = op.getRegionIterArgs(); + auto operands = op.getIterOperands(); + + llvm::interleaveComma(llvm::zip(regionArgs, operands), p, [&](auto it) { + p << std::get<0>(it) << " = " << std::get<1>(it); + }); + p << ") -> (" << op.getResultTypes() << ")"; + printBlockTerminators = true; + } + p.printRegion(op.region(), - /*printEntryBlockArgs=*/false, - /*printBlockTerminators=*/false); + /*printEntryBlockArgs=*/false, printBlockTerminators); p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{op.getLowerBoundAttrName(), op.getUpperBoundAttrName(), @@ -1555,8 +1624,8 @@ AffineBound AffineForOp::getLowerBound() { AffineBound AffineForOp::getUpperBound() { auto lbMap = getLowerBoundMap(); auto ubMap = getUpperBoundMap(); - return AffineBound(AffineForOp(*this), lbMap.getNumInputs(), getNumOperands(), - ubMap); + return AffineBound(AffineForOp(*this), lbMap.getNumInputs(), + lbMap.getNumInputs() + ubMap.getNumInputs(), ubMap); } void AffineForOp::setLowerBound(ValueRange lbOperands, AffineMap map) { @@ -1567,6 +1636,8 @@ void AffineForOp::setLowerBound(ValueRange lbOperands, AffineMap map) { auto ubOperands = getUpperBoundOperands(); newOperands.append(ubOperands.begin(), ubOperands.end()); + auto iterOperands = getIterOperands(); + newOperands.append(iterOperands.begin(), iterOperands.end()); getOperation()->setOperands(newOperands); setAttr(getLowerBoundAttrName(), AffineMapAttr::get(map)); @@ -1578,6 +1649,8 @@ void AffineForOp::setUpperBound(ValueRange ubOperands, AffineMap map) { SmallVector newOperands(getLowerBoundOperands()); newOperands.append(ubOperands.begin(), ubOperands.end()); + auto iterOperands = getIterOperands(); + newOperands.append(iterOperands.begin(), iterOperands.end()); getOperation()->setOperands(newOperands); setAttr(getUpperBoundAttrName(), AffineMapAttr::get(map)); @@ -1630,7 +1703,9 @@ AffineForOp::operand_range AffineForOp::getLowerBoundOperands() { } AffineForOp::operand_range AffineForOp::getUpperBoundOperands() { - return {operand_begin() + getLowerBoundMap().getNumInputs(), operand_end()}; + return {operand_begin() + getLowerBoundMap().getNumInputs(), + operand_begin() + getLowerBoundMap().getNumInputs() + + getUpperBoundMap().getNumInputs()}; } bool AffineForOp::matchingBoundOperandList() { @@ -1710,8 +1785,8 @@ static void buildAffineLoopNestImpl( ivs.reserve(lbs.size()); for (unsigned i = 0, e = lbs.size(); i < e; ++i) { // Callback for creating the loop body, always creates the terminator. - auto loopBody = [&](OpBuilder &nestedBuilder, Location nestedLoc, - Value iv) { + auto loopBody = [&](OpBuilder &nestedBuilder, Location nestedLoc, Value iv, + ValueRange iterArgs) { ivs.push_back(iv); // In the innermost loop, call the body builder. if (i == e - 1 && bodyBuilderFn) { @@ -1729,16 +1804,19 @@ static void buildAffineLoopNestImpl( } /// Creates an affine loop from the bounds known to be constants. -static AffineForOp buildAffineLoopFromConstants( - OpBuilder &builder, Location loc, int64_t lb, int64_t ub, int64_t step, - function_ref bodyBuilderFn) { - return builder.create(loc, lb, ub, step, bodyBuilderFn); +static AffineForOp +buildAffineLoopFromConstants(OpBuilder &builder, Location loc, int64_t lb, + int64_t ub, int64_t step, + AffineForOp::BodyBuilderFn bodyBuilderFn) { + return builder.create(loc, lb, ub, step, /*iterArgs=*/llvm::None, + bodyBuilderFn); } /// Creates an affine loop from the bounds that may or may not be constants. -static AffineForOp buildAffineLoopFromValues( - OpBuilder &builder, Location loc, Value lb, Value ub, int64_t step, - function_ref bodyBuilderFn) { +static AffineForOp +buildAffineLoopFromValues(OpBuilder &builder, Location loc, Value lb, Value ub, + int64_t step, + AffineForOp::BodyBuilderFn bodyBuilderFn) { auto lbConst = lb.getDefiningOp(); auto ubConst = ub.getDefiningOp(); if (lbConst && ubConst) @@ -1747,7 +1825,7 @@ static AffineForOp buildAffineLoopFromValues( bodyBuilderFn); return builder.create(loc, lb, builder.getDimIdentityMap(), ub, builder.getDimIdentityMap(), step, - bodyBuilderFn); + /*iterArgs=*/llvm::None, bodyBuilderFn); } void mlir::buildAffineLoopNest( diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp index 5bded917978a7..56469482c7632 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp @@ -61,278 +61,6 @@ std::unique_ptr> mlir::createLoopTilingPass() { return std::make_unique(); } -// Move the loop body of AffineForOp 'src' from 'src' into the specified -// location in destination's body, ignoring the terminator. -static inline void moveLoopBody(AffineForOp src, AffineForOp dest, - Block::iterator loc) { - auto &insts = src.getBody()->getOperations(); - dest.getBody()->getOperations().splice(loc, insts, insts.begin(), - std::prev(insts.end())); -} - -// Move the loop body of AffineForOp 'src' from 'src' to the start of dest's -// body. -static inline void moveLoopBody(AffineForOp src, AffineForOp dest) { - moveLoopBody(src, dest, dest.getBody()->begin()); -} - -/// Constructs and sets new loop bounds after tiling for the case of -/// hyper-rectangular index sets, where the bounds of one dimension do not -/// depend on other dimensions. Bounds of each dimension can thus be treated -/// independently, and deriving the new bounds is much simpler and faster -/// than for the case of tiling arbitrary polyhedral shapes. -static void -constructTiledIndexSetHyperRect(MutableArrayRef origLoops, - MutableArrayRef newLoops, - ArrayRef tileSizes) { - assert(!origLoops.empty()); - assert(origLoops.size() == tileSizes.size()); - - OpBuilder b(origLoops[0].getOperation()); - unsigned width = origLoops.size(); - - // Bounds for tile space loops. - for (unsigned i = 0; i < width; i++) { - OperandRange newLbOperands = origLoops[i].getLowerBoundOperands(); - OperandRange newUbOperands = origLoops[i].getUpperBoundOperands(); - newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap()); - newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap()); - newLoops[i].setStep(tileSizes[i]); - } - // Bounds for intra-tile loops. - for (unsigned i = 0; i < width; i++) { - int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]); - auto mayBeConstantCount = getConstantTripCount(origLoops[i]); - // The lower bound is just the tile-space loop. - AffineMap lbMap = b.getDimIdentityMap(); - newLoops[width + i].setLowerBound( - /*operands=*/newLoops[i].getInductionVar(), lbMap); - - // Set the upper bound. - if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSizes[i]) { - // Trip count is less than the tile size: upper bound is lower bound + - // trip count. - auto ubMap = b.getSingleDimShiftAffineMap(mayBeConstantCount.getValue()); - newLoops[width + i].setUpperBound( - /*operands=*/newLoops[i].getInductionVar(), ubMap); - } else if (largestDiv % tileSizes[i] != 0) { - // Intra-tile loop ii goes from i to min(i + tileSize, ub_i). - // Construct the upper bound map; the operands are the original operands - // with 'i' (tile-space loop) appended to it. The new upper bound map is - // the original one with an additional expression i + tileSize appended. - - // Add dim operands from original upper bound. - SmallVector ubOperands; - auto ub = origLoops[i].getUpperBound(); - ubOperands.reserve(ub.getNumOperands() + 1); - auto origUbMap = ub.getMap(); - for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) - ubOperands.push_back(ub.getOperand(j)); - - // Add dim operand for new loop upper bound. - ubOperands.push_back(newLoops[i].getInductionVar()); - - // Add symbol operands from original upper bound. - for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) - ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); - - SmallVector boundExprs; - boundExprs.reserve(1 + origUbMap.getNumResults()); - auto dim = b.getAffineDimExpr(origUbMap.getNumDims()); - // The new upper bound map is the original one with an additional - // expression i + tileSize appended. - boundExprs.push_back(dim + tileSizes[i]); - boundExprs.append(origUbMap.getResults().begin(), - origUbMap.getResults().end()); - auto ubMap = - AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(), - boundExprs, b.getContext()); - newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap); - } else { - // No need of the min expression. - auto dim = b.getAffineDimExpr(0); - auto ubMap = AffineMap::get(1, 0, dim + tileSizes[i]); - newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap); - } - } -} - -/// This function checks whether hyper-rectangular loop tiling of the nest -/// represented by `origLoops` is valid. The validity condition is from Irigoin -/// and Triolet, which states that two tiles cannot depend on each other. We -/// simplify such condition to just checking whether there is any negative -/// dependence direction, since we have the prior knowledge that the tiling -/// results will be hyper-rectangles, which are scheduled in the -/// lexicographically increasing order on the vector of loop indices. This -/// function will return failure when any dependence component is negative along -/// any of `origLoops`. -static LogicalResult -checkTilingLegality(MutableArrayRef origLoops) { - assert(!origLoops.empty() && "no original loops provided"); - - // We first find out all dependences we intend to check. - SmallVector loadAndStoreOps; - origLoops[0].getOperation()->walk([&](Operation *op) { - if (isa(op)) - loadAndStoreOps.push_back(op); - }); - - unsigned numOps = loadAndStoreOps.size(); - unsigned numLoops = origLoops.size(); - FlatAffineConstraints dependenceConstraints; - for (unsigned d = 1; d <= numLoops + 1; ++d) { - for (unsigned i = 0; i < numOps; ++i) { - Operation *srcOp = loadAndStoreOps[i]; - MemRefAccess srcAccess(srcOp); - for (unsigned j = 0; j < numOps; ++j) { - Operation *dstOp = loadAndStoreOps[j]; - MemRefAccess dstAccess(dstOp); - - SmallVector depComps; - dependenceConstraints.reset(); - DependenceResult result = checkMemrefAccessDependence( - srcAccess, dstAccess, d, &dependenceConstraints, &depComps); - - // Skip if there is no dependence in this case. - if (!hasDependence(result)) - continue; - - // Check whether there is any negative direction vector in the - // dependence components found above, which means that dependence is - // violated by the default hyper-rect tiling method. - LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated " - "for dependence at depth: " - << Twine(d) << " between:\n";); - LLVM_DEBUG(srcAccess.opInst->dump();); - LLVM_DEBUG(dstAccess.opInst->dump();); - for (unsigned k = 0, e = depComps.size(); k < e; k++) { - DependenceComponent depComp = depComps[k]; - if (depComp.lb.hasValue() && depComp.ub.hasValue() && - depComp.lb.getValue() < depComp.ub.getValue() && - depComp.ub.getValue() < 0) { - LLVM_DEBUG(llvm::dbgs() - << "Dependence component lb = " - << Twine(depComp.lb.getValue()) - << " ub = " << Twine(depComp.ub.getValue()) - << " is negative at depth: " << Twine(d) - << " and thus violates the legality rule.\n"); - return failure(); - } - } - } - } - } - - return success(); -} -/// Tiles the specified band of perfectly nested loops creating tile-space loops -/// and intra-tile loops. A band is a contiguous set of loops. -// TODO: handle non hyper-rectangular spaces. -LogicalResult -mlir::tilePerfectlyNested(MutableArrayRef input, - ArrayRef tileSizes, - SmallVectorImpl *tiledNest) { - // Check if the supplied for op's are all successively nested. - assert(!input.empty() && "no loops in input band"); - assert(input.size() == tileSizes.size() && "Too few/many tile sizes"); - - assert(isPerfectlyNested(input) && "input loops not perfectly nested"); - - auto origLoops = input; - - // Perform tiling legality test. - if (failed(checkTilingLegality(origLoops))) - origLoops[0].emitRemark("tiled code is illegal due to dependences"); - - AffineForOp rootAffineForOp = origLoops[0]; - auto loc = rootAffineForOp.getLoc(); - // Note that width is at least one since band isn't empty. - unsigned width = input.size(); - - SmallVector tiledLoops(2 * width); - - // The outermost among the loops as we add more.. - auto *topLoop = rootAffineForOp.getOperation(); - AffineForOp innermostPointLoop; - - // Add intra-tile (or point) loops. - for (unsigned i = 0; i < width; i++) { - OpBuilder b(topLoop); - // Loop bounds will be set later. - auto pointLoop = b.create(loc, 0, 0); - pointLoop.getBody()->getOperations().splice( - pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), - topLoop); - tiledLoops[2 * width - 1 - i] = pointLoop; - topLoop = pointLoop.getOperation(); - if (i == 0) - innermostPointLoop = pointLoop; - } - - // Add tile space loops; - for (unsigned i = width; i < 2 * width; i++) { - OpBuilder b(topLoop); - // Loop bounds will be set later. - auto tileSpaceLoop = b.create(loc, 0, 0); - tileSpaceLoop.getBody()->getOperations().splice( - tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), - topLoop); - tiledLoops[2 * width - i - 1] = tileSpaceLoop; - topLoop = tileSpaceLoop.getOperation(); - } - - // Move the loop body of the original nest to the new one. - moveLoopBody(origLoops.back(), innermostPointLoop); - - SmallVector origLoopIVs; - extractForInductionVars(input, &origLoopIVs); - - FlatAffineConstraints cst; - SmallVector ops; - ops.reserve(input.size()); - for (AffineForOp forOp : input) - ops.push_back(forOp); - getIndexSet(ops, &cst); - if (!cst.isHyperRectangular(0, width)) { - rootAffineForOp.emitError("tiled code generation unimplemented for the " - "non-hyperrectangular case"); - return failure(); - } - - constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes); - - // Replace original IVs with intra-tile loop IVs. - for (unsigned i = 0; i < width; i++) - origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar()); - - // Erase the old loop nest. - rootAffineForOp.erase(); - - if (tiledNest) - *tiledNest = std::move(tiledLoops); - - return success(); -} - -// Identify valid and profitable bands of loops to tile. This is currently just -// a temporary placeholder to test the mechanics of tiled code generation. -// Returns all maximal outermost perfect loop nests to tile. -static void getTileableBands(FuncOp f, - std::vector> *bands) { - // Get maximal perfect nest of 'affine.for' insts starting from root - // (inclusive). - auto getMaximalPerfectLoopNest = [&](AffineForOp root) { - SmallVector band; - getPerfectlyNestedLoops(band, root); - bands->push_back(band); - }; - - for (auto &block : f) - for (auto &op : block) - if (auto forOp = dyn_cast(op)) - getMaximalPerfectLoopNest(forOp); -} - /// Reduces each tile size to the largest divisor of the corresponding trip /// count (if the trip count is known). static void adjustToDivisorsOfTripCounts(ArrayRef band, @@ -340,7 +68,7 @@ static void adjustToDivisorsOfTripCounts(ArrayRef band, assert(band.size() == tileSizes->size() && "invalid tile size count"); for (unsigned i = 0, e = band.size(); i < e; i++) { unsigned &tSizeAdjusted = (*tileSizes)[i]; - auto mayConst = getConstantTripCount(band[i]); + Optional mayConst = getConstantTripCount(band[i]); if (!mayConst) continue; // Adjust the tile size to largest factor of the trip count less than @@ -379,14 +107,14 @@ void LoopTiling::getTileSizes(ArrayRef band, tileSizes->resize(band.size()); // The first loop in the band. - auto rootForOp = band[0]; + AffineForOp rootForOp = band[0]; (void)rootForOp; // Obtain memory footprint and set tile sizes so that a tile fits in // the cache size. This is an approximation with the assumption that the // footprint increases with the tile size linearly in that dimension (i.e., // assumes one-to-one access function). - auto fp = getMemoryFootprintBytes(band[0], 0); + Optional fp = getMemoryFootprintBytes(band[0], 0); if (!fp) { // Fill with default tile sizes if footprint is unknown. std::fill(tileSizes->begin(), tileSizes->end(), @@ -445,7 +173,7 @@ void LoopTiling::runOnFunction() { getTileSizes(band, &tileSizes); if (llvm::DebugFlag) { auto diag = band[0].emitRemark("using tile sizes ["); - for (auto tSize : tileSizes) + for (unsigned tSize : tileSizes) diag << tSize << ' '; diag << "]\n"; } diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp index edb21384080f4..26669967ff329 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp @@ -9,7 +9,6 @@ // This file implements loop unrolling. // //===----------------------------------------------------------------------===// - #include "PassDetail.h" #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" @@ -45,11 +44,13 @@ struct LoopUnroll : public AffineLoopUnrollBase { : AffineLoopUnrollBase(other), getUnrollFactor(other.getUnrollFactor) {} explicit LoopUnroll( - Optional unrollFactor = None, bool unrollFull = false, + Optional unrollFactor = None, bool unrollUpToFactor = false, + bool unrollFull = false, const std::function &getUnrollFactor = nullptr) : getUnrollFactor(getUnrollFactor) { if (unrollFactor) this->unrollFactor = *unrollFactor; + this->unrollUpToFactor = unrollUpToFactor; this->unrollFull = unrollFull; } @@ -126,13 +127,15 @@ LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) { if (unrollFull) return loopUnrollFull(forOp); // Otherwise, unroll by the given unroll factor. + if (unrollUpToFactor) + return loopUnrollUpToFactor(forOp, unrollFactor); return loopUnrollByFactor(forOp, unrollFactor); } std::unique_ptr> mlir::createLoopUnrollPass( - int unrollFactor, bool unrollFull, + int unrollFactor, bool unrollUpToFactor, bool unrollFull, const std::function &getUnrollFactor) { return std::make_unique( - unrollFactor == -1 ? None : Optional(unrollFactor), unrollFull, - getUnrollFactor); + unrollFactor == -1 ? None : Optional(unrollFactor), + unrollUpToFactor, unrollFull, getUnrollFactor); } diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp index 1de7b8957711a..ee52fe44830c4 100644 --- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp @@ -945,7 +945,7 @@ static bool isUniformDefinition(Value value, /// vectorization strategy in 'state'. static Value vectorizeUniform(Value value, VectorizationState *state) { OpBuilder builder(value.getContext()); - builder.setInsertionPointAfter(value); + builder.setInsertionPointAfterValue(value); auto vectorTy = getVectorType(value.getType(), state->strategy); auto bcast = builder.create(value.getLoc(), vectorTy, value); diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index 58f9480c37be0..7dc74f21e2fbf 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -777,10 +777,5 @@ static void print(OpAsmPrinter &p, GPUModuleOp op) { /*printBlockTerminators=*/false); } -// Namespace avoids ambiguous ReturnOpAdaptor. -namespace mlir { -namespace gpu { #define GET_OP_CLASSES #include "mlir/Dialect/GPU/GPUOps.cpp.inc" -} // namespace gpu -} // namespace mlir diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp index b42929039a974..b953bad676276 100644 --- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -23,10 +23,9 @@ using namespace mlir; using namespace mlir::gpu; using namespace mlir::scf; +#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc" #include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc" namespace mlir { - -#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc" namespace gpu { StringRef getMappingAttrName() { return "mapping"; } diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAVX512Dialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAVX512Dialect.cpp index 9f7e66b0ae0a9..512234cc87646 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAVX512Dialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAVX512Dialect.cpp @@ -27,9 +27,5 @@ void LLVM::LLVMAVX512Dialect::initialize() { >(); } -namespace mlir { -namespace LLVM { #define GET_OP_CLASSES #include "mlir/Dialect/LLVMIR/LLVMAVX512.cpp.inc" -} // namespace LLVM -} // namespace mlir diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index cc809b581c843..e13a83854b1e3 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -16,7 +16,6 @@ #include "mlir/Dialect/LLVMIR/NVVMDialect.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Builders.h" #include "mlir/IR/MLIRContext.h" #include "mlir/IR/Operation.h" @@ -146,10 +145,5 @@ void NVVMDialect::initialize() { allowUnknownOperations(); } -namespace mlir { -namespace NVVM { #define GET_OP_CLASSES #include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc" -} // namespace NVVM -} // namespace mlir - diff --git a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp index 70c3558638e6a..afdd9537c6792 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp @@ -91,10 +91,5 @@ void ROCDLDialect::initialize() { allowUnknownOperations(); } -namespace mlir { -namespace ROCDL { #define GET_OP_CLASSES #include "mlir/Dialect/LLVMIR/ROCDLOps.cpp.inc" -} // namespace ROCDL -} // namespace mlir - diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index c9b05f89f30b1..efe2e45f78ea9 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -260,13 +260,14 @@ static LogicalResult verifyGenericOp(GenericOpType op) { if (failed(BlockArgsVerifier::verify(op, region.front()))) return failure(); - auto attr = op.template getAttrOfType("symbol_source"); - int64_t targetRank = 0; - if (attr) { - unsigned index = attr.getInt(); + auto symbolSourceAttr = + op.template getAttrOfType("symbol_source"); + int64_t expectedNumSymbols = 0; + if (symbolSourceAttr) { + unsigned index = symbolSourceAttr.getInt(); if (index >= op.getNumOperands()) return op.emitOpError("symbol_source index out of range"); - targetRank = op.getShapedType(index).getRank(); + expectedNumSymbols = op.getShapedType(index).getRank(); } SmallVector indexingMaps; @@ -278,9 +279,9 @@ static LogicalResult verifyGenericOp(GenericOpType op) { auto view = (idx < nInputViews) ? op.getInputShapedType(idx) : op.getOutputShapedType(idx - nInputViews); - if (m.getNumSymbols() != targetRank) + if (m.getNumSymbols() != expectedNumSymbols) return op.emitOpError("expected the number of symbols in indexing_map #") - << idx << " to match target rank"; + << idx << " to match rank of operand `symbol_source`"; if (m.getNumDims() != nLoops) return op.emitOpError("expected indexing_map #") @@ -846,13 +847,9 @@ Value SliceOp::getViewSource() { return view(); } //===----------------------------------------------------------------------===// // TransposeOp //===----------------------------------------------------------------------===// -void mlir::linalg::TransposeOp::build(OpBuilder &b, OperationState &result, - Value view, AffineMapAttr permutation, - ArrayRef attrs) { - auto permutationMap = permutation.getValue(); - assert(permutationMap); - auto memRefType = view.getType().cast(); +static MemRefType inferTransposeResultType(MemRefType memRefType, + AffineMap permutationMap) { auto rank = memRefType.getRank(); auto originalSizes = memRefType.getShape(); // Compute permuted sizes. @@ -867,11 +864,21 @@ void mlir::linalg::TransposeOp::build(OpBuilder &b, OperationState &result, auto res = getStridesAndOffset(memRefType, strides, offset); assert(succeeded(res) && strides.size() == static_cast(rank)); (void)res; - auto map = makeStridedLinearLayoutMap(strides, offset, b.getContext()); + auto map = + makeStridedLinearLayoutMap(strides, offset, memRefType.getContext()); map = permutationMap ? map.compose(permutationMap) : map; + return MemRefType::Builder(memRefType).setShape(sizes).setAffineMaps(map); +} + +void mlir::linalg::TransposeOp::build(OpBuilder &b, OperationState &result, + Value view, AffineMapAttr permutation, + ArrayRef attrs) { + auto permutationMap = permutation.getValue(); + assert(permutationMap); + + auto memRefType = view.getType().cast(); // Compute result type. - MemRefType resultType = - MemRefType::Builder(memRefType).setShape(sizes).setAffineMaps(map); + MemRefType resultType = inferTransposeResultType(memRefType, permutationMap); build(b, result, resultType, view, attrs); result.addAttribute(TransposeOp::getPermutationAttrName(), permutation); @@ -881,19 +888,20 @@ static void print(OpAsmPrinter &p, TransposeOp op) { p << op.getOperationName() << " " << op.view() << " " << op.permutation(); p.printOptionalAttrDict(op.getAttrs(), {TransposeOp::getPermutationAttrName()}); - p << " : " << op.view().getType(); + p << " : " << op.view().getType() << " to " << op.getType(); } static ParseResult parseTransposeOp(OpAsmParser &parser, OperationState &result) { OpAsmParser::OperandType view; AffineMap permutation; - MemRefType type; + MemRefType srcType, dstType; if (parser.parseOperand(view) || parser.parseAffineMap(permutation) || parser.parseOptionalAttrDict(result.attributes) || - parser.parseColonType(type) || - parser.resolveOperand(view, type, result.operands) || - parser.addTypeToList(type, result.types)) + parser.parseColonType(srcType) || + parser.resolveOperand(view, srcType, result.operands) || + parser.parseKeywordType("to", dstType) || + parser.addTypeToList(dstType, result.types)) return failure(); result.addAttribute(TransposeOp::getPermutationAttrName(), @@ -901,6 +909,21 @@ static ParseResult parseTransposeOp(OpAsmParser &parser, return success(); } +static LogicalResult verify(TransposeOp op) { + if (!op.permutation().isPermutation()) + return op.emitOpError("expected a permutation map"); + if (op.permutation().getNumDims() != op.getShapedType().getRank()) + return op.emitOpError( + "expected a permutation map of same rank as the view"); + + auto srcType = op.view().getType().cast(); + auto dstType = op.getType().cast(); + if (dstType != inferTransposeResultType(srcType, op.permutation())) + return op.emitOpError("output type ") + << dstType << " does not match transposed input type " << srcType; + return success(); +} + //===----------------------------------------------------------------------===// // YieldOp //===----------------------------------------------------------------------===// @@ -1073,9 +1096,6 @@ static LogicalResult verify(PoolingSumOp op) { return verifySingleInputPoolingOp(op); } -namespace mlir { -namespace linalg { - #include "mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterfaces.cpp.inc" #define GET_OP_CLASSES @@ -1084,9 +1104,6 @@ namespace linalg { #define GET_OP_CLASSES #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc" -} // namespace linalg -} // namespace mlir - AffineMap mlir::linalg::extractOrIdentityMap(Optional maybeMap, unsigned rank, MLIRContext *context) { @@ -1224,15 +1241,9 @@ void buildNamedStructuredOpRegionAndAttributes(Builder &builder, mlir::edsc::ScopedContext scope(opBuilder, builder.getUnknownLoc()); NamedStructuredOpType::regionBuilder(*body); - auto indexingMaps = builder.getAffineMapArrayAttr( - NamedStructuredOpType::referenceIndexingMaps(operandTypes, - tensorResultTypes)); - result.addAttribute(getIndexingMapsAttrName(), indexingMaps); + // indexing_maps is an auto-generated method. - auto iterators = - builder.getStrArrayAttr(NamedStructuredOpType::referenceIterators( - operandTypes, tensorResultTypes)); - result.addAttribute(getIteratorTypesAttrName(), iterators); + // iterator_types is an auto-generated method. } template @@ -1350,6 +1361,7 @@ CANONICALIZERS_AND_FOLDERS(BatchMatmulOp) CANONICALIZERS_AND_FOLDERS(DotOp) CANONICALIZERS_AND_FOLDERS(MatmulOp) CANONICALIZERS_AND_FOLDERS(MatvecOp) +CANONICALIZERS_AND_FOLDERS(VecmatOp) CANONICALIZERS_AND_FOLDERS(ConvWOp) CANONICALIZERS_AND_FOLDERS(ConvNWCOp) CANONICALIZERS_AND_FOLDERS(ConvNCWOp) diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgTypes.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgTypes.cpp index b8bffd35f5a12..abc82f300f633 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgTypes.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgTypes.cpp @@ -17,6 +17,7 @@ #include "mlir/IR/StandardTypes.h" #include "mlir/Parser.h" #include "mlir/Support/LLVM.h" +#include "mlir/Transforms/InliningUtils.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/raw_ostream.h" @@ -24,6 +25,38 @@ using namespace mlir; using namespace mlir::linalg; +//===----------------------------------------------------------------------===// +// LinalgDialect Dialect Interfaces +//===----------------------------------------------------------------------===// + +namespace { + +struct LinalgInlinerInterface : public DialectInlinerInterface { + using DialectInlinerInterface::DialectInlinerInterface; + + // We don't have any special restrictions on what can be inlined into + // destination regions (e.g. while/conditional bodies). Always allow it. + bool isLegalToInline(Region *dest, Region *src, + BlockAndValueMapping &valueMapping) const final { + return true; + } + // Operations in Linalg dialect are always legal to inline. + bool isLegalToInline(Operation *, Region *, + BlockAndValueMapping &) const final { + return true; + } + // Handle the given inlined terminator by replacing it with a new operation + // as necessary. Required when the region has only one block. + void handleTerminator(Operation *op, + ArrayRef valuesToRepl) const final {} +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// LinalgDialect +//===----------------------------------------------------------------------===// + void mlir::linalg::LinalgDialect::initialize() { addTypes(); addOperations< @@ -34,7 +67,9 @@ void mlir::linalg::LinalgDialect::initialize() { #define GET_OP_LIST #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc" >(); + addInterfaces(); } + Type mlir::linalg::LinalgDialect::parseType(DialectAsmParser &parser) const { // Parse the main keyword for the type. StringRef keyword; diff --git a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp index d4d1d108be71a..d3c90ffab06fd 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp @@ -679,6 +679,8 @@ static Optional linalgOpToLoopsImplSwitch(Operation *op, return linalgOpToLoopsImpl(op, builder); if (isa(op)) return linalgOpToLoopsImpl(op, builder); + if (isa(op)) + return linalgOpToLoopsImpl(op, builder); if (isa(op)) return linalgOpToLoopsImpl(op, builder); if (isa(op)) diff --git a/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp b/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp index 89a01f9ca6292..6af0067c8928c 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp @@ -51,11 +51,6 @@ class GenericOpConverter return rewriter.notifyMatchFailure( op, "dynamic shapes not currently supported"); auto memrefType = MemRefType::get(type.getShape(), type.getElementType()); - - // Compute alloc position and insert a custom allocation node. - OpBuilder::InsertionGuard guard(rewriter); - rewriter.restoreInsertionPoint( - bufferAssignment->computeAllocPosition(result)); auto alloc = rewriter.create(loc, memrefType); newArgs.push_back(alloc); newResults.push_back(alloc); @@ -99,13 +94,12 @@ class GenericOpConverter /// Populate the given list with patterns to convert Linalg operations on /// tensors to buffers. static void populateConvertLinalgOnTensorsToBuffersPattern( - MLIRContext *context, BufferAssignmentPlacer *placer, - BufferAssignmentTypeConverter *converter, + MLIRContext *context, BufferAssignmentTypeConverter *converter, OwningRewritePatternList *patterns) { populateWithBufferAssignmentOpConversionPatterns< - mlir::ReturnOp, mlir::ReturnOp, linalg::CopyOp>(context, placer, - converter, patterns); - patterns->insert(context, placer, converter); + mlir::ReturnOp, mlir::ReturnOp, linalg::CopyOp>(context, converter, + patterns); + patterns->insert(context, converter); } /// Converts Linalg operations that work on tensor-type operands or results to @@ -119,6 +113,8 @@ struct ConvertLinalgOnTensorsToBuffers // Mark all Standard operations legal. target.addLegalDialect(); + target.addLegalOp(); + target.addLegalOp(); // Mark all Linalg operations illegal as long as they work on tensors. auto isLegalOperation = [&](Operation *op) { @@ -144,16 +140,11 @@ struct ConvertLinalgOnTensorsToBuffers converter.setResultConversionKind( BufferAssignmentTypeConverter::AppendToArgumentsList); - // Walk over all the functions to apply buffer assignment. - getOperation().walk([&](FuncOp function) -> WalkResult { - OwningRewritePatternList patterns; - BufferAssignmentPlacer placer(function); - populateConvertLinalgOnTensorsToBuffersPattern(&context, &placer, - &converter, &patterns); - - // Applying full conversion - return applyFullConversion(function, target, patterns); - }); + OwningRewritePatternList patterns; + populateConvertLinalgOnTensorsToBuffersPattern(&context, &converter, + &patterns); + if (failed(applyFullConversion(this->getOperation(), target, patterns))) + this->signalPassFailure(); } }; } // end anonymous namespace diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index afac3d5f5f9a4..c1aad620fe08a 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -126,8 +126,6 @@ LogicalResult mlir::linalg::LinalgBaseTilingPattern::matchAndRewrite( // New marker if specified. marker.replaceLinalgMarker(rewriter, res->op.getOperation()); - - rewriter.eraseOp(op); return success(); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index ada89f1c82b5c..9a225dd81c79c 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -69,7 +69,7 @@ static bool hasMultiplyAddBody(Region &r) { static LogicalResult isContraction(Operation *op) { // TODO: interface for named ops. if (isa(op)) + linalg::VecmatOp, linalg::DotOp>(op)) return success(); auto genericOp = dyn_cast(op); @@ -367,3 +367,141 @@ LogicalResult LinalgCopyVTWForwardingPattern::matchAndRewrite( return success(); } + +template +LogicalResult ConvOpVectorization::matchAndRewrite( + ConvOp op, PatternRewriter &rewriter) const { + Location loc = op.getLoc(); + MLIRContext *context = op.getContext(); + edsc::ScopedContext scope(rewriter, loc); + + ShapedType inShapeType = op.getInputShapedType(0); + ShapedType kShapeType = op.getInputShapedType(1); + + ArrayRef inShape = inShapeType.getShape(); + ArrayRef kShape = kShapeType.getShape(); + + if (!inShapeType.hasStaticShape() || !kShapeType.hasStaticShape()) + return failure(); + + SmallVector mapping; + // Fail to apply when the size of not vectorized dimension is not 1 or + // when the size of vectorized dimension is not dimSize. + for (unsigned i = 0; i < N; i++) { + if (!mask[i] && (inShape[i] != 1 || kShape[i] != 1)) + return failure(); + if (mask[i] && (inShape[i] != tileSize || kShape[i] != tileSize)) + return failure(); + + if (mask[i]) + mapping.push_back(getAffineDimExpr(i, context)); + } + + Value input = op.getInput(0); + Value kernel = op.getInput(1); + Value output = op.getOutputBuffer(0); + + unsigned rank = inShapeType.getRank(); + unsigned numDims = mapping.size(); + Type elemType = inShapeType.getElementType(); + + auto map = AffineMap::get(rank, 0, mapping, context); + SmallVector zeros(rank, std_constant_index(0)); + auto vecType = + VectorType::get(SmallVector(numDims, tileSize), elemType); + + auto inputVec = vector_transfer_read(vecType, input, zeros, map); + auto kernelVec = vector_transfer_read(vecType, kernel, zeros, map); + + auto acc = std_constant(elemType, rewriter.getZeroAttr(elemType)); + + std::array indexingMaps{ + AffineMap::getMultiDimIdentityMap(numDims, context), + AffineMap::getMultiDimIdentityMap(numDims, context), + AffineMap::get(numDims, 0, {}, context)}; + + std::vector iteratorTypes(numDims, "reduction"); + + auto result = rewriter.create( + loc, inputVec, kernelVec, acc, + rewriter.getAffineMapArrayAttr(indexingMaps), + rewriter.getStrArrayAttr(iteratorTypes)); + + rewriter.create(loc, result, output, ValueRange(zeros)); + rewriter.eraseOp(op); + return success(); +} + +using ConvOpConst = ConvOpVectorization; + +/// Inserts tiling, promotion and vectorization pattern for ConvOp +/// conversion into corresponding pattern lists. +template +static void +populateVectorizationPatterns(OwningRewritePatternList &tilingPatterns, + OwningRewritePatternList &promotionPatterns, + OwningRewritePatternList &vectorizationPatterns, + ArrayRef tileSizes, + MLIRContext *context) { + constexpr static StringRef kTiledMarker = "TILED"; + constexpr static StringRef kPromotedMarker = "PROMOTED"; + tilingPatterns.insert>( + context, LinalgTilingOptions().setTileSizes(tileSizes), + LinalgMarker({}, Identifier::get(kTiledMarker, context))); + + promotionPatterns.insert>( + context, LinalgPromotionOptions().setUseFullTileBuffersByDefault(true), + LinalgMarker(Identifier::get(kTiledMarker, context), + Identifier::get(kPromotedMarker, context))); + + SmallVector mask(N); + int offset = tileSizes.size() - N; + std::transform(tileSizes.begin() + offset, tileSizes.end(), mask.begin(), + [](int64_t i) -> bool { return i != ConvOpConst::noTile; }); + + vectorizationPatterns.insert>(context, mask); +} + +void mlir::linalg::populateConvVectorizationPatterns( + MLIRContext *context, SmallVectorImpl &patterns) { + const int64_t tileSize = ConvOpConst::tileSize; + const int64_t noTile = ConvOpConst::noTile; + auto makeTileSizes = [&](unsigned numNoTile, unsigned numTile) { + SmallVector result(numNoTile, noTile); + result.append(numTile, tileSize); + return result; + }; + + OwningRewritePatternList tiling, promotion, vectorization; + populateVectorizationPatterns( + tiling, promotion, vectorization, + makeTileSizes(/*numNoTile=*/1, /*numTile*/ 1), context); + + populateVectorizationPatterns(tiling, promotion, vectorization, + makeTileSizes(3, 2), context); + + populateVectorizationPatterns(tiling, promotion, vectorization, + makeTileSizes(3, 2), context); + + populateVectorizationPatterns(tiling, promotion, vectorization, + makeTileSizes(2, 2), context); + + populateVectorizationPatterns(tiling, promotion, vectorization, + makeTileSizes(4, 3), context); + + populateVectorizationPatterns(tiling, promotion, vectorization, + makeTileSizes(4, 3), context); + + populateVectorizationPatterns(tiling, promotion, vectorization, + makeTileSizes(3, 3), context); + + populateVectorizationPatterns( + tiling, promotion, vectorization, makeTileSizes(5, 4), context); + + populateVectorizationPatterns( + tiling, promotion, vectorization, makeTileSizes(5, 4), context); + + patterns.push_back(std::move(tiling)); + patterns.push_back(std::move(promotion)); + patterns.push_back(std::move(vectorization)); +} diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp index cf14555aa63fc..585b00189964d 100644 --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -147,6 +147,50 @@ static void unpackRanges(ArrayRef ranges, namespace mlir { namespace linalg { +/// Return the linearized list of all view dimensions in a linalgOp. +SmallVector getViewSizes(OpBuilder &builder, LinalgOp linalgOp) { + auto loc = linalgOp.getLoc(); + SmallVector res; + SmallVector ranks; + for (auto v : linalgOp.getInputsAndOutputBuffers()) { + MemRefType t = v.getType().template cast(); + ranks.push_back(t.getRank()); + for (unsigned i = 0; i < t.getRank(); ++i) + res.push_back(builder.create(loc, v, i)); + } + + auto attr = linalgOp.template getAttrOfType("symbol_source"); + if (attr) { + // Find the correct position for inserting values for symbols. + unsigned numSymb = ranks[attr.getInt()], symbolsPos = 0; + for (unsigned idx = 0; idx < attr.getInt(); idx++) + symbolsPos += ranks[idx]; + + // Append the end of the value list that corresponds to the + // values mapping to symbols. Since inside concatinated map symbols are + // repeated we have to repeat the sizes as well. + + // Reserve is mandatory to avoid a potential undefined behavior with + // pushing back to smallvector from itself. + res.reserve(res.size() + ranks.size() * numSymb); + for (unsigned idx = 0, s = ranks.size(); idx < s; ++idx) + for (unsigned idx2 = 0; idx2 < numSymb; ++idx2) + res.push_back(res[symbolsPos + idx2]); + } + return res; +} + +Optional> +getLoopRanges(OpBuilder &builder, LinalgOp linalgOp, OperationFolder *folder) { + SmallVector viewSizes = getViewSizes(builder, linalgOp); + AffineMap invertedMap = + inversePermutation(concatAffineMaps(linalgOp.getIndexingMaps())); + if (!invertedMap) + return {}; + return applyMapToValues(builder, linalgOp.getLoc(), invertedMap, viewSizes, + folder); +} + /// Specialization to build an scf "for" nest. template <> void GenerateLoopNest::doit( diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index b5dfa2c133585..3cae3c8feb8fa 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -101,6 +101,22 @@ static ParseResult parseOptionalOperand(OpAsmParser &parser, StringRef keyword, return success(); } +static OptionalParseResult parseOptionalOperandAndType(OpAsmParser &parser, + StringRef keyword, + OperationState &result) { + OpAsmParser::OperandType operand; + Type type; + if (succeeded(parser.parseOptionalKeyword(keyword))) { + if (parser.parseLParen() || parser.parseOperand(operand) || + parser.parseColonType(type) || + parser.resolveOperand(operand, type, result.operands) || + parser.parseRParen()) + return failure(); + return success(); + } + return llvm::None; +} + //===----------------------------------------------------------------------===// // ParallelOp //===----------------------------------------------------------------------===// @@ -116,8 +132,11 @@ static ParseResult parseOptionalOperand(OpAsmParser &parser, StringRef keyword, /// `reduction` `(` value-list `)`? /// `copy` `(` value-list `)`? /// `copyin` `(` value-list `)`? +/// `copyin_readonly` `(` value-list `)`? /// `copyout` `(` value-list `)`? +/// `copyout_zero` `(` value-list `)`? /// `create` `(` value-list `)`? +/// `create_zero` `(` value-list `)`? /// `no_create` `(` value-list `)`? /// `present` `(` value-list `)`? /// `deviceptr` `(` value-list `)`? @@ -129,43 +148,50 @@ static ParseResult parseParallelOp(OpAsmParser &parser, OperationState &result) { Builder &builder = parser.getBuilder(); SmallVector privateOperands, - firstprivateOperands, createOperands, copyOperands, copyinOperands, - copyoutOperands, noCreateOperands, presentOperands, devicePtrOperands, - attachOperands, waitOperands, reductionOperands; - SmallVector operandTypes; - OpAsmParser::OperandType async, numGangs, numWorkers, vectorLength, ifCond, - selfCond; - bool hasAsync = false, hasNumGangs = false, hasNumWorkers = false; - bool hasVectorLength = false, hasIfCond = false, hasSelfCond = false; + firstprivateOperands, copyOperands, copyinOperands, + copyinReadonlyOperands, copyoutOperands, copyoutZeroOperands, + createOperands, createZeroOperands, noCreateOperands, presentOperands, + devicePtrOperands, attachOperands, waitOperands, reductionOperands; + SmallVector waitOperandTypes, reductionOperandTypes, + copyOperandTypes, copyinOperandTypes, copyinReadonlyOperandTypes, + copyoutOperandTypes, copyoutZeroOperandTypes, createOperandTypes, + createZeroOperandTypes, noCreateOperandTypes, presentOperandTypes, + deviceptrOperandTypes, attachOperandTypes, privateOperandTypes, + firstprivateOperandTypes; - Type indexType = builder.getIndexType(); + SmallVector operandTypes; + OpAsmParser::OperandType ifCond, selfCond; + bool hasIfCond = false, hasSelfCond = false; + OptionalParseResult async, numGangs, numWorkers, vectorLength; Type i1Type = builder.getI1Type(); // async()? - if (failed(parseOptionalOperand(parser, ParallelOp::getAsyncKeyword(), async, - indexType, hasAsync, result))) + async = parseOptionalOperandAndType(parser, ParallelOp::getAsyncKeyword(), + result); + if (async.hasValue() && failed(*async)) return failure(); // wait()? if (failed(parseOperandList(parser, ParallelOp::getWaitKeyword(), - waitOperands, operandTypes, result))) + waitOperands, waitOperandTypes, result))) return failure(); // num_gangs(value)? - if (failed(parseOptionalOperand(parser, ParallelOp::getNumGangsKeyword(), - numGangs, indexType, hasNumGangs, result))) + numGangs = parseOptionalOperandAndType( + parser, ParallelOp::getNumGangsKeyword(), result); + if (numGangs.hasValue() && failed(*numGangs)) return failure(); // num_workers(value)? - if (failed(parseOptionalOperand(parser, ParallelOp::getNumWorkersKeyword(), - numWorkers, indexType, hasNumWorkers, - result))) + numWorkers = parseOptionalOperandAndType( + parser, ParallelOp::getNumWorkersKeyword(), result); + if (numWorkers.hasValue() && failed(*numWorkers)) return failure(); // vector_length(value)? - if (failed(parseOptionalOperand(parser, ParallelOp::getVectorLengthKeyword(), - vectorLength, indexType, hasVectorLength, - result))) + vectorLength = parseOptionalOperandAndType( + parser, ParallelOp::getVectorLengthKeyword(), result); + if (vectorLength.hasValue() && failed(*vectorLength)) return failure(); // if()? @@ -180,83 +206,108 @@ static ParseResult parseParallelOp(OpAsmParser &parser, // reduction()? if (failed(parseOperandList(parser, ParallelOp::getReductionKeyword(), - reductionOperands, operandTypes, result))) + reductionOperands, reductionOperandTypes, + result))) return failure(); // copy()? if (failed(parseOperandList(parser, ParallelOp::getCopyKeyword(), - copyOperands, operandTypes, result))) + copyOperands, copyOperandTypes, result))) return failure(); // copyin()? if (failed(parseOperandList(parser, ParallelOp::getCopyinKeyword(), - copyinOperands, operandTypes, result))) + copyinOperands, copyinOperandTypes, result))) + return failure(); + + // copyin_readonly()? + if (failed(parseOperandList(parser, ParallelOp::getCopyinReadonlyKeyword(), + copyinReadonlyOperands, + copyinReadonlyOperandTypes, result))) return failure(); // copyout()? if (failed(parseOperandList(parser, ParallelOp::getCopyoutKeyword(), - copyoutOperands, operandTypes, result))) + copyoutOperands, copyoutOperandTypes, result))) + return failure(); + + // copyout_zero()? + if (failed(parseOperandList(parser, ParallelOp::getCopyoutZeroKeyword(), + copyoutZeroOperands, copyoutZeroOperandTypes, + result))) return failure(); // create()? if (failed(parseOperandList(parser, ParallelOp::getCreateKeyword(), - createOperands, operandTypes, result))) + createOperands, createOperandTypes, result))) + return failure(); + + // create_zero()? + if (failed(parseOperandList(parser, ParallelOp::getCreateZeroKeyword(), + createZeroOperands, createZeroOperandTypes, + result))) return failure(); // no_create()? if (failed(parseOperandList(parser, ParallelOp::getNoCreateKeyword(), - noCreateOperands, operandTypes, result))) + noCreateOperands, noCreateOperandTypes, result))) return failure(); // present()? if (failed(parseOperandList(parser, ParallelOp::getPresentKeyword(), - presentOperands, operandTypes, result))) + presentOperands, presentOperandTypes, result))) return failure(); // deviceptr()? if (failed(parseOperandList(parser, ParallelOp::getDevicePtrKeyword(), - devicePtrOperands, operandTypes, result))) + devicePtrOperands, deviceptrOperandTypes, + result))) return failure(); // attach()? if (failed(parseOperandList(parser, ParallelOp::getAttachKeyword(), - attachOperands, operandTypes, result))) + attachOperands, attachOperandTypes, result))) return failure(); // private()? if (failed(parseOperandList(parser, ParallelOp::getPrivateKeyword(), - privateOperands, operandTypes, result))) + privateOperands, privateOperandTypes, result))) return failure(); // firstprivate()? if (failed(parseOperandList(parser, ParallelOp::getFirstPrivateKeyword(), - firstprivateOperands, operandTypes, result))) + firstprivateOperands, firstprivateOperandTypes, + result))) return failure(); // Parallel op region if (failed(parseRegions(parser, result))) return failure(); - result.addAttribute(ParallelOp::getOperandSegmentSizeAttr(), - builder.getI32VectorAttr( - {static_cast(hasAsync ? 1 : 0), - static_cast(waitOperands.size()), - static_cast(hasNumGangs ? 1 : 0), - static_cast(hasNumWorkers ? 1 : 0), - static_cast(hasVectorLength ? 1 : 0), - static_cast(hasIfCond ? 1 : 0), - static_cast(hasSelfCond ? 1 : 0), - static_cast(reductionOperands.size()), - static_cast(copyOperands.size()), - static_cast(copyinOperands.size()), - static_cast(copyoutOperands.size()), - static_cast(createOperands.size()), - static_cast(noCreateOperands.size()), - static_cast(presentOperands.size()), - static_cast(devicePtrOperands.size()), - static_cast(attachOperands.size()), - static_cast(privateOperands.size()), - static_cast(firstprivateOperands.size())})); + result.addAttribute( + ParallelOp::getOperandSegmentSizeAttr(), + builder.getI32VectorAttr( + {static_cast(async.hasValue() ? 1 : 0), + static_cast(waitOperands.size()), + static_cast(numGangs.hasValue() ? 1 : 0), + static_cast(numWorkers.hasValue() ? 1 : 0), + static_cast(vectorLength.hasValue() ? 1 : 0), + static_cast(hasIfCond ? 1 : 0), + static_cast(hasSelfCond ? 1 : 0), + static_cast(reductionOperands.size()), + static_cast(copyOperands.size()), + static_cast(copyinOperands.size()), + static_cast(copyinReadonlyOperands.size()), + static_cast(copyoutOperands.size()), + static_cast(copyoutZeroOperands.size()), + static_cast(createOperands.size()), + static_cast(createZeroOperands.size()), + static_cast(noCreateOperands.size()), + static_cast(presentOperands.size()), + static_cast(devicePtrOperands.size()), + static_cast(attachOperands.size()), + static_cast(privateOperands.size()), + static_cast(firstprivateOperands.size())})); // Additional attributes if (failed(parser.parseOptionalAttrDictWithKeyword(result.attributes))) @@ -269,21 +320,27 @@ static void print(OpAsmPrinter &printer, ParallelOp &op) { printer << ParallelOp::getOperationName(); // async()? - if (auto async = op.async()) - printer << " " << ParallelOp::getAsyncKeyword() << "(" << async << ")"; + if (Value async = op.async()) + printer << " " << ParallelOp::getAsyncKeyword() << "(" << async << ": " + << async.getType() << ")"; // wait()? printOperandList(op.waitOperands(), ParallelOp::getWaitKeyword(), printer); // num_gangs()? - if (auto numGangs = op.numGangs()) + if (Value numGangs = op.numGangs()) printer << " " << ParallelOp::getNumGangsKeyword() << "(" << numGangs - << ")"; + << ": " << numGangs.getType() << ")"; // num_workers()? - if (auto numWorkers = op.numWorkers()) + if (Value numWorkers = op.numWorkers()) printer << " " << ParallelOp::getNumWorkersKeyword() << "(" << numWorkers - << ")"; + << ": " << numWorkers.getType() << ")"; + + // vector_length()? + if (Value vectorLength = op.vectorLength()) + printer << " " << ParallelOp::getVectorLengthKeyword() << "(" + << vectorLength << ": " << vectorLength.getType() << ")"; // if()? if (Value ifCond = op.ifCond()) @@ -304,14 +361,26 @@ static void print(OpAsmPrinter &printer, ParallelOp &op) { printOperandList(op.copyinOperands(), ParallelOp::getCopyinKeyword(), printer); + // copyin_readonly()? + printOperandList(op.copyinReadonlyOperands(), + ParallelOp::getCopyinReadonlyKeyword(), printer); + // copyout()? printOperandList(op.copyoutOperands(), ParallelOp::getCopyoutKeyword(), printer); + // copyout_zero()? + printOperandList(op.copyoutZeroOperands(), + ParallelOp::getCopyoutZeroKeyword(), printer); + // create()? printOperandList(op.createOperands(), ParallelOp::getCreateKeyword(), printer); + // create_zero()? + printOperandList(op.createZeroOperands(), ParallelOp::getCreateZeroKeyword(), + printer); + // no_create()? printOperandList(op.noCreateOperands(), ParallelOp::getNoCreateKeyword(), printer); @@ -482,7 +551,7 @@ static void print(OpAsmPrinter &printer, DataOp &op) { /// region attr-dict? static ParseResult parseLoopOp(OpAsmParser &parser, OperationState &result) { Builder &builder = parser.getBuilder(); - unsigned executionMapping = 0; + unsigned executionMapping = OpenACCExecMapping::NONE; SmallVector operandTypes; SmallVector privateOperands, reductionOperands; SmallVector tileOperands; @@ -562,7 +631,7 @@ static ParseResult parseLoopOp(OpAsmParser &parser, OperationState &result) { reductionOperands, operandTypes, result))) return failure(); - if (executionMapping != 0) + if (executionMapping != acc::OpenACCExecMapping::NONE) result.addAttribute(LoopOp::getExecutionMappingAttrName(), builder.getI64IntegerAttr(executionMapping)); @@ -592,13 +661,7 @@ static ParseResult parseLoopOp(OpAsmParser &parser, OperationState &result) { static void print(OpAsmPrinter &printer, LoopOp &op) { printer << LoopOp::getOperationName(); - unsigned execMapping = - (op.getAttrOfType(LoopOp::getExecutionMappingAttrName()) != - nullptr) - ? op.getAttrOfType(LoopOp::getExecutionMappingAttrName()) - .getInt() - : 0; - + unsigned execMapping = op.exec_mapping(); if (execMapping & OpenACCExecMapping::GANG) { printer << " " << LoopOp::getGangKeyword(); Value gangNum = op.gangNum(); @@ -656,5 +719,31 @@ static void print(OpAsmPrinter &printer, LoopOp &op) { LoopOp::getOperandSegmentSizeAttr()}); } +static LogicalResult verifyLoopOp(acc::LoopOp loopOp) { + // auto, independent and seq attribute are mutually exclusive. + if ((loopOp.auto_() && (loopOp.independent() || loopOp.seq())) || + (loopOp.independent() && loopOp.seq())) { + loopOp.emitError("only one of " + acc::LoopOp::getAutoAttrName() + ", " + + acc::LoopOp::getIndependentAttrName() + ", " + + acc::LoopOp::getSeqAttrName() + + " can be present at the same time"); + return failure(); + } + + // Gang, worker and vector are incompatible with seq. + if (loopOp.seq() && loopOp.exec_mapping() != OpenACCExecMapping::NONE) { + loopOp.emitError("gang, worker or vector cannot appear with the seq attr"); + return failure(); + } + + // Check non-empty body(). + if (loopOp.region().empty()) { + loopOp.emitError("expected non-empty body."); + return failure(); + } + + return success(); +} + #define GET_OP_CLASSES #include "mlir/Dialect/OpenACC/OpenACCOps.cpp.inc" diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 217588289e851..ec47177df84ce 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -271,9 +271,5 @@ static ParseResult parseParallelOp(OpAsmParser &parser, return success(); } -namespace mlir { -namespace omp { #define GET_OP_CLASSES #include "mlir/Dialect/OpenMP/OpenMPOps.cpp.inc" -} // namespace omp -} // namespace mlir diff --git a/mlir/lib/Dialect/PDL/IR/PDL.cpp b/mlir/lib/Dialect/PDL/IR/PDL.cpp index 082229b6b3944..a0b9c969becf6 100644 --- a/mlir/lib/Dialect/PDL/IR/PDL.cpp +++ b/mlir/lib/Dialect/PDL/IR/PDL.cpp @@ -454,11 +454,5 @@ static LogicalResult verify(TypeOp op) { // TableGen'd op method definitions //===----------------------------------------------------------------------===// -namespace mlir { -namespace pdl { - #define GET_OP_CLASSES #include "mlir/Dialect/PDL/IR/PDLOps.cpp.inc" - -} // end namespace pdl -} // end namespace mlir diff --git a/mlir/lib/Dialect/SCF/SCF.cpp b/mlir/lib/Dialect/SCF/SCF.cpp index 498246315d642..e36ffc2e6b815 100644 --- a/mlir/lib/Dialect/SCF/SCF.cpp +++ b/mlir/lib/Dialect/SCF/SCF.cpp @@ -899,9 +899,5 @@ static void print(OpAsmPrinter &p, scf::YieldOp op) { // TableGen'd op method definitions //===----------------------------------------------------------------------===// -namespace mlir { -namespace scf { #define GET_OP_CLASSES #include "mlir/Dialect/SCF/SCFOps.cpp.inc" -} // namespace scf -} // namespace mlir diff --git a/mlir/lib/Dialect/SPIRV/SPIRVAttributes.cpp b/mlir/lib/Dialect/SPIRV/SPIRVAttributes.cpp index c2bf4840ddc84..6773862a8cd73 100644 --- a/mlir/lib/Dialect/SPIRV/SPIRVAttributes.cpp +++ b/mlir/lib/Dialect/SPIRV/SPIRVAttributes.cpp @@ -16,9 +16,10 @@ using namespace mlir; // DictionaryDict derived attributes //===----------------------------------------------------------------------===// -namespace mlir { #include "mlir/Dialect/SPIRV/TargetAndABI.cpp.inc" +namespace mlir { + //===----------------------------------------------------------------------===// // Attribute storage classes //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp index 339f588541f6e..a01177132b27b 100644 --- a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp +++ b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp @@ -16,6 +16,7 @@ #include "mlir/Dialect/SPIRV/SPIRVAttributes.h" #include "mlir/Dialect/SPIRV/SPIRVDialect.h" #include "mlir/Dialect/SPIRV/SPIRVTypes.h" +#include "mlir/Dialect/SPIRV/TargetAndABI.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Function.h" #include "mlir/IR/FunctionImplementation.h" @@ -305,7 +306,12 @@ static void printSourceMemoryAccessAttribute( } static LogicalResult verifyCastOp(Operation *op, - bool requireSameBitWidth = true) { + bool requireSameBitWidth = true, + bool skipBitWidthCheck = false) { + // Some CastOps have no limit on bit widths for result and operand type. + if (skipBitWidthCheck) + return success(); + Type operandType = op->getOperand(0).getType(); Type resultType = op->getResult(0).getType(); @@ -2038,6 +2044,32 @@ static LogicalResult verify(spirv::GroupNonUniformBallotOp ballotOp) { return success(); } +//===----------------------------------------------------------------------===// +// spv.GroupNonUniformBroadcast +//===----------------------------------------------------------------------===// + +static LogicalResult verify(spirv::GroupNonUniformBroadcastOp broadcastOp) { + spirv::Scope scope = broadcastOp.execution_scope(); + if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup) + return broadcastOp.emitOpError( + "execution scope must be 'Workgroup' or 'Subgroup'"); + + // SPIR-V spec: "Before version 1.5, Id must come from a + // constant instruction. + auto targetEnv = spirv::getDefaultTargetEnv(broadcastOp.getContext()); + if (auto spirvModule = broadcastOp.getParentOfType()) + targetEnv = spirv::lookupTargetEnvOrDefault(spirvModule); + + if (targetEnv.getVersion() < spirv::Version::V_1_5) { + auto *idOp = broadcastOp.id().getDefiningOp(); + if (!idOp || !isa(idOp)) // for spec constant + return broadcastOp.emitOpError("id must be the result of a constant op"); + } + + return success(); +} + //===----------------------------------------------------------------------===// // spv.SubgroupBlockReadINTEL //===----------------------------------------------------------------------===// @@ -3261,11 +3293,15 @@ namespace spirv { // TableGen'erated operation interfaces for querying versions, extensions, and // capabilities. #include "mlir/Dialect/SPIRV/SPIRVAvailability.cpp.inc" +} // namespace spirv +} // namespace mlir // TablenGen'erated operation definitions. #define GET_OP_CLASSES #include "mlir/Dialect/SPIRV/SPIRVOps.cpp.inc" +namespace mlir { +namespace spirv { // TableGen'erated operation availability interface implementations. #include "mlir/Dialect/SPIRV/SPIRVOpAvailabilityImpl.inc" diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp index bcfaa896f63d2..70621295e39cf 100644 --- a/mlir/lib/Dialect/Shape/IR/Shape.cpp +++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp @@ -399,46 +399,6 @@ LogicalResult getShapeVec(Value input, SmallVectorImpl &shapeValues) { return failure(); } } - -// For shapes that were created by some operations, we can obtain partial -// information on the shapes and sometimes determine if they will be -// broadcastable with that. -struct CstrBroadcastablePartialInfo - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(CstrBroadcastableOp op, - PatternRewriter &rewriter) const override { - SmallVector lhsShape, rhsShape; - if (failed(getShapeVec(op.lhs(), lhsShape))) - return failure(); - if (failed(getShapeVec(op.rhs(), rhsShape))) - return failure(); - if (!OpTrait::util::staticallyKnownBroadcastable(lhsShape, rhsShape)) - return failure(); - - rewriter.replaceOpWithNewOp(op.getOperation(), true); - return success(); - } -}; - -// Scalars are always broadcastable. -struct CstrBroadcastableScalar : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(CstrBroadcastableOp op, - PatternRewriter &rewriter) const override { - SmallVector shape; - if (failed(getShapeVec(op.lhs(), shape)) || shape.size() > 0) - return failure(); - if (failed(getShapeVec(op.rhs(), shape)) || shape.size() > 0) - return failure(); - - rewriter.replaceOpWithNewOp(op.getOperation(), true); - return success(); - } -}; - } // namespace void CstrBroadcastableOp::getCanonicalizationPatterns( @@ -446,8 +406,7 @@ void CstrBroadcastableOp::getCanonicalizationPatterns( // Canonicalization patterns have overlap with the considerations during // folding in case additional shape information is inferred at some point that // does not result in folding. - patterns.insert(context); + patterns.insert(context); } OpFoldResult CstrBroadcastableOp::fold(ArrayRef operands) { @@ -531,6 +490,14 @@ void ConstSizeOp::getAsmResultNames( OpFoldResult ConstWitnessOp::fold(ArrayRef) { return passingAttr(); } +//===----------------------------------------------------------------------===// +// CstrRequireOp +//===----------------------------------------------------------------------===// + +OpFoldResult CstrRequireOp::fold(ArrayRef operands) { + return operands[0]; +} + //===----------------------------------------------------------------------===// // ShapeEqOp //===----------------------------------------------------------------------===// @@ -938,11 +905,5 @@ static void print(OpAsmPrinter &p, ReduceOp op) { p.printOptionalAttrDict(op.getAttrs()); } -namespace mlir { -namespace shape { - #define GET_OP_CLASSES #include "mlir/Dialect/Shape/IR/ShapeOps.cpp.inc" - -} // namespace shape -} // namespace mlir diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp index 65f8b83d9a718..c0dc87210a3f1 100644 --- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp +++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp @@ -11,6 +11,7 @@ #include "mlir/Dialect/CommonFolders.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" +#include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Function.h" #include "mlir/IR/Matchers.h" @@ -217,6 +218,26 @@ static LogicalResult foldMemRefCast(Operation *op) { return success(folded); } +//===----------------------------------------------------------------------===// +// Common cast compatibility check for vector types. +//===----------------------------------------------------------------------===// + +/// This method checks for cast compatibility of vector types. +/// If 'a' and 'b' are vector types, and they are cast compatible, +/// it calls the 'areElementsCastCompatible' function to check for +/// element cast compatibility. +/// Returns 'true' if the vector types are cast compatible, and 'false' +/// otherwise. +static bool areVectorCastSimpleCompatible( + Type a, Type b, function_ref areElementsCastCompatible) { + if (auto va = a.dyn_cast()) + if (auto vb = b.dyn_cast()) + return va.getShape().equals(vb.getShape()) && + areElementsCastCompatible(va.getElementType(), + vb.getElementType()); + return false; +} + //===----------------------------------------------------------------------===// // AddFOp //===----------------------------------------------------------------------===// @@ -1694,6 +1715,117 @@ static LogicalResult verify(DynamicTensorFromElementsOp op) { return success(); } +void DynamicTensorFromElementsOp::build( + OpBuilder &b, OperationState &result, Type resultTy, + ValueRange dynamicExtents, + function_ref bodyBuilder) { + build(b, result, resultTy, dynamicExtents); + + // Build and populate body. + OpBuilder::InsertionGuard guard(b); + Region *bodyRegion = result.regions.front().get(); + auto rank = resultTy.cast().getRank(); + SmallVector argumentTypes(rank, b.getIndexType()); + Block *bodyBlock = + b.createBlock(bodyRegion, bodyRegion->end(), argumentTypes); + bodyBuilder(b, result.location, bodyBlock->getArguments()); +} + +namespace { + +/// Canonicalizes dynamic_tensor_from_elements operations with a constant +/// operand into the equivalent operation with the operand expressed in the +/// result type, instead. We also insert a type cast to make sure that the +/// resulting IR is still well-typed. +struct StaticDynamicTensorFromElements + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(DynamicTensorFromElementsOp tensorFromElements, + PatternRewriter &rewriter) const final { + auto resultType = + tensorFromElements.getResult().getType().cast(); + + if (resultType.hasStaticShape()) + return failure(); + + SmallVector newOperands; + SmallVector newShape; + auto operandsIt = tensorFromElements.dynamicExtents().begin(); + + for (int64_t dim : resultType.getShape()) { + if (dim != RankedTensorType::kDynamicSize) { + newShape.push_back(dim); + continue; + } + APInt index; + if (!matchPattern(*operandsIt, m_ConstantInt(&index))) { + newShape.push_back(RankedTensorType::kDynamicSize); + newOperands.push_back(*operandsIt++); + continue; + } + newShape.push_back(index.getSExtValue()); + operandsIt++; + } + + if (newOperands.size() == tensorFromElements.dynamicExtents().size()) + return failure(); + + auto loc = tensorFromElements.getLoc(); + auto newOp = rewriter.create( + loc, RankedTensorType::get(newShape, resultType.getElementType()), + newOperands); + rewriter.inlineRegionBefore(tensorFromElements.body(), newOp.body(), + newOp.body().begin()); + rewriter.replaceOpWithNewOp(tensorFromElements, resultType, + newOp); + return success(); + } +}; + +/// Canonicalizes the pattern of the form +/// +/// %tensor = dynamic_tensor_from_elements %x { +/// ^bb0(%arg0: index): // no predecessors +/// +/// yield %1 : index +/// } : tensor +/// %extracted_element = extract_element %tensor[%c0] : tensor +/// +/// to just with %arg0 replaced by %c0. We only do this if the +/// dynamic_tensor_from_elements operation has no side-effects. +struct ExtractElementFromDynamicTensorFromElements + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(ExtractElementOp extract, + PatternRewriter &rewriter) const final { + auto tensorFromElements = + extract.aggregate().getDefiningOp(); + if (!tensorFromElements || !wouldOpBeTriviallyDead(tensorFromElements)) + return failure(); + + BlockAndValueMapping mapping; + Block *body = tensorFromElements.getBody(); + mapping.map(body->getArguments(), extract.indices()); + for (auto &op : body->without_terminator()) + rewriter.clone(op, mapping); + + auto yield = cast(body->getTerminator()); + + rewriter.replaceOp(extract, mapping.lookupOrDefault(yield.value())); + return success(); + } +}; + +} // namespace + +void DynamicTensorFromElementsOp::getCanonicalizationPatterns( + OwningRewritePatternList &results, MLIRContext *context) { + results.insert(context); +} + //===----------------------------------------------------------------------===// // ExtractElementOp //===----------------------------------------------------------------------===// @@ -1740,42 +1872,18 @@ OpFoldResult ExtractElementOp::fold(ArrayRef operands) { // TensorFromElementsOp //===----------------------------------------------------------------------===// -static ParseResult parseTensorFromElementsOp(OpAsmParser &parser, - OperationState &result) { - SmallVector elementsOperands; - Type resultType; - if (parser.parseLParen() || parser.parseOperandList(elementsOperands) || - parser.parseRParen() || parser.parseOptionalAttrDict(result.attributes) || - parser.parseColon() || parser.parseType(resultType)) - return failure(); - - if (parser.resolveOperands(elementsOperands, - resultType.cast().getElementType(), - result.operands)) - return failure(); - - result.addTypes(resultType); - return success(); -} - -static void print(OpAsmPrinter &p, TensorFromElementsOp op) { - p << "tensor_from_elements(" << op.elements() << ')'; - p.printOptionalAttrDict(op.getAttrs()); - p << " : " << op.result().getType(); +void TensorFromElementsOp::build(OpBuilder &builder, OperationState &result, + Type elementType, ValueRange elements) { + Type resultTy = RankedTensorType::get({static_cast(elements.size())}, + elementType); + result.addOperands(elements); + result.addTypes(resultTy); } -static LogicalResult verify(TensorFromElementsOp op) { - auto resultTensorType = op.result().getType().dyn_cast(); - if (!resultTensorType) - return op.emitOpError("expected result type to be a ranked tensor"); - - int64_t elementsCount = static_cast(op.elements().size()); - if (resultTensorType.getRank() != 1 || - resultTensorType.getShape().front() != elementsCount) - return op.emitOpError() - << "expected result type to be a 1D tensor with " << elementsCount - << (elementsCount == 1 ? " element" : " elements"); - return success(); +void TensorFromElementsOp::build(OpBuilder &builder, OperationState &result, + ValueRange elements) { + assert(!elements.empty() && "expected at least one element"); + build(builder, result, elements.front().getType(), elements); } namespace { @@ -1795,16 +1903,16 @@ struct ExtractElementFromTensorFromElements if (extract.indices().size() != 1) return failure(); - auto tensor_from_elements = dyn_cast_or_null( + auto tensorFromElements = dyn_cast_or_null( extract.aggregate().getDefiningOp()); - if (tensor_from_elements == nullptr) + if (tensorFromElements == nullptr) return failure(); APInt index; if (!matchPattern(*extract.indices().begin(), m_ConstantInt(&index))) return failure(); rewriter.replaceOp(extract, - tensor_from_elements.getOperand(index.getZExtValue())); + tensorFromElements.getOperand(index.getZExtValue())); return success(); } }; @@ -1824,11 +1932,7 @@ bool FPExtOp::areCastCompatible(Type a, Type b) { if (auto fa = a.dyn_cast()) if (auto fb = b.dyn_cast()) return fa.getWidth() < fb.getWidth(); - if (auto va = a.dyn_cast()) - if (auto vb = b.dyn_cast()) - return va.getShape().equals(vb.getShape()) && - areCastCompatible(va.getElementType(), vb.getElementType()); - return false; + return areVectorCastSimpleCompatible(a, b, areCastCompatible); } //===----------------------------------------------------------------------===// @@ -1836,7 +1940,9 @@ bool FPExtOp::areCastCompatible(Type a, Type b) { //===----------------------------------------------------------------------===// bool FPToSIOp::areCastCompatible(Type a, Type b) { - return a.isa() && b.isSignlessInteger(); + if (a.isa() && b.isSignlessInteger()) + return true; + return areVectorCastSimpleCompatible(a, b, areCastCompatible); } //===----------------------------------------------------------------------===// @@ -1844,7 +1950,9 @@ bool FPToSIOp::areCastCompatible(Type a, Type b) { //===----------------------------------------------------------------------===// bool FPToUIOp::areCastCompatible(Type a, Type b) { - return a.isa() && b.isSignlessInteger(); + if (a.isa() && b.isSignlessInteger()) + return true; + return areVectorCastSimpleCompatible(a, b, areCastCompatible); } //===----------------------------------------------------------------------===// @@ -1855,11 +1963,7 @@ bool FPTruncOp::areCastCompatible(Type a, Type b) { if (auto fa = a.dyn_cast()) if (auto fb = b.dyn_cast()) return fa.getWidth() > fb.getWidth(); - if (auto va = a.dyn_cast()) - if (auto vb = b.dyn_cast()) - return va.getShape().equals(vb.getShape()) && - areCastCompatible(va.getElementType(), vb.getElementType()); - return false; + return areVectorCastSimpleCompatible(a, b, areCastCompatible); } //===----------------------------------------------------------------------===// @@ -2299,7 +2403,9 @@ OpFoldResult SignedRemIOp::fold(ArrayRef operands) { // sitofp is applicable from integer types to float types. bool SIToFPOp::areCastCompatible(Type a, Type b) { - return a.isSignlessInteger() && b.isa(); + if (a.isSignlessInteger() && b.isa()) + return true; + return areVectorCastSimpleCompatible(a, b, areCastCompatible); } //===----------------------------------------------------------------------===// @@ -2379,7 +2485,9 @@ OpFoldResult SubIOp::fold(ArrayRef operands) { // uitofp is applicable from integer types to float types. bool UIToFPOp::areCastCompatible(Type a, Type b) { - return a.isSignlessInteger() && b.isa(); + if (a.isSignlessInteger() && b.isa()) + return true; + return areVectorCastSimpleCompatible(a, b, areCastCompatible); } //===----------------------------------------------------------------------===// @@ -3055,6 +3163,87 @@ OpFoldResult TensorCastOp::fold(ArrayRef operands) { return impl::foldCastOp(*this); } +/// Compute a TensorType that has the joined shape knowledge of the two +/// given TensorTypes. The element types need to match. +static TensorType joinShapes(TensorType one, TensorType two) { + assert(one.getElementType() == two.getElementType()); + + if (!one.hasRank()) + return two; + if (!two.hasRank()) + return one; + + int64_t rank = one.getRank(); + if (rank != two.getRank()) + return {}; + + SmallVector join; + join.reserve(rank); + for (int64_t i = 0; i < rank; ++i) { + if (one.isDynamicDim(i)) { + join.push_back(two.getDimSize(i)); + continue; + } + if (two.isDynamicDim(i)) { + join.push_back(one.getDimSize(i)); + continue; + } + if (one.getDimSize(i) != two.getDimSize(i)) + return {}; + join.push_back(one.getDimSize(i)); + } + return RankedTensorType::get(join, one.getElementType()); +} + +namespace { + +/// Replaces chains of two tensor_cast operations by a single tensor_cast +/// operation if doing so does not remove runtime constraints. +struct ChainedTensorCast : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(TensorCastOp tensorCast, + PatternRewriter &rewriter) const final { + auto tensorCastOperand = + tensorCast.getOperand().getDefiningOp(); + + if (!tensorCastOperand) + return failure(); + + auto sourceType = + tensorCastOperand.getOperand().getType().cast(); + auto intermediateType = tensorCastOperand.getType().cast(); + auto resultType = tensorCast.getType().cast(); + + // We can remove the intermediate cast if joining all three produces the + // same result as just joining the source and result shapes. + auto firstJoin = + joinShapes(joinShapes(sourceType, intermediateType), resultType); + + // The join might not exist if the cast sequence would fail at runtime. + if (!firstJoin) + return failure(); + + // The newJoin always exists if the above join exists, it might just contain + // less information. If so, we cannot drop the intermediate cast, as doing + // so would remove runtime checks. + auto newJoin = joinShapes(sourceType, resultType); + if (firstJoin != newJoin) + return failure(); + + rewriter.replaceOpWithNewOp(tensorCast, resultType, + tensorCastOperand.getOperand()); + return success(); + } +}; + +} // namespace + +void TensorCastOp::getCanonicalizationPatterns( + OwningRewritePatternList &results, MLIRContext *context) { + results.insert(context); +} + //===----------------------------------------------------------------------===// // Helpers for Tensor[Load|Store]Op //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Vector/VectorOps.cpp b/mlir/lib/Dialect/Vector/VectorOps.cpp index d00e56297532c..c2cfaa54e4485 100644 --- a/mlir/lib/Dialect/Vector/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/VectorOps.cpp @@ -929,6 +929,17 @@ static LogicalResult verify(BroadcastOp op) { return success(); } +OpFoldResult BroadcastOp::fold(ArrayRef operands) { + if (!operands[0]) + return {}; + auto vectorType = getVectorType(); + if (operands[0].getType().isIntOrIndexOrFloat()) + return DenseElementsAttr::get(vectorType, operands[0]); + if (auto attr = operands[0].dyn_cast()) + return DenseElementsAttr::get(vectorType, attr.getSplatValue()); + return {}; +} + //===----------------------------------------------------------------------===// // ShuffleOp //===----------------------------------------------------------------------===// @@ -2688,11 +2699,5 @@ void mlir::vector::populateVectorToVectorCanonicalizationPatterns( TransposeFolder>(context); } -namespace mlir { -namespace vector { - #define GET_OP_CLASSES #include "mlir/Dialect/Vector/VectorOps.cpp.inc" - -} // namespace vector -} // namespace mlir diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp index 3deb7b477bea4..602138d3ada7c 100644 --- a/mlir/lib/IR/AsmPrinter.cpp +++ b/mlir/lib/IR/AsmPrinter.cpp @@ -2359,16 +2359,18 @@ void Value::print(raw_ostream &os) { if (auto *op = getDefiningOp()) return op->print(os); // TODO: Improve this. - assert(isa()); - os << "\n"; + BlockArgument arg = this->cast(); + os << " of type '" << arg.getType() + << "' at index: " << arg.getArgNumber() << '\n'; } void Value::print(raw_ostream &os, AsmState &state) { if (auto *op = getDefiningOp()) return op->print(os, state); // TODO: Improve this. - assert(isa()); - os << "\n"; + BlockArgument arg = this->cast(); + os << " of type '" << arg.getType() + << "' at index: " << arg.getArgNumber() << '\n'; } void Value::dump() { diff --git a/mlir/lib/IR/Block.cpp b/mlir/lib/IR/Block.cpp index 71f368c49776e..e039b41ae4b77 100644 --- a/mlir/lib/IR/Block.cpp +++ b/mlir/lib/IR/Block.cpp @@ -282,7 +282,7 @@ unsigned PredecessorIterator::getSuccessorIndex() const { } //===----------------------------------------------------------------------===// -// Successors +// SuccessorRange //===----------------------------------------------------------------------===// SuccessorRange::SuccessorRange(Block *block) : SuccessorRange(nullptr, 0) { @@ -295,3 +295,29 @@ SuccessorRange::SuccessorRange(Operation *term) : SuccessorRange(nullptr, 0) { if ((count = term->getNumSuccessors())) base = term->getBlockOperands().data(); } + +//===----------------------------------------------------------------------===// +// BlockRange +//===----------------------------------------------------------------------===// + +BlockRange::BlockRange(ArrayRef blocks) : BlockRange(nullptr, 0) { + if ((count = blocks.size())) + base = blocks.data(); +} + +BlockRange::BlockRange(SuccessorRange successors) + : BlockRange(successors.begin().getBase(), successors.size()) {} + +/// See `llvm::detail::indexed_accessor_range_base` for details. +BlockRange::OwnerT BlockRange::offset_base(OwnerT object, ptrdiff_t index) { + if (auto *operand = object.dyn_cast()) + return {operand + index}; + return {object.dyn_cast() + index}; +} + +/// See `llvm::detail::indexed_accessor_range_base` for details. +Block *BlockRange::dereference_iterator(OwnerT object, ptrdiff_t index) { + if (const auto *operand = object.dyn_cast()) + return operand[index].get(); + return object.dyn_cast()[index]; +} diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp index b8f9e6c9fdfc4..f531a6097c257 100644 --- a/mlir/lib/IR/Operation.cpp +++ b/mlir/lib/IR/Operation.cpp @@ -71,29 +71,24 @@ OperationName OperationName::getFromOpaquePointer(void *pointer) { /// Create a new Operation with the specific fields. Operation *Operation::create(Location location, OperationName name, - ArrayRef resultTypes, - ArrayRef operands, + TypeRange resultTypes, ValueRange operands, ArrayRef attributes, - ArrayRef successors, - unsigned numRegions) { + BlockRange successors, unsigned numRegions) { return create(location, name, resultTypes, operands, MutableDictionaryAttr(attributes), successors, numRegions); } /// Create a new Operation from operation state. Operation *Operation::create(const OperationState &state) { - return Operation::create(state.location, state.name, state.types, - state.operands, state.attributes, state.successors, - state.regions); + return create(state.location, state.name, state.types, state.operands, + state.attributes, state.successors, state.regions); } /// Create a new Operation with the specific fields. Operation *Operation::create(Location location, OperationName name, - ArrayRef resultTypes, - ArrayRef operands, + TypeRange resultTypes, ValueRange operands, MutableDictionaryAttr attributes, - ArrayRef successors, - RegionRange regions) { + BlockRange successors, RegionRange regions) { unsigned numRegions = regions.size(); Operation *op = create(location, name, resultTypes, operands, attributes, successors, numRegions); @@ -106,11 +101,9 @@ Operation *Operation::create(Location location, OperationName name, /// Overload of create that takes an existing MutableDictionaryAttr to avoid /// unnecessarily uniquing a list of attributes. Operation *Operation::create(Location location, OperationName name, - ArrayRef resultTypes, - ArrayRef operands, + TypeRange resultTypes, ValueRange operands, MutableDictionaryAttr attributes, - ArrayRef successors, - unsigned numRegions) { + BlockRange successors, unsigned numRegions) { // We only need to allocate additional memory for a subset of results. unsigned numTrailingResults = OpResult::getNumTrailing(resultTypes.size()); unsigned numInlineResults = OpResult::getNumInline(resultTypes.size()); @@ -167,7 +160,7 @@ Operation *Operation::create(Location location, OperationName name, } Operation::Operation(Location location, OperationName name, - ArrayRef resultTypes, unsigned numSuccessors, + TypeRange resultTypes, unsigned numSuccessors, unsigned numRegions, const MutableDictionaryAttr &attributes, bool hasOperandStorage) @@ -611,8 +604,8 @@ Operation *Operation::cloneWithoutRegions(BlockAndValueMapping &mapper) { successors.push_back(mapper.lookupOrDefault(successor)); // Create the new operation. - auto *newOp = Operation::create(getLoc(), getName(), getResultTypes(), - operands, attrs, successors, getNumRegions()); + auto *newOp = create(getLoc(), getName(), getResultTypes(), operands, attrs, + successors, getNumRegions()); // Remember the mapping of any results. for (unsigned i = 0, e = getNumResults(); i != e; ++i) diff --git a/mlir/lib/IR/OperationSupport.cpp b/mlir/lib/IR/OperationSupport.cpp index ab84f4e8cf178..69aea3bfcf198 100644 --- a/mlir/lib/IR/OperationSupport.cpp +++ b/mlir/lib/IR/OperationSupport.cpp @@ -186,7 +186,7 @@ void OperationState::addOperands(ValueRange newOperands) { operands.append(newOperands.begin(), newOperands.end()); } -void OperationState::addSuccessors(SuccessorRange newSuccessors) { +void OperationState::addSuccessors(BlockRange newSuccessors) { successors.append(newSuccessors.begin(), newSuccessors.end()); } diff --git a/mlir/lib/IR/PatternMatch.cpp b/mlir/lib/IR/PatternMatch.cpp index a26bc63ed89d0..d1da8d1d8f263 100644 --- a/mlir/lib/IR/PatternMatch.cpp +++ b/mlir/lib/IR/PatternMatch.cpp @@ -10,9 +10,12 @@ #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Value.h" +#include "llvm/Support/Debug.h" using namespace mlir; +#define DEBUG_TYPE "pattern-match" + PatternBenefit::PatternBenefit(unsigned benefit) : representation(benefit) { assert(representation == benefit && benefit != ImpossibleToMatchSentinel && "This pattern match benefit is too large to represent"); @@ -207,8 +210,14 @@ void PatternApplicator::applyCostModel(CostModel model) { anyOpPatterns.clear(); for (const auto &pat : owningPatternList) { // If the pattern is always impossible to match, just ignore it. - if (pat->getBenefit().isImpossibleToMatch()) + if (pat->getBenefit().isImpossibleToMatch()) { + LLVM_DEBUG({ + llvm::dbgs() + << "Ignoring pattern '" << pat->getRootKind() + << "' because it is impossible to match (by pattern benefit)\n"; + }); continue; + } if (Optional opName = pat->getRootKind()) patterns[*opName].push_back(pat.get()); else @@ -223,8 +232,14 @@ void PatternApplicator::applyCostModel(CostModel model) { auto processPatternList = [&](SmallVectorImpl &list) { // Special case for one pattern in the list, which is the most common case. if (list.size() == 1) { - if (model(*list.front()).isImpossibleToMatch()) + if (model(*list.front()).isImpossibleToMatch()) { + LLVM_DEBUG({ + llvm::dbgs() << "Ignoring pattern '" << list.front()->getRootKind() + << "' because it is impossible to match or cannot lead " + "to legal IR (by cost model)\n"; + }); list.clear(); + } return; } @@ -236,8 +251,14 @@ void PatternApplicator::applyCostModel(CostModel model) { // Sort patterns with highest benefit first, and remove those that are // impossible to match. std::stable_sort(list.begin(), list.end(), cmp); - while (!list.empty() && benefits[list.back()].isImpossibleToMatch()) + while (!list.empty() && benefits[list.back()].isImpossibleToMatch()) { + LLVM_DEBUG({ + llvm::dbgs() << "Ignoring pattern '" << list.back()->getRootKind() + << "' because it is impossible to match or cannot lead to " + "legal IR (by cost model)\n"; + }); list.pop_back(); + } }; for (auto &it : patterns) processPatternList(it.second); diff --git a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp index fc79c820165d4..498486281c770 100644 --- a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp +++ b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp @@ -103,13 +103,13 @@ static LogicalResult verifyTypesAlongAllEdges( if (sourceNo) diag << "Region #" << sourceNo.getValue(); else - diag << op->getName(); + diag << "parent operands"; diag << " to "; if (succRegionNo) diag << "Region #" << succRegionNo.getValue(); else - diag << op->getName(); + diag << "parent results"; return diag; }; @@ -117,10 +117,9 @@ static LogicalResult verifyTypesAlongAllEdges( TypeRange succInputsTypes = succ.getSuccessorInputs().getTypes(); if (sourceTypes.size() != succInputsTypes.size()) { InFlightDiagnostic diag = op->emitOpError(" region control flow edge "); - return printEdgeName(diag) - << " has " << sourceTypes.size() - << " source operands, but target successor needs " - << succInputsTypes.size(); + return printEdgeName(diag) << ": source has " << sourceTypes.size() + << " operands, but target successor needs " + << succInputsTypes.size(); } for (auto typesIdx : @@ -130,8 +129,8 @@ static LogicalResult verifyTypesAlongAllEdges( if (sourceType != inputType) { InFlightDiagnostic diag = op->emitOpError(" along control flow edge "); return printEdgeName(diag) - << " source #" << typesIdx.index() << " type " << sourceType - << " should match input #" << typesIdx.index() << " type " + << ": source type #" << typesIdx.index() << " " << sourceType + << " should match input type #" << typesIdx.index() << " " << inputType; } } diff --git a/mlir/lib/Support/StorageUniquer.cpp b/mlir/lib/Support/StorageUniquer.cpp index 73578b5c91acf..a3e296e99e738 100644 --- a/mlir/lib/Support/StorageUniquer.cpp +++ b/mlir/lib/Support/StorageUniquer.cpp @@ -89,6 +89,9 @@ struct StorageUniquerImpl { // Parametric Storage //===--------------------------------------------------------------------===// + /// Check if an instance of a parametric storage class exists. + bool hasParametricStorage(TypeID id) { return parametricUniquers.count(id); } + /// Get or create an instance of a parametric type. BaseStorage * getOrCreate(TypeID id, unsigned hashValue, @@ -176,6 +179,9 @@ struct StorageUniquerImpl { return singletonInstance; } + /// Check if an instance of a singleton storage class exists. + bool hasSingleton(TypeID id) { return singletonInstances.count(id); } + //===--------------------------------------------------------------------===// // Instance Storage //===--------------------------------------------------------------------===// @@ -227,6 +233,16 @@ auto StorageUniquer::getSingletonImpl(TypeID id) -> BaseStorage * { return impl->getSingleton(id); } +/// Test is the storage singleton is initialized. +bool StorageUniquer::isSingletonStorageInitialized(TypeID id) { + return impl->hasSingleton(id); +} + +/// Test is the parametric storage is initialized. +bool StorageUniquer::isParametricStorageInitialized(TypeID id) { + return impl->hasParametricStorage(id); +} + /// Implementation for registering an instance of a derived type with default /// storage. void StorageUniquer::registerSingletonImpl( diff --git a/mlir/lib/TableGen/Attribute.cpp b/mlir/lib/TableGen/Attribute.cpp index e489174a38d91..f34d9c00b4388 100644 --- a/mlir/lib/TableGen/Attribute.cpp +++ b/mlir/lib/TableGen/Attribute.cpp @@ -126,7 +126,12 @@ StringRef Attribute::getDerivedCodeBody() const { } Dialect Attribute::getDialect() const { - return Dialect(def->getValueAsDef("dialect")); + const llvm::RecordVal *record = def->getValue("dialect"); + if (record && record->getValue()) { + if (DefInit *init = dyn_cast(record->getValue())) + return Dialect(init->getDef()); + } + return Dialect(nullptr); } ConstantAttr::ConstantAttr(const DefInit *init) : def(init->getDef()) { @@ -255,7 +260,7 @@ StringRef StructAttr::getStructClassName() const { } StringRef StructAttr::getCppNamespace() const { - Dialect dialect(def->getValueAsDef("structDialect")); + Dialect dialect(def->getValueAsDef("dialect")); return dialect.getCppNamespace(); } diff --git a/mlir/lib/TableGen/Dialect.cpp b/mlir/lib/TableGen/Dialect.cpp index 2b5f7e534ecc7..c17180c204833 100644 --- a/mlir/lib/TableGen/Dialect.cpp +++ b/mlir/lib/TableGen/Dialect.cpp @@ -16,6 +16,8 @@ using namespace mlir; using namespace mlir::tblgen; Dialect::Dialect(const llvm::Record *def) : def(def) { + if (def == nullptr) + return; for (StringRef dialect : def->getValueAsListOfStrings("dependentDialects")) dependentDialects.push_back(dialect); } diff --git a/mlir/lib/TableGen/OpClass.cpp b/mlir/lib/TableGen/OpClass.cpp index 47c520c28394b..ceb4f5ae82a39 100644 --- a/mlir/lib/TableGen/OpClass.cpp +++ b/mlir/lib/TableGen/OpClass.cpp @@ -9,50 +9,157 @@ #include "mlir/TableGen/OpClass.h" #include "mlir/TableGen/Format.h" +#include "llvm/ADT/Sequence.h" #include "llvm/ADT/Twine.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include + +#define DEBUG_TYPE "mlir-tblgen-opclass" using namespace mlir; using namespace mlir::tblgen; +namespace { + +// Returns space to be emitted after the given C++ `type`. return "" if the +// ends with '&' or '*', or is empty, else returns " ". +StringRef getSpaceAfterType(StringRef type) { + return (type.empty() || type.endswith("&") || type.endswith("*")) ? "" : " "; +} + +} // namespace + //===----------------------------------------------------------------------===// -// OpMethodSignature definitions +// OpMethodParameter definitions //===----------------------------------------------------------------------===// -OpMethodSignature::OpMethodSignature(StringRef retType, StringRef name, - StringRef params) - : returnType(retType), methodName(name), parameters(params) {} +void OpMethodParameter::writeTo(raw_ostream &os, bool emitDefault) const { + if (properties & PP_Optional) + os << "/*optional*/"; + os << type << getSpaceAfterType(type) << name; + if (emitDefault && !defaultValue.empty()) + os << " = " << defaultValue; +} -void OpMethodSignature::writeDeclTo(raw_ostream &os) const { - os << returnType << (elideSpaceAfterType(returnType) ? "" : " ") << methodName - << "(" << parameters << ")"; +//===----------------------------------------------------------------------===// +// OpMethodParameters definitions +//===----------------------------------------------------------------------===// + +// Factory methods to construct the correct type of `OpMethodParameters` +// object based on the arguments. +std::unique_ptr OpMethodParameters::create() { + return std::make_unique(); } -void OpMethodSignature::writeDefTo(raw_ostream &os, - StringRef namePrefix) const { +std::unique_ptr +OpMethodParameters::create(StringRef params) { + return std::make_unique(params); +} + +std::unique_ptr +OpMethodParameters::create(llvm::SmallVectorImpl &¶ms) { + return std::make_unique(std::move(params)); +} + +std::unique_ptr +OpMethodParameters::create(StringRef type, StringRef name, + StringRef defaultValue) { + return std::make_unique(type, name, defaultValue); +} + +//===----------------------------------------------------------------------===// +// OpMethodUnresolvedParameters definitions +//===----------------------------------------------------------------------===// +void OpMethodUnresolvedParameters::writeDeclTo(raw_ostream &os) const { + os << parameters; +} + +void OpMethodUnresolvedParameters::writeDefTo(raw_ostream &os) const { // We need to remove the default values for parameters in method definition. // TODO: We are using '=' and ',' as delimiters for parameter // initializers. This is incorrect for initializer list with more than one // element. Change to a more robust approach. - auto removeParamDefaultValue = [](StringRef params) { - std::string result; - std::pair parts; - while (!params.empty()) { - parts = params.split("="); - result.append(result.empty() ? "" : ", "); - result += parts.first; - params = parts.second.split(",").second; - } - return result; - }; + llvm::SmallVector tokens; + StringRef params = parameters; + while (!params.empty()) { + std::pair parts = params.split("="); + tokens.push_back(parts.first); + params = parts.second.split(',').second; + } + llvm::interleaveComma(tokens, os, [&](StringRef token) { os << token; }); +} + +//===----------------------------------------------------------------------===// +// OpMethodResolvedParameters definitions +//===----------------------------------------------------------------------===// - os << returnType << (elideSpaceAfterType(returnType) ? "" : " ") << namePrefix - << (namePrefix.empty() ? "" : "::") << methodName << "(" - << removeParamDefaultValue(parameters) << ")"; +// Returns true if a method with these parameters makes a method with parameters +// `other` redundant. This should return true only if all possible calls to the +// other method can be replaced by calls to this method. +bool OpMethodResolvedParameters::makesRedundant( + const OpMethodResolvedParameters &other) const { + const size_t otherNumParams = other.getNumParameters(); + const size_t thisNumParams = getNumParameters(); + + // All calls to the other method can be replaced this method only if this + // method has the same or more arguments number of arguments as the other, and + // the common arguments have the same type. + if (thisNumParams < otherNumParams) + return false; + for (int idx : llvm::seq(0, otherNumParams)) + if (parameters[idx].getType() != other.parameters[idx].getType()) + return false; + + // If all the common arguments have the same type, we can elide the other + // method if this method has the same number of arguments as other or the + // first argument after the common ones has a default value (and by C++ + // requirement, all the later ones will also have a default value). + return thisNumParams == otherNumParams || + parameters[otherNumParams].hasDefaultValue(); } -bool OpMethodSignature::elideSpaceAfterType(StringRef type) { - return type.empty() || type.endswith("&") || type.endswith("*"); +void OpMethodResolvedParameters::writeDeclTo(raw_ostream &os) const { + llvm::interleaveComma(parameters, os, [&](const OpMethodParameter ¶m) { + param.writeDeclTo(os); + }); +} + +void OpMethodResolvedParameters::writeDefTo(raw_ostream &os) const { + llvm::interleaveComma(parameters, os, [&](const OpMethodParameter ¶m) { + param.writeDefTo(os); + }); +} + +//===----------------------------------------------------------------------===// +// OpMethodSignature definitions +//===----------------------------------------------------------------------===// + +// Returns if a method with this signature makes a method with `other` signature +// redundant. Only supports resolved parameters. +bool OpMethodSignature::makesRedundant(const OpMethodSignature &other) const { + if (methodName != other.methodName) + return false; + auto *resolvedThis = dyn_cast(parameters.get()); + auto *resolvedOther = + dyn_cast(other.parameters.get()); + if (resolvedThis && resolvedOther) + return resolvedThis->makesRedundant(*resolvedOther); + return false; +} + +void OpMethodSignature::writeDeclTo(raw_ostream &os) const { + os << returnType << getSpaceAfterType(returnType) << methodName << "("; + parameters->writeDeclTo(os); + os << ")"; +} + +void OpMethodSignature::writeDefTo(raw_ostream &os, + StringRef namePrefix) const { + os << returnType << getSpaceAfterType(returnType) << namePrefix + << (namePrefix.empty() ? "" : "::") << methodName << "("; + parameters->writeDefTo(os); + os << ")"; } //===----------------------------------------------------------------------===// @@ -90,10 +197,6 @@ void OpMethodBody::writeTo(raw_ostream &os) const { // OpMethod definitions //===----------------------------------------------------------------------===// -OpMethod::OpMethod(StringRef retType, StringRef name, StringRef params, - OpMethod::Property property, bool declOnly) - : properties(property), isDeclOnly(declOnly), - methodSignature(retType, name, params), methodBody(declOnly) {} void OpMethod::writeDeclTo(raw_ostream &os) const { os.indent(2); if (isStatic()) @@ -103,9 +206,9 @@ void OpMethod::writeDeclTo(raw_ostream &os) const { } void OpMethod::writeDefTo(raw_ostream &os, StringRef namePrefix) const { - if (isDeclOnly) + // Do not write definition if the method is decl only. + if (properties & MP_Declaration) return; - methodSignature.writeDefTo(os, namePrefix); os << " {\n"; methodBody.writeTo(os); @@ -122,7 +225,8 @@ void OpConstructor::addMemberInitializer(StringRef name, StringRef value) { } void OpConstructor::writeDefTo(raw_ostream &os, StringRef namePrefix) const { - if (isDeclOnly) + // Do not write definition if the method is decl only. + if (properties & MP_Declaration) return; methodSignature.writeDefTo(os, namePrefix); @@ -137,18 +241,6 @@ void OpConstructor::writeDefTo(raw_ostream &os, StringRef namePrefix) const { Class::Class(StringRef name) : className(name) {} -OpMethod &Class::newMethod(StringRef retType, StringRef name, StringRef params, - OpMethod::Property property, bool declOnly) { - methods.emplace_back(retType, name, params, property, declOnly); - return methods.back(); -} - -OpConstructor &Class::newConstructor(StringRef params, bool declOnly) { - constructors.emplace_back("", getClassName(), params, - OpMethod::MP_Constructor, declOnly); - return constructors.back(); -} - void Class::newField(StringRef type, StringRef name, StringRef defaultValue) { std::string varName = formatv("{0} {1}", type, name).str(); std::string field = defaultValue.empty() @@ -156,43 +248,42 @@ void Class::newField(StringRef type, StringRef name, StringRef defaultValue) { : formatv("{0} = {1}", varName, defaultValue).str(); fields.push_back(std::move(field)); } - void Class::writeDeclTo(raw_ostream &os) const { bool hasPrivateMethod = false; os << "class " << className << " {\n"; os << "public:\n"; - for (const auto &method : - llvm::concat(constructors, methods)) { + + forAllMethods([&](const OpMethod &method) { if (!method.isPrivate()) { method.writeDeclTo(os); os << '\n'; } else { hasPrivateMethod = true; } - } + }); + os << '\n'; os << "private:\n"; if (hasPrivateMethod) { - for (const auto &method : - llvm::concat(constructors, methods)) { + forAllMethods([&](const OpMethod &method) { if (method.isPrivate()) { method.writeDeclTo(os); os << '\n'; } - } + }); os << '\n'; } + for (const auto &field : fields) os.indent(2) << field << ";\n"; os << "};\n"; } void Class::writeDefTo(raw_ostream &os) const { - for (const auto &method : - llvm::concat(constructors, methods)) { + forAllMethods([&](const OpMethod &method) { method.writeDefTo(os, className); os << "\n\n"; - } + }); } //===----------------------------------------------------------------------===// @@ -217,14 +308,14 @@ void OpClass::writeDeclTo(raw_ostream &os) const { os << " using Adaptor = " << className << "Adaptor;\n"; bool hasPrivateMethod = false; - for (const auto &method : methods) { + forAllMethods([&](const OpMethod &method) { if (!method.isPrivate()) { method.writeDeclTo(os); os << "\n"; } else { hasPrivateMethod = true; } - } + }); // TODO: Add line control markers to make errors easier to debug. if (!extraClassDeclaration.empty()) @@ -232,12 +323,12 @@ void OpClass::writeDeclTo(raw_ostream &os) const { if (hasPrivateMethod) { os << "\nprivate:\n"; - for (const auto &method : methods) { + forAllMethods([&](const OpMethod &method) { if (method.isPrivate()) { method.writeDeclTo(os); os << "\n"; } - } + }); } os << "};\n"; diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp index 0586cd837e073..24dffa36e13ee 100644 --- a/mlir/lib/TableGen/Operator.cpp +++ b/mlir/lib/TableGen/Operator.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FormatVariadic.h" @@ -278,7 +279,7 @@ void Operator::populateTypeInferenceInfo( // Skip cases currently being custom generated. // TODO: Remove special cases. - if (getTrait("OpTrait::SameOperandsAndResultType")) + if (getTrait("::mlir::OpTrait::SameOperandsAndResultType")) return; // We create equivalence classes of argument/result types where arguments @@ -565,6 +566,21 @@ void Operator::print(llvm::raw_ostream &os) const { } } +Operator::NamespaceEmitter::NamespaceEmitter(raw_ostream &os, Operator &op) + : os(os) { + auto dialect = op.getDialect(); + if (!dialect) + return; + llvm::SplitString(dialect.getCppNamespace(), namespaces, "::"); + for (StringRef ns : namespaces) + os << "namespace " << ns << " {\n"; +} + +Operator::NamespaceEmitter::~NamespaceEmitter() { + for (StringRef ns : llvm::reverse(namespaces)) + os << "} // namespace " << ns << "\n"; +} + auto Operator::VariableDecoratorIterator::unwrap(llvm::Init *init) -> VariableDecorator { return VariableDecorator(cast(init)->getDef()); diff --git a/mlir/lib/Transforms/BufferPlacement.cpp b/mlir/lib/Transforms/BufferPlacement.cpp index 1ab3e7e2e48dc..9f2c254f91e51 100644 --- a/mlir/lib/Transforms/BufferPlacement.cpp +++ b/mlir/lib/Transforms/BufferPlacement.cpp @@ -48,11 +48,10 @@ // will be freed in the end. // // TODO: -// The current implementation does not support loops and the resulting code will -// be invalid with respect to program semantics. The only thing that is -// currently missing is a high-level loop analysis that allows us to move allocs -// and deallocs outside of the loop blocks. Furthermore, it doesn't also accept -// functions which return buffers already. +// The current implementation does not support explicit-control-flow loops and +// the resulting code will be invalid with respect to program semantics. +// However, structured control-flow loops are fully supported. Furthermore, it +// doesn't accept functions which return buffers already. // //===----------------------------------------------------------------------===// @@ -77,6 +76,22 @@ static void walkReturnOperations(Region *region, const FuncT &func) { } } +/// Wrapper for the actual `RegionBranchOpInterface.getSuccessorRegions` +/// function that initializes the required `operandAttributes` array. +static void getSuccessorRegions(RegionBranchOpInterface regionInterface, + llvm::Optional index, + SmallVectorImpl &successors) { + // Create a list of null attributes for each operand to comply with the + // `getSuccessorRegions` interface definition that requires a single + // attribute per operand. + SmallVector operandAttributes( + regionInterface.getOperation()->getNumOperands()); + + // Get all successor regions using the temporarily allocated + // `operandAttributes`. + regionInterface.getSuccessorRegions(index, operandAttributes, successors); +} + namespace { //===----------------------------------------------------------------------===// // BufferPlacementAliasAnalysis @@ -166,16 +181,10 @@ class BufferPlacementAliasAnalysis { // Query the RegionBranchOpInterface to find potential successor regions. op->walk([&](RegionBranchOpInterface regionInterface) { - // Create an empty attribute for each operand to comply with the - // `getSuccessorRegions` interface definition that requires a single - // attribute per operand. - SmallVector operandAttributes( - regionInterface.getOperation()->getNumOperands()); - // Extract all entry regions and wire all initial entry successor inputs. SmallVector entrySuccessors; - regionInterface.getSuccessorRegions(/*index=*/llvm::None, - operandAttributes, entrySuccessors); + getSuccessorRegions(regionInterface, /*index=*/llvm::None, + entrySuccessors); for (RegionSuccessor &entrySuccessor : entrySuccessors) { // Wire the entry region's successor arguments with the initial // successor inputs. @@ -191,8 +200,8 @@ class BufferPlacementAliasAnalysis { // Iterate over all successor region entries that are reachable from the // current region. SmallVector successorRegions; - regionInterface.getSuccessorRegions( - region.getRegionNumber(), operandAttributes, successorRegions); + getSuccessorRegions(regionInterface, region.getRegionNumber(), + successorRegions); for (RegionSuccessor &successorRegion : successorRegions) { // Iterate over all immediate terminator operations and wire the // successor inputs with the operands of each terminator. @@ -209,6 +218,83 @@ class BufferPlacementAliasAnalysis { ValueMapT aliases; }; +//===----------------------------------------------------------------------===// +// Backedges +//===----------------------------------------------------------------------===// + +/// A straight-forward program analysis which detects loop backedges induced by +/// explicit control flow. +class Backedges { +public: + using BlockSetT = SmallPtrSet; + using BackedgeSetT = llvm::DenseSet>; + +public: + /// Constructs a new backedges analysis using the op provided. + Backedges(Operation *op) { recurse(op, op->getBlock()); } + + /// Returns the number of backedges formed by explicit control flow. + size_t size() const { return edgeSet.size(); } + + /// Returns the start iterator to loop over all backedges. + BackedgeSetT::const_iterator begin() const { return edgeSet.begin(); } + + /// Returns the end iterator to loop over all backedges. + BackedgeSetT::const_iterator end() const { return edgeSet.end(); } + +private: + /// Enters the current block and inserts a backedge into the `edgeSet` if we + /// have already visited the current block. The inserted edge links the given + /// `predecessor` with the `current` block. + bool enter(Block ¤t, Block *predecessor) { + bool inserted = visited.insert(¤t).second; + if (!inserted) + edgeSet.insert(std::make_pair(predecessor, ¤t)); + return inserted; + } + + /// Leaves the current block. + void exit(Block ¤t) { visited.erase(¤t); } + + /// Recurses into the given operation while taking all attached regions into + /// account. + void recurse(Operation *op, Block *predecessor) { + Block *current = op->getBlock(); + // If the current op implements the `BranchOpInterface`, there can be + // cycles in the scope of all successor blocks. + if (isa(op)) { + for (Block *succ : current->getSuccessors()) + recurse(*succ, current); + } + // Recurse into all distinct regions and check for explicit control-flow + // loops. + for (Region ®ion : op->getRegions()) + recurse(region.front(), current); + } + + /// Recurses into explicit control-flow structures that are given by + /// the successor relation defined on the block level. + void recurse(Block &block, Block *predecessor) { + // Try to enter the current block. If this is not possible, we are + // currently processing this block and can safely return here. + if (!enter(block, predecessor)) + return; + + // Recurse into all operations and successor blocks. + for (auto &op : block.getOperations()) + recurse(&op, predecessor); + + // Leave the current block. + exit(block); + } + + /// Stores all blocks that are currently visited and on the processing stack. + BlockSetT visited; + + /// Stores all backedges in the format (source, target). + BackedgeSetT edgeSet; +}; + //===----------------------------------------------------------------------===// // BufferPlacement //===----------------------------------------------------------------------===// @@ -357,9 +443,14 @@ class BufferPlacement { for (Value value : it->second) { if (valuesToFree.count(value) > 0) continue; - // Check whether we have to free this particular block argument. - if (!dominators.dominates(definingBlock, value.getParentBlock())) { - toProcess.emplace_back(value, value.getParentBlock()); + Block *parentBlock = value.getParentBlock(); + // Check whether we have to free this particular block argument or + // generic value. We have to free the current alias if it is either + // defined in a non-dominated block or it is defined in the same block + // but the current value is not dominated by the source value. + if (!dominators.dominates(definingBlock, parentBlock) || + (definingBlock == parentBlock && value.isa())) { + toProcess.emplace_back(value, parentBlock); valuesToFree.insert(value); } else if (visitedValues.insert(std::make_tuple(value, definingBlock)) .second) @@ -431,22 +522,42 @@ class BufferPlacement { // argument belongs to the first block in a region and the parent operation // implements the RegionBranchOpInterface. Region *argRegion = block->getParent(); + Operation *parentOp = argRegion->getParentOp(); RegionBranchOpInterface regionInterface; if (!argRegion || &argRegion->front() != block || - !(regionInterface = - dyn_cast(argRegion->getParentOp()))) + !(regionInterface = dyn_cast(parentOp))) return; introduceCopiesForRegionSuccessors( - regionInterface, argRegion->getParentOp()->getRegions(), + regionInterface, argRegion->getParentOp()->getRegions(), blockArg, [&](RegionSuccessor &successorRegion) { // Find a predecessor of our argRegion. return successorRegion.getSuccessor() == argRegion; - }, - [&](RegionSuccessor &successorRegion) { - // The operand index will be the argument number. - return blockArg.getArgNumber(); }); + + // Check whether the block argument belongs to an entry region of the + // parent operation. In this case, we have to introduce an additional copy + // for buffer that is passed to the argument. + SmallVector successorRegions; + getSuccessorRegions(regionInterface, llvm::None, successorRegions); + auto *it = + llvm::find_if(successorRegions, [&](RegionSuccessor &successorRegion) { + return successorRegion.getSuccessor() == argRegion; + }); + if (it == successorRegions.end()) + return; + + // Determine the actual operand to introduce a copy for and rewire the + // operand to point to the copy instead. + Value operand = + regionInterface.getSuccessorEntryOperands(argRegion->getRegionNumber()) + [llvm::find(it->getSuccessorInputs(), blockArg).getIndex()]; + Value copy = introduceBufferCopy(operand, parentOp); + + auto op = llvm::find(parentOp->getOperands(), operand); + assert(op != parentOp->getOperands().end() && + "parentOp does not contain operand"); + parentOp->setOperand(op.getIndex(), copy); } /// Introduces temporary allocs in front of all associated nested-region @@ -455,42 +566,34 @@ class BufferPlacement { // Get the actual result index in the scope of the parent terminator. Operation *operation = value.getDefiningOp(); auto regionInterface = cast(operation); - introduceCopiesForRegionSuccessors( - regionInterface, operation->getRegions(), - [&](RegionSuccessor &successorRegion) { - // Determine whether this region has a successor entry that leaves - // this region by returning to its parent operation. - return !successorRegion.getSuccessor(); - }, - [&](RegionSuccessor &successorRegion) { - // Find the associated success input index. - return llvm::find(successorRegion.getSuccessorInputs(), value) - .getIndex(); - }); + // Filter successors that return to the parent operation. + auto regionPredicate = [&](RegionSuccessor &successorRegion) { + // If the RegionSuccessor has no associated successor, it will return to + // its parent operation. + return !successorRegion.getSuccessor(); + }; + // Introduce a copy for all region "results" that are returned to the parent + // operation. This is required since the parent's result value has been + // considered critical. Therefore, the algorithm assumes that a copy of a + // previously allocated buffer is returned by the operation (like in the + // case of a block argument). + introduceCopiesForRegionSuccessors(regionInterface, operation->getRegions(), + value, regionPredicate); } /// Introduces buffer copies for all terminators in the given regions. The /// regionPredicate is applied to every successor region in order to restrict - /// the copies to specific regions. Thereby, the operandProvider is invoked - /// for each matching region successor and determines the operand index that - /// requires a buffer copy. - template - void - introduceCopiesForRegionSuccessors(RegionBranchOpInterface regionInterface, - MutableArrayRef regions, - const TPredicate ®ionPredicate, - const TOperandProvider &operandProvider) { - // Create an empty attribute for each operand to comply with the - // `getSuccessorRegions` interface definition that requires a single - // attribute per operand. - SmallVector operandAttributes( - regionInterface.getOperation()->getNumOperands()); + /// the copies to specific regions. + template + void introduceCopiesForRegionSuccessors( + RegionBranchOpInterface regionInterface, MutableArrayRef regions, + Value argValue, const TPredicate ®ionPredicate) { for (Region ®ion : regions) { // Query the regionInterface to get all successor regions of the current // one. SmallVector successorRegions; - regionInterface.getSuccessorRegions(region.getRegionNumber(), - operandAttributes, successorRegions); + getSuccessorRegions(regionInterface, region.getRegionNumber(), + successorRegions); // Try to find a matching region successor. RegionSuccessor *regionSuccessor = llvm::find_if(successorRegions, regionPredicate); @@ -498,7 +601,9 @@ class BufferPlacement { continue; // Get the operand index in the context of the current successor input // bindings. - auto operandIndex = operandProvider(*regionSuccessor); + size_t operandIndex = + llvm::find(regionSuccessor->getSuccessorInputs(), argValue) + .getIndex(); // Iterate over all immediate terminator operations to introduce // new buffer allocations. Thereby, the appropriate terminator operand @@ -518,6 +623,16 @@ class BufferPlacement { /// its content into the newly allocated buffer. The terminator operation is /// used to insert the alloc and copy operations at the right places. Value introduceBufferCopy(Value sourceValue, Operation *terminator) { + // Avoid multiple copies of the same source value. This can happen in the + // presence of loops when a branch acts as a backedge while also having + // another successor that returns to its parent operation. Note: that + // copying copied buffers can introduce memory leaks since the invariant of + // BufferPlacement assumes that a buffer will be only copied once into a + // temporary buffer. Hence, the construction of copy chains introduces + // additional allocations that are not tracked automatically by the + // algorithm. + if (copiedValues.contains(sourceValue)) + return sourceValue; // Create a new alloc at the current location of the terminator. auto memRefType = sourceValue.getType().cast(); OpBuilder builder(terminator); @@ -541,6 +656,8 @@ class BufferPlacement { // allocation to the new one. builder.create(terminator->getLoc(), sourceValue, alloc); + // Remember the copy of original source value. + copiedValues.insert(alloc); return alloc; } @@ -652,6 +769,9 @@ class BufferPlacement { /// Maps allocation nodes to their associated blocks. AllocEntryList allocs; + // Stores already copied allocations to avoid additional copies of copies. + ValueSetT copiedValues; + /// The underlying liveness analysis to compute fine grained information /// about alloc and dealloc positions. Liveness liveness; @@ -673,6 +793,14 @@ class BufferPlacement { struct BufferPlacementPass : BufferPlacementBase { void runOnFunction() override { + // Ensure that there are supported loops only. + Backedges backedges(getFunction()); + if (backedges.size()) { + getFunction().emitError( + "Structured control-flow loops are supported only."); + return; + } + // Place all required alloc, copy and dealloc nodes. BufferPlacement placement(getFunction()); placement.place(); @@ -681,20 +809,6 @@ struct BufferPlacementPass : BufferPlacementBase { } // end anonymous namespace -//===----------------------------------------------------------------------===// -// BufferAssignmentPlacer -//===----------------------------------------------------------------------===// - -/// Creates a new assignment placer. -BufferAssignmentPlacer::BufferAssignmentPlacer(Operation *op) : operation(op) {} - -/// Computes the actual position to place allocs for the given value. -OpBuilder::InsertPoint -BufferAssignmentPlacer::computeAllocPosition(OpResult result) { - Operation *owner = result.getOwner(); - return OpBuilder::InsertPoint(owner->getBlock(), Block::iterator(owner)); -} - //===----------------------------------------------------------------------===// // BufferAssignmentTypeConverter //===----------------------------------------------------------------------===// @@ -891,9 +1005,6 @@ LogicalResult BufferAssignmentCallOpConverter::matchAndRewrite( resultMapping.addMapping(newResultTypes.size() - 1); } else { // kind = BufferAssignmentTypeConverter::AppendToArgumentsList - OpBuilder::InsertionGuard guard(rewriter); - rewriter.restoreInsertionPoint( - bufferAssignment->computeAllocPosition(result.value())); MemRefType memref = converted.dyn_cast(); if (!memref) return callOp.emitError("Cannot allocate for a non-Memref type"); diff --git a/mlir/lib/Transforms/CopyRemoval.cpp b/mlir/lib/Transforms/CopyRemoval.cpp index ccfd02630ac28..c5a8da6329568 100644 --- a/mlir/lib/Transforms/CopyRemoval.cpp +++ b/mlir/lib/Transforms/CopyRemoval.cpp @@ -30,16 +30,35 @@ class CopyRemovalPass : public PassWrapper> { reuseCopySourceAsTarget(copyOp); reuseCopyTargetAsSource(copyOp); }); + for (std::pair &pair : replaceList) + pair.first.replaceAllUsesWith(pair.second); for (Operation *op : eraseList) op->erase(); } private: /// List of operations that need to be removed. - DenseSet eraseList; + llvm::SmallPtrSet eraseList; + + /// List of values that need to be replaced with their counterparts. + llvm::SmallDenseSet, 4> replaceList; + + /// Returns the allocation operation for `value` in `block` if it exists. + /// nullptr otherwise. + Operation *getAllocationOpInBlock(Value value, Block *block) { + assert(block && "Block cannot be null"); + Operation *op = value.getDefiningOp(); + if (op && op->getBlock() == block) { + auto effects = dyn_cast(op); + if (effects && effects.hasEffect()) + return op; + } + return nullptr; + } /// Returns the deallocation operation for `value` in `block` if it exists. - Operation *getDeallocationInBlock(Value value, Block *block) { + /// nullptr otherwise. + Operation *getDeallocationOpInBlock(Value value, Block *block) { assert(block && "Block cannot be null"); auto valueUsers = value.getUsers(); auto it = llvm::find_if(valueUsers, [&](Operation *op) { @@ -119,9 +138,10 @@ class CopyRemovalPass : public PassWrapper> { Value to = copyOp.getTarget(); Operation *copy = copyOp.getOperation(); + Block *copyBlock = copy->getBlock(); Operation *fromDefiningOp = from.getDefiningOp(); - Operation *fromFreeingOp = getDeallocationInBlock(from, copy->getBlock()); - Operation *toDefiningOp = to.getDefiningOp(); + Operation *fromFreeingOp = getDeallocationOpInBlock(from, copyBlock); + Operation *toDefiningOp = getAllocationOpInBlock(to, copyBlock); if (!fromDefiningOp || !fromFreeingOp || !toDefiningOp || !areOpsInTheSameBlock({fromFreeingOp, toDefiningOp, copy}) || hasUsersBetween(to, toDefiningOp, copy) || @@ -129,7 +149,7 @@ class CopyRemovalPass : public PassWrapper> { hasMemoryEffectOpBetween(copy, fromFreeingOp)) return; - to.replaceAllUsesWith(from); + replaceList.insert({to, from}); eraseList.insert(copy); eraseList.insert(toDefiningOp); eraseList.insert(fromFreeingOp); @@ -169,8 +189,9 @@ class CopyRemovalPass : public PassWrapper> { Value to = copyOp.getTarget(); Operation *copy = copyOp.getOperation(); - Operation *fromDefiningOp = from.getDefiningOp(); - Operation *fromFreeingOp = getDeallocationInBlock(from, copy->getBlock()); + Block *copyBlock = copy->getBlock(); + Operation *fromDefiningOp = getAllocationOpInBlock(from, copyBlock); + Operation *fromFreeingOp = getDeallocationOpInBlock(from, copyBlock); if (!fromDefiningOp || !fromFreeingOp || !areOpsInTheSameBlock({fromFreeingOp, fromDefiningOp, copy}) || hasUsersBetween(to, fromDefiningOp, copy) || @@ -178,7 +199,7 @@ class CopyRemovalPass : public PassWrapper> { hasMemoryEffectOpBetween(copy, fromFreeingOp)) return; - from.replaceAllUsesWith(to); + replaceList.insert({from, to}); eraseList.insert(copy); eraseList.insert(fromDefiningOp); eraseList.insert(fromFreeingOp); diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp index db6a071367d6c..cf79e267fb8ad 100644 --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -418,10 +418,559 @@ LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp, return success(); } -// Collect perfectly nested loops starting from `rootForOps`. Loops are -// perfectly nested if each loop is the first and only non-terminator operation -// in the parent loop. Collect at most `maxLoops` loops and append them to -// `forOps`. +/// Checks the legality of tiling of a hyper-rectangular loop nest by simply +/// checking if there is a 'negative' dependence in the memrefs present in +/// the loop nest. If yes then tiling is invalid. +static bool +checkTilingLegalityImpl(MutableArrayRef origLoops) { + assert(!origLoops.empty() && "no original loops provided"); + + // We first find out all dependences we intend to check. + SmallVector loadAndStoreOps; + origLoops[0].getOperation()->walk([&](Operation *op) { + if (isa(op)) + loadAndStoreOps.push_back(op); + }); + + unsigned numOps = loadAndStoreOps.size(); + unsigned numLoops = origLoops.size(); + FlatAffineConstraints dependenceConstraints; + for (unsigned d = 1; d <= numLoops + 1; ++d) { + for (unsigned i = 0; i < numOps; ++i) { + Operation *srcOp = loadAndStoreOps[i]; + MemRefAccess srcAccess(srcOp); + for (unsigned j = 0; j < numOps; ++j) { + Operation *dstOp = loadAndStoreOps[j]; + MemRefAccess dstAccess(dstOp); + + SmallVector depComps; + dependenceConstraints.reset(); + DependenceResult result = checkMemrefAccessDependence( + srcAccess, dstAccess, d, &dependenceConstraints, &depComps); + + // Skip if there is no dependence in this case. + if (!hasDependence(result)) + continue; + + // Check whether there is any negative direction vector in the + // dependence components found above, which means that dependence is + // violated by the default hyper-rect tiling method. + LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated " + "for dependence at depth: " + << Twine(d) << " between:\n";); + LLVM_DEBUG(srcAccess.opInst->dump();); + LLVM_DEBUG(dstAccess.opInst->dump();); + for (unsigned k = 0, e = depComps.size(); k < e; k++) { + DependenceComponent depComp = depComps[k]; + if (depComp.lb.hasValue() && depComp.ub.hasValue() && + depComp.lb.getValue() < depComp.ub.getValue() && + depComp.ub.getValue() < 0) { + LLVM_DEBUG(llvm::dbgs() + << "Dependence component lb = " + << Twine(depComp.lb.getValue()) + << " ub = " << Twine(depComp.ub.getValue()) + << " is negative at depth: " << Twine(d) + << " and thus violates the legality rule.\n"); + return false; + } + } + } + } + } + + return true; +} + +/// Checks whether hyper-rectangular loop tiling of the nest +/// represented by `origLoops` is valid. The validity condition is from Irigoin +/// and Triolet, which states that two tiles cannot depend on each other. We +/// simplify such condition to just checking whether there is any negative +/// dependence direction, since we have the prior knowledge that the tiling +/// results will be hyper-rectangles, which are scheduled in the +/// lexicographically increasing order on the vector of loop indices. This +/// function will return failure when any dependence component is negative along +/// any of `origLoops`. +LogicalResult +checkTilingLegality(MutableArrayRef origLoops) { + return success(checkTilingLegalityImpl(origLoops)); +} + +/// Check if the input data is valid and wheter tiled code will be legal or not. +template +void performPreTilingChecks(MutableArrayRef input, + ArrayRef tileSizes) { + // Check if the supplied for op's are all successively nested. + assert(!input.empty() && "no loops in input band"); + assert(input.size() == tileSizes.size() && "Too few/many tile sizes"); + + assert(isPerfectlyNested(input) && "input loops not perfectly nested"); + + // Perform tiling legality test. + if (failed(checkTilingLegality(input))) + input[0].emitRemark("tiled code is illegal due to dependences"); +} + +/// Move the loop body of AffineForOp 'src' from 'src' into the specified +/// location in destination's body, ignoring the terminator. +static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, + Block::iterator loc) { + auto &ops = src.getBody()->getOperations(); + dest.getBody()->getOperations().splice(loc, ops, ops.begin(), + std::prev(ops.end())); +} + +/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest +/// body. +void moveLoopBody(AffineForOp src, AffineForOp dest) { + moveLoopBodyImpl(src, dest, dest.getBody()->begin()); +} + +/// Constructs tiled loop nest, without setting the loop bounds and move the +/// body of the original loop nest to the tiled loop nest. +void constructTiledLoopNest(MutableArrayRef origLoops, + AffineForOp rootAffineForOp, unsigned width, + MutableArrayRef tiledLoops) { + Location loc = rootAffineForOp.getLoc(); + + // The outermost among the loops as we add more.. + Operation *topLoop = rootAffineForOp.getOperation(); + AffineForOp innermostPointLoop; + + // Add intra-tile (or point) loops. + for (unsigned i = 0; i < width; i++) { + OpBuilder b(topLoop); + // Loop bounds will be set later. + AffineForOp pointLoop = b.create(loc, 0, 0); + pointLoop.getBody()->getOperations().splice( + pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), + topLoop); + tiledLoops[2 * width - 1 - i] = pointLoop; + topLoop = pointLoop.getOperation(); + if (i == 0) + innermostPointLoop = pointLoop; + } + + // Add tile space loops; + for (unsigned i = width; i < 2 * width; i++) { + OpBuilder b(topLoop); + // Loop bounds will be set later. + AffineForOp tileSpaceLoop = b.create(loc, 0, 0); + tileSpaceLoop.getBody()->getOperations().splice( + tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), + topLoop); + tiledLoops[2 * width - i - 1] = tileSpaceLoop; + topLoop = tileSpaceLoop.getOperation(); + } + + // Move the loop body of the original nest to the new one. + moveLoopBody(origLoops.back(), innermostPointLoop); +} + +/// Checks whether a loop nest is hyper-rectangular or not. +LogicalResult checkIfHyperRectangular(MutableArrayRef input, + AffineForOp rootAffineForOp, + unsigned width) { + FlatAffineConstraints cst; + SmallVector ops(input.begin(), input.end()); + getIndexSet(ops, &cst); + if (!cst.isHyperRectangular(0, width)) { + rootAffineForOp.emitError("tiled code generation unimplemented for the " + "non-hyperrectangular case"); + return failure(); + } + return success(); +} + +/// Set lower and upper bounds of intra-tile loops for parametric tiling. +// TODO: Handle non-constant lower bounds. +static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, + AffineForOp newInterTileLoop, + AffineForOp newIntraTileLoop, + Value tileSize) { + // The lower bound for the intra-tile loop is represented by an affine map + // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound + // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i + // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV + // of the corresponding inter-tile loop, %t0 is the corresponding tiling + // parameter, %origlb is lower bound and %origLoopStep is the loop step of the + // corresponding inter-tile loop. + + assert(origLoop.hasConstantLowerBound() && + "expected input loops to have constant lower bound."); + + // Get lower bound of original loop as an affine expression. + AffineExpr origLowerBoundExpr; + origLowerBoundExpr = + b.getAffineConstantExpr(origLoop.getConstantLowerBound()); + + // Add dim operands from original lower/upper bound. + SmallVector lbOperands, ubOperands; + AffineBound lb = origLoop.getLowerBound(); + AffineBound ub = origLoop.getUpperBound(); + lbOperands.reserve(lb.getNumOperands() + 2); + ubOperands.reserve(ub.getNumOperands() + 2); + AffineMap origLbMap = lb.getMap(); + AffineMap origUbMap = ub.getMap(); + for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j) + lbOperands.push_back(lb.getOperand(j)); + for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) + ubOperands.push_back(ub.getOperand(j)); + + // Add a new dim operand in lb/ubOperands corresponding to the origLoop + // IV. + lbOperands.push_back(newInterTileLoop.getInductionVar()); + ubOperands.push_back(newInterTileLoop.getInductionVar()); + + // Get loop IV as an affine expression for lower/upper bound. Size of + // lb/ubOperands is guaranteed to be atleast one. + AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1); + AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1); + + // Add symbol operands from original lower/upper bound. + for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j) + lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j)); + for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) + ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); + + // Add a new symbol operand which is the tile size for this loop. + lbOperands.push_back(tileSize); + ubOperands.push_back(tileSize); + + SmallVector lbBoundExprs; + SmallVector ubBoundExprs; + lbBoundExprs.reserve(origLbMap.getNumResults()); + ubBoundExprs.reserve(origUbMap.getNumResults()); + + // Get tiling parameter as an affine expression for lb/ub. + AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols()); + AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols()); + + // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb. + lbBoundExprs.push_back( + ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) + + origLowerBoundExpr); + + // Get the origLoopStep as an affine expression. + AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStep()); + + // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) + + // (tilingParameter * origLoopStep) + origlb. + ubBoundExprs.push_back( + ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) + + (ubTileParameter * origLoopStep) + origLowerBoundExpr); + + ubBoundExprs.append(origUbMap.getResults().begin(), + origUbMap.getResults().end()); + + AffineMap lbMap = + AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1, + lbBoundExprs, b.getContext()); + newIntraTileLoop.setLowerBound(lbOperands, lbMap); + + AffineMap ubMap = + AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1, + ubBoundExprs, b.getContext()); + newIntraTileLoop.setUpperBound(ubOperands, ubMap); + + // Original loop step must be preserved. + newIntraTileLoop.setStep(origLoop.getStep()); +} + +/// Set lower and upper bounds of inter-tile loops for parametric tiling. +// TODO: Handle non-constant lower bounds. +static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, + AffineForOp newLoop, Value tileSize) { + OperandRange newLbOperands = origLoop.getLowerBoundOperands(); + + // The lower bounds for inter-tile loops are same as the correspondig lower + // bounds of original loops. + newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap()); + + // The new upper bound map for inter-tile loops, assuming constant lower + // bounds, are now originalLowerBound + ceildiv((orignalUpperBound - + // originalLowerBound), tiling paramter); where tiling parameter is the + // respective tile size for that loop. For e.g. if the original ubmap was + // ()->(1024), the new map will be + // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter. + // Therefore a new symbol operand is inserted in the map and the result + // expression is overwritten. + + assert(origLoop.hasConstantLowerBound() && + "expected input loops to have constant lower bound."); + + // Get lower bound of original loop as an affine expression. + AffineExpr origLowerBoundExpr; + origLowerBoundExpr = + b.getAffineConstantExpr(origLoop.getConstantLowerBound()); + + // Add dim operands from original upper bound. + SmallVector ubOperands; + AffineBound ub = origLoop.getUpperBound(); + ubOperands.reserve(ub.getNumOperands() + 1); + AffineMap origUbMap = ub.getMap(); + for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) + ubOperands.push_back(ub.getOperand(j)); + + // Add symbol operands from original upper bound. + for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) + ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); + + // Add a new symbol operand which is the tile size for this loop. + ubOperands.push_back(tileSize); + + // Get tiling parameter as an affine expression. + AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols()); + + SmallVector boundExprs; + boundExprs.reserve(origUbMap.getNumResults()); + int64_t origUpperBound; + AffineExpr origUpperBoundExpr; + + // If upper bound for the original loop is constant, then the constant can + // be obtained as an affine expression straight away. + if (origLoop.hasConstantUpperBound()) { + origUpperBound = origLoop.getConstantUpperBound(); + + // Get original constant upper bound as an affine expression. + origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound); + + // Insert the bound as originalLowerBoundceildiv((originalUpperBound - + // originalLowerBound), tilingParameter). + boundExprs.push_back( + origLowerBoundExpr + + (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter)); + } else { + // If upper bound for the original loop is not constant then two cases + // are possible, although there handeling is the same, 1.) The result of + // ubmap has only one result expression. For e.g. + // affine.for %i = 5 to %ub + // + // A symbol operand is added which represents the tiling paramater. The + // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5) + // where 's0' is the original upper bound and 's1' is the tiling + // parameter. 2.) When ubMap has more than one result expression. For e.g. + // #map0 = affine_map<()[s0, s1] -> (s0, s1) + // affine.for %i = 5 to min #map0()[%s0, %s1] + // + // A symbol operand is added which represents the tiling parameter. The + // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5, + // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter. + + // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound - + // originalLowerBound), tilingParameter). + for (AffineExpr origUpperBoundExpr : origUbMap.getResults()) + boundExprs.push_back( + origLowerBoundExpr + + (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter)); + } + + AffineMap ubMap = + AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1, + boundExprs, b.getContext()); + newLoop.setUpperBound(ubOperands, ubMap); + + // Original loop step must be preserved. + newLoop.setStep(origLoop.getStep()); +} + +/// Constructs and sets new loop bounds after tiling for the case of +/// hyper-rectangular index sets, where the bounds of one dimension do not +/// depend on other dimensions and tiling parameters are captured from SSA +/// values. Bounds of each dimension can thus be treated independently, +/// and deriving the new bounds is much simpler and faster than for the case of +/// tiling arbitrary polyhedral shapes. +static void constructParametricallyTiledIndexSetHyperRect( + MutableArrayRef origLoops, + MutableArrayRef newLoops, ArrayRef tileSizes) { + assert(!origLoops.empty() && "expected atleast one loop in band"); + assert(origLoops.size() == tileSizes.size() && + "expected tiling parameter for each loop in band."); + + OpBuilder b(origLoops[0].getOperation()); + unsigned width = origLoops.size(); + + // Set bounds for tile space loops. + for (unsigned i = 0; i < width; ++i) { + setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]); + } + + // Set bounds for intra-tile loops. + for (unsigned i = 0; i < width; ++i) { + setIntraTileBoundsParametric(b, origLoops[i], newLoops[i], + newLoops[i + width], tileSizes[i]); + } +} + +/// Constructs and sets new loop bounds after tiling for the case of +/// hyper-rectangular index sets, where the bounds of one dimension do not +/// depend on other dimensions. Bounds of each dimension can thus be treated +/// independently, and deriving the new bounds is much simpler and faster +/// than for the case of tiling arbitrary polyhedral shapes. +static void +constructTiledIndexSetHyperRect(MutableArrayRef origLoops, + MutableArrayRef newLoops, + ArrayRef tileSizes) { + assert(!origLoops.empty()); + assert(origLoops.size() == tileSizes.size()); + + OpBuilder b(origLoops[0].getOperation()); + unsigned width = origLoops.size(); + + // Bounds for tile space loops. + for (unsigned i = 0; i < width; i++) { + OperandRange newLbOperands = origLoops[i].getLowerBoundOperands(); + OperandRange newUbOperands = origLoops[i].getUpperBoundOperands(); + newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap()); + newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap()); + newLoops[i].setStep(tileSizes[i]); + } + // Bounds for intra-tile loops. + for (unsigned i = 0; i < width; i++) { + int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]); + Optional mayBeConstantCount = getConstantTripCount(origLoops[i]); + // The lower bound is just the tile-space loop. + AffineMap lbMap = b.getDimIdentityMap(); + newLoops[width + i].setLowerBound( + /*operands=*/newLoops[i].getInductionVar(), lbMap); + + // Set the upper bound. + if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSizes[i]) { + // Trip count is less than the tile size: upper bound is lower bound + + // trip count. + AffineMap ubMap = + b.getSingleDimShiftAffineMap(mayBeConstantCount.getValue()); + newLoops[width + i].setUpperBound( + /*operands=*/newLoops[i].getInductionVar(), ubMap); + } else if (largestDiv % tileSizes[i] != 0) { + // Intra-tile loop ii goes from i to min(i + tileSize, ub_i). + // Construct the upper bound map; the operands are the original operands + // with 'i' (tile-space loop) appended to it. The new upper bound map is + // the original one with an additional expression i + tileSize appended. + + // Add dim operands from original upper bound. + SmallVector ubOperands; + AffineBound ub = origLoops[i].getUpperBound(); + ubOperands.reserve(ub.getNumOperands() + 1); + AffineMap origUbMap = ub.getMap(); + for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) + ubOperands.push_back(ub.getOperand(j)); + + // Add dim operand for new loop upper bound. + ubOperands.push_back(newLoops[i].getInductionVar()); + + // Add symbol operands from original upper bound. + for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) + ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); + + SmallVector boundExprs; + boundExprs.reserve(1 + origUbMap.getNumResults()); + AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims()); + // The new upper bound map is the original one with an additional + // expression i + tileSize appended. + boundExprs.push_back(dim + tileSizes[i]); + boundExprs.append(origUbMap.getResults().begin(), + origUbMap.getResults().end()); + AffineMap ubMap = + AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(), + boundExprs, b.getContext()); + newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap); + } else { + // No need of the min expression. + AffineExpr dim = b.getAffineDimExpr(0); + AffineMap ubMap = AffineMap::get(1, 0, dim + tileSizes[i]); + newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap); + } + } +} + +/// Tiles the specified band of perfectly nested loops creating tile-space loops +/// and intra-tile loops. A band is a contiguous set of loops. +// TODO: handle non hyper-rectangular spaces. +LogicalResult +mlir::tilePerfectlyNested(MutableArrayRef input, + ArrayRef tileSizes, + SmallVectorImpl *tiledNest) { + performPreTilingChecks(input, tileSizes); + + MutableArrayRef origLoops = input; + AffineForOp rootAffineForOp = origLoops[0]; + // Note that width is at least one since band isn't empty. + unsigned width = input.size(); + SmallVector tiledLoops(2 * width); + + // Construct a tiled loop nest without setting their bounds. Bounds are + // set later. + constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops); + + SmallVector origLoopIVs; + extractForInductionVars(input, &origLoopIVs); + + if (failed(checkIfHyperRectangular(input, rootAffineForOp, width))) + return failure(); + + // Set loop bounds for the tiled loop nest. + constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes); + + // Replace original IVs with intra-tile loop IVs. + for (unsigned i = 0; i < width; i++) + origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar()); + + // Erase the old loop nest. + rootAffineForOp.erase(); + + if (tiledNest) + *tiledNest = std::move(tiledLoops); + + return success(); +} + +/// Tiles the specified band of perfectly nested loops creating tile-space +/// loops and intra-tile loops, using SSA values as tiling parameters. A band +/// is a contiguous set of loops. +// TODO: handle non hyper-rectangular spaces. +LogicalResult +mlir::tilePerfectlyNestedParametric(MutableArrayRef input, + ArrayRef tileSizes, + SmallVectorImpl *tiledNest) { + performPreTilingChecks(input, tileSizes); + + MutableArrayRef origLoops = input; + AffineForOp rootAffineForOp = origLoops[0]; + // Note that width is at least one since band isn't empty. + unsigned width = input.size(); + SmallVector tiledLoops(2 * width); + + // Construct a tiled loop nest without setting their bounds. Bounds are + // set later. + constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops); + + SmallVector origLoopIVs; + extractForInductionVars(input, &origLoopIVs); + + if (failed(checkIfHyperRectangular(input, rootAffineForOp, width))) + return failure(); + + // Set loop bounds for the tiled loop nest. + constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops, + tileSizes); + + // Replace original IVs with intra-tile loop IVs. + for (unsigned i = 0; i < width; i++) + origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar()); + + // Erase the old loop nest. + rootAffineForOp.erase(); + + if (tiledNest) + *tiledNest = std::move(tiledLoops); + + return success(); +} + +/// Collect perfectly nested loops starting from `rootForOps`. Loops are +/// perfectly nested if each loop is the first and only non-terminator operation +/// in the parent loop. Collect at most `maxLoops` loops and append them to +/// `forOps`. template static void getPerfectlyNestedLoopsImpl( SmallVectorImpl &forOps, T rootForOp, @@ -452,6 +1001,20 @@ void mlir::getPerfectlyNestedLoops(SmallVectorImpl &nestedLoops, getPerfectlyNestedLoopsImpl(nestedLoops, root); } +/// Identify valid and profitable bands of loops to tile. This is currently just +/// a temporary placeholder to test the mechanics of tiled code generation. +/// Returns all maximal outermost perfect loop nests to tile. +void mlir::getTileableBands(FuncOp f, + std::vector> *bands) { + // Get maximal perfect nest of 'affine.for' insts starting from root + // (inclusive). + for (AffineForOp forOp : f.getOps()) { + SmallVector band; + getPerfectlyNestedLoops(band, forOp); + bands->push_back(band); + } +} + /// Unrolls this loop completely. LogicalResult mlir::loopUnrollFull(AffineForOp forOp) { Optional mayBeConstantTripCount = getConstantTripCount(forOp); @@ -469,7 +1032,6 @@ LogicalResult mlir::loopUnrollFull(AffineForOp forOp) { LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor) { Optional mayBeConstantTripCount = getConstantTripCount(forOp); - if (mayBeConstantTripCount.hasValue() && mayBeConstantTripCount.getValue() < unrollFactor) return loopUnrollByFactor(forOp, mayBeConstantTripCount.getValue()); diff --git a/mlir/test/CAPI/ir.c b/mlir/test/CAPI/ir.c index 0a8ebae4e19e0..01b007e717835 100644 --- a/mlir/test/CAPI/ir.c +++ b/mlir/test/CAPI/ir.c @@ -10,6 +10,7 @@ /* RUN: mlir-capi-ir-test 2>&1 | FileCheck %s */ +#include "mlir-c/AffineMap.h" #include "mlir-c/IR.h" #include "mlir-c/Registration.h" #include "mlir-c/StandardAttributes.h" @@ -408,31 +409,36 @@ int printStandardAttributes(MlirContext ctx) { mlirAttributeDump(boolean); const char data[] = "abcdefghijklmnopqestuvwxyz"; - char buffer[10]; MlirAttribute opaque = mlirOpaqueAttrGet(ctx, "std", 3, data, mlirNoneTypeGet(ctx)); if (!mlirAttributeIsAOpaque(opaque) || strcmp("std", mlirOpaqueAttrGetDialectNamespace(opaque))) return 4; - mlirOpaqueAttrGetData(opaque, callbackSetFixedLengthString, buffer); - if (buffer[0] != 'a' || buffer[1] != 'b' || buffer[2] != 'c') + + MlirStringRef opaqueData = mlirOpaqueAttrGetData(opaque); + if (opaqueData.length != 3 || + strncmp(data, opaqueData.data, opaqueData.length)) return 5; mlirAttributeDump(opaque); MlirAttribute string = mlirStringAttrGet(ctx, 2, data + 3); if (!mlirAttributeIsAString(string)) return 6; - mlirStringAttrGetValue(string, callbackSetFixedLengthString, buffer); - if (buffer[0] != 'd' || buffer[1] != 'e') + + MlirStringRef stringValue = mlirStringAttrGetValue(string); + if (stringValue.length != 2 || + strncmp(data + 3, stringValue.data, stringValue.length)) return 7; mlirAttributeDump(string); MlirAttribute flatSymbolRef = mlirFlatSymbolRefAttrGet(ctx, 3, data + 5); if (!mlirAttributeIsAFlatSymbolRef(flatSymbolRef)) return 8; - mlirFloatSymbolRefAttrGetValue(flatSymbolRef, callbackSetFixedLengthString, - buffer); - if (buffer[0] != 'f' || buffer[1] != 'g' || buffer[2] != 'h') + + MlirStringRef flatSymbolRefValue = + mlirFlatSymbolRefAttrGetValue(flatSymbolRef); + if (flatSymbolRefValue.length != 3 || + strncmp(data + 5, flatSymbolRefValue.data, flatSymbolRefValue.length)) return 9; mlirAttributeDump(flatSymbolRef); @@ -445,12 +451,13 @@ int printStandardAttributes(MlirContext ctx) { !mlirAttributeEqual(mlirSymbolRefAttrGetNestedReference(symbolRef, 1), flatSymbolRef)) return 10; - mlirSymbolRefAttrGetLeafReference(symbolRef, callbackSetFixedLengthString, - buffer); - mlirSymbolRefAttrGetRootReference(symbolRef, callbackSetFixedLengthString, - buffer + 3); - if (buffer[0] != 'f' || buffer[1] != 'g' || buffer[2] != 'h' || - buffer[3] != 'i' || buffer[4] != 'j') + + MlirStringRef symbolRefLeaf = mlirSymbolRefAttrGetLeafReference(symbolRef); + MlirStringRef symbolRefRoot = mlirSymbolRefAttrGetRootReference(symbolRef); + if (symbolRefLeaf.length != 3 || + strncmp(data + 5, symbolRefLeaf.data, symbolRefLeaf.length) || + symbolRefRoot.length != 2 || + strncmp(data + 8, symbolRefRoot.data, symbolRefRoot.length)) return 11; mlirAttributeDump(symbolRef); @@ -587,6 +594,121 @@ int printStandardAttributes(MlirContext ctx) { return 0; } +int printAffineMap(MlirContext ctx) { + MlirAffineMap emptyAffineMap = mlirAffineMapEmptyGet(ctx); + MlirAffineMap affineMap = mlirAffineMapGet(ctx, 3, 2); + MlirAffineMap constAffineMap = mlirAffineMapConstantGet(ctx, 2); + MlirAffineMap multiDimIdentityAffineMap = + mlirAffineMapMultiDimIdentityGet(ctx, 3); + MlirAffineMap minorIdentityAffineMap = + mlirAffineMapMinorIdentityGet(ctx, 3, 2); + unsigned permutation[] = {1, 2, 0}; + MlirAffineMap permutationAffineMap = mlirAffineMapPermutationGet( + ctx, sizeof(permutation) / sizeof(unsigned), permutation); + + mlirAffineMapDump(emptyAffineMap); + mlirAffineMapDump(affineMap); + mlirAffineMapDump(constAffineMap); + mlirAffineMapDump(multiDimIdentityAffineMap); + mlirAffineMapDump(minorIdentityAffineMap); + mlirAffineMapDump(permutationAffineMap); + + if (!mlirAffineMapIsIdentity(emptyAffineMap) || + mlirAffineMapIsIdentity(affineMap) || + mlirAffineMapIsIdentity(constAffineMap) || + !mlirAffineMapIsIdentity(multiDimIdentityAffineMap) || + mlirAffineMapIsIdentity(minorIdentityAffineMap) || + mlirAffineMapIsIdentity(permutationAffineMap)) + return 1; + + if (!mlirAffineMapIsMinorIdentity(emptyAffineMap) || + mlirAffineMapIsMinorIdentity(affineMap) || + !mlirAffineMapIsMinorIdentity(multiDimIdentityAffineMap) || + !mlirAffineMapIsMinorIdentity(minorIdentityAffineMap) || + mlirAffineMapIsMinorIdentity(permutationAffineMap)) + return 2; + + if (!mlirAffineMapIsEmpty(emptyAffineMap) || + mlirAffineMapIsEmpty(affineMap) || + mlirAffineMapIsEmpty(constAffineMap) || + mlirAffineMapIsEmpty(multiDimIdentityAffineMap) || + mlirAffineMapIsEmpty(minorIdentityAffineMap) || + mlirAffineMapIsEmpty(permutationAffineMap)) + return 3; + + if (mlirAffineMapIsSingleConstant(emptyAffineMap) || + mlirAffineMapIsSingleConstant(affineMap) || + !mlirAffineMapIsSingleConstant(constAffineMap) || + mlirAffineMapIsSingleConstant(multiDimIdentityAffineMap) || + mlirAffineMapIsSingleConstant(minorIdentityAffineMap) || + mlirAffineMapIsSingleConstant(permutationAffineMap)) + return 4; + + if (mlirAffineMapGetSingleConstantResult(constAffineMap) != 2) + return 5; + + if (mlirAffineMapGetNumDims(emptyAffineMap) != 0 || + mlirAffineMapGetNumDims(affineMap) != 3 || + mlirAffineMapGetNumDims(constAffineMap) != 0 || + mlirAffineMapGetNumDims(multiDimIdentityAffineMap) != 3 || + mlirAffineMapGetNumDims(minorIdentityAffineMap) != 3 || + mlirAffineMapGetNumDims(permutationAffineMap) != 3) + return 6; + + if (mlirAffineMapGetNumSymbols(emptyAffineMap) != 0 || + mlirAffineMapGetNumSymbols(affineMap) != 2 || + mlirAffineMapGetNumSymbols(constAffineMap) != 0 || + mlirAffineMapGetNumSymbols(multiDimIdentityAffineMap) != 0 || + mlirAffineMapGetNumSymbols(minorIdentityAffineMap) != 0 || + mlirAffineMapGetNumSymbols(permutationAffineMap) != 0) + return 7; + + if (mlirAffineMapGetNumResults(emptyAffineMap) != 0 || + mlirAffineMapGetNumResults(affineMap) != 0 || + mlirAffineMapGetNumResults(constAffineMap) != 1 || + mlirAffineMapGetNumResults(multiDimIdentityAffineMap) != 3 || + mlirAffineMapGetNumResults(minorIdentityAffineMap) != 2 || + mlirAffineMapGetNumResults(permutationAffineMap) != 3) + return 8; + + if (mlirAffineMapGetNumInputs(emptyAffineMap) != 0 || + mlirAffineMapGetNumInputs(affineMap) != 5 || + mlirAffineMapGetNumInputs(constAffineMap) != 0 || + mlirAffineMapGetNumInputs(multiDimIdentityAffineMap) != 3 || + mlirAffineMapGetNumInputs(minorIdentityAffineMap) != 3 || + mlirAffineMapGetNumInputs(permutationAffineMap) != 3) + return 9; + + if (!mlirAffineMapIsProjectedPermutation(emptyAffineMap) || + !mlirAffineMapIsPermutation(emptyAffineMap) || + mlirAffineMapIsProjectedPermutation(affineMap) || + mlirAffineMapIsPermutation(affineMap) || + mlirAffineMapIsProjectedPermutation(constAffineMap) || + mlirAffineMapIsPermutation(constAffineMap) || + !mlirAffineMapIsProjectedPermutation(multiDimIdentityAffineMap) || + !mlirAffineMapIsPermutation(multiDimIdentityAffineMap) || + !mlirAffineMapIsProjectedPermutation(minorIdentityAffineMap) || + mlirAffineMapIsPermutation(minorIdentityAffineMap) || + !mlirAffineMapIsProjectedPermutation(permutationAffineMap) || + !mlirAffineMapIsPermutation(permutationAffineMap)) + return 10; + + intptr_t sub[] = {1}; + + MlirAffineMap subMap = mlirAffineMapGetSubMap( + multiDimIdentityAffineMap, sizeof(sub) / sizeof(intptr_t), sub); + MlirAffineMap majorSubMap = + mlirAffineMapGetMajorSubMap(multiDimIdentityAffineMap, 1); + MlirAffineMap minorSubMap = + mlirAffineMapGetMinorSubMap(multiDimIdentityAffineMap, 1); + + mlirAffineMapDump(subMap); + mlirAffineMapDump(majorSubMap); + mlirAffineMapDump(minorSubMap); + + return 0; +} + int main() { MlirContext ctx = mlirContextCreate(); mlirRegisterAllDialects(ctx); @@ -698,6 +820,23 @@ int main() { errcode = printStandardAttributes(ctx); fprintf(stderr, "%d\n", errcode); + // clang-format off + // CHECK-LABEL: @affineMap + // CHECK: () -> () + // CHECK: (d0, d1, d2)[s0, s1] -> () + // CHECK: () -> (2) + // CHECK: (d0, d1, d2) -> (d0, d1, d2) + // CHECK: (d0, d1, d2) -> (d1, d2) + // CHECK: (d0, d1, d2) -> (d1, d2, d0) + // CHECK: (d0, d1, d2) -> (d1) + // CHECK: (d0, d1, d2) -> (d0) + // CHECK: (d0, d1, d2) -> (d2) + // CHECK: 0 + // clang-format on + fprintf(stderr, "@affineMap\n"); + errcode = printAffineMap(ctx); + fprintf(stderr, "%d\n", errcode); + mlirContextDestroy(ctx); return 0; diff --git a/mlir/test/Conversion/LinalgToVector/linalg-to-vector.mlir b/mlir/test/Conversion/LinalgToVector/linalg-to-vector.mlir new file mode 100644 index 0000000000000..c2e8a31eb443c --- /dev/null +++ b/mlir/test/Conversion/LinalgToVector/linalg-to-vector.mlir @@ -0,0 +1,52 @@ +// RUN: mlir-opt %s -test-conv-vectorization --cse | FileCheck %s + +// CHECK-DAG: #[[$map0:.*]] = affine_map<(d0)[s0] -> (1, -d0 + s0)> +// CHECK-DAG: #[[$map1:.*]] = affine_map<(d0)[s0] -> (d0 + s0)> +// CHECK-DAG: #[[$map2:.*]] = affine_map<(d0, d1) -> (d0 + d1)> +// CHECK-DAG: #[[$map3:.*]] = affine_map<(d0, d1)[s0] -> (3, -d0 - d1 + s0)> +// CHECK-DAG: #[[$map4:.*]] = affine_map<(d0)[s0] -> (3, -d0 + s0)> +// CHECK-DAG: #[[$map5:.*]] = affine_map<(d0) -> (d0)> + +func @conv_1d(%arg0: memref, %arg1: memref, %arg2: memref) { + linalg.conv_1d %arg0, %arg1, %arg2 : (memref, memref, memref) + return +} + +// CHECK-LABEL: @conv_1d +// CHECK-SAME: %[[arg0:[a-zA-Z0-9]+]]: memref +// CHECK-SAME: %[[arg1:[a-zA-Z0-9]+]]: memref +// CHECK-SAME: %[[arg2:[a-zA-Z0-9]+]]: memref +// CHECK: %[[v1:.*]] = dim %[[arg2]], %[[c0]] : memref +// CHECK: %[[v2:.*]] = dim %[[arg0]], %[[c0]] : memref +// CHECK: %[[v3:.*]] = alloc(%[[c12]]) : memref +// CHECK: %[[v4:.*]] = alloc(%[[c12]]) : memref +// CHECK: %[[v5:.*]] = alloc(%[[c4]]) : memref +// CHECK: %[[v6:.*]] = std.view %[[v3]][%[[c0]]][] : memref to memref<3xf32> +// CHECK: %[[v7:.*]] = std.view %[[v4]][%[[c0]]][] : memref to memref<3xf32> +// CHECK: %[[v8:.*]] = std.view %[[v5]][%[[c0]]][] : memref to memref<1xf32> +// CHECK: scf.for %[[arg3:.*]] = %[[c0]] to %[[v1]] step %[[c1]] { +// CHECK: %[[v9:.*]] = affine.min #[[$map0]](%[[arg3]])[%[[v1]]] +// CHECK: %[[v10:.*]] = subview %[[arg2]][%[[arg3]]] [%[[v9]]] [1] : memref to memref +// CHECK: %[[v11:.*]] = subview %[[v8]][0] [%[[v9]]] [1] : memref<1xf32> to memref +// CHECK: scf.for %[[arg4:.*]] = %[[c0]] to %[[v0]] step %[[c3]] { +// CHECK: %[[v12:.*]] = affine.apply #[[$map2]](%[[arg3]], %[[arg4]]) +// CHECK: %[[v13:.*]] = affine.min #[[$map3]](%[[arg3]], %[[arg4]])[%[[v2]]] +// CHECK: %[[v14:.*]] = subview %arg0[%12] [%13] [1] : memref to memref +// CHECK: %[[v15:.*]] = affine.min #[[$map4]](%arg4)[%0] +// CHECK: %[[v16:.*]] = subview %[[arg1]][%[[arg4]]] [%[[v15]]] [1] : memref to memref +// CHECK: %[[v17:.*]] = subview %[[v6]][0] [%[[v13]]] [1] : memref<3xf32> to memref +// CHECK: %[[v19:.*]] = vector.transfer_read %[[v6]][%[[c0]]], %[[cst]] {masked = [false]} : memref<3xf32>, vector<3xf32> +// CHECK: %[[v20:.*]] = vector.transfer_read %[[v7]][%[[c0]]], %[[cst]] {masked = [false]} : memref<3xf32>, vector<3xf32> +// CHECK: %[[v21:.*]] = mulf %[[v19]], %[[v20]] : vector<3xf32> +// CHECK: %[[v22:.*]] = vector.reduction "add", %[[v21]], %[[cst]] : vector<3xf32> into f32 +// CHECK: store %[[v22]], %[[v8]][%[[c0]]] : memref<1xf32> +// CHECK: scf.for %[[arg5:.*]] = %[[c0]] to %[[v9]] step %[[c1]] { +// CHECK: %[[v23:.*]] = load %[[v11]][%[[arg5]]] : memref +// CHECK: store %[[v23]], %[[v10]][%[[arg5]]] : memref diff --git a/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir b/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir index bf8e74e5143ed..01ba6abcc6c4e 100644 --- a/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir +++ b/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir @@ -94,7 +94,7 @@ func @const_shape() -> tensor { // CHECK: %[[C1:.*]] = constant 1 : index // CHECK: %[[C2:.*]] = constant 2 : index // CHECK: %[[C3:.*]] = constant 3 : index - // CHECK: %[[TENSOR3:.*]] = tensor_from_elements(%[[C1]], %[[C2]], %[[C3]]) + // CHECK: %[[TENSOR3:.*]] = tensor_from_elements %[[C1]], %[[C2]], %[[C3]] // CHECK: %[[RESULT:.*]] = tensor_cast %[[TENSOR3]] : tensor<3xindex> to tensor // CHECK: return %[[RESULT]] : tensor %shape = shape.const_shape [1, 2, 3] : tensor @@ -103,6 +103,19 @@ func @const_shape() -> tensor { // ----- +// Lower `const_shape` in the case of rank 0. +// CHECK-LABEL: func @const_shape_zero_elements +// CHECK-SAME: () -> tensor +func @const_shape_zero_elements() -> tensor { + // CHECK: %[[TENSOR:.*]] = tensor_from_elements : tensor<0xindex> + // CHECK: %[[RESULT:.*]] = tensor_cast %[[TENSOR]] : tensor<0xindex> to tensor + // CHECK: return %[[RESULT]] : tensor + %shape = shape.const_shape [] : tensor + return %shape : tensor +} + +// ----- + // Lower `any` to its first operand. // CHECK-LABEL: @any_of_three // CHECK-SAME: (%[[A:.*]]: tensor, %[[B:.*]]: tensor, %[[C:.*]]: tensor) -> tensor @@ -191,14 +204,11 @@ func @shape_of(%arg : tensor<*xf32>) { // CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>) func @shape_of_unranked(%arg : tensor<*xf32>) { // CHECK: %[[RANK:.*]] = rank %[[ARG]] : tensor<*xf32> - // CHECK: %[[SHAPE_MEM:.*]] = alloca(%[[RANK]]) : memref - // CHECK: %[[C0:.*]] = constant 0 : index - // CHECK: %[[C1:.*]] = constant 1 : index - // CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[RANK]] step %[[C1]] { - // CHECK: %[[DIM:.]] = dim %[[ARG]], %[[I]] : tensor<*xf32> - // CHECK: store %[[DIM]], %[[SHAPE_MEM]][%[[I]]] : memref - // CHECK: } - // CHECK: %[[SHAPE:.*]] = tensor_load %[[SHAPE_MEM]] : memref + // CHECK: %[[SHAPE:.*]] = dynamic_tensor_from_elements %[[RANK]] { + // CHECK: ^bb0(%[[I:.*]]: index): + // CHECK: %[[EXTENT:.*]] = dim %[[ARG]], %[[I]] : tensor<*xf32> + // CHECK: yield %[[EXTENT]] : index + // CHECK: } : tensor %shape = shape.shape_of %arg : tensor<*xf32> -> tensor return } @@ -223,13 +233,24 @@ func @shape_of_stat(%arg : tensor<1x2x3xf32>) { // CHECK-DAG: %[[C1:.*]] = constant 1 : index // CHECK-DAG: %[[C2:.*]] = constant 2 : index // CHECK-DAG: %[[C3:.*]] = constant 3 : index - // CHECK-DAG: %[[SHAPE_UNCASTED:.*]] = tensor_from_elements(%[[C1]], %[[C2]], %[[C3]]) : tensor<3xindex> + // CHECK-DAG: %[[SHAPE_UNCASTED:.*]] = tensor_from_elements %[[C1]], %[[C2]], %[[C3]] : tensor<3xindex> %shape = shape.shape_of %arg : tensor<1x2x3xf32> -> tensor return } // ----- +// Lower `shape_of` for 0-D tensor. +// CHECK-LABEL: @shape_of_zero_d +// CHECK-SAME: (%[[ARG:.*]]: tensor) +func @shape_of_zero_d(%arg : tensor) { + // CHECK-DAG: %[[SHAPE_UNCASTED:.*]] = tensor_from_elements : tensor<0xindex> + %shape = shape.shape_of %arg : tensor -> tensor + return +} + +// ----- + // Lower `shape_of` for dynamically shaped tensor. // CHECK-LABEL: @shape_of_dyn // CHECK-SAME: (%[[ARG:.*]]: tensor<1x5x?xf32>) @@ -238,7 +259,7 @@ func @shape_of_dyn(%arg : tensor<1x5x?xf32>) { // CHECK-DAG: %[[C5:.*]] = constant 5 : index // CHECK-DAG: %[[C2:.*]] = constant 2 : index // CHECK-DAG: %[[DYN_DIM:.*]] = dim %[[ARG]], %[[C2]] : tensor<1x5x?xf32> - // CHECK-DAG: %[[SHAPE_UNCASTED:.*]] = tensor_from_elements(%[[C1]], %[[C5]], %[[DYN_DIM]]) : tensor<3xindex> + // CHECK-DAG: %[[SHAPE_UNCASTED:.*]] = tensor_from_elements %[[C1]], %[[C5]], %[[DYN_DIM]] : tensor<3xindex> %shape = shape.shape_of %arg : tensor<1x5x?xf32> -> tensor return } diff --git a/mlir/test/Conversion/StandardToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/StandardToLLVM/convert-to-llvmir.mlir index 62be4783e364b..bb0363b1cba52 100644 --- a/mlir/test/Conversion/StandardToLLVM/convert-to-llvmir.mlir +++ b/mlir/test/Conversion/StandardToLLVM/convert-to-llvmir.mlir @@ -594,6 +594,24 @@ func @sitofp(%arg0 : i32, %arg1 : i64) { return } +// Checking conversion of integer vectors to floating point vector types. +// CHECK-LABEL: @sitofp_vector +func @sitofp_vector(%arg0 : vector<2xi16>, %arg1 : vector<2xi32>, %arg2 : vector<2xi64>) { +// CHECK-NEXT: = llvm.sitofp {{.*}} : !llvm.vec<2 x i16> to !llvm.vec<2 x float> + %0 = sitofp %arg0: vector<2xi16> to vector<2xf32> +// CHECK-NEXT: = llvm.sitofp {{.*}} : !llvm.vec<2 x i16> to !llvm.vec<2 x double> + %1 = sitofp %arg0: vector<2xi16> to vector<2xf64> +// CHECK-NEXT: = llvm.sitofp {{.*}} : !llvm.vec<2 x i32> to !llvm.vec<2 x float> + %2 = sitofp %arg1: vector<2xi32> to vector<2xf32> +// CHECK-NEXT: = llvm.sitofp {{.*}} : !llvm.vec<2 x i32> to !llvm.vec<2 x double> + %3 = sitofp %arg1: vector<2xi32> to vector<2xf64> +// CHECK-NEXT: = llvm.sitofp {{.*}} : !llvm.vec<2 x i64> to !llvm.vec<2 x float> + %4 = sitofp %arg2: vector<2xi64> to vector<2xf32> +// CHECK-NEXT: = llvm.sitofp {{.*}} : !llvm.vec<2 x i64> to !llvm.vec<2 x double> + %5 = sitofp %arg2: vector<2xi64> to vector<2xf64> + return +} + // Checking conversion of unsigned integer types to floating point. // CHECK-LABEL: @uitofp func @uitofp(%arg0 : i32, %arg1 : i64) { @@ -646,6 +664,24 @@ func @fptosi(%arg0 : f32, %arg1 : f64) { return } +// Checking conversion of floating point vectors to integer vector types. +// CHECK-LABEL: @fptosi_vector +func @fptosi_vector(%arg0 : vector<2xf16>, %arg1 : vector<2xf32>, %arg2 : vector<2xf64>) { +// CHECK-NEXT: = llvm.fptosi {{.*}} : !llvm.vec<2 x half> to !llvm.vec<2 x i32> + %0 = fptosi %arg0: vector<2xf16> to vector<2xi32> +// CHECK-NEXT: = llvm.fptosi {{.*}} : !llvm.vec<2 x half> to !llvm.vec<2 x i64> + %1 = fptosi %arg0: vector<2xf16> to vector<2xi64> +// CHECK-NEXT: = llvm.fptosi {{.*}} : !llvm.vec<2 x float> to !llvm.vec<2 x i32> + %2 = fptosi %arg1: vector<2xf32> to vector<2xi32> +// CHECK-NEXT: = llvm.fptosi {{.*}} : !llvm.vec<2 x float> to !llvm.vec<2 x i64> + %3 = fptosi %arg1: vector<2xf32> to vector<2xi64> +// CHECK-NEXT: = llvm.fptosi {{.*}} : !llvm.vec<2 x double> to !llvm.vec<2 x i32> + %4 = fptosi %arg2: vector<2xf64> to vector<2xi32> +// CHECK-NEXT: = llvm.fptosi {{.*}} : !llvm.vec<2 x double> to !llvm.vec<2 x i64> + %5 = fptosi %arg2: vector<2xf64> to vector<2xi64> + return +} + // Checking conversion of floating point to integer types. // CHECK-LABEL: @fptoui func @fptoui(%arg0 : f32, %arg1 : f64) { @@ -660,6 +696,41 @@ func @fptoui(%arg0 : f32, %arg1 : f64) { return } +// Checking conversion of floating point vectors to integer vector types. +// CHECK-LABEL: @fptoui_vector +func @fptoui_vector(%arg0 : vector<2xf16>, %arg1 : vector<2xf32>, %arg2 : vector<2xf64>) { +// CHECK-NEXT: = llvm.fptoui {{.*}} : !llvm.vec<2 x half> to !llvm.vec<2 x i32> + %0 = fptoui %arg0: vector<2xf16> to vector<2xi32> +// CHECK-NEXT: = llvm.fptoui {{.*}} : !llvm.vec<2 x half> to !llvm.vec<2 x i64> + %1 = fptoui %arg0: vector<2xf16> to vector<2xi64> +// CHECK-NEXT: = llvm.fptoui {{.*}} : !llvm.vec<2 x float> to !llvm.vec<2 x i32> + %2 = fptoui %arg1: vector<2xf32> to vector<2xi32> +// CHECK-NEXT: = llvm.fptoui {{.*}} : !llvm.vec<2 x float> to !llvm.vec<2 x i64> + %3 = fptoui %arg1: vector<2xf32> to vector<2xi64> +// CHECK-NEXT: = llvm.fptoui {{.*}} : !llvm.vec<2 x double> to !llvm.vec<2 x i32> + %4 = fptoui %arg2: vector<2xf64> to vector<2xi32> +// CHECK-NEXT: = llvm.fptoui {{.*}} : !llvm.vec<2 x double> to !llvm.vec<2 x i64> + %5 = fptoui %arg2: vector<2xf64> to vector<2xi64> + return +} + +// Checking conversion of integer vectors to floating point vector types. +// CHECK-LABEL: @uitofp_vector +func @uitofp_vector(%arg0 : vector<2xi16>, %arg1 : vector<2xi32>, %arg2 : vector<2xi64>) { +// CHECK-NEXT: = llvm.uitofp {{.*}} : !llvm.vec<2 x i16> to !llvm.vec<2 x float> + %0 = uitofp %arg0: vector<2xi16> to vector<2xf32> +// CHECK-NEXT: = llvm.uitofp {{.*}} : !llvm.vec<2 x i16> to !llvm.vec<2 x double> + %1 = uitofp %arg0: vector<2xi16> to vector<2xf64> +// CHECK-NEXT: = llvm.uitofp {{.*}} : !llvm.vec<2 x i32> to !llvm.vec<2 x float> + %2 = uitofp %arg1: vector<2xi32> to vector<2xf32> +// CHECK-NEXT: = llvm.uitofp {{.*}} : !llvm.vec<2 x i32> to !llvm.vec<2 x double> + %3 = uitofp %arg1: vector<2xi32> to vector<2xf64> +// CHECK-NEXT: = llvm.uitofp {{.*}} : !llvm.vec<2 x i64> to !llvm.vec<2 x float> + %4 = uitofp %arg2: vector<2xi64> to vector<2xf32> +// CHECK-NEXT: = llvm.uitofp {{.*}} : !llvm.vec<2 x i64> to !llvm.vec<2 x double> + %5 = uitofp %arg2: vector<2xi64> to vector<2xf64> + return +} // Checking conversion of integer types to floating point. // CHECK-LABEL: @fptrunc diff --git a/mlir/test/Conversion/StandardToLLVM/invalid.mlir b/mlir/test/Conversion/StandardToLLVM/invalid.mlir index 469bb9753ec49..40acf4bc9d49b 100644 --- a/mlir/test/Conversion/StandardToLLVM/invalid.mlir +++ b/mlir/test/Conversion/StandardToLLVM/invalid.mlir @@ -29,3 +29,12 @@ func @mlir_cast_to_llvm_vec(%0 : vector<1x1xf32>) -> !llvm.vec<1 x float> { %1 = llvm.mlir.cast %0 : vector<1x1xf32> to !llvm.vec<1 x float> return %1 : !llvm.vec<1 x float> } + +// ----- + +// Should not crash on unsupported types in function signatures. +func @unsupported_signature() -> tensor<10 x i32> + +// ----- + +func @partially_supported_signature() -> (vector<10 x i32>, tensor<10 x i32>) diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir index e0800c2fd2272..42336b8e9b70e 100644 --- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir @@ -755,34 +755,36 @@ func @transfer_read_1d(%A : memref, %base: index) -> vector<17xf32> { // 2. Create a vector with linear indices [ 0 .. vector_length - 1 ]. // CHECK: %[[linearIndex:.*]] = llvm.mlir.constant(dense // CHECK-SAME: <[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]> : -// CHECK-SAME: vector<17xi64>) : !llvm.vec<17 x i64> +// CHECK-SAME: vector<17xi32>) : !llvm.vec<17 x i32> // // 3. Create offsetVector = [ offset + 0 .. offset + vector_length - 1 ]. -// CHECK: %[[offsetVec:.*]] = llvm.mlir.undef : !llvm.vec<17 x i64> +// CHECK: %[[otrunc:.*]] = llvm.trunc %[[BASE]] : !llvm.i64 to !llvm.i32 +// CHECK: %[[offsetVec:.*]] = llvm.mlir.undef : !llvm.vec<17 x i32> // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32 -// CHECK: %[[offsetVec2:.*]] = llvm.insertelement %[[BASE]], %[[offsetVec]][%[[c0]] : -// CHECK-SAME: !llvm.i32] : !llvm.vec<17 x i64> +// CHECK: %[[offsetVec2:.*]] = llvm.insertelement %[[otrunc]], %[[offsetVec]][%[[c0]] : +// CHECK-SAME: !llvm.i32] : !llvm.vec<17 x i32> // CHECK: %[[offsetVec3:.*]] = llvm.shufflevector %[[offsetVec2]], %{{.*}} [ // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32] : -// CHECK-SAME: !llvm.vec<17 x i64>, !llvm.vec<17 x i64> +// CHECK-SAME: !llvm.vec<17 x i32>, !llvm.vec<17 x i32> // CHECK: %[[offsetVec4:.*]] = llvm.add %[[offsetVec3]], %[[linearIndex]] : -// CHECK-SAME: !llvm.vec<17 x i64> +// CHECK-SAME: !llvm.vec<17 x i32> // // 4. Let dim the memref dimension, compute the vector comparison mask: // [ offset + 0 .. offset + vector_length - 1 ] < [ dim .. dim ] -// CHECK: %[[dimVec:.*]] = llvm.mlir.undef : !llvm.vec<17 x i64> +// CHECK: %[[dtrunc:.*]] = llvm.trunc %[[DIM]] : !llvm.i64 to !llvm.i32 +// CHECK: %[[dimVec:.*]] = llvm.mlir.undef : !llvm.vec<17 x i32> // CHECK: %[[c01:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32 -// CHECK: %[[dimVec2:.*]] = llvm.insertelement %[[DIM]], %[[dimVec]][%[[c01]] : -// CHECK-SAME: !llvm.i32] : !llvm.vec<17 x i64> +// CHECK: %[[dimVec2:.*]] = llvm.insertelement %[[dtrunc]], %[[dimVec]][%[[c01]] : +// CHECK-SAME: !llvm.i32] : !llvm.vec<17 x i32> // CHECK: %[[dimVec3:.*]] = llvm.shufflevector %[[dimVec2]], %{{.*}} [ // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32] : -// CHECK-SAME: !llvm.vec<17 x i64>, !llvm.vec<17 x i64> +// CHECK-SAME: !llvm.vec<17 x i32>, !llvm.vec<17 x i32> // CHECK: %[[mask:.*]] = llvm.icmp "slt" %[[offsetVec4]], %[[dimVec3]] : -// CHECK-SAME: !llvm.vec<17 x i64> +// CHECK-SAME: !llvm.vec<17 x i32> // // 5. Rewrite as a masked read. // CHECK: %[[PASS_THROUGH:.*]] = llvm.mlir.constant(dense<7.000000e+00> : @@ -801,13 +803,13 @@ func @transfer_read_1d(%A : memref, %base: index) -> vector<17xf32> { // 2. Create a vector with linear indices [ 0 .. vector_length - 1 ]. // CHECK: %[[linearIndex_b:.*]] = llvm.mlir.constant(dense // CHECK-SAME: <[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]> : -// CHECK-SAME: vector<17xi64>) : !llvm.vec<17 x i64> +// CHECK-SAME: vector<17xi32>) : !llvm.vec<17 x i32> // // 3. Create offsetVector = [ offset + 0 .. offset + vector_length - 1 ]. // CHECK: llvm.shufflevector {{.*}} [0 : i32, 0 : i32, 0 : i32, 0 : i32, // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32] : -// CHECK-SAME: !llvm.vec<17 x i64>, !llvm.vec<17 x i64> +// CHECK-SAME: !llvm.vec<17 x i32>, !llvm.vec<17 x i32> // CHECK: llvm.add // // 4. Let dim the memref dimension, compute the vector comparison mask: @@ -815,8 +817,8 @@ func @transfer_read_1d(%A : memref, %base: index) -> vector<17xf32> { // CHECK: llvm.shufflevector {{.*}} [0 : i32, 0 : i32, 0 : i32, 0 : i32, // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32] : -// CHECK-SAME: !llvm.vec<17 x i64>, !llvm.vec<17 x i64> -// CHECK: %[[mask_b:.*]] = llvm.icmp "slt" {{.*}} : !llvm.vec<17 x i64> +// CHECK-SAME: !llvm.vec<17 x i32>, !llvm.vec<17 x i32> +// CHECK: %[[mask_b:.*]] = llvm.icmp "slt" {{.*}} : !llvm.vec<17 x i32> // // 5. Rewrite as a masked write. // CHECK: llvm.intr.masked.store %[[loaded]], %[[vecPtr_b]], %[[mask_b]] @@ -836,28 +838,29 @@ func @transfer_read_2d_to_1d(%A : memref, %base0: index, %base1: index) // CHECK-SAME: !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> // // Create offsetVector = [ offset + 0 .. offset + vector_length - 1 ]. -// CHECK: %[[offsetVec:.*]] = llvm.mlir.undef : !llvm.vec<17 x i64> +// CHECK: %[[trunc:.*]] = llvm.trunc %[[BASE_1]] : !llvm.i64 to !llvm.i32 +// CHECK: %[[offsetVec:.*]] = llvm.mlir.undef : !llvm.vec<17 x i32> // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32 -// Here we check we properly use %BASE_1 -// CHECK: %[[offsetVec2:.*]] = llvm.insertelement %[[BASE_1]], %[[offsetVec]][%[[c0]] : -// CHECK-SAME: !llvm.i32] : !llvm.vec<17 x i64> +// CHECK: %[[offsetVec2:.*]] = llvm.insertelement %[[trunc]], %[[offsetVec]][%[[c0]] : +// CHECK-SAME: !llvm.i32] : !llvm.vec<17 x i32> // CHECK: %[[offsetVec3:.*]] = llvm.shufflevector %[[offsetVec2]], %{{.*}} [ // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32] : +// CHECK-SAME: !llvm.vec<17 x i32>, !llvm.vec<17 x i32> // // Let dim the memref dimension, compute the vector comparison mask: // [ offset + 0 .. offset + vector_length - 1 ] < [ dim .. dim ] -// Here we check we properly use %DIM[1] -// CHECK: %[[dimVec:.*]] = llvm.mlir.undef : !llvm.vec<17 x i64> +// CHECK: %[[dimtrunc:.*]] = llvm.trunc %[[DIM]] : !llvm.i64 to !llvm.i32 +// CHECK: %[[dimVec:.*]] = llvm.mlir.undef : !llvm.vec<17 x i32> // CHECK: %[[c01:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32 -// CHECK: %[[dimVec2:.*]] = llvm.insertelement %[[DIM]], %[[dimVec]][%[[c01]] : -// CHECK-SAME: !llvm.i32] : !llvm.vec<17 x i64> +// CHECK: %[[dimVec2:.*]] = llvm.insertelement %[[dimtrunc]], %[[dimVec]][%[[c01]] : +// CHECK-SAME: !llvm.i32] : !llvm.vec<17 x i32> // CHECK: %[[dimVec3:.*]] = llvm.shufflevector %[[dimVec2]], %{{.*}} [ // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, 0 : i32, // CHECK-SAME: 0 : i32, 0 : i32, 0 : i32] : -// CHECK-SAME: !llvm.vec<17 x i64>, !llvm.vec<17 x i64> +// CHECK-SAME: !llvm.vec<17 x i32>, !llvm.vec<17 x i32> func @transfer_read_1d_non_zero_addrspace(%A : memref, %base: index) -> vector<17xf32> { %f7 = constant 7.0: f32 diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir index 240925baf3d8c..ef1b2e995053c 100644 --- a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir +++ b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir @@ -15,11 +15,13 @@ func @materialize_read_1d() { %ip3 = affine.apply affine_map<(d0) -> (d0 + 3)> (%i1) %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> // Both accesses in the load must be clipped otherwise %i1 + 2 and %i1 + 3 will go out of bounds. - // CHECK: {{.*}} = select - // CHECK: %[[FILTERED1:.*]] = select - // CHECK: {{.*}} = select - // CHECK: %[[FILTERED2:.*]] = select - // CHECK: %{{.*}} = load {{.*}}[%[[FILTERED1]], %[[FILTERED2]]] : memref<7x42xf32> + // CHECK: scf.if + // CHECK-NEXT: load + // CHECK-NEXT: vector.insertelement + // CHECK-NEXT: store + // CHECK-NEXT: else + // CHECK-NEXT: vector.insertelement + // CHECK-NEXT: store } } return @@ -53,7 +55,6 @@ func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %d // ----- // CHECK: #[[$ADD:map[0-9]+]] = affine_map<(d0, d1) -> (d0 + d1)> -// CHECK: #[[$SUB:map[0-9]+]] = affine_map<()[s0] -> (s0 - 1)> // CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) { func @materialize_read(%M: index, %N: index, %O: index, %P: index) { @@ -72,37 +73,18 @@ func @materialize_read(%M: index, %N: index, %O: index, %P: index) { // CHECK-NEXT: scf.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] { // CHECK-NEXT: scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { // CHECK-NEXT: scf.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] { - // CHECK-NEXT: {{.*}} = affine.apply #[[$ADD]](%[[I0]], %[[I4]]) - // CHECK-NEXT: {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}] - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}} : index - // CHECK-NEXT: {{.*}} = select - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index - // CHECK-NEXT: %[[L0:.*]] = select - // - // CHECK-NEXT: {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}] - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}} : index - // CHECK-NEXT: {{.*}} = select - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index - // CHECK-NEXT: %[[L1:.*]] = select - // - // CHECK-NEXT: {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}] - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}} : index - // CHECK-NEXT: {{.*}} = select - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index - // CHECK-NEXT: %[[L2:.*]] = select - // - // CHECK-NEXT: {{.*}} = affine.apply #[[$ADD]](%[[I3]], %[[I6]]) - // CHECK-NEXT: {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}] - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}} : index - // CHECK-NEXT: {{.*}} = select - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index - // CHECK-NEXT: %[[L3:.*]] = select - // CHECK-NEXT: %[[VIDX:.*]] = index_cast %[[I4]] - // - // CHECK-DAG: %[[SCAL:.*]] = load %{{.*}}[%[[L0]], %[[L1]], %[[L2]], %[[L3]]] : memref - // CHECK-DAG: %[[VEC:.*]] = load %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>> - // CHECK-NEXT: %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %[[VEC]][%[[VIDX]] : i32] : vector<3xf32> - // CHECK-NEXT: store %[[RVEC]], %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>> + // CHECK: %[[VIDX:.*]] = index_cast %[[I4]] + // CHECK: %[[VEC:.*]] = load %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>> + // CHECK: %[[L0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I4]]) + // CHECK: %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I6]]) + // CHECK-NEXT: scf.if + // CHECK-NEXT: %[[SCAL:.*]] = load %{{.*}}[%[[L0]], %[[I1]], %[[I2]], %[[L3]]] : memref + // CHECK-NEXT: %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %[[VEC]][%[[VIDX]] : i32] : vector<3xf32> + // CHECK-NEXT: store %[[RVEC]], %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>> + // CHECK-NEXT: } else { + // CHECK-NEXT: %[[CVEC:.*]] = vector.insertelement + // CHECK-NEXT: store %[[CVEC]], %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>> + // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } @@ -132,7 +114,6 @@ func @materialize_read(%M: index, %N: index, %O: index, %P: index) { // ----- // CHECK: #[[$ADD:map[0-9]+]] = affine_map<(d0, d1) -> (d0 + d1)> -// CHECK: #[[$SUB:map[0-9]+]] = affine_map<()[s0] -> (s0 - 1)> // CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) { func @materialize_write(%M: index, %N: index, %O: index, %P: index) { @@ -153,37 +134,15 @@ func @materialize_write(%M: index, %N: index, %O: index, %P: index) { // CHECK-NEXT: scf.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] { // CHECK-NEXT: scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { // CHECK-NEXT: scf.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] { - // CHECK-NEXT: {{.*}} = affine.apply #[[$ADD]](%[[I0]], %[[I4]]) - // CHECK-NEXT: {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}] - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}}, {{.*}} : index - // CHECK-NEXT: {{.*}} = select {{.*}}, {{.*}}, {{.*}} : index - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index - // CHECK-NEXT: %[[S0:.*]] = select {{.*}}, %[[C0]], {{.*}} : index - // - // CHECK-NEXT: {{.*}} = affine.apply #[[$ADD]](%[[I1]], %[[I5]]) - // CHECK-NEXT: {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}] - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}}, {{.*}} : index - // CHECK-NEXT: {{.*}} = select {{.*}}, {{.*}}, {{.*}} : index - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index - // CHECK-NEXT: %[[S1:.*]] = select {{.*}}, %[[C0]], {{.*}} : index - // - // CHECK-NEXT: {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}] - // CHECK-NEXT: {{.*}} = cmpi "slt", %[[I2]], %{{.*}} : index - // CHECK-NEXT: {{.*}} = select {{.*}}, %[[I2]], {{.*}} : index - // CHECK-NEXT: {{.*}} = cmpi "slt", %[[I2]], %[[C0]] : index - // CHECK-NEXT: %[[S2:.*]] = select {{.*}}, %[[C0]], {{.*}} : index - // - // CHECK-NEXT: {{.*}} = affine.apply #[[$ADD]](%[[I3]], %[[I6]]) - // CHECK-NEXT: {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}] - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}}, {{.*}} : index - // CHECK-NEXT: {{.*}} = select {{.*}}, {{.*}}, {{.*}} : index - // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index - // CHECK-NEXT: %[[S3:.*]] = select {{.*}}, %[[C0]], {{.*}} : index - // CHECK-NEXT: %[[VIDX:.*]] = index_cast %[[I4]] - // - // CHECK-NEXT: %[[VEC:.*]] = load {{.*}}[%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>> - // CHECK-NEXT: %[[SCAL:.*]] = vector.extractelement %[[VEC]][%[[VIDX]] : i32] : vector<3xf32> - // CHECK-NEXT: store %[[SCAL]], {{.*}}[%[[S0]], %[[S1]], %[[S2]], %[[S3]]] : memref + // CHECK: %[[VIDX:.*]] = index_cast %[[I4]] + // CHECK: %[[S0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I4]]) + // CHECK: %[[S1:.*]] = affine.apply #[[$ADD]](%[[I1]], %[[I5]]) + // CHECK: %[[S3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I6]]) + // CHECK-NEXT: scf.if + // CHECK-NEXT: %[[VEC:.*]] = load {{.*}}[%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>> + // CHECK-NEXT: %[[SCAL:.*]] = vector.extractelement %[[VEC]][%[[VIDX]] : i32] : vector<3xf32> + // CHECK: store %[[SCAL]], {{.*}}[%[[S0]], %[[S1]], %[[I2]], %[[S3]]] : memref + // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } diff --git a/mlir/test/Dialect/Affine/invalid.mlir b/mlir/test/Dialect/Affine/invalid.mlir index 4d7c9c23edb6c..c38a78060dc64 100644 --- a/mlir/test/Dialect/Affine/invalid.mlir +++ b/mlir/test/Dialect/Affine/invalid.mlir @@ -379,3 +379,14 @@ func @affine_if_with_else_region_args(%N: index) { return } +// ----- + +func @affine_for_iter_args_mismatch(%buffer: memref<1024xf32>) -> f32 { + %sum_0 = constant 0.0 : f32 + // expected-error@+1 {{mismatch between the number of loop-carried values and results}} + %res = affine.for %i = 0 to 10 step 2 iter_args(%sum_iter = %sum_0) -> (f32, f32) { + %t = affine.load %buffer[%i] : memref<1024xf32> + affine.yield %t : f32 + } + return %res : f32 +} diff --git a/mlir/test/Dialect/Affine/loop-tiling-parametric.mlir b/mlir/test/Dialect/Affine/loop-tiling-parametric.mlir new file mode 100644 index 0000000000000..5e9bc4a884c2d --- /dev/null +++ b/mlir/test/Dialect/Affine/loop-tiling-parametric.mlir @@ -0,0 +1,275 @@ +// RUN: mlir-opt %s -split-input-file -test-affine-parametric-tile | FileCheck %s +// Test cases to test the utility introduced to tile affine for loops using +// SSA values as tiling parameters(tile sizes). The tile sizes are expected +// to be passed as input arguments(before any other argument) to the function +// enclosing the loop nest. Currently hyper-rectangular loop nests with constant +// lower bounds are supported. + +// ----- + +// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 256)> +// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 512)> +// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 1024)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)> +// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (512 ceildiv s0)> +// CHECK-DAG: [[UBO2:#map[0-9]+]] = affine_map<()[s0] -> (1024 ceildiv s0)> + +// CHECK: func @loop_tiling_3d([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index) +// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]] +// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG1]] +// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO2]](){{.*}}[[ARG2]] +// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]] +// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI1]]{{.*}}[[ARG4]]{{.*}}[[ARG1]] +// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI2]]{{.*}}[[ARG5]]{{.*}}[[ARG2]] +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) +func @loop_tiling_3d(%t0 : index, %t1 : index, %t2 : index) { + affine.for %i = 0 to 256 { + affine.for %j = 0 to 512 { + affine.for %k = 0 to 1024 { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} + +// ----- + +// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 4, 256)> +// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 3, 512)> +// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0 * 2, 1024)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)> +// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (512 ceildiv s0)> +// CHECK-DAG: [[UBO2:#map[0-9]+]] = affine_map<()[s0] -> (1024 ceildiv s0)> + +// CHECK: func @loop_tiling_non_unit_step([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index) +// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}}step 4 +// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG1]]{{.*}} step 3 +// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO2]](){{.*}}[[ARG2]]{{.*}} step 2 +// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} step 4 +// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI1]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} step 3 +// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI2]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} step 2 +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) +func @loop_tiling_non_unit_step(%t0: index, %t1: index, %t2: index){ + affine.for %i = 0 to 256 step 4 { + affine.for %j = 0 to 512 step 3 { + affine.for %k = 0 to 1024 step 2 { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2, s0, 4096 floordiv s1)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> (s0 ceildiv s2, (4096 floordiv s1) ceildiv s2)> + +// CHECK: func @tile_loop_with_div_in_upper_bound([[ARG0:%arg[0-9]+]]: index, %{{.*}}: memref, %{{.*}}: index, %{{.*}}: index) +#ub = affine_map<()[s0, s1] -> (s0, 4096 floordiv s1)> +func @tile_loop_with_div_in_upper_bound(%t5 : index, %A : memref, %L : index, %U : index) { + %c0 = constant 0 : index + %M = dim %A, %c0 : memref + affine.for %i = 0 to min #ub()[%M, %U] { + addi %i, %i : index + } + // CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to min [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]] + // CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}})[{{.*}}, {{.*}}, [[ARG0]]] + // CHECK-NEXT: addi %[[I]], %[[I]] + return +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2 * 4, s0, 4096 floordiv s1)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> (s0 ceildiv s2, (4096 floordiv s1) ceildiv s2)> + +// CHECK: func @tile_loop_with_div_in_upper_bound_non_unit_step([[ARG0:%arg[0-9]+]]: index, %{{.*}}: memref, %{{.*}}: index, %{{.*}}: index) +#ub = affine_map<()[s0, s1] -> (s0, 4096 floordiv s1)> +func @tile_loop_with_div_in_upper_bound_non_unit_step(%t5 : index, %A : memref, %L : index, %U : index) { + %c0 = constant 0 : index + %M = dim %A, %c0 : memref + affine.for %i = 0 to min #ub()[%M, %U] step 4 { + addi %i, %i : index + } + // CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to min [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]]{{.*}} step 4{{.*}} + // CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}})[{{.*}}, {{.*}}, [[ARG0]]]{{.*}} step 4{{.*}} + // CHECK-NEXT: addi %[[I]], %[[I]] + return +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> ((d0 - 8) * s0 + 8)> +// CHECK-DAG: [[UBI2:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 - 8) * s1 + s1 * 4 + 8, s0 + 16)> +// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 - 8) * s1 + s1 + 8, s0 + 16)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> ((d0 - 8) * s0 + s0 + 8, 256)> +// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0, s1] -> ((s0 + 8) ceildiv s1 + 8)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (248 ceildiv s0 + 8)> + +// CHECK: func @tile_loop_with_non_zero_lb([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, %{{.*}}: index) +// CHECK-NEXT: affine.for [[ARG3:%arg[0-9+]]] = 8 to [[UBO0]]{{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for [[ARG4:%arg[0-9+]]] = 8 to [[UBO1]]{{.*}}[[ARG1]]{{.*}} +// CHECK-NEXT: affine.for [[ARG5:%arg[0-9+]]] = 8 to [[UBO1]]{{.*}}[[ARG2]]{{.*}} step 4 +// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG3]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]([[ARG3]]){{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI0]]([[ARG4]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]([[ARG4]]){{.*}}[[ARG1]]{{.*}} +// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI0]]([[ARG5]]){{.*}}[[ARG2]]{{.*}} to min [[UBI2]]([[ARG5]]){{.*}}[[ARG2]]{{.*}}step 4{{.*}} +// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) : (index, index, index) -> () +#ubi = affine_map<()[s0] -> (s0 + 16)> +func @tile_loop_with_non_zero_lb(%t0: index, %t1: index, %t2: index, %U: index){ + affine.for %i = 8 to 256 { + affine.for %j = 8 to #ubi()[%U] { + affine.for %k = 8 to #ubi()[%U] step 4 { + "test.foo"(%i, %j, %k) : (index, index, index) -> () + } + } + } + return +} + +// ----- + +// CHECK-DAG: [[LBI:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 256)> +// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0 + s0, 250)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0] -> (256 ceildiv s0)> +// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<()[s0] -> (250 ceildiv s0)> + +// CHECK: func @simple_matmul([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index{{.*}}) +// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG1]]{{.*}} +// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]](){{.*}}[[ARG2]]{{.*}} +// CHECK-NEXT: affine.for %[[I:.*]] = [[LBI]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for %[[J:.*]] = [[LBI]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} to min [[UBI0]]{{.*}}[[ARG4]]{{.*}}[[ARG1]]{{.*}} +// CHECK-NEXT: affine.for %[[K:.*]] = [[LBI]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} to min [[UBI1]]{{.*}}[[ARG5]]{{.*}}[[ARG2]]{{.*}} +// CHECK-NEXT: affine.load %{{.*}}[%[[I]], %[[K]]] +// CHECK-NEXT: affine.load %{{.*}}[%[[K]], %[[J]]] +// CHECK-NEXT: affine.load %{{.*}}[%[[I]], %[[J]]] +// CHECK-NEXT: mulf %{{.*}} +// CHECK-NEXT: addf %{{.*}} +// CHECK-NEXT: affine.store %{{.*}}[%[[I]], %[[J]]] +func @simple_matmul(%t6 : index, %t7 : index, %t8 : index, %arg0: memref<256x256xvector<64xf32>>, %arg1: memref<256x256xvector<64xf32>>, %arg2: memref<256x256xvector<64xf32>>) -> memref<256x256xvector<64xf32>> { + affine.for %i = 0 to 256 { + affine.for %j = 0 to 256 { + affine.for %k = 0 to 250 { + %l = affine.load %arg0[%i, %k] : memref<256x256xvector<64xf32>> + %r = affine.load %arg1[%k, %j] : memref<256x256xvector<64xf32>> + %o = affine.load %arg2[%i, %j] : memref<256x256xvector<64xf32>> + %m = mulf %l, %r : vector<64xf32> + %a = addf %o, %m : vector<64xf32> + affine.store %a, %arg2[%i, %j] : memref<256x256xvector<64xf32>> + } + } + } + return %arg2 : memref<256x256xvector<64xf32>> +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s1, s0)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> + +// CHECK: func @tile_with_symbolic_loop_upper_bounds([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index{{.*}}){{.*}} +// CHECK: affine.for [[ARG2:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for [[ARG3:%arg[0-9]+]] = 0 to [[UBO0]](){{.*}}[[ARG1]]{{.*}} +// CHECK-NEXT: affine.for %[[I0:.*]] = [[LBI0]]{{.*}}[[ARG2]]{{.*}}[[ARG0]]{{.*}} to min [[UBI0]]{{.*}}[[ARG2]]{{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for %[[I1:.*]] = [[LBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG1]]{{.*}} to min [[UBI0]]{{.*}}[[ARG3]]{{.*}}[[ARG1]]{{.*}} +// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[I0]], %[[I1]]] : memref +// CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} { +// CHECK-NEXT: affine.load %{{.*}}%[[I0]], %[[I2]] +// CHECK-NEXT: affine.load %{{.*}}%[[I2]], %[[I1]] +// CHECK-NEXT: mulf +// CHECK-NEXT: affine.load %{{.*}}%[[I0]], %[[I1]] +// CHECK-NEXT: addf +// CHECK-NEXT: affine.store %{{.*}}%[[I0]], %[[I1]] +func @tile_with_symbolic_loop_upper_bounds(%t9 : index, %t10: index, %arg0: memref, %arg1: memref, %arg2: memref) { + %cst = constant 0.000000e+00 : f32 + %c0 = constant 0 : index + %0 = dim %arg0, %c0 : memref + affine.for %i0 = 0 to %0 { + affine.for %i1 = 0 to %0 { + affine.store %cst, %arg2[%i0, %i1] : memref + affine.for %i2 = 0 to %0 { + %1 = affine.load %arg0[%i0, %i2] : memref + %2 = affine.load %arg1[%i2, %i1] : memref + %3 = mulf %1, %2 : f32 + %4 = affine.load %arg2[%i0, %i1] : memref + %5 = addf %4, %3 : f32 + affine.store %5, %arg2[%i0, %i1] : memref + } + } + } + return +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 + s2, s0 + s1)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<()[s0, s1, s2] -> ((s0 + s1) ceildiv s2)> + +// CHECK: func @tile_with_loop_upper_bounds_in_two_symbols([[ARG0:%arg[0-9]+]]: index{{.*}}){{.*}} +func @tile_with_loop_upper_bounds_in_two_symbols(%t11 : index, %arg0: memref, %limit: index) { + %c0 = constant 0 : index + %dim0 = dim %arg0, %c0 : memref + affine.for %i0 = 0 to affine_map<()[s0, s1] -> (s0 + s1)> ()[%dim0, %limit] { + %v0 = affine.load %arg0[%i0] : memref + } + // CHECK: affine.for [[ARG1:%arg[0-9]+]] = 0 to [[UBO0]]()[%{{.*}}, %{{.*}}, [[ARG0]]] + // CHECK-NEXT: affine.for %[[I:.*]] = [[LBI0]]([[ARG1]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]([[ARG1]])[{{.*}}, {{.*}}, [[ARG0]]] + // CHECK-NEXT: affine.load %{{.*}}[%[[I]]] + return +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1, d0 + s0 + 4)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1, d0 + s0 + 2)> +// CHECK-DAG: [[LBO0:#map[0-9]+]] = affine_map<() -> (0)> +// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 4) ceildiv s1)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 2) ceildiv s1)> + +// CHECK: func @tile_with_upper_bounds_in_dimensions_and_symbols([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, [[ARG3:%arg[0-9]+]]: index{{.*}}){{.*}} +// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]]({{.*}}){{.*}}[[ARG0]] +// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]]({{.*}}){{.*}}[[ARG1]] +// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG4]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}}, [[ARG4]]){{.*}}[[ARG0]]{{.*}} +// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG5]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]({{.*}}, [[ARG5]]){{.*}}[[ARG1]]{{.*}} +func @tile_with_upper_bounds_in_dimensions_and_symbols(%t12 : index, %t13 :index, %M: index, %N: index, %K: index) { + affine.for %i = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 2)>(%M)[%K] { + affine.for %j = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 4)>(%N)[%K] { + "test.foo" () : () -> () + } + } + return +} + +// ----- + +// CHECK-DAG: [[LBI0:#map[0-9]+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: [[UBI1:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1 * 4, d0 + s0 + 4)> +// CHECK-DAG: [[UBI0:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 * s1 + s1 * 2, d0 + s0 + 2)> +// CHECK-DAG: [[LBO0:#map[0-9]+]] = affine_map<() -> (0)> +// CHECK-DAG: [[UBO1:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 4) ceildiv s1)> +// CHECK-DAG: [[UBO0:#map[0-9]+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0 + 2) ceildiv s1)> + +// CHECK: func @tile_with_upper_bounds_in_dimensions_and_symbols_non_unit_steps +// CHECK-SAME: ([[ARG0:%arg[0-9]+]]: index, [[ARG1:%arg[0-9]+]]: index, [[ARG2:%arg[0-9]+]]: index, [[ARG3:%arg[0-9]+]]: index{{.*}}){{.*}} +// CHECK-NEXT: affine.for [[ARG4:%arg[0-9]+]] = 0 to [[UBO0]]({{.*}}){{.*}}[[ARG0]]{{.*}} step 2{{.*}} +// CHECK-NEXT: affine.for [[ARG5:%arg[0-9]+]] = 0 to [[UBO1]]({{.*}}){{.*}}[[ARG1]]{{.*}} step 4{{.*}} +// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG4]]){{.*}}[[ARG0]]{{.*}} to min [[UBI0]]({{.*}}, [[ARG4]]){{.*}}[[ARG0]]{{.*}} step 2{{.*}} +// CHECK-NEXT: affine.for {{.*}} = [[LBI0]]([[ARG5]]){{.*}}[[ARG1]]{{.*}} to min [[UBI1]]({{.*}}, [[ARG5]]){{.*}}[[ARG1]]{{.*}} step 4{{.*}} +func @tile_with_upper_bounds_in_dimensions_and_symbols_non_unit_steps(%t12 : index, %t13 :index, %M: index, %N : index, %K: index) { + affine.for %i = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 2)>(%M)[%K] step 2 { + affine.for %j = 0 to affine_map<(d0)[s0] -> (d0 + s0 + 4)>(%N)[%K] step 4 { + "test.foo" () : () -> () + } + } + return +} diff --git a/mlir/test/Dialect/Affine/ops.mlir b/mlir/test/Dialect/Affine/ops.mlir index cd60869106485..627104bae976b 100644 --- a/mlir/test/Dialect/Affine/ops.mlir +++ b/mlir/test/Dialect/Affine/ops.mlir @@ -184,3 +184,53 @@ func @affine_if() -> f32 { // CHECK: return %[[OUT]] : f32 return %0 : f32 } + +// ----- + +// Test affine.for with yield values. + +#set = affine_set<(d0): (d0 - 10 >= 0)> + +// CHECK-LABEL: func @yield_loop +func @yield_loop(%buffer: memref<1024xf32>) -> f32 { + %sum_init_0 = constant 0.0 : f32 + %res = affine.for %i = 0 to 10 step 2 iter_args(%sum_iter = %sum_init_0) -> f32 { + %t = affine.load %buffer[%i] : memref<1024xf32> + %sum_next = affine.if #set(%i) -> (f32) { + %new_sum = addf %sum_iter, %t : f32 + affine.yield %new_sum : f32 + } else { + affine.yield %sum_iter : f32 + } + affine.yield %sum_next : f32 + } + return %res : f32 +} +// CHECK: %[[const_0:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[output:.*]] = affine.for %{{.*}} = 0 to 10 step 2 iter_args(%{{.*}} = %[[const_0]]) -> (f32) { +// CHECK: affine.if #set0(%{{.*}}) -> f32 { +// CHECK: affine.yield %{{.*}} : f32 +// CHECK-NEXT: } else { +// CHECK-NEXT: affine.yield %{{.*}} : f32 +// CHECK-NEXT: } +// CHECK-NEXT: affine.yield %{{.*}} : f32 +// CHECK-NEXT: } +// CHECK-NEXT: return %[[output]] : f32 + +// CHECK-LABEL: func @affine_for_multiple_yield +func @affine_for_multiple_yield(%buffer: memref<1024xf32>) -> (f32, f32) { + %init_0 = constant 0.0 : f32 + %res1, %res2 = affine.for %i = 0 to 10 step 2 iter_args(%iter_arg1 = %init_0, %iter_arg2 = %init_0) -> (f32, f32) { + %t = affine.load %buffer[%i] : memref<1024xf32> + %ret1 = addf %t, %iter_arg1 : f32 + %ret2 = addf %t, %iter_arg2 : f32 + affine.yield %ret1, %ret2 : f32, f32 + } + return %res1, %res2 : f32, f32 +} +// CHECK: %[[const_0:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[output:[0-9]+]]:2 = affine.for %{{.*}} = 0 to 10 step 2 iter_args(%[[iter_arg1:.*]] = %[[const_0]], %[[iter_arg2:.*]] = %[[const_0]]) -> (f32, f32) { +// CHECK: %[[res1:.*]] = addf %{{.*}}, %[[iter_arg1]] : f32 +// CHECK-NEXT: %[[res2:.*]] = addf %{{.*}}, %[[iter_arg2]] : f32 +// CHECK-NEXT: affine.yield %[[res1]], %[[res2]] : f32, f32 +// CHECK-NEXT: } diff --git a/mlir/test/Dialect/Linalg/inlining.mlir b/mlir/test/Dialect/Linalg/inlining.mlir new file mode 100644 index 0000000000000..1e5af263eb832 --- /dev/null +++ b/mlir/test/Dialect/Linalg/inlining.mlir @@ -0,0 +1,31 @@ +// RUN: mlir-opt %s -inline | FileCheck %s + +// These tests verify that regions with operations from Lingalg dialect +// can be inlined. + +#accesses = [ + affine_map<(i) -> (i)>, + affine_map<(i) -> (i)> +] + +#trait = { + args_in = 1, + args_out = 1, + indexing_maps = #accesses, + iterator_types = ["parallel"] +} + +func @inline_into(%arg0: memref) { + // CHECK: linalg.generic + call @inlined_fn(%arg0) : (memref) -> () + return +} + +func @inlined_fn(%arg0: memref) { + // CHECK: linalg.generic + linalg.generic #trait %arg0, %arg0 { + ^bb(%0 : f32, %1 : f32) : + linalg.yield %0 : f32 + } : memref, memref + return +} diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir index ca59ecd387ec3..3774aed7ad1f0 100644 --- a/mlir/test/Dialect/Linalg/invalid.mlir +++ b/mlir/test/Dialect/Linalg/invalid.mlir @@ -35,14 +35,21 @@ func @store_number_of_indices(%v : memref) { func @transpose_not_permutation(%v : memref(off + M * i + j)>>) { // expected-error @+1 {{expected a permutation map}} - linalg.transpose %v (i, j) -> (i, i) : memref(off + M * i + j)>> + linalg.transpose %v (i, j) -> (i, i) : memref(off + M * i + j)>> to memref(off + M * i + j)>> } // ----- func @transpose_bad_rank(%v : memref(off + M * i + j)>>) { // expected-error @+1 {{expected a permutation map of same rank as the view}} - linalg.transpose %v (i) -> (i) : memref(off + M * i + j)>> + linalg.transpose %v (i) -> (i) : memref(off + M * i + j)>> to memref(off + M * i + j)>> +} + +// ----- + +func @transpose_wrong_type(%v : memref(off + M * i + j)>>) { + // expected-error @+1 {{output type 'memref (d0 * s1 + s0 + d1)>>' does not match transposed input type 'memref (d0 * s1 + s0 + d1)>>'}} + linalg.transpose %v (i, j) -> (j, i) : memref(off + M * i + j)>> to memref(off + M * i + j)>> } // ----- @@ -106,7 +113,7 @@ func @generic_mismatched_num_returns(%arg0: memref) { // ----- func @generic_symbol_in_map(%arg0: memref) { - // expected-error @+1 {{expected the number of symbols in indexing_map #0 to match target rank}} + // expected-error @+1 {{expected the number of symbols in indexing_map #0 to match rank of operand `symbol_source`}} linalg.generic { args_in = 0, args_out = 1, @@ -507,3 +514,20 @@ func @named_ops(%a3: memref, %b3: memref, %c3: memref, memref, memref) -> () return } + +// ----- + +func @generic(%arg0: tensor) { + // expected-error @+1 {{unexpected #results > #outputs}} + linalg.generic { + args_in = 1, + args_out = 1, + indexing_maps = [ affine_map<(i) -> (i)> ], + iterator_types = ["parallel"] + } %arg0 { + ^bb(%0: i4) : + %1 = std.addi %0, %0: i4 + linalg.yield %1, %1: i4, i4 + } : tensor -> (tensor, tensor) + return +} diff --git a/mlir/test/Dialect/Linalg/llvm.mlir b/mlir/test/Dialect/Linalg/llvm.mlir index 02693e5d1be46..c8031824d6307 100644 --- a/mlir/test/Dialect/Linalg/llvm.mlir +++ b/mlir/test/Dialect/Linalg/llvm.mlir @@ -70,7 +70,7 @@ func @slice_with_range_and_index(%arg0: memref, ptr, i64, array<1 x i64>, array<1 x i64>)> func @transpose(%arg0: memref) { - %0 = linalg.transpose %arg0 (i, j, k) -> (k, i, j) : memref + %0 = linalg.transpose %arg0 (i, j, k) -> (k, i, j) : memref to memref (d2 * s1 + s0 + d0 * s2 + d1)>> return } // CHECK-LABEL: func @transpose diff --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir index 6af53a2b8d222..1e10e036ee2d7 100644 --- a/mlir/test/Dialect/Linalg/loops.mlir +++ b/mlir/test/Dialect/Linalg/loops.mlir @@ -1318,14 +1318,15 @@ func @conv1d_no_symbols(%in : memref, %filter : memref, %out : mem // CHECKPARALLEL: %[[c1:.*]] = constant 1 : index // CHECKPARALLEL: %[[dim0:.*]] = dim %[[arg1]], %[[c0]] : memref // CHECKPARALLEL: %[[dim1:.*]] = dim %[[arg2]], %[[c0]] : memref -// CHECKPARALLEL: scf.parallel (%[[b:.*]], %[[m:.*]]) = (%[[c0]], %[[c0]]) to (%[[dim1]], %[[dim0]]) step (%[[c1]], %[[c1]]) { -// CHECKPARALLEL: %[[aff:.*]] = affine.apply #[[$stride1Dilation1]](%[[b]], %[[m]]) -// CHECKPARALLEL: %[[vb:.*]] = load %[[arg0]][%[[aff]]] : memref -// CHECKPARALLEL: %[[va:.*]] = load %[[arg1]][%[[m]]] : memref -// CHECKPARALLEL: %[[vc:.*]] = load %[[arg2]][%[[b]]] : memref -// CHECKPARALLEL: %[[inc:.*]] = mulf %[[vb]], %[[va]] : f32 -// CHECKPARALLEL: %[[res:.*]] = addf %[[vc]], %[[inc]] : f32 -// CHECKPARALLEL: store %[[res]], %[[arg2]][%[[b]]] : memref +// CHECKPARALLEL: scf.parallel (%[[b:.*]]) = (%[[c0]]) to (%[[dim1]]) step (%[[c1]]) { +// CHECKPARALLEL: scf.for %[[m:.*]] = %[[c0]] to %[[dim0]] step %[[c1]] { +// CHECKPARALLEL: %[[aff:.*]] = affine.apply #[[$stride1Dilation1]](%[[b]], %[[m]]) +// CHECKPARALLEL: %[[vb:.*]] = load %[[arg0]][%[[aff]]] : memref +// CHECKPARALLEL: %[[va:.*]] = load %[[arg1]][%[[m]]] : memref +// CHECKPARALLEL: %[[vc:.*]] = load %[[arg2]][%[[b]]] : memref +// CHECKPARALLEL: %[[inc:.*]] = mulf %[[vb]], %[[va]] : f32 +// CHECKPARALLEL: %[[res:.*]] = addf %[[vc]], %[[inc]] : f32 +// CHECKPARALLEL: store %[[res]], %[[arg2]][%[[b]]] : memref func @conv2d_no_symbols(%in : memref, %filter : memref, %out : memref) -> () { @@ -1367,15 +1368,17 @@ func @conv2d_no_symbols(%in : memref, %filter : memref, %out : // CHECKPARALLEL: %[[dim1:.*]] = dim %[[arg1]], %[[c1]] : memref // CHECKPARALLEL: %[[dim2:.*]] = dim %[[arg2]], %[[c0]] : memref // CHECKPARALLEL: %[[dim3:.*]] = dim %[[arg2]], %[[c1]] : memref -// CHECKPARALLEL: scf.parallel (%[[arg3:.*]], %[[arg4:.*]], %[[arg5:.*]], %[[arg6:.*]]) = (%[[c0]], %[[c0]], %[[c0]], %[[c0]]) to (%[[dim2]], %[[dim3]], %[[dim0]], %[[dim1]]) step (%[[c1]], %[[c1]], %[[c1]], %[[c1]]) { -// CHECKPARALLEL: %[[aff:.*]] = affine.apply #[[$stride1Dilation1]](%[[arg3]], %[[arg5]]) -// CHECKPARALLEL: %[[aff2:.*]] = affine.apply #[[$stride1Dilation1]](%[[arg4]], %[[arg6]]) -// CHECKPARALLEL: %[[vb:.*]] = load %[[arg0]][%[[aff]], %[[aff2]]] : memref -// CHECKPARALLEL: %[[va:.*]] = load %[[arg1]][%[[arg5]], %[[arg6]]] : memref -// CHECKPARALLEL: %[[vc:.*]] = load %[[arg2]][%[[arg3]], %[[arg4]]] : memref -// CHECKPARALLEL: %[[inc:.*]] = mulf %[[vb]], %[[va]] : f32 -// CHECKPARALLEL: %[[res:.*]] = addf %[[vc]], %[[inc]] : f32 -// CHECKPARALLEL: store %[[res]], %[[arg2]][%[[arg3]], %[[arg4]]] : memref +// CHECKPARALLEL: scf.parallel (%[[arg3:.*]], %[[arg4:.*]]) = (%[[c0]], %[[c0]]) to (%[[dim2]], %[[dim3]]) step (%[[c1]], %[[c1]]) { +// CHECKPARALLEL: scf.for %[[arg5:.*]] = %[[c0]] to %[[dim0]] step %[[c1]] { +// CHECKPARALLEL: scf.for %[[arg6:.*]] = %[[c0]] to %[[dim1]] step %[[c1]] { +// CHECKPARALLEL: %[[aff:.*]] = affine.apply #[[$stride1Dilation1]](%[[arg3]], %[[arg5]]) +// CHECKPARALLEL: %[[aff2:.*]] = affine.apply #[[$stride1Dilation1]](%[[arg4]], %[[arg6]]) +// CHECKPARALLEL: %[[vb:.*]] = load %[[arg0]][%[[aff]], %[[aff2]]] : memref +// CHECKPARALLEL: %[[va:.*]] = load %[[arg1]][%[[arg5]], %[[arg6]]] : memref +// CHECKPARALLEL: %[[vc:.*]] = load %[[arg2]][%[[arg3]], %[[arg4]]] : memref +// CHECKPARALLEL: %[[inc:.*]] = mulf %[[vb]], %[[va]] : f32 +// CHECKPARALLEL: %[[res:.*]] = addf %[[vc]], %[[inc]] : f32 +// CHECKPARALLEL: store %[[res]], %[[arg2]][%[[arg3]], %[[arg4]]] : memref func @conv3d_no_symbols(%in : memref, %filter : memref, %out : memref) -> () { @@ -1427,13 +1430,16 @@ func @conv3d_no_symbols(%in : memref, %filter : memref, %o // CHECKPARALLEL: %[[dim3:.*]] = dim %[[arg2]], %[[c0]] : memref // CHECKPARALLEL: %[[dim4:.*]] = dim %[[arg2]], %[[c1]] : memref // CHECKPARALLEL: %[[dim5:.*]] = dim %[[arg2]], %[[c2]] : memref -// CHECKPARALLEL: scf.parallel (%[[arg3:.*]], %[[arg4:.*]], %[[arg5:.*]], %[[arg6:.*]], %[[arg7:.*]], %[[arg8:.*]]) = (%[[c0]], %[[c0]], %[[c0]], %[[c0]], %[[c0]], %[[c0]]) to (%[[dim3]], %[[dim4]], %[[dim5]], %[[dim0]], %[[dim1]], %[[dim2]]) step (%[[c1]], %[[c1]], %[[c1]], %[[c1]], %[[c1]], %[[c1]]) { -// CHECKPARALLEL: %[[aff:.*]] = affine.apply #[[$stride1Dilation1]](%[[arg3]], %[[arg6]]) -// CHECKPARALLEL: %[[aff2:.*]] = affine.apply #[[$stride1Dilation1]](%[[arg4]], %[[arg7]]) -// CHECKPARALLEL: %[[aff3:.*]] = affine.apply #[[$stride1Dilation1]](%[[arg5]], %[[arg8]]) -// CHECKPARALLEL: %[[vb:.*]] = load %[[arg0]][%[[aff]], %[[aff2]], %[[aff3]]] : memref -// CHECKPARALLEL: %[[va:.*]] = load %[[arg1]][%[[arg6]], %[[arg7]], %[[arg8]]] : memref -// CHECKPARALLEL: %[[vc:.*]] = load %[[arg2]][%[[arg3]], %[[arg4]], %[[arg5]]] : memref -// CHECKPARALLEL: %[[inc:.*]] = mulf %[[vb]], %[[va]] : f32 -// CHECKPARALLEL: %[[res:.*]] = addf %[[vc]], %[[inc]] : f32 -// CHECKPARALLEL: store %[[res]], %[[arg2]][%[[arg3]], %[[arg4]], %[[arg5]]] : memref +// CHECKPARALLEL: scf.parallel (%[[arg3:.*]], %[[arg4:.*]], %[[arg5:.*]]) = (%[[c0]], %[[c0]], %[[c0]]) to (%[[dim3]], %[[dim4]], %[[dim5]]) step (%[[c1]], %[[c1]], %[[c1]]) { +// CHECKPARALLEL: scf.for %[[arg6:.*]] = %[[c0]] to %[[dim0]] step %[[c1]] { +// CHECKPARALLEL: scf.for %[[arg7:.*]] = %[[c0]] to %[[dim1]] step %[[c1]] { +// CHECKPARALLEL: scf.for %[[arg8:.*]] = %[[c0]] to %[[dim2]] step %[[c1]] { +// CHECKPARALLEL: %[[aff:.*]] = affine.apply #[[$stride1Dilation1]](%[[arg3]], %[[arg6]]) +// CHECKPARALLEL: %[[aff2:.*]] = affine.apply #[[$stride1Dilation1]](%[[arg4]], %[[arg7]]) +// CHECKPARALLEL: %[[aff3:.*]] = affine.apply #[[$stride1Dilation1]](%[[arg5]], %[[arg8]]) +// CHECKPARALLEL: %[[vb:.*]] = load %[[arg0]][%[[aff]], %[[aff2]], %[[aff3]]] : memref +// CHECKPARALLEL: %[[va:.*]] = load %[[arg1]][%[[arg6]], %[[arg7]], %[[arg8]]] : memref +// CHECKPARALLEL: %[[vc:.*]] = load %[[arg2]][%[[arg3]], %[[arg4]], %[[arg5]]] : memref +// CHECKPARALLEL: %[[inc:.*]] = mulf %[[vb]], %[[va]] : f32 +// CHECKPARALLEL: %[[res:.*]] = addf %[[vc]], %[[inc]] : f32 +// CHECKPARALLEL: store %[[res]], %[[arg2]][%[[arg3]], %[[arg4]], %[[arg5]]] : memref diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir index 2696643246972..404c978fa61bb 100644 --- a/mlir/test/Dialect/Linalg/roundtrip.mlir +++ b/mlir/test/Dialect/Linalg/roundtrip.mlir @@ -123,14 +123,15 @@ func @fill_view(%arg0: memref, %arg1: f32) { // ----- // CHECK-DAG: #[[$strided3D:.*]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2 + d2)> +// CHECK-DAG: #[[$strided3DT:.*]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2 * s1 + s0 + d1 * s2 + d0)> func @transpose(%arg0: memref) { - %0 = linalg.transpose %arg0 (i, j, k) -> (k, j, i) : memref + %0 = linalg.transpose %arg0 (i, j, k) -> (k, j, i) : memref to memref (d2 * s1 + s0 + d1 * s2 + d0)>> return } // CHECK-LABEL: func @transpose // CHECK: linalg.transpose %{{.*}} ([[i:.*]], [[j:.*]], [[k:.*]]) -> ([[k]], [[j]], [[i]]) : -// CHECK-SAME: memref +// CHECK-SAME: memref to memref // ----- diff --git a/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir b/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir index 83e9461d66cc9..683aeb2413182 100644 --- a/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir +++ b/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir @@ -13,13 +13,8 @@ func @matmul(%A: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>, } // CHECK-LABEL:func @matmul -// CHECK: vector.broadcast {{.*}} : f32 to vector<8x16xf32> // CHECK: store {{.*}}[] : memref> -// -// CHECK: vector.broadcast {{.*}} : f32 to vector<16x12xf32> // CHECK: store {{.*}}[] : memref> -// -// CHECK: vector.broadcast {{.*}} : f32 to vector<8x12xf32> // CHECK: store {{.*}}[] : memref> // // CHECK: linalg.copy diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir new file mode 100644 index 0000000000000..61a13211ba262 --- /dev/null +++ b/mlir/test/Dialect/OpenACC/invalid.mlir @@ -0,0 +1,70 @@ +// RUN: mlir-opt -split-input-file -verify-diagnostics %s + +// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +acc.loop gang { + "some.op"() : () -> () + acc.yield +} attributes {seq} + +// ----- + +// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +acc.loop worker { + "some.op"() : () -> () + acc.yield +} attributes {seq} + +// ----- + +// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +acc.loop vector { + "some.op"() : () -> () + acc.yield +} attributes {seq} + +// ----- + +// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +acc.loop gang worker { + "some.op"() : () -> () + acc.yield +} attributes {seq} + +// ----- + +// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +acc.loop gang vector { + "some.op"() : () -> () + acc.yield +} attributes {seq} + +// ----- + +// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +acc.loop worker vector { + "some.op"() : () -> () + acc.yield +} attributes {seq} + +// ----- + +// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +acc.loop gang worker vector { + "some.op"() : () -> () + acc.yield +} attributes {seq} + +// ----- + +// expected-error@+1 {{expected non-empty body.}} +acc.loop { +} + +// ----- + +// expected-error@+1 {{only one of auto, independent, seq can be present at the same time}} +acc.loop { + acc.yield +} attributes {auto_, seq} + +// ----- diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir index 6cdba227d5dab..196949839db47 100644 --- a/mlir/test/Dialect/OpenACC/ops.mlir +++ b/mlir/test/Dialect/OpenACC/ops.mlir @@ -1,15 +1,16 @@ -// RUN: mlir-opt %s | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s // Verify the printed output can be parsed. -// RUN: mlir-opt %s | mlir-opt | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect %s | mlir-opt -allow-unregistered-dialect | FileCheck %s // Verify the generic form can be parsed. -// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect -mlir-print-op-generic %s | mlir-opt -allow-unregistered-dialect | FileCheck %s func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x10xf32>) -> memref<10x10xf32> { %c0 = constant 0 : index %c10 = constant 10 : index %c1 = constant 1 : index + %async = constant 1 : i64 - acc.parallel async(%c1) { + acc.parallel async(%async: i64) { acc.loop gang vector { scf.for %arg3 = %c0 to %c10 step %c1 { scf.for %arg4 = %c0 to %c10 step %c1 { @@ -35,7 +36,8 @@ func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x10xf3 // CHECK-NEXT: %{{.*}} = constant 0 : index // CHECK-NEXT: %{{.*}} = constant 10 : index // CHECK-NEXT: %{{.*}} = constant 1 : index -// CHECK-NEXT: acc.parallel async(%{{.*}}) { +// CHECK-NEXT: [[ASYNC:%.*]] = constant 1 : i64 +// CHECK-NEXT: acc.parallel async([[ASYNC]]: i64) { // CHECK-NEXT: acc.loop gang vector { // CHECK-NEXT: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { // CHECK-NEXT: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { @@ -113,9 +115,11 @@ func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10xf32>, %lb = constant 0 : index %st = constant 1 : index %c10 = constant 10 : index + %numGangs = constant 10 : i64 + %numWorkers = constant 10 : i64 acc.data present(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10xf32>, %d: memref<10xf32>) { - acc.parallel num_gangs(%c10) num_workers(%c10) private(%c : memref<10xf32>) { + acc.parallel num_gangs(%numGangs: i64) num_workers(%numWorkers: i64) private(%c : memref<10xf32>) { acc.loop gang { scf.for %x = %lb to %c10 step %st { acc.loop worker { @@ -154,8 +158,10 @@ func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10xf32>, // CHECK-NEXT: [[C0:%.*]] = constant 0 : index // CHECK-NEXT: [[C1:%.*]] = constant 1 : index // CHECK-NEXT: [[C10:%.*]] = constant 10 : index +// CHECK-NEXT: [[NUMGANG:%.*]] = constant 10 : i64 +// CHECK-NEXT: [[NUMWORKERS:%.*]] = constant 10 : i64 // CHECK-NEXT: acc.data present(%{{.*}}: memref<10x10xf32>, %{{.*}}: memref<10x10xf32>, %{{.*}}: memref<10xf32>, %{{.*}}: memref<10xf32>) { -// CHECK-NEXT: acc.parallel num_gangs([[C10]]) num_workers([[C10]]) private([[ARG2]]: memref<10xf32>) { +// CHECK-NEXT: acc.parallel num_gangs([[NUMGANG]]: i64) num_workers([[NUMWORKERS]]: i64) private([[ARG2]]: memref<10xf32>) { // CHECK-NEXT: acc.loop gang { // CHECK-NEXT: scf.for %{{.*}} = [[C0]] to [[C10]] step [[C1]] { // CHECK-NEXT: acc.loop worker { @@ -186,27 +192,43 @@ func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10xf32>, // CHECK-NEXT: return %{{.*}} : memref<10xf32> // CHECK-NEXT: } -func @testop() -> () { +func @testop(%a: memref<10xf32>) -> () { %workerNum = constant 1 : i64 %vectorLength = constant 128 : i64 %gangNum = constant 8 : i64 %gangStatic = constant 2 : i64 %tileSize = constant 2 : i64 acc.loop gang worker vector { + "some.op"() : () -> () + acc.yield } acc.loop gang(num: %gangNum) { + "some.op"() : () -> () + acc.yield } acc.loop gang(static: %gangStatic) { + "some.op"() : () -> () + acc.yield } acc.loop worker(%workerNum) { + "some.op"() : () -> () + acc.yield } acc.loop vector(%vectorLength) { + "some.op"() : () -> () + acc.yield } acc.loop gang(num: %gangNum) worker vector { + "some.op"() : () -> () + acc.yield } acc.loop gang(num: %gangNum, static: %gangStatic) worker(%workerNum) vector(%vectorLength) { + "some.op"() : () -> () + acc.yield } acc.loop tile(%tileSize : i64, %tileSize : i64) { + "some.op"() : () -> () + acc.yield } return } @@ -217,18 +239,151 @@ func @testop() -> () { // CHECK-NEXT: [[GANGSTATIC:%.*]] = constant 2 : i64 // CHECK-NEXT: [[TILESIZE:%.*]] = constant 2 : i64 // CHECK-NEXT: acc.loop gang worker vector { +// CHECK-NEXT: "some.op"() : () -> () +// CHECK-NEXT: acc.yield // CHECK-NEXT: } // CHECK-NEXT: acc.loop gang(num: [[GANGNUM]]) { +// CHECK-NEXT: "some.op"() : () -> () +// CHECK-NEXT: acc.yield // CHECK-NEXT: } // CHECK-NEXT: acc.loop gang(static: [[GANGSTATIC]]) { +// CHECK-NEXT: "some.op"() : () -> () +// CHECK-NEXT: acc.yield // CHECK-NEXT: } // CHECK-NEXT: acc.loop worker([[WORKERNUM]]) { +// CHECK-NEXT: "some.op"() : () -> () +// CHECK-NEXT: acc.yield // CHECK-NEXT: } // CHECK-NEXT: acc.loop vector([[VECTORLENGTH]]) { +// CHECK-NEXT: "some.op"() : () -> () +// CHECK-NEXT: acc.yield // CHECK-NEXT: } // CHECK-NEXT: acc.loop gang(num: [[GANGNUM]]) worker vector { +// CHECK-NEXT: "some.op"() : () -> () +// CHECK-NEXT: acc.yield // CHECK-NEXT: } // CHECK-NEXT: acc.loop gang(num: [[GANGNUM]], static: [[GANGSTATIC]]) worker([[WORKERNUM]]) vector([[VECTORLENGTH]]) { +// CHECK-NEXT: "some.op"() : () -> () +// CHECK-NEXT: acc.yield // CHECK-NEXT: } // CHECK-NEXT: acc.loop tile([[TILESIZE]]: i64, [[TILESIZE]]: i64) { +// CHECK-NEXT: "some.op"() : () -> () +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } + + +func @testparallelop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10xf32>) -> () { + %i64value = constant 1 : i64 + %i32value = constant 1 : i32 + %idxValue = constant 1 : index + acc.parallel async(%i64value: i64) { + } + acc.parallel async(%i32value: i32) { + } + acc.parallel async(%idxValue: index) { + } + acc.parallel wait(%i64value: i64) { + } + acc.parallel wait(%i32value: i32) { + } + acc.parallel wait(%idxValue: index) { + } + acc.parallel wait(%i64value: i64, %i32value: i32, %idxValue: index) { + } + acc.parallel num_gangs(%i64value: i64) { + } + acc.parallel num_gangs(%i32value: i32) { + } + acc.parallel num_gangs(%idxValue: index) { + } + acc.parallel num_workers(%i64value: i64) { + } + acc.parallel num_workers(%i32value: i32) { + } + acc.parallel num_workers(%idxValue: index) { + } + acc.parallel vector_length(%i64value: i64) { + } + acc.parallel vector_length(%i32value: i32) { + } + acc.parallel vector_length(%idxValue: index) { + } + acc.parallel copyin(%a: memref<10xf32>, %b: memref<10xf32>) { + } + acc.parallel copyin_readonly(%a: memref<10xf32>, %b: memref<10xf32>) { + } + acc.parallel copyin(%a: memref<10xf32>) copyout_zero(%b: memref<10xf32>, %c: memref<10x10xf32>) { + } + acc.parallel copyout(%b: memref<10xf32>, %c: memref<10x10xf32>) create(%a: memref<10xf32>) { + } + acc.parallel copyout_zero(%b: memref<10xf32>, %c: memref<10x10xf32>) create_zero(%a: memref<10xf32>) { + } + acc.parallel no_create(%a: memref<10xf32>) present(%b: memref<10xf32>, %c: memref<10x10xf32>) { + } + acc.parallel deviceptr(%a: memref<10xf32>) attach(%b: memref<10xf32>, %c: memref<10x10xf32>) { + } + acc.parallel private(%a: memref<10xf32>, %c: memref<10x10xf32>) firstprivate(%b: memref<10xf32>) { + } + acc.parallel { + } attributes {defaultAttr = "none"} + acc.parallel { + } attributes {defaultAttr = "present"} + return +} + +// CHECK: func @testparallelop([[ARGA:%.*]]: memref<10xf32>, [[ARGB:%.*]]: memref<10xf32>, [[ARGC:%.*]]: memref<10x10xf32>) { +// CHECK: [[I64VALUE:%.*]] = constant 1 : i64 +// CHECK: [[I32VALUE:%.*]] = constant 1 : i32 +// CHECK: [[IDXVALUE:%.*]] = constant 1 : index +// CHECK: acc.parallel async([[I64VALUE]]: i64) { +// CHECK-NEXT: } +// CHECK: acc.parallel async([[I32VALUE]]: i32) { +// CHECK-NEXT: } +// CHECK: acc.parallel async([[IDXVALUE]]: index) { +// CHECK-NEXT: } +// CHECK: acc.parallel wait([[I64VALUE]]: i64) { +// CHECK-NEXT: } +// CHECK: acc.parallel wait([[I32VALUE]]: i32) { +// CHECK-NEXT: } +// CHECK: acc.parallel wait([[IDXVALUE]]: index) { +// CHECK-NEXT: } +// CHECK: acc.parallel wait([[I64VALUE]]: i64, [[I32VALUE]]: i32, [[IDXVALUE]]: index) { +// CHECK-NEXT: } +// CHECK: acc.parallel num_gangs([[I64VALUE]]: i64) { +// CHECK-NEXT: } +// CHECK: acc.parallel num_gangs([[I32VALUE]]: i32) { +// CHECK-NEXT: } +// CHECK: acc.parallel num_gangs([[IDXVALUE]]: index) { +// CHECK-NEXT: } +// CHECK: acc.parallel num_workers([[I64VALUE]]: i64) { +// CHECK-NEXT: } +// CHECK: acc.parallel num_workers([[I32VALUE]]: i32) { +// CHECK-NEXT: } +// CHECK: acc.parallel num_workers([[IDXVALUE]]: index) { +// CHECK-NEXT: } +// CHECK: acc.parallel vector_length([[I64VALUE]]: i64) { +// CHECK-NEXT: } +// CHECK: acc.parallel vector_length([[I32VALUE]]: i32) { +// CHECK-NEXT: } +// CHECK: acc.parallel vector_length([[IDXVALUE]]: index) { +// CHECK-NEXT: } +// CHECK: acc.parallel copyin([[ARGA]]: memref<10xf32>, [[ARGB]]: memref<10xf32>) { +// CHECK-NEXT: } +// CHECK: acc.parallel copyin_readonly([[ARGA]]: memref<10xf32>, [[ARGB]]: memref<10xf32>) { +// CHECK-NEXT: } +// CHECK: acc.parallel copyin([[ARGA]]: memref<10xf32>) copyout_zero([[ARGB]]: memref<10xf32>, [[ARGC]]: memref<10x10xf32>) { +// CHECK-NEXT: } +// CHECK: acc.parallel copyout([[ARGB]]: memref<10xf32>, [[ARGC]]: memref<10x10xf32>) create([[ARGA]]: memref<10xf32>) { +// CHECK-NEXT: } +// CHECK: acc.parallel copyout_zero([[ARGB]]: memref<10xf32>, [[ARGC]]: memref<10x10xf32>) create_zero([[ARGA]]: memref<10xf32>) { +// CHECK-NEXT: } +// CHECK: acc.parallel no_create([[ARGA]]: memref<10xf32>) present([[ARGB]]: memref<10xf32>, [[ARGC]]: memref<10x10xf32>) { +// CHECK-NEXT: } +// CHECK: acc.parallel deviceptr([[ARGA]]: memref<10xf32>) attach([[ARGB]]: memref<10xf32>, [[ARGC]]: memref<10x10xf32>) { +// CHECK-NEXT: } +// CHECK: acc.parallel private([[ARGA]]: memref<10xf32>, [[ARGC]]: memref<10x10xf32>) firstprivate([[ARGB]]: memref<10xf32>) { // CHECK-NEXT: } +// CHECK: acc.parallel { +// CHECK-NEXT: } attributes {defaultAttr = "none"} +// CHECK: acc.parallel { +// CHECK-NEXT: } attributes {defaultAttr = "present"} diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir index 517e8855c97b8..06b902da781ca 100644 --- a/mlir/test/Dialect/SCF/invalid.mlir +++ b/mlir/test/Dialect/SCF/invalid.mlir @@ -325,7 +325,7 @@ func @reduceReturn_not_inside_reduce(%arg0 : f32) { func @std_if_incorrect_yield(%arg0: i1, %arg1: f32) { - // expected-error@+1 {{region control flow edge from Region #0 to scf.if has 1 source operands, but target successor needs 2}} + // expected-error@+1 {{region control flow edge from Region #0 to parent results: source has 1 operands, but target successor needs 2}} %x, %y = scf.if %arg0 -> (f32, f32) { %0 = addf %arg1, %arg1 : f32 scf.yield %0 : f32 @@ -401,7 +401,7 @@ func @std_for_operands_mismatch_3(%arg0 : index, %arg1 : index, %arg2 : index) { func @std_for_operands_mismatch_4(%arg0 : index, %arg1 : index, %arg2 : index) { %s0 = constant 0.0 : f32 %t0 = constant 1.0 : f32 - // expected-error @+1 {{along control flow edge from Region #0 to Region #0 source #1 type 'i32' should match input #1 type 'f32'}} + // expected-error @+1 {{along control flow edge from Region #0 to Region #0: source type #1 'i32' should match input type #1 'f32'}} %result1:2 = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%si = %s0, %ti = %t0) -> (f32, f32) { %sn = addf %si, %si : f32 diff --git a/mlir/test/Dialect/SCF/loop-unroll.mlir b/mlir/test/Dialect/SCF/loop-unroll.mlir index 775188bf0ed99..0b6e178ed0aab 100644 --- a/mlir/test/Dialect/SCF/loop-unroll.mlir +++ b/mlir/test/Dialect/SCF/loop-unroll.mlir @@ -2,6 +2,7 @@ // RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=3' | FileCheck %s --check-prefix UNROLL-BY-3 // RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2 loop-depth=0' | FileCheck %s --check-prefix UNROLL-OUTER-BY-2 // RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2 loop-depth=1' | FileCheck %s --check-prefix UNROLL-INNER-BY-2 +// RUN: mlir-opt %s --affine-loop-unroll='unroll-factor=6 unroll-up-to-factor=true' | FileCheck %s --check-prefix UNROLL-UP-TO func @dynamic_loop_unroll(%arg0 : index, %arg1 : index, %arg2 : index, %arg3: memref) { @@ -248,3 +249,24 @@ func @static_loop_unroll_by_3_promote_epilogue(%arg0 : memref) { // UNROLL-BY-3-NEXT: } // UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[C9]]] : memref // UNROLL-BY-3-NEXT: return + +// Test unroll-up-to functionality. +func @static_loop_unroll_up_to_factor(%arg0 : memref) { + %0 = constant 7.0 : f32 + %lb = constant 0 : index + %ub = constant 2 : index + affine.for %i0 = %lb to %ub { + affine.store %0, %arg0[%i0] : memref + } + return +} +// UNROLL-UP-TO-LABEL: func @static_loop_unroll_up_to_factor +// UNROLL-UP-TO-SAME: %[[MEM:.*0]]: memref +// +// UNROLL-UP-TO-DAG: %[[C0:.*]] = constant 0 : index +// UNROLL-UP-TO-DAG: %[[C2:.*]] = constant 2 : index +// UNROLL-UP-TO-NEXT: %[[V0:.*]] = affine.apply {{.*}} +// UNROLL-UP-TO-NEXT: store %{{.*}}, %[[MEM]][%[[V0]]] : memref +// UNROLL-UP-TO-NEXT: %[[V1:.*]] = affine.apply {{.*}} +// UNROLL-UP-TO-NEXT: affine.store %{{.*}}, %[[MEM]][%[[V1]]] : memref +// UNROLL-UP-TO-NEXT: return \ No newline at end of file diff --git a/mlir/test/Dialect/SPIRV/Serialization/cast-ops.mlir b/mlir/test/Dialect/SPIRV/Serialization/cast-ops.mlir index 76bac23e6f8ff..e04ac316f8736 100644 --- a/mlir/test/Dialect/SPIRV/Serialization/cast-ops.mlir +++ b/mlir/test/Dialect/SPIRV/Serialization/cast-ops.mlir @@ -20,21 +20,41 @@ spv.module Logical GLSL450 requires #spv.vce { %0 = spv.ConvertFToS %arg0 : f32 to i32 spv.ReturnValue %0 : i32 } + spv.func @convert_f64_to_s32(%arg0 : f64) -> i32 "None" { + // CHECK: {{%.*}} = spv.ConvertFToS {{%.*}} : f64 to i32 + %0 = spv.ConvertFToS %arg0 : f64 to i32 + spv.ReturnValue %0 : i32 + } spv.func @convert_f_to_u(%arg0 : f32) -> i32 "None" { // CHECK: {{%.*}} = spv.ConvertFToU {{%.*}} : f32 to i32 %0 = spv.ConvertFToU %arg0 : f32 to i32 spv.ReturnValue %0 : i32 } + spv.func @convert_f64_to_u32(%arg0 : f64) -> i32 "None" { + // CHECK: {{%.*}} = spv.ConvertFToU {{%.*}} : f64 to i32 + %0 = spv.ConvertFToU %arg0 : f64 to i32 + spv.ReturnValue %0 : i32 + } spv.func @convert_s_to_f(%arg0 : i32) -> f32 "None" { // CHECK: {{%.*}} = spv.ConvertSToF {{%.*}} : i32 to f32 %0 = spv.ConvertSToF %arg0 : i32 to f32 spv.ReturnValue %0 : f32 } + spv.func @convert_s64_to_f32(%arg0 : i64) -> f32 "None" { + // CHECK: {{%.*}} = spv.ConvertSToF {{%.*}} : i64 to f32 + %0 = spv.ConvertSToF %arg0 : i64 to f32 + spv.ReturnValue %0 : f32 + } spv.func @convert_u_to_f(%arg0 : i32) -> f32 "None" { // CHECK: {{%.*}} = spv.ConvertUToF {{%.*}} : i32 to f32 %0 = spv.ConvertUToF %arg0 : i32 to f32 spv.ReturnValue %0 : f32 } + spv.func @convert_u64_to_f32(%arg0 : i64) -> f32 "None" { + // CHECK: {{%.*}} = spv.ConvertUToF {{%.*}} : i64 to f32 + %0 = spv.ConvertUToF %arg0 : i64 to f32 + spv.ReturnValue %0 : f32 + } spv.func @f_convert(%arg0 : f32) -> f64 "None" { // CHECK: {{%.*}} = spv.FConvert {{%.*}} : f32 to f64 %0 = spv.FConvert %arg0 : f32 to f64 diff --git a/mlir/test/Dialect/SPIRV/Serialization/non-uniform-ops.mlir b/mlir/test/Dialect/SPIRV/Serialization/non-uniform-ops.mlir index ab714dfbaa008..f7b8f6cfc1858 100644 --- a/mlir/test/Dialect/SPIRV/Serialization/non-uniform-ops.mlir +++ b/mlir/test/Dialect/SPIRV/Serialization/non-uniform-ops.mlir @@ -8,6 +8,14 @@ spv.module Logical GLSL450 requires #spv.vce { spv.ReturnValue %0: vector<4xi32> } + // CHECK-LABEL: @group_non_uniform_broadcast + spv.func @group_non_uniform_broadcast(%value: f32) -> f32 "None" { + %one = spv.constant 1 : i32 + // CHECK: spv.GroupNonUniformBroadcast "Subgroup" %{{.*}}, %{{.*}} : f32, i32 + %0 = spv.GroupNonUniformBroadcast "Subgroup" %value, %one : f32, i32 + spv.ReturnValue %0: f32 + } + // CHECK-LABEL: @group_non_uniform_elect spv.func @group_non_uniform_elect() -> i1 "None" { // CHECK: %{{.+}} = spv.GroupNonUniformElect "Workgroup" : i1 diff --git a/mlir/test/Dialect/SPIRV/non-uniform-ops.mlir b/mlir/test/Dialect/SPIRV/non-uniform-ops.mlir index 86c3c2886a4fe..5839ee7c56276 100644 --- a/mlir/test/Dialect/SPIRV/non-uniform-ops.mlir +++ b/mlir/test/Dialect/SPIRV/non-uniform-ops.mlir @@ -28,6 +28,45 @@ func @group_non_uniform_ballot(%predicate: i1) -> vector<4xsi32> { // ----- +//===----------------------------------------------------------------------===// +// spv.NonUniformGroupBroadcast +//===----------------------------------------------------------------------===// + +func @group_non_uniform_broadcast_scalar(%value: f32) -> f32 { + %one = spv.constant 1 : i32 + // CHECK: spv.GroupNonUniformBroadcast "Workgroup" %{{.*}}, %{{.*}} : f32, i32 + %0 = spv.GroupNonUniformBroadcast "Workgroup" %value, %one : f32, i32 + return %0: f32 +} + +// ----- + +func @group_non_uniform_broadcast_vector(%value: vector<4xf32>) -> vector<4xf32> { + %one = spv.constant 1 : i32 + // CHECK: spv.GroupNonUniformBroadcast "Subgroup" %{{.*}}, %{{.*}} : vector<4xf32>, i32 + %0 = spv.GroupNonUniformBroadcast "Subgroup" %value, %one : vector<4xf32>, i32 + return %0: vector<4xf32> +} + +// ----- + +func @group_non_uniform_broadcast_negative_scope(%value: f32, %localid: i32 ) -> f32 { + %one = spv.constant 1 : i32 + // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + %0 = spv.GroupNonUniformBroadcast "Device" %value, %one : f32, i32 + return %0: f32 +} + +// ----- + +func @group_non_uniform_broadcast_negative_non_const(%value: f32, %localid: i32) -> f32 { + // expected-error @+1 {{id must be the result of a constant op}} + %0 = spv.GroupNonUniformBroadcast "Subgroup" %value, %localid : f32, i32 + return %0: f32 +} + +// ----- + //===----------------------------------------------------------------------===// // spv.GroupNonUniformElect //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/SPIRV/ops.mlir b/mlir/test/Dialect/SPIRV/ops.mlir index c91a81fe239c4..fe845ae572fa3 100644 --- a/mlir/test/Dialect/SPIRV/ops.mlir +++ b/mlir/test/Dialect/SPIRV/ops.mlir @@ -335,6 +335,22 @@ func @convert_f_to_s_scalar(%arg0 : f32) -> i32 { // ----- +func @convert_f64_to_s32_scalar(%arg0 : f64) -> i32 { + // CHECK: {{%.*}} = spv.ConvertFToS {{%.*}} : f64 to i32 + %0 = spv.ConvertFToS %arg0 : f64 to i32 + spv.ReturnValue %0 : i32 +} + +// ----- + +func @convert_f_to_s_vector(%arg0 : vector<3xf32>) -> vector<3xi32> { + // CHECK: {{%.*}} = spv.ConvertFToS {{%.*}} : vector<3xf32> to vector<3xi32> + %0 = spv.ConvertFToS %arg0 : vector<3xf32> to vector<3xi32> + spv.ReturnValue %0 : vector<3xi32> +} + +// ----- + //===----------------------------------------------------------------------===// // spv.ConvertFToU //===----------------------------------------------------------------------===// @@ -347,6 +363,14 @@ func @convert_f_to_u_scalar(%arg0 : f32) -> i32 { // ----- +func @convert_f64_to_u32_scalar(%arg0 : f64) -> i32 { + // CHECK: {{%.*}} = spv.ConvertFToU {{%.*}} : f64 to i32 + %0 = spv.ConvertFToU %arg0 : f64 to i32 + spv.ReturnValue %0 : i32 +} + +// ----- + func @convert_f_to_u_vector(%arg0 : vector<3xf32>) -> vector<3xi32> { // CHECK: {{%.*}} = spv.ConvertFToU {{%.*}} : vector<3xf32> to vector<3xi32> %0 = spv.ConvertFToU %arg0 : vector<3xf32> to vector<3xi32> @@ -363,14 +387,6 @@ func @convert_f_to_u_coopmatrix(%arg0 : !spv.coopmatrix<8x16xf32, Subgroup>) { // ----- -func @convert_f_to_u_scalar_invalid(%arg0 : f16) -> i32 { - // expected-error @+1 {{expected the same bit widths for operand type and result type, but provided 'f16' and 'i32'}} - %0 = spv.ConvertFToU %arg0 : f16 to i32 - spv.ReturnValue %0 : i32 -} - -// ----- - //===----------------------------------------------------------------------===// // spv.ConvertSToF //===----------------------------------------------------------------------===// @@ -383,6 +399,22 @@ func @convert_s_to_f_scalar(%arg0 : i32) -> f32 { // ----- +func @convert_s64_to_f32_scalar(%arg0 : i64) -> f32 { + // CHECK: {{%.*}} = spv.ConvertSToF {{%.*}} : i64 to f32 + %0 = spv.ConvertSToF %arg0 : i64 to f32 + spv.ReturnValue %0 : f32 +} + +// ----- + +func @convert_s_to_f_vector(%arg0 : vector<3xi32>) -> vector<3xf32> { + // CHECK: {{%.*}} = spv.ConvertSToF {{%.*}} : vector<3xi32> to vector<3xf32> + %0 = spv.ConvertSToF %arg0 : vector<3xi32> to vector<3xf32> + spv.ReturnValue %0 : vector<3xf32> +} + +// ----- + //===----------------------------------------------------------------------===// // spv.ConvertUToF //===----------------------------------------------------------------------===// @@ -395,6 +427,22 @@ func @convert_u_to_f_scalar(%arg0 : i32) -> f32 { // ----- +func @convert_u64_to_f32_scalar(%arg0 : i64) -> f32 { + // CHECK: {{%.*}} = spv.ConvertUToF {{%.*}} : i64 to f32 + %0 = spv.ConvertUToF %arg0 : i64 to f32 + spv.ReturnValue %0 : f32 +} + +// ----- + +func @convert_u_to_f_vector(%arg0 : vector<3xi32>) -> vector<3xf32> { + // CHECK: {{%.*}} = spv.ConvertUToF {{%.*}} : vector<3xi32> to vector<3xf32> + %0 = spv.ConvertUToF %arg0 : vector<3xi32> to vector<3xf32> + spv.ReturnValue %0 : vector<3xf32> +} + +// ----- + //===----------------------------------------------------------------------===// // spv.FConvert //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir index 670d207a5f474..9c45f254ba6d6 100644 --- a/mlir/test/Dialect/Shape/canonicalize.mlir +++ b/mlir/test/Dialect/Shape/canonicalize.mlir @@ -386,7 +386,31 @@ func @f(%arg0: !shape.shape, %arg1: !shape.shape) { } // ----- +// cstr_require with constant can be folded +// CHECK-LABEL: func @cstr_require_fold +func @cstr_require_fold() { + // CHECK-NEXT: shape.const_witness true + // CHECK-NEXT: consume.witness + // CHECK-NEXT: return + %true = constant true + %0 = shape.cstr_require %true + "consume.witness"(%0) : (!shape.witness) -> () + return +} + +// ----- +// cstr_require without constant cannot be folded +// CHECK-LABEL: func @cstr_require_no_fold +func @cstr_require_no_fold(%arg0: i1) { + // CHECK-NEXT: shape.cstr_require + // CHECK-NEXT: consume.witness + // CHECK-NEXT: return + %0 = shape.cstr_require %arg0 + "consume.witness"(%0) : (!shape.witness) -> () + return +} +// ----- // assuming_all with known passing witnesses can be folded // CHECK-LABEL: func @f func @f() { diff --git a/mlir/test/Dialect/Shape/ops.mlir b/mlir/test/Dialect/Shape/ops.mlir index 58f2a61841e22..1a431d2dbd2f3 100644 --- a/mlir/test/Dialect/Shape/ops.mlir +++ b/mlir/test/Dialect/Shape/ops.mlir @@ -100,12 +100,14 @@ func @test_shape_of(%arg0: tensor) -> tensor { func @test_constraints() { %0 = shape.const_shape [] : !shape.shape %1 = shape.const_shape [1, 2, 3] : !shape.shape + %true = constant true %w0 = shape.cstr_broadcastable %0, %1 : !shape.shape, !shape.shape %w1 = shape.cstr_eq %0, %1 %w2 = shape.const_witness true %w3 = shape.const_witness false - %w4 = shape.assuming_all %w0, %w1, %w2, %w3 - shape.assuming %w4 -> !shape.shape { + %w4 = shape.cstr_require %true + %w_all = shape.assuming_all %w0, %w1, %w2, %w3, %w4 + shape.assuming %w_all -> !shape.shape { %2 = "shape.any"(%0, %1) : (!shape.shape, !shape.shape) -> !shape.shape shape.assuming_yield %2 : !shape.shape } diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index 1b1362f948841..9c36f7684baf9 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -385,3 +385,28 @@ func @bitcast_folding(%I1: vector<4x8xf32>, %I2: vector<2xi32>) -> (vector<4x8xf %2 = vector.bitcast %1 : vector<4xi16> to vector<2xi32> return %0, %2 : vector<4x8xf32>, vector<2xi32> } + +// ----- + +// CHECK-LABEL: broadcast_folding1 +// CHECK: %[[CST:.*]] = constant dense<42> : vector<4xi32> +// CHECK-NOT: vector.broadcast +// CHECK: return %[[CST]] +func @broadcast_folding1() -> vector<4xi32> { + %0 = constant 42 : i32 + %1 = vector.broadcast %0 : i32 to vector<4xi32> + return %1 : vector<4xi32> +} + +// ----- + +// CHECK-LABEL: @broadcast_folding2 +// CHECK: %[[CST:.*]] = constant dense<42> : vector<4x16xi32> +// CHECK-NOT: vector.broadcast +// CHECK: return %[[CST]] +func @broadcast_folding2() -> vector<4x16xi32> { + %0 = constant 42 : i32 + %1 = vector.broadcast %0 : i32 to vector<16xi32> + %2 = vector.broadcast %1 : vector<16xi32> to vector<4x16xi32> + return %2 : vector<4x16xi32> +} diff --git a/mlir/test/EDSC/builder-api-test.cpp b/mlir/test/EDSC/builder-api-test.cpp index 4695090dacb52..ec22dd04dc4ab 100644 --- a/mlir/test/EDSC/builder-api-test.cpp +++ b/mlir/test/EDSC/builder-api-test.cpp @@ -177,6 +177,38 @@ TEST_FUNC(builder_max_min_for) { f.erase(); } +TEST_FUNC(builder_affine_for_iter_args) { + auto indexType = IndexType::get(&globalContext()); + auto f = makeFunction("builder_affine_for_iter_args", {}, + {indexType, indexType, indexType}); + + OpBuilder builder(f.getBody()); + ScopedContext scope(builder, f.getLoc()); + Value i, lb_1(f.getArgument(0)), ub_1(f.getArgument(1)), + ub_2(f.getArgument(2)); + Value c32(std_constant_int(32, 32)); + Value c42(std_constant_int(42, 32)); + using namespace edsc::op; + affineLoopBuilder( + lb_1, {ub_1, ub_2}, 2, {c32, c42}, [&](Value iv, ValueRange args) { + Value sum(args[0] + args[1]); + builder.create(f.getLoc(), ValueRange({args[1], sum})); + }); + + // clang-format off + // CHECK-LABEL: func @builder_affine_for_iter_args + // CHECK: (%[[lb_1:.*]]: index, %[[ub_1:.*]]: index, %[[ub_2:.*]]: index) { + // CHECK-NEXT: %[[c32:.*]] = constant 32 : i32 + // CHECK-NEXT: %[[c42:.*]] = constant 42 : i32 + // CHECK-NEXT: %{{.*}} = affine.for %{{.*}} = affine_map<(d0) -> (d0)>(%{{.*}}) to min affine_map<(d0, d1) -> (d0, d1)>(%[[ub_1]], %[[ub_2]]) step 2 iter_args(%[[iarg_1:.*]] = %[[c32]], %[[iarg_2:.*]] = %[[c42]]) -> (i32, i32) { + // CHECK-NEXT: %[[sum:.*]] = addi %[[iarg_1]], %[[iarg_2]] : i32 + // CHECK-NEXT: affine.yield %[[iarg_2]], %[[sum]] : i32, i32 + // CHECK-NEXT: } + // clang-format on + f.print(llvm::outs()); + f.erase(); +} + TEST_FUNC(builder_block_append) { using namespace edsc::op; auto f = makeFunction("builder_blocks"); diff --git a/mlir/test/IR/core-ops.mlir b/mlir/test/IR/core-ops.mlir index 69e974bc41734..f182936c87032 100644 --- a/mlir/test/IR/core-ops.mlir +++ b/mlir/test/IR/core-ops.mlir @@ -661,17 +661,20 @@ func @extract_element(%arg0: tensor<*xi32>, %arg1 : tensor<4x4xf32>) -> i32 { // CHECK-LABEL: func @tensor_from_elements() { func @tensor_from_elements() { %c0 = "std.constant"() {value = 0: index} : () -> index - // CHECK: %0 = tensor_from_elements(%c0) : tensor<1xindex> - %0 = tensor_from_elements(%c0) : tensor<1xindex> + // CHECK: %0 = tensor_from_elements %c0 : tensor<1xindex> + %0 = tensor_from_elements %c0 : tensor<1xindex> %c1 = "std.constant"() {value = 1: index} : () -> index - // CHECK: %1 = tensor_from_elements(%c0, %c1) : tensor<2xindex> - %1 = tensor_from_elements(%c0, %c1) : tensor<2xindex> + // CHECK: %1 = tensor_from_elements %c0, %c1 : tensor<2xindex> + %1 = tensor_from_elements %c0, %c1 : tensor<2xindex> %c0_f32 = "std.constant"() {value = 0.0: f32} : () -> f32 // CHECK: [[C0_F32:%.*]] = constant - // CHECK: %2 = tensor_from_elements([[C0_F32]]) : tensor<1xf32> - %2 = tensor_from_elements(%c0_f32) : tensor<1xf32> + // CHECK: %2 = tensor_from_elements [[C0_F32]] : tensor<1xf32> + %2 = tensor_from_elements %c0_f32 : tensor<1xf32> + + // CHECK: tensor_from_elements : tensor<0xindex> + %3 = tensor_from_elements : tensor<0xindex> return } diff --git a/mlir/test/IR/invalid-ops.mlir b/mlir/test/IR/invalid-ops.mlir index 55739119aa26d..e02dbca494df6 100644 --- a/mlir/test/IR/invalid-ops.mlir +++ b/mlir/test/IR/invalid-ops.mlir @@ -595,18 +595,18 @@ func @extract_element_tensor_too_few_indices(%t : tensor<2x3xf32>, %i : index) { // ----- func @tensor_from_elements_wrong_result_type() { - // expected-error@+2 {{expected result type to be a ranked tensor}} + // expected-error@+2 {{'result' must be 1D tensor of any type values, but got 'tensor<*xi32>'}} %c0 = constant 0 : i32 - %0 = tensor_from_elements(%c0) : tensor<*xi32> + %0 = tensor_from_elements %c0 : tensor<*xi32> return } // ----- func @tensor_from_elements_wrong_elements_count() { - // expected-error@+2 {{expected result type to be a 1D tensor with 1 element}} + // expected-error@+2 {{1 operands present, but expected 2}} %c0 = constant 0 : index - %0 = tensor_from_elements(%c0) : tensor<2xindex> + %0 = tensor_from_elements %c0 : tensor<2xindex> return } diff --git a/mlir/test/IR/print-ir-defuse.mlir b/mlir/test/IR/print-ir-defuse.mlir new file mode 100644 index 0000000000000..78c5804119250 --- /dev/null +++ b/mlir/test/IR/print-ir-defuse.mlir @@ -0,0 +1,31 @@ +// RUN: mlir-opt -test-print-defuse -allow-unregistered-dialect %s | FileCheck %s + +// CHECK: Visiting op 'dialect.op1' with 0 operands: +// CHECK: Has 4 results: +// CHECK: - Result 0 has a single use: - dialect.op2 +// CHECK: - Result 1 has no uses +// CHECK: - Result 2 has 2 uses: +// CHECK: - dialect.innerop1 +// CHECK: - dialect.op2 +// CHECK: - Result 3 has no uses +// CHECK: Visiting op 'dialect.op2' with 2 operands: +// CHECK: - Operand produced by operation 'dialect.op1' +// CHECK: - Operand produced by operation 'dialect.op1' +// CHECK: Has 0 results: +// CHECK: Visiting op 'dialect.innerop1' with 2 operands: +// CHECK: - Operand produced by Block argument, number 0 +// CHECK: - Operand produced by operation 'dialect.op1' +// CHECK: Has 0 results: +// CHECK: Visiting op 'dialect.op3' with 0 operands: +// CHECK: Has 0 results: +// CHECK: Visiting op 'module_terminator' with 0 operands: +// CHECK: Has 0 results: +// CHECK: Visiting op 'module' with 0 operands: +// CHECK: Has 0 results: + +%results:4 = "dialect.op1"() : () -> (i1, i16, i32, i64) +"dialect.op2"(%results#0, %results#2) : (i1, i32) -> () +"dialect.op3"() ({ + ^bb0(%arg0 : i1): + "dialect.innerop1"(%arg0, %results#2) : (i1, i32) -> () +}) : () -> () diff --git a/mlir/test/IR/print-ir-nesting.mlir b/mlir/test/IR/print-ir-nesting.mlir new file mode 100644 index 0000000000000..4682753947550 --- /dev/null +++ b/mlir/test/IR/print-ir-nesting.mlir @@ -0,0 +1,57 @@ +// RUN: mlir-opt -test-print-nesting -allow-unregistered-dialect %s | FileCheck %s + +// CHECK: visiting op: 'module' with 0 operands and 0 results +// CHECK: 1 nested regions: +// CHECK: Region with 1 blocks: +// CHECK: Block with 0 arguments, 0 successors, and 3 operations +module { + + +// CHECK: visiting op: 'dialect.op1' with 0 operands and 4 results +// CHECK: 1 attributes: +// CHECK: - 'attribute name' : '42 : i32' +// CHECK: 0 nested regions: + %results:4 = "dialect.op1"() { "attribute name" = 42 : i32 } : () -> (i1, i16, i32, i64) + + +// CHECK: visiting op: 'dialect.op2' with 0 operands and 0 results +// CHECK: 2 nested regions: + "dialect.op2"() ({ + +// CHECK: Region with 1 blocks: +// CHECK: Block with 0 arguments, 0 successors, and 1 operations +// CHECK: visiting op: 'dialect.innerop1' with 2 operands and 0 results +// CHECK: 0 nested regions: + "dialect.innerop1"(%results#0, %results#1) : (i1, i16) -> () + +// CHECK: Region with 3 blocks: + },{ + +// CHECK: Block with 0 arguments, 2 successors, and 2 operations +// CHECK: visiting op: 'dialect.innerop2' with 0 operands and 0 results +// CHECK: 0 nested regions: + "dialect.innerop2"() : () -> () +// CHECK: visiting op: 'dialect.innerop3' with 3 operands and 0 results +// CHECK: 0 nested regions: + "dialect.innerop3"(%results#0, %results#2, %results#3)[^bb1, ^bb2] : (i1, i32, i64) -> () +// CHECK: Block with 1 arguments, 0 successors, and 2 operations + ^bb1(%arg1 : i32): +// CHECK: visiting op: 'dialect.innerop4' with 0 operands and 0 results +// CHECK: 0 nested regions: + "dialect.innerop4"() : () -> () +// CHECK: visiting op: 'dialect.innerop5' with 0 operands and 0 results +// CHECK: 0 nested regions: + "dialect.innerop5"() : () -> () +// CHECK: Block with 1 arguments, 0 successors, and 2 operations + ^bb2(%arg2 : i64): +// CHECK: visiting op: 'dialect.innerop6' with 0 operands and 0 results +// CHECK: 0 nested regions: + "dialect.innerop6"() : () -> () +// CHECK: visiting op: 'dialect.innerop7' with 0 operands and 0 results +// CHECK: 0 nested regions: + "dialect.innerop7"() : () -> () + }) : () -> () + +// CHECK: visiting op: 'module_terminator' with 0 operands and 0 results + +} // module diff --git a/mlir/test/IR/slice.mlir b/mlir/test/IR/slice.mlir new file mode 100644 index 0000000000000..731f3872f67dd --- /dev/null +++ b/mlir/test/IR/slice.mlir @@ -0,0 +1,33 @@ +// RUN: mlir-opt -slice-analysis-test %s | FileCheck %s + +func @slicing_linalg_op(%arg0 : index, %arg1 : index, %arg2 : index) { + %a = alloc(%arg0, %arg2) : memref + %b = alloc(%arg2, %arg1) : memref + %c = alloc(%arg0, %arg1) : memref + %d = alloc(%arg0, %arg1) : memref + linalg.matmul %a, %b, %c : (memref, memref, memref) + linalg.matmul %a, %b, %d : (memref, memref, memref) + dealloc %c : memref + dealloc %b : memref + dealloc %a : memref + dealloc %d : memref + return +} + +// CHECK-LABEL: func @slicing_linalg_op__backward_slice__0 +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: index +// CHECK-DAG: %[[A:.+]] = alloc(%[[ARG0]], %[[ARG2]]) : memref +// CHECK-DAG: %[[B:.+]] = alloc(%[[ARG2]], %[[ARG1]]) : memref +// CHECK-DAG: %[[C:.+]] = alloc(%[[ARG0]], %[[ARG1]]) : memref +// CHECK: return + +// CHECK-LABEL: func @slicing_linalg_op__backward_slice__1 +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: index +// CHECK-DAG: %[[A:.+]] = alloc(%[[ARG0]], %[[ARG2]]) : memref +// CHECK-DAG: %[[B:.+]] = alloc(%[[ARG2]], %[[ARG1]]) : memref +// CHECK-DAG: %[[C:.+]] = alloc(%[[ARG0]], %[[ARG1]]) : memref +// CHECK: return diff --git a/mlir/test/Transforms/buffer-placement.mlir b/mlir/test/Transforms/buffer-placement.mlir index e1ed2c4309c3d..dc9ff44bf4838 100644 --- a/mlir/test/Transforms/buffer-placement.mlir +++ b/mlir/test/Transforms/buffer-placement.mlir @@ -1125,3 +1125,295 @@ func @nestedRegionControlFlowAlloca( // CHECK: %[[ALLOCA:.*]] = alloca(%arg0, %arg1) // CHECK-NEXT: scf.yield %[[ALLOC0]] // CHECK: return %[[ALLOC1]] + +// ----- + +// Test Case: structured control-flow loop using a nested alloc. +// The alloc positions of %3 will not be changed, but the iteration argument +// %iterBuf has to be freed before yielding %3 to avoid memory leaks. + +// ----- + +// CHECK-LABEL: func @loop_alloc +func @loop_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = alloc() : memref<2xf32> + scf.yield %3 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: dealloc %[[ALLOC0]] +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc() +// CHECK: linalg.copy(%arg3, %[[ALLOC1]]) +// CHECK: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] = %[[ALLOC1]] +// CHECK: cmpi +// CHECK: dealloc %[[IALLOC]] +// CHECK: %[[ALLOC3:.*]] = alloc() +// CHECK: %[[ALLOC4:.*]] = alloc() +// CHECK: linalg.copy(%[[ALLOC3]], %[[ALLOC4]]) +// CHECK: dealloc %[[ALLOC3]] +// CHECK: scf.yield %[[ALLOC4]] +// CHECK: } +// CHECK: linalg.copy(%[[ALLOC2]], %arg4) +// CHECK-NEXT: dealloc %[[ALLOC2]] + +// ----- + +// Test Case: structured control-flow loop with a nested if operation. +// The loop yields buffers that have been defined outside of the loop and the +// backeges only use the iteration arguments (or one of its aliases). +// Therefore, we do not have to (and are not allowed to) free any buffers +// that are passed via the backedges. + +// CHECK-LABEL: func @loop_nested_if_no_alloc +func @loop_nested_if_no_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = scf.if %2 -> (memref<2xf32>) { + scf.yield %0 : memref<2xf32> + } else { + scf.yield %iterBuf : memref<2xf32> + } + scf.yield %3 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: %[[ALLOC1:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] = +// CHECK: %[[ALLOC2:.*]] = scf.if +// CHECK: scf.yield %[[ALLOC0]] +// CHECK: scf.yield %[[IALLOC]] +// CHECK: scf.yield %[[ALLOC2]] +// CHECK: linalg.copy(%[[ALLOC1]], %arg4) +// CHECK: dealloc %[[ALLOC0]] + +// ----- + +// Test Case: structured control-flow loop with a nested if operation using +// a deeply nested buffer allocation. +// Since the innermost allocation happens in a divergent branch, we have to +// introduce additional copies for the nested if operation. Since the loop's +// yield operation "returns" %3, it will return a newly allocated buffer. +// Therefore, we have to free the iteration argument %iterBuf before +// "returning" %3. + +// CHECK-LABEL: func @loop_nested_if_alloc +func @loop_nested_if_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>) -> memref<2xf32> { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = cmpi "eq", %i, %ub : index + %3 = scf.if %2 -> (memref<2xf32>) { + %4 = alloc() : memref<2xf32> + scf.yield %4 : memref<2xf32> + } else { + scf.yield %0 : memref<2xf32> + } + scf.yield %3 : memref<2xf32> + } + return %1 : memref<2xf32> +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%arg3, %[[ALLOC1]]) +// CHECK-NEXT: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] = %[[ALLOC1]] +// CHECK: dealloc %[[IALLOC]] +// CHECK: %[[ALLOC3:.*]] = scf.if + +// CHECK: %[[ALLOC4:.*]] = alloc() +// CHECK-NEXT: %[[ALLOC5:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC4]], %[[ALLOC5]]) +// CHECK-NEXT: dealloc %[[ALLOC4]] +// CHECK-NEXT: scf.yield %[[ALLOC5]] + +// CHECK: %[[ALLOC6:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC0]], %[[ALLOC6]]) +// CHECK-NEXT: scf.yield %[[ALLOC6]] + +// CHECK: %[[ALLOC7:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC3:.*]], %[[ALLOC7]]) +// CHECK-NEXT: dealloc %[[ALLOC3]] +// CHECK-NEXT: scf.yield %[[ALLOC7]] + +// CHECK: dealloc %[[ALLOC0]] +// CHECK-NEXT: return %[[ALLOC2]] + +// ----- + +// Test Case: several nested structured control-flow loops with a deeply nested +// buffer allocation inside an if operation. +// Same behavior is an loop_nested_if_alloc: we have to insert deallocations +// before each yield in all loops recursively. + +// CHECK-LABEL: func @loop_nested_alloc +func @loop_nested_alloc( + %lb: index, + %ub: index, + %step: index, + %buf: memref<2xf32>, + %res: memref<2xf32>) { + %0 = alloc() : memref<2xf32> + %1 = scf.for %i = %lb to %ub step %step + iter_args(%iterBuf = %buf) -> memref<2xf32> { + %2 = scf.for %i2 = %lb to %ub step %step + iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> { + %3 = scf.for %i3 = %lb to %ub step %step + iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> { + %4 = alloc() : memref<2xf32> + %5 = cmpi "eq", %i, %ub : index + %6 = scf.if %5 -> (memref<2xf32>) { + %7 = alloc() : memref<2xf32> + scf.yield %7 : memref<2xf32> + } else { + scf.yield %iterBuf3 : memref<2xf32> + } + scf.yield %6 : memref<2xf32> + } + scf.yield %3 : memref<2xf32> + } + scf.yield %2 : memref<2xf32> + } + "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// CHECK: %[[ALLOC0:.*]] = alloc() +// CHECK-NEXT: dealloc %[[ALLOC0]] +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%arg3, %[[ALLOC1]]) +// CHECK-NEXT: %[[VAL_7:.*]] = scf.for {{.*}} iter_args(%[[IALLOC0:.*]] = %[[ALLOC1]]) +// CHECK: %[[ALLOC2:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%[[IALLOC0]], %[[ALLOC2]]) +// CHECK-NEXT: dealloc %[[IALLOC0]] +// CHECK-NEXT: %[[ALLOC3:.*]] = scf.for {{.*}} iter_args(%[[IALLOC1:.*]] = %[[ALLOC2]]) +// CHECK: %[[ALLOC5:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%[[IALLOC1]], %[[ALLOC5]]) +// CHECK-NEXT: dealloc %[[IALLOC1]] + +// CHECK: %[[ALLOC6:.*]] = scf.for {{.*}} iter_args(%[[IALLOC2:.*]] = %[[ALLOC5]]) +// CHECK: %[[ALLOC8:.*]] = alloc() +// CHECK-NEXT: dealloc %[[ALLOC8]] +// CHECK: %[[ALLOC9:.*]] = scf.if + +// CHECK: %[[ALLOC11:.*]] = alloc() +// CHECK-NEXT: %[[ALLOC12:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC11]], %[[ALLOC12]]) +// CHECK-NEXT: dealloc %[[ALLOC11]] +// CHECK-NEXT: scf.yield %[[ALLOC12]] + +// CHECK: %[[ALLOC13:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%[[IALLOC2]], %[[ALLOC13]]) +// CHECK-NEXT: scf.yield %[[ALLOC13]] + +// CHECK: dealloc %[[IALLOC2]] +// CHECK-NEXT: %[[ALLOC10:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC9]], %[[ALLOC10]]) +// CHECK-NEXT: dealloc %[[ALLOC9]] +// CHECK-NEXT: scf.yield %[[ALLOC10]] + +// CHECK: %[[ALLOC7:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC6]], %[[ALLOC7]]) +// CHECK-NEXT: dealloc %[[ALLOC6]] +// CHECK-NEXT: scf.yield %[[ALLOC7]] + +// CHECK: %[[ALLOC4:.*]] = alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC3]], %[[ALLOC4]]) +// CHECK-NEXT: dealloc %[[ALLOC3]] +// CHECK-NEXT: scf.yield %[[ALLOC4]] + +// CHECK: linalg.copy(%[[VAL_7]], %arg4) +// CHECK-NEXT: dealloc %[[VAL_7]] + +// ----- + +// Test Case: explicit control-flow loop with a dynamically allocated buffer. +// The BufferPlacement transformation should fail on this explicit +// control-flow loop since they are not supported. + +// CHECK-LABEL: func @loop_dynalloc +func @loop_dynalloc( + %arg0 : i32, + %arg1 : i32, + %arg2: memref, + %arg3: memref) { + %const0 = constant 0 : i32 + br ^loopHeader(%const0, %arg2 : i32, memref) + +^loopHeader(%i : i32, %buff : memref): + %lessThan = cmpi "slt", %i, %arg1 : i32 + cond_br %lessThan, + ^loopBody(%i, %buff : i32, memref), + ^exit(%buff : memref) + +^loopBody(%val : i32, %buff2: memref): + %const1 = constant 1 : i32 + %inc = addi %val, %const1 : i32 + %size = std.index_cast %inc : i32 to index + %alloc1 = alloc(%size) : memref + br ^loopHeader(%inc, %alloc1 : i32, memref) + +^exit(%buff3 : memref): + "linalg.copy"(%buff3, %arg3) : (memref, memref) -> () + return +} + +// expected-error@+1 {{Structured control-flow loops are supported only}} + +// ----- + +// Test Case: explicit control-flow loop with a dynamically allocated buffer. +// The BufferPlacement transformation should fail on this explicit +// control-flow loop since they are not supported. + +// CHECK-LABEL: func @do_loop_alloc +func @do_loop_alloc( + %arg0 : i32, + %arg1 : i32, + %arg2: memref<2xf32>, + %arg3: memref<2xf32>) { + %const0 = constant 0 : i32 + br ^loopBody(%const0, %arg2 : i32, memref<2xf32>) + +^loopBody(%val : i32, %buff2: memref<2xf32>): + %const1 = constant 1 : i32 + %inc = addi %val, %const1 : i32 + %alloc1 = alloc() : memref<2xf32> + br ^loopHeader(%inc, %alloc1 : i32, memref<2xf32>) + +^loopHeader(%i : i32, %buff : memref<2xf32>): + %lessThan = cmpi "slt", %i, %arg1 : i32 + cond_br %lessThan, + ^loopBody(%i, %buff : i32, memref<2xf32>), + ^exit(%buff : memref<2xf32>) + +^exit(%buff3 : memref<2xf32>): + "linalg.copy"(%buff3, %arg3) : (memref<2xf32>, memref<2xf32>) -> () + return +} + +// expected-error@+1 {{Structured control-flow loops are supported only}} diff --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir index 7333446c6e5d9..3603c473a1fd7 100644 --- a/mlir/test/Transforms/canonicalize.mlir +++ b/mlir/test/Transforms/canonicalize.mlir @@ -981,8 +981,132 @@ func @memref_cast_folding_subview_static(%V: memref<16x16xf32>, %a: index, %b: i func @extract_element_from_tensor_from_elements(%element : index) -> index { // CHECK-SAME: ([[ARG:%.*]]: index) %c0 = constant 0 : index - %tensor = tensor_from_elements(%element) : tensor<1xindex> + %tensor = tensor_from_elements %element : tensor<1xindex> %extracted_element = extract_element %tensor[%c0] : tensor<1xindex> // CHECK: [[ARG]] : index return %extracted_element : index } + +// ----- + +// CHECK-LABEL: func @extract_element_from_dynamic_tensor_from_elements +// CHECK-SAME: %[[IDX:.*]]: index, %[[TENSOR:.*]]: tensor<*xf32> +func @extract_element_from_dynamic_tensor_from_elements(%idx: index, %tensor: tensor<*xf32>) -> index { + %size = rank %tensor : tensor<*xf32> + // CHECK-NEXT: %[[RES:.*]] = dim %[[TENSOR]], %[[IDX]] + %0 = dynamic_tensor_from_elements %size { + ^bb0(%arg0: index): + %1 = dim %tensor, %arg0 : tensor<*xf32> + yield %1 : index + } : tensor + %1 = extract_element %0[%idx] : tensor + // CHECK-NEXT: return %[[RES]] + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @extract_element_from_dynamic_tensor_from_elements_2d +// CHECK-SAME: %[[IDX0:.*]]: index, %[[IDX1:.*]]: index, %[[TENSOR:.*]]: tensor<*xf32> +func @extract_element_from_dynamic_tensor_from_elements_2d(%idx0: index, %idx1: index, %tensor: tensor<*xf32>) -> index { + %size = rank %tensor : tensor<*xf32> + // CHECK-NEXT: %[[DIM0:.*]] = dim %[[TENSOR]], %[[IDX0]] + // CHECK-NEXT: %[[DIM1:.*]] = dim %[[TENSOR]], %[[IDX1]] + // CHECK-NEXT: %[[RES:.*]] = addi %[[DIM0]], %[[DIM1]] + %0 = dynamic_tensor_from_elements %size, %size { + ^bb0(%arg0: index, %arg1: index): + %1 = dim %tensor, %arg0 : tensor<*xf32> + %2 = dim %tensor, %arg1 : tensor<*xf32> + %3 = addi %1, %2 : index + yield %3 : index + } : tensor + %4 = extract_element %0[%idx0, %idx1] : tensor + // CHECK-NEXT: return %[[RES]] + return %4 : index +} + +// ----- + +// CHECK-LABEL: func @extract_element_from_dynamic_tensor_from_elements_sideeffects +// CHECK-SAME: %[[IDX:.*]]: index +func @extract_element_from_dynamic_tensor_from_elements_sideeffects(%idx: index, %tensor: tensor<*xf32>) -> index { + %size = rank %tensor : tensor<*xf32> + %mem = alloc(%size) : memref + // CHECK: %[[DTENSOR:.*]] = dynamic_tensor_from_elements + %0 = dynamic_tensor_from_elements %size { + ^bb0(%arg0: index): + %1 = dim %tensor, %arg0 : tensor<*xf32> + store %1, %mem[%arg0] : memref + yield %1 : index + } : tensor + // CHECK: %[[RES:.*]] = extract_element %[[DTENSOR]][%[[IDX]]] + %1 = extract_element %0[%idx] : tensor + // CHECK-NEXT: return %[[RES]] + return %1 : index +} + +// ----- + +// CHECK-LABEL: @static_dynamic_tensor_from_elements +// CHECK-SAME: %[[SIZE1:.*]]: index, %[[SIZE4:.*]]: index) +func @static_dynamic_tensor_from_elements(%size1: index, %size4: index) -> tensor<3x?x?x7x?xindex> { + %c5 = constant 5 : index + // CHECK: dynamic_tensor_from_elements %[[SIZE1]], %[[SIZE4]] + %0 = dynamic_tensor_from_elements %size1, %c5, %size4 { + ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index): + %1 = constant 32 : index + yield %1 : index + // CHECK: : tensor<3x?x5x7x?xindex> + } : tensor<3x?x?x7x?xindex> + // CHECK: tensor_cast %{{.*}} : tensor<3x?x5x7x?xindex> to tensor<3x?x?x7x?xindex> + return %0 : tensor<3x?x?x7x?xindex> +} + +// ----- + +// CHECK-LABEL: @tensor_cast_chain_ok +// CHECK-SAME: %[[IN:.*]]: tensor<*xi32> +func @tensor_cast_chain_ok(%input: tensor<*xi32>) -> tensor<4x8xi32> { + // CHECK-NEXT: %[[RES:.*]] = tensor_cast %[[IN]] : tensor<*xi32> to tensor<4x8xi32> + %0 = tensor_cast %input : tensor<*xi32> to tensor<4x?xi32> + %1 = tensor_cast %0 : tensor<4x?xi32> to tensor<4x8xi32> + // CHECK-NEXT: return %[[RES]] + return %1 : tensor<4x8xi32> +} + +// ----- + +// CHECK-LABEL: @tensor_cast_chain_regain +// CHECK-SAME: %[[IN:.*]]: tensor<4xi32> +func @tensor_cast_chain_regain(%input: tensor<4xi32>) -> tensor<4xi32> { + %0 = tensor_cast %input : tensor<4xi32> to tensor + %1 = tensor_cast %0 : tensor to tensor<4xi32> + // CHECK-NEXT: return %[[IN]] + return %1 : tensor<4xi32> +} + +// ----- + +// CHECK-LABEL: @tensor_cast_chain_keep +// CHECK-SAME: %[[IN:.*]]: tensor +func @tensor_cast_chain_keep(%input: tensor) -> tensor { + // CHECK-NEXT: %[[C1:.*]] = tensor_cast %[[IN]] + %0 = tensor_cast %input : tensor to tensor<4x?xi32> + // CHECK-NEXT: %[[C2:.*]] = tensor_cast %[[C1]] + %1 = tensor_cast %0 : tensor<4x?xi32> to tensor + // CHECK-NEXT: return %[[C2]] + return %1 : tensor +} + +// ----- + +// CHECK-LABEL: @tensor_cast_chain_invalid +// CHECK-SAME: %[[IN:.*]]: tensor<4x8xi32> +func @tensor_cast_chain_invalid(%input: tensor<4x8xi32>) -> tensor<8x4xi32> { + // CHECK-NEXT: %[[C1:.*]] = tensor_cast %[[IN]] + %0 = tensor_cast %input : tensor<4x8xi32> to tensor + // CHECK-NEXT: %[[C2:.*]] = tensor_cast %[[C1]] + %1 = tensor_cast %0 : tensor to tensor<8x4xi32> + // CHECK-NEXT: return %[[C2]] + return %1 : tensor<8x4xi32> +} diff --git a/mlir/test/Transforms/copy-removal.mlir b/mlir/test/Transforms/copy-removal.mlir index f750dabb18a04..a0d1193b77d58 100644 --- a/mlir/test/Transforms/copy-removal.mlir +++ b/mlir/test/Transforms/copy-removal.mlir @@ -283,3 +283,67 @@ func @test_ReuseCopyTargetAsSource(%arg0: memref<2xf32>){ dealloc %temp : memref<2xf32> return } + +// ----- + +// The only redundant copy is linalg.copy(%4, %5) + +// CHECK-LABEL: func @loop_alloc +func @loop_alloc(%arg0: index, %arg1: index, %arg2: index, %arg3: memref<2xf32>, %arg4: memref<2xf32>) { + // CHECK: %{{.*}} = alloc() + %0 = alloc() : memref<2xf32> + dealloc %0 : memref<2xf32> + // CHECK: %{{.*}} = alloc() + %1 = alloc() : memref<2xf32> + // CHECK: linalg.copy + linalg.copy(%arg3, %1) : memref<2xf32>, memref<2xf32> + %2 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %1) -> (memref<2xf32>) { + %3 = cmpi "eq", %arg5, %arg1 : index + // CHECK: dealloc + dealloc %arg6 : memref<2xf32> + // CHECK: %[[PERCENT4:.*]] = alloc() + %4 = alloc() : memref<2xf32> + // CHECK-NOT: alloc + // CHECK-NOT: linalg.copy + // CHECK-NOT: dealloc + %5 = alloc() : memref<2xf32> + linalg.copy(%4, %5) : memref<2xf32>, memref<2xf32> + dealloc %4 : memref<2xf32> + // CHECK: %[[PERCENT6:.*]] = alloc() + %6 = alloc() : memref<2xf32> + // CHECK: linalg.copy(%[[PERCENT4]], %[[PERCENT6]]) + linalg.copy(%5, %6) : memref<2xf32>, memref<2xf32> + scf.yield %6 : memref<2xf32> + } + // CHECK: linalg.copy + linalg.copy(%2, %arg4) : memref<2xf32>, memref<2xf32> + dealloc %2 : memref<2xf32> + return +} + +// ----- + +// The linalg.copy operation can be removed in addition to alloc and dealloc +// operations. All uses of %0 is then replaced with %arg2. + +// CHECK-LABEL: func @check_with_affine_dialect +func @check_with_affine_dialect(%arg0: memref<4xf32>, %arg1: memref<4xf32>, %arg2: memref<4xf32>) { + // CHECK-SAME: (%[[ARG0:.*]]: memref<4xf32>, %[[ARG1:.*]]: memref<4xf32>, %[[RES:.*]]: memref<4xf32>) + // CHECK-NOT: alloc + %0 = alloc() : memref<4xf32> + affine.for %arg3 = 0 to 4 { + %5 = affine.load %arg0[%arg3] : memref<4xf32> + %6 = affine.load %arg1[%arg3] : memref<4xf32> + %7 = cmpf "ogt", %5, %6 : f32 + // CHECK: %[[SELECT_RES:.*]] = select + %8 = select %7, %5, %6 : f32 + // CHECK-NEXT: affine.store %[[SELECT_RES]], %[[RES]] + affine.store %8, %0[%arg3] : memref<4xf32> + } + // CHECK-NOT: linalg.copy + // CHECK-NOT: dealloc + "linalg.copy"(%0, %arg2) : (memref<4xf32>, memref<4xf32>) -> () + dealloc %0 : memref<4xf32> + //CHECK: return + return +} diff --git a/mlir/test/lib/Dialect/Test/TestDialect.h b/mlir/test/lib/Dialect/Test/TestDialect.h index 34fc1a9534e8d..09f84d1ac1339 100644 --- a/mlir/test/lib/Dialect/Test/TestDialect.h +++ b/mlir/test/lib/Dialect/Test/TestDialect.h @@ -29,7 +29,6 @@ #include "TestOpEnums.h.inc" -namespace mlir { #include "TestOpStructs.h.inc" #include "TestOpsDialect.h.inc" @@ -37,8 +36,8 @@ namespace mlir { #define GET_OP_CLASSES #include "TestOps.h.inc" +namespace mlir { void registerTestDialect(DialectRegistry ®istry); - } // end namespace mlir #endif // MLIR_TESTDIALECT_H diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index f03c953396a4a..9ae36ed1710c0 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -22,7 +22,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" def Test_Dialect : Dialect { let name = "test"; - let cppNamespace = ""; + let cppNamespace = "::mlir"; let hasOperationAttrVerify = 1; let hasRegionArgAttrVerify = 1; let hasRegionResultAttrVerify = 1; diff --git a/mlir/test/lib/IR/CMakeLists.txt b/mlir/test/lib/IR/CMakeLists.txt index f77b26e5ca184..a42f90bb92689 100644 --- a/mlir/test/lib/IR/CMakeLists.txt +++ b/mlir/test/lib/IR/CMakeLists.txt @@ -3,7 +3,10 @@ add_mlir_library(MLIRTestIR TestFunc.cpp TestInterfaces.cpp TestMatchers.cpp + TestPrintDefUse.cpp + TestPrintNesting.cpp TestSideEffects.cpp + TestSlicing.cpp TestSymbolUses.cpp TestTypes.cpp diff --git a/mlir/test/lib/IR/TestPrintDefUse.cpp b/mlir/test/lib/IR/TestPrintDefUse.cpp new file mode 100644 index 0000000000000..3153a148477a9 --- /dev/null +++ b/mlir/test/lib/IR/TestPrintDefUse.cpp @@ -0,0 +1,71 @@ +//===- TestPrintDefUse.cpp - Passes to illustrate the IR def-use chains ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/IR/Function.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; + +namespace { +/// This pass illustrates the IR def-use chains through printing. +struct TestPrintDefUsePass + : public PassWrapper> { + void runOnOperation() override { + // Recursively traverse the IR nested under the current operation and print + // every single operation and their operands and users. + getOperation()->walk([](Operation *op) { + llvm::outs() << "Visiting op '" << op->getName() << "' with " + << op->getNumOperands() << " operands:\n"; + + // Print information about the producer of each of the operands. + for (Value operand : op->getOperands()) { + if (Operation *producer = operand.getDefiningOp()) { + llvm::outs() << " - Operand produced by operation '" + << producer->getName() << "'\n"; + } else { + // If there is no defining op, the Value is necessarily a Block + // argument. + auto blockArg = operand.cast(); + llvm::outs() << " - Operand produced by Block argument, number " + << blockArg.getArgNumber() << "\n"; + } + } + + // Print information about the user of each of the result. + llvm::outs() << "Has " << op->getNumResults() << " results:\n"; + for (auto indexedResult : llvm::enumerate(op->getResults())) { + Value result = indexedResult.value(); + llvm::outs() << " - Result " << indexedResult.index(); + if (result.use_empty()) { + llvm::outs() << " has no uses\n"; + continue; + } + if (result.hasOneUse()) { + llvm::outs() << " has a single use: "; + } else { + llvm::outs() << " has " + << std::distance(result.getUses().begin(), + result.getUses().end()) + << " uses:\n"; + } + for (Operation *userOp : result.getUsers()) { + llvm::outs() << " - " << userOp->getName() << "\n"; + } + } + }); + } +}; +} // end anonymous namespace + +namespace mlir { +void registerTestPrintDefUsePass() { + PassRegistration("test-print-defuse", + "Test various printing."); +} +} // namespace mlir diff --git a/mlir/test/lib/IR/TestPrintNesting.cpp b/mlir/test/lib/IR/TestPrintNesting.cpp new file mode 100644 index 0000000000000..825d241740fda --- /dev/null +++ b/mlir/test/lib/IR/TestPrintNesting.cpp @@ -0,0 +1,96 @@ +//===- TestPrintNesting.cpp - Passes to illustrate the IR nesting ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/IR/Function.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; + +namespace { +/// This pass illustrates the IR nesting through printing. +struct TestPrintNestingPass + : public PassWrapper> { + // Entry point for the pass. + void runOnOperation() override { + Operation *op = getOperation(); + resetIndent(); + printOperation(op); + } + + /// The three methods below are mutually recursive and follow the nesting of + /// the IR: operation->region->block->operation->... + + void printOperation(Operation *op) { + // Print the operation itself and some of its properties + printIndent() << "visiting op: '" << op->getName() << "' with " + << op->getNumOperands() << " operands and " + << op->getNumResults() << " results\n"; + // Print the operation attributes + if (!op->getAttrs().empty()) { + printIndent() << op->getAttrs().size() << " attributes:\n"; + for (NamedAttribute attr : op->getAttrs()) + printIndent() << " - '" << attr.first << "' : '" << attr.second + << "'\n"; + } + + // Recurse into each of the regions attached to the operation. + printIndent() << " " << op->getNumRegions() << " nested regions:\n"; + auto indent = pushIndent(); + for (Region ®ion : op->getRegions()) + printRegion(region); + } + + void printRegion(Region ®ion) { + // A region does not hold anything by itself other than a list of blocks. + printIndent() << "Region with " << region.getBlocks().size() + << " blocks:\n"; + auto indent = pushIndent(); + for (Block &block : region.getBlocks()) + printBlock(block); + } + + void printBlock(Block &block) { + // Print the block intrinsics properties (basically: argument list) + printIndent() + << "Block with " << block.getNumArguments() << " arguments, " + << block.getNumSuccessors() + << " successors, and " + // Note, this `.size()` is traversing a linked-list and is O(n). + << block.getOperations().size() << " operations\n"; + + // Block main role is to hold a list of Operations: let's recurse. + auto indent = pushIndent(); + for (Operation &op : block.getOperations()) + printOperation(&op); + } + + /// Manages the indentation as we traverse the IR nesting. + int indent; + struct IdentRAII { + int &indent; + IdentRAII(int &indent) : indent(indent) {} + ~IdentRAII() { --indent; } + }; + void resetIndent() { indent = 0; } + IdentRAII pushIndent() { return IdentRAII(++indent); } + + llvm::raw_ostream &printIndent() { + for (int i = 0; i < indent; ++i) + llvm::outs() << " "; + return llvm::outs(); + } +}; +} // end anonymous namespace + +namespace mlir { +void registerTestPrintNestingPass() { + PassRegistration("test-print-nesting", + "Test various printing."); +} +} // namespace mlir diff --git a/mlir/test/lib/IR/TestSlicing.cpp b/mlir/test/lib/IR/TestSlicing.cpp new file mode 100644 index 0000000000000..a95b2f84cfcf5 --- /dev/null +++ b/mlir/test/lib/IR/TestSlicing.cpp @@ -0,0 +1,81 @@ +//===- TestSlicing.cpp - Testing slice functionality ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a simple testing pass for slicing. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Dialect/Linalg/IR/LinalgOps.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/IR/BlockAndValueMapping.h" +#include "mlir/IR/Function.h" +#include "mlir/IR/Module.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" + +using namespace mlir; + +/// Create a function with the same signature as the parent function of `op` +/// with name being the function name and a `suffix`. +static LogicalResult createBackwardSliceFunction(Operation *op, + StringRef suffix) { + FuncOp parentFuncOp = op->getParentOfType(); + OpBuilder builder(parentFuncOp); + Location loc = op->getLoc(); + std::string clonedFuncOpName = parentFuncOp.getName().str() + suffix.str(); + FuncOp clonedFuncOp = + builder.create(loc, clonedFuncOpName, parentFuncOp.getType()); + BlockAndValueMapping mapper; + builder.setInsertionPointToEnd(clonedFuncOp.addEntryBlock()); + for (auto arg : enumerate(parentFuncOp.getArguments())) + mapper.map(arg.value(), clonedFuncOp.getArgument(arg.index())); + llvm::SetVector slice; + getBackwardSlice(op, &slice); + for (Operation *slicedOp : slice) + builder.clone(*slicedOp, mapper); + builder.create(loc); + return success(); +} + +namespace { +/// Pass to test slice generated from slice analysis. +struct SliceAnalysisTestPass + : public PassWrapper> { + void runOnOperation() override; + SliceAnalysisTestPass() = default; + SliceAnalysisTestPass(const SliceAnalysisTestPass &) {} +}; +} // namespace + +void SliceAnalysisTestPass::runOnOperation() { + ModuleOp module = getOperation(); + auto funcOps = module.getOps(); + unsigned opNum = 0; + for (auto funcOp : funcOps) { + // TODO: For now this is just looking for Linalg ops. It can be generalized + // to look for other ops using flags. + funcOp.walk([&](Operation *op) { + if (!isa(op)) + return WalkResult::advance(); + std::string append = + std::string("__backward_slice__") + std::to_string(opNum); + createBackwardSliceFunction(op, append); + opNum++; + return WalkResult::advance(); + }); + } +} + +namespace mlir { +void registerSliceAnalysisTestPass() { + PassRegistration pass( + "slice-analysis-test", "Test Slice analysis functionality."); +} +} // namespace mlir diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt index de894467d63d4..99424f1c9c065 100644 --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -1,10 +1,12 @@ # Exclude tests from libMLIR.so add_mlir_library(MLIRTestTransforms TestAllReduceLowering.cpp + TestAffineLoopParametricTiling.cpp TestBufferPlacement.cpp TestExpandTanh.cpp TestCallGraph.cpp TestConstantFold.cpp + TestConvVectorization.cpp TestConvertCallOp.cpp TestConvertGPUKernelToCubin.cpp TestConvertGPUKernelToHsaco.cpp diff --git a/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp b/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp new file mode 100644 index 0000000000000..5d369e62ae435 --- /dev/null +++ b/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp @@ -0,0 +1,90 @@ +//= TestAffineLoopParametricTiling.cpp -- Parametric Affine loop tiling pass =// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a test pass to test parametric tiling of perfectly +// nested affine for loops. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Transforms/LoopUtils.h" + +using namespace mlir; + +#define DEBUG_TYPE "test-affine-parametric-tile" + +namespace { + +struct TestAffineLoopParametricTiling + : public PassWrapper { + void runOnFunction() override; +}; +} // end anonymous namespace + +/// Checks if the function enclosing the loop nest has any arguments passed to +/// it, which can be used as tiling parameters. Assumes that atleast 'n' +/// arguments are passed, where 'n' is the number of loops in the loop nest. +static void checkIfTilingParametersExist(ArrayRef band) { + assert(!band.empty() && "no loops in input band"); + AffineForOp topLoop = band[0]; + + if (FuncOp funcOp = dyn_cast(topLoop.getParentOp())) + assert(funcOp.getNumArguments() >= band.size() && "Too few tile sizes"); +} + +/// Captures tiling parameters, which are expected to be passed as arguments +/// to the function enclosing the loop nest. Also checks if the required +/// parameters are of index type. This approach is temporary for testing +/// purposes. +static void getTilingParameters(ArrayRef band, + SmallVectorImpl &tilingParameters) { + AffineForOp topLoop = band[0]; + Region *funcOpRegion = topLoop.getParentRegion(); + unsigned nestDepth = band.size(); + + for (BlockArgument blockArgument : + funcOpRegion->getArguments().take_front(nestDepth)) { + if (blockArgument.getArgNumber() < nestDepth) { + assert(blockArgument.getType().isIndex() && + "expected tiling parameters to be of index type."); + tilingParameters.push_back(blockArgument); + } + } +} + +void TestAffineLoopParametricTiling::runOnFunction() { + // Bands of loops to tile. + std::vector> bands; + getTileableBands(getFunction(), &bands); + + // Tile each band. + for (SmallVectorImpl &band : bands) { + // Capture the tiling parameters from the arguments to the function + // enclosing this loop nest. + SmallVector tiledNest; + SmallVector tilingParameters; + // Check if tiling parameters are present. + checkIfTilingParametersExist(band); + + // Get function arguments as tiling parameters. + getTilingParameters(band, tilingParameters); + + if (failed( + tilePerfectlyNestedParametric(band, tilingParameters, &tiledNest))) + return signalPassFailure(); + } +} + +namespace mlir { +void registerTestAffineLoopParametricTilingPass() { + PassRegistration( + "test-affine-parametric-tile", + "Tile affine loops using SSA values as tile sizes"); +} +} // namespace mlir diff --git a/mlir/test/lib/Transforms/TestBufferPlacement.cpp b/mlir/test/lib/Transforms/TestBufferPlacement.cpp index 14b72b9fc92a0..c338f0f37c4ea 100644 --- a/mlir/test/lib/Transforms/TestBufferPlacement.cpp +++ b/mlir/test/lib/Transforms/TestBufferPlacement.cpp @@ -65,11 +65,6 @@ struct TestBufferPlacementPreparationPass op, "dynamic shapes not currently supported"); auto memrefType = MemRefType::get(type.getShape(), type.getElementType()); - - // Compute alloc position and insert a custom allocation node. - OpBuilder::InsertionGuard guard(rewriter); - rewriter.restoreInsertionPoint( - bufferAssignment->computeAllocPosition(result)); auto alloc = rewriter.create(loc, memrefType); newArgs.push_back(alloc); newResults.push_back(alloc); @@ -110,13 +105,12 @@ struct TestBufferPlacementPreparationPass }; void populateTensorLinalgToBufferLinalgConversionPattern( - MLIRContext *context, BufferAssignmentPlacer *placer, - BufferAssignmentTypeConverter *converter, + MLIRContext *context, BufferAssignmentTypeConverter *converter, OwningRewritePatternList *patterns) { populateWithBufferAssignmentOpConversionPatterns< - mlir::ReturnOp, mlir::ReturnOp, linalg::CopyOp>(context, placer, - converter, patterns); - patterns->insert(context, placer, converter); + mlir::ReturnOp, mlir::ReturnOp, linalg::CopyOp>(context, converter, + patterns); + patterns->insert(context, converter); } void getDependentDialects(DialectRegistry ®istry) const override { @@ -133,6 +127,8 @@ struct TestBufferPlacementPreparationPass target.addLegalDialect(); target.addLegalOp(); target.addLegalOp(); + target.addLegalOp(); + target.addLegalOp(); // Mark all Linalg operations illegal as long as they work on tensors. auto isLegalOperation = [&](Operation *op) { @@ -191,16 +187,11 @@ struct TestBufferPlacementPreparationPass return success(); }); - // Walk over all the functions to apply buffer assignment. - this->getOperation().walk([&](FuncOp function) -> WalkResult { - OwningRewritePatternList patterns; - BufferAssignmentPlacer placer(function); - populateTensorLinalgToBufferLinalgConversionPattern( - &context, &placer, &converter, &patterns); - - // Applying full conversion - return applyFullConversion(function, target, patterns); - }); + OwningRewritePatternList patterns; + populateTensorLinalgToBufferLinalgConversionPattern(&context, &converter, + &patterns); + if (failed(applyFullConversion(this->getOperation(), target, patterns))) + this->signalPassFailure(); }; }; } // end anonymous namespace diff --git a/mlir/test/lib/Transforms/TestConvVectorization.cpp b/mlir/test/lib/Transforms/TestConvVectorization.cpp new file mode 100644 index 0000000000000..c90d8058de329 --- /dev/null +++ b/mlir/test/lib/Transforms/TestConvVectorization.cpp @@ -0,0 +1,116 @@ +//===- TestConvVectorization.cpp - Vectorization of Conv ops --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/VectorToSCF/VectorToSCF.h" +#include "mlir/Dialect/Linalg/Passes.h" +#include "mlir/Dialect/Linalg/Transforms/Hoisting.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" +#include "mlir/Dialect/Vector/VectorTransforms.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Passes.h" + +using namespace mlir; +using namespace vector; + +namespace { +/// A pass converting MLIR Linalg ops into Vector ops. +class TestConvVectorization + : public PassWrapper> { + void runOnOperation() override; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + registry.insert(); + registry.insert(); + registry.insert(); + registry.insert(); + } +}; +} // namespace + +void TestConvVectorization::runOnOperation() { + MLIRContext *context = &getContext(); + ModuleOp module = getOperation(); + + ConversionTarget target(*context); + target.addLegalDialect(); + target.addLegalOp(); + target.addLegalOp(); + + SmallVector stage1Patterns; + linalg::populateConvVectorizationPatterns(context, stage1Patterns); + + OwningRewritePatternList stage2Patterns = + linalg::getLinalgTilingCanonicalizationPatterns(context); + stage2Patterns.insert(context); + + auto stage3Transforms = [](Operation *op) { + PassManager pm(op->getContext()); + pm.addPass(createLoopInvariantCodeMotionPass()); + if (failed(pm.run(cast(op)))) + llvm_unreachable("Unexpected failure in cleanup pass pipeline."); + op->walk([](FuncOp func) { + promoteSingleIterationLoops(func); + linalg::hoistViewAllocOps(func); + linalg::hoistRedundantVectorTransfers(func); + }); + return success(); + }; + + linalg::applyStagedPatterns(module, stage1Patterns, stage2Patterns, + stage3Transforms); + + //===--------------------------------------------------------------------===// + // Post staged patterns transforms + //===--------------------------------------------------------------------===// + + VectorTransformsOptions vectorTransformsOptions{ + VectorContractLowering::Dot, VectorTransposeLowering::EltWise}; + + OwningRewritePatternList vectorTransferPatterns; + // Pattern is not applied because rank-reducing vector transfer is not yet + // supported as can be seen in splitFullAndPartialTransferPrecondition, + // VectorTransforms.cpp + vectorTransferPatterns.insert( + context, vectorTransformsOptions); + applyPatternsAndFoldGreedily(module, vectorTransferPatterns); + + // Programmatic controlled lowering of linalg.copy and linalg.fill. + PassManager pm(context); + pm.addPass(createConvertLinalgToLoopsPass()); + if (failed(pm.run(module))) + llvm_unreachable("Unexpected failure in linalg to loops pass."); + + // Programmatic controlled lowering of vector.contract only. + OwningRewritePatternList vectorContractLoweringPatterns; + populateVectorContractLoweringPatterns(vectorContractLoweringPatterns, + context, vectorTransformsOptions); + applyPatternsAndFoldGreedily(module, vectorContractLoweringPatterns); + + // Programmatic controlled lowering of vector.transfer only. + OwningRewritePatternList vectorToLoopsPatterns; + populateVectorToSCFConversionPatterns(vectorToLoopsPatterns, context, + VectorTransferToSCFOptions()); + applyPatternsAndFoldGreedily(module, vectorToLoopsPatterns); + + // Ensure we drop the marker in the end. + module.walk([](linalg::LinalgOp op) { + op.removeAttr(linalg::LinalgTransforms::kLinalgTransformMarker); + }); +} + +namespace mlir { +void registerTestConvVectorization() { + PassRegistration testTransformPatternsPass( + "test-conv-vectorization", "Test vectorization of convolutions"); +} +} // namespace mlir diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp index 4fc880a24277b..edcc66c9b6a61 100644 --- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp +++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp @@ -449,6 +449,7 @@ static void applyContractionToVectorPatterns(FuncOp funcOp) { patterns.insert, LinalgVectorizationPattern, LinalgVectorizationPattern, + LinalgVectorizationPattern, LinalgVectorizationPattern, LinalgVectorizationPattern>(funcOp.getContext()); applyPatternsAndFoldGreedily(funcOp, patterns); diff --git a/mlir/test/lib/Transforms/TestLoopUnrolling.cpp b/mlir/test/lib/Transforms/TestLoopUnrolling.cpp index 712fddb97028e..396f08b2cba32 100644 --- a/mlir/test/lib/Transforms/TestLoopUnrolling.cpp +++ b/mlir/test/lib/Transforms/TestLoopUnrolling.cpp @@ -55,6 +55,9 @@ class TestLoopUnrollingPass Option unrollFactor{*this, "unroll-factor", llvm::cl::desc("Loop unroll factor."), llvm::cl::init(1)}; + Option unrollUpToFactor{*this, "unroll-up-to-factor", + llvm::cl::desc("Loop unroll up to factor."), + llvm::cl::init(false)}; Option loopDepth{*this, "loop-depth", llvm::cl::desc("Loop depth."), llvm::cl::init(0)}; }; diff --git a/mlir/test/mlir-linalg-ods-gen/test-linalg-ods-gen.tc b/mlir/test/mlir-linalg-ods-gen/test-linalg-ods-gen.tc index d796d1917c035..aad983eb85d28 100644 --- a/mlir/test/mlir-linalg-ods-gen/test-linalg-ods-gen.tc +++ b/mlir/test/mlir-linalg-ods-gen/test-linalg-ods-gen.tc @@ -4,16 +4,15 @@ // ODS-LABEL: def Test1Op : LinalgNamedStructured_Op<"test1", [ // ODS-NEXT: NInputs<2> // ODS-NEXT: NOutputs<1> -// ODS-NEXT: NamedStructuredOpTraits // ODS-NEXT: SingleBlockImplicitTerminator<"YieldOp"> // -// IMPL-LABEL: SmallVector Test1Op::referenceIterators +// IMPL-LABEL: ArrayAttr Test1Op::iterator_types() { // IMPL: { {{.*}}Parallel{{.*}}, {{.*}}Reduction{{.*}} } // -// IMPL: SmallVector Test1Op::referenceIndexingMaps +// IMPL: ArrayAttr Test1Op::indexing_maps() { // IMPL: AffineMap::get(2, 0, {d0, d1}, context), // IMPL-NEXT: AffineMap::get(2, 0, {d1}, context), -// IMPL-NEXT: AffineMap::get(2, 0, {d0}, context) }; +// IMPL-NEXT: AffineMap::get(2, 0, {d0}, context) }); // // IMPL: void Test1Op::regionBuilder(Block &block) { // IMPL: Value [[a:.*]](args[0]), [[b:.*]](args[1]), [[c:.*]](args[2]); @@ -29,16 +28,15 @@ def test1(A: f32(M, K), B: f32(K)) -> (C: f32(M)) { // ODS-LABEL: def Test2Op : LinalgNamedStructured_Op<"test2", [ // ODS-NEXT: NInputs<2> // ODS-NEXT: NOutputs<1> -// ODS-NEXT: NamedStructuredOpTraits // ODS-NEXT: SingleBlockImplicitTerminator<"YieldOp"> // -// IMPL-LABEL: SmallVector Test2Op::referenceIterators +// IMPL-LABEL: ArrayAttr Test2Op::iterator_types() { // IMPL: { {{.*}}Parallel{{.*}}, {{.*}}Parallel{{.*}}, {{.*}}Reduction{{.*}} } // -// IMPL: SmallVector Test2Op::referenceIndexingMaps +// IMPL: ArrayAttr Test2Op::indexing_maps() { // IMPL: AffineMap::get(3, 0, {d0, d2}, context), // IMPL-NEXT: AffineMap::get(3, 0, {d2, d1}, context), -// IMPL-NEXT: AffineMap::get(3, 0, {d0, d1}, context) }; +// IMPL-NEXT: AffineMap::get(3, 0, {d0, d1}, context) }); // // IMPL: Test2Op::regionBuilder(Block &block) { // IMPL: Value [[a:.*]](args[0]), [[b:.*]](args[1]), [[c:.*]](args[2]); @@ -54,16 +52,15 @@ def test2(A: f32(M, K), B: f32(K, N)) -> (C: f32(M, N)) { // ODS-LABEL: def Test3Op : LinalgNamedStructured_Op<"test3", [ // ODS-NEXT: NInputs<2> // ODS-NEXT: NOutputs<1> -// ODS-NEXT: NamedStructuredOpTraits // ODS-NEXT: SingleBlockImplicitTerminator<"YieldOp"> // -// IMPL-LABEL: SmallVector Test3Op::referenceIterators +// IMPL-LABEL: ArrayAttr Test3Op::iterator_types() { // IMPL: { {{.*}}Parallel{{.*}}, {{.*}}Parallel{{.*}}, {{.*}}Reduction{{.*}} } // -// IMPL: SmallVector Test3Op::referenceIndexingMaps +// IMPL: ArrayAttr Test3Op::indexing_maps() { // IMPL: AffineMap::get(4, 0, {d0, d1, d3}, context), // IMPL-NEXT: AffineMap::get(4, 0, {d3, d2}, context), -// IMPL-NEXT: AffineMap::get(4, 0, {d0, d1, d2}, context) }; +// IMPL-NEXT: AffineMap::get(4, 0, {d0, d1, d2}, context) }); // // IMPL: Test3Op::regionBuilder(Block &block) { // IMPL: Value [[a:.*]](args[0]), [[b:.*]](args[1]), [[c:.*]](args[2]); diff --git a/mlir/test/mlir-rocm-runner/vecadd.mlir b/mlir/test/mlir-rocm-runner/vecadd.mlir index df5c073f9b811..9063974d51242 100644 --- a/mlir/test/mlir-rocm-runner/vecadd.mlir +++ b/mlir/test/mlir-rocm-runner/vecadd.mlir @@ -17,12 +17,20 @@ func @vecadd(%arg0 : memref, %arg1 : memref, %arg2 : memref // CHECK: [2.46, 2.46, 2.46, 2.46, 2.46] func @main() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c5 = constant 5 : index + %cf1dot23 = constant 1.23 : f32 %0 = alloc() : memref<5xf32> %1 = alloc() : memref<5xf32> %2 = alloc() : memref<5xf32> %3 = memref_cast %0 : memref<5xf32> to memref %4 = memref_cast %1 : memref<5xf32> to memref %5 = memref_cast %2 : memref<5xf32> to memref + scf.for %i = %c0 to %c5 step %c1 { + store %cf1dot23, %3[%i] : memref + store %cf1dot23, %4[%i] : memref + } %6 = memref_cast %3 : memref to memref<*xf32> %7 = memref_cast %4 : memref to memref<*xf32> %8 = memref_cast %5 : memref to memref<*xf32> diff --git a/mlir/test/mlir-rocm-runner/vector-transferops.mlir b/mlir/test/mlir-rocm-runner/vector-transferops.mlir index 873897011464b..3d4424cc4281b 100644 --- a/mlir/test/mlir-rocm-runner/vector-transferops.mlir +++ b/mlir/test/mlir-rocm-runner/vector-transferops.mlir @@ -44,7 +44,11 @@ func @vectransferx4(%arg0 : memref, %arg1 : memref) { } func @main() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c4 = constant 4 : index %cf1 = constant 1.0 : f32 + %cf1dot23 = constant 1.23 : f32 %arg0 = alloc() : memref<4xf32> %arg1 = alloc() : memref<4xf32> @@ -52,6 +56,11 @@ func @main() { %22 = memref_cast %arg0 : memref<4xf32> to memref %23 = memref_cast %arg1 : memref<4xf32> to memref + scf.for %i = %c0 to %c4 step %c1 { + store %cf1dot23, %22[%i] : memref + store %cf1dot23, %23[%i] : memref + } + %cast0 = memref_cast %22 : memref to memref<*xf32> %cast1 = memref_cast %23 : memref to memref<*xf32> diff --git a/mlir/test/mlir-tblgen/op-attribute.td b/mlir/test/mlir-tblgen/op-attribute.td index edb387cfa2d49..171b5f5757782 100644 --- a/mlir/test/mlir-tblgen/op-attribute.td +++ b/mlir/test/mlir-tblgen/op-attribute.td @@ -107,7 +107,7 @@ def BOp : NS_Op<"b_op", []> { StrAttr:$str_attr, ElementsAttr:$elements_attr, FlatSymbolRefAttr:$function_attr, - SomeTypeAttr:$type_attr, + SomeTypeAttr:$some_type_attr, ArrayAttr:$array_attr, TypedArrayAttrBase:$some_attr_array, TypeAttr:$type_attr @@ -128,7 +128,7 @@ def BOp : NS_Op<"b_op", []> { // DEF: if (!((tblgen_str_attr.isa<::mlir::StringAttr>()))) // DEF: if (!((tblgen_elements_attr.isa<::mlir::ElementsAttr>()))) // DEF: if (!((tblgen_function_attr.isa<::mlir::FlatSymbolRefAttr>()))) -// DEF: if (!(((tblgen_type_attr.isa<::mlir::TypeAttr>())) && ((tblgen_type_attr.cast<::mlir::TypeAttr>().getValue().isa())))) +// DEF: if (!(((tblgen_some_type_attr.isa<::mlir::TypeAttr>())) && ((tblgen_some_type_attr.cast<::mlir::TypeAttr>().getValue().isa())))) // DEF: if (!((tblgen_array_attr.isa<::mlir::ArrayAttr>()))) // DEF: if (!(((tblgen_some_attr_array.isa<::mlir::ArrayAttr>())) && (::llvm::all_of(tblgen_some_attr_array.cast<::mlir::ArrayAttr>(), [](::mlir::Attribute attr) { return (some-condition); })))) // DEF: if (!(((tblgen_type_attr.isa<::mlir::TypeAttr>())) && ((tblgen_type_attr.cast<::mlir::TypeAttr>().getValue().isa<::mlir::Type>())))) @@ -145,7 +145,7 @@ def BOp : NS_Op<"b_op", []> { // DEF: ::llvm::StringRef BOp::str_attr() // DEF: ::mlir::ElementsAttr BOp::elements_attr() // DEF: ::llvm::StringRef BOp::function_attr() -// DEF: SomeType BOp::type_attr() +// DEF: SomeType BOp::some_type_attr() // DEF: ::mlir::ArrayAttr BOp::array_attr() // DEF: ::mlir::ArrayAttr BOp::some_attr_array() // DEF: ::mlir::Type BOp::type_attr() @@ -275,3 +275,19 @@ def SomeTypedArrayAttr : TypedArrayAttrBase; // RECORD-LABEL: def SomeTypedArrayAttr // RECORD: Attr elementAttr = SomeAttr; + +def Test_Dialect_2 : Dialect { + let name = "dialect_2"; +} +def MyStruct : StructAttr<"MyStruct", Test_Dialect_2, +[StructFieldAttr<"potatoes", I64ElementsAttr>]> { + let description = "A structure describing a number of potatoes."; +} + +def StructAttrOp : NS_Op<"struct_attr_op", []> { + let arguments = (ins + MyStruct:$potatoes + ); +} + +// DECL: dialect_2::MyStruct potatoes(); diff --git a/mlir/test/mlir-tblgen/op-decl.td b/mlir/test/mlir-tblgen/op-decl.td index d1b11556be308..8390dea18ae9e 100644 --- a/mlir/test/mlir-tblgen/op-decl.td +++ b/mlir/test/mlir-tblgen/op-decl.td @@ -61,8 +61,8 @@ def NS_AOp : NS_Op<"a_op", [IsolatedFromAbove, IsolatedFromAbove]> { // CHECK: ::mlir::ValueRange odsOperands; // CHECK: }; -// CHECK: class AOp : public ::mlir::Op::Impl, OpTrait::AtLeastNResults<1>::Impl, OpTrait::ZeroSuccessor, OpTrait::AtLeastNOperands<1>::Impl, OpTrait::IsIsolatedFromAbove -// CHECK-NOT: OpTrait::IsIsolatedFromAbove +// CHECK: class AOp : public ::mlir::Op::Impl, ::mlir::OpTrait::AtLeastNResults<1>::Impl, ::mlir::OpTrait::ZeroSuccessor, ::mlir::OpTrait::AtLeastNOperands<1>::Impl, ::mlir::OpTrait::IsIsolatedFromAbove +// CHECK-NOT: ::mlir::OpTrait::IsIsolatedFromAbove // CHECK: public: // CHECK: using Op::Op; // CHECK: using Adaptor = AOpAdaptor; diff --git a/mlir/test/mlir-tblgen/op-result.td b/mlir/test/mlir-tblgen/op-result.td index bdb0765ab541c..68492202b4a60 100644 --- a/mlir/test/mlir-tblgen/op-result.td +++ b/mlir/test/mlir-tblgen/op-result.td @@ -110,7 +110,7 @@ def OpK : NS_Op<"only_input_is_variadic_with_same_value_type_op", [SameOperandsA let results = (outs AnyTensor:$result); } -// CHECK-LABEL: OpK::build(::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &odsState, ::mlir::ValueRange operands, ::llvm::ArrayRef<::mlir::NamedAttribute> attributes ) +// CHECK-LABEL: OpK::build(::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &odsState, ::mlir::ValueRange operands, ::llvm::ArrayRef<::mlir::NamedAttribute> attributes) // CHECK: odsState.addTypes({operands[0].getType()}); // Test with inferred shapes and interleaved with operands/attributes. diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp index 92efef67e8f4a..59d655684f48c 100644 --- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp +++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp @@ -974,19 +974,19 @@ class TCParser { /// Parse and print the information for a TC def. /// When `gen-ods-decl` is used, this prints the ODS declaration for the TC. /// When `gen-impl` is used, this prints the C++ implementation for the extra - /// methods defined in ODS (referenceIterators, referenceIndexingMaps and - /// regionBuilder). + /// methods defined in ODS (`iterator_types`, `indexing_maps` and + /// `regionBuilder`). LogicalResult parseAndEmitODSDef(llvm::raw_ostream &os); /// Print the ODS class that defines a new `cppOpName` for a `linalgOpName`. void printODS(llvm::raw_ostream &os, StringRef cppOpName, StringRef linalgOpName); - /// Print the C++ StructuredOpsInterface impl of `referenceIterators`. + /// Print the C++ StructuredOpsInterface impl of `iterator_types`. void printReferenceIterators(llvm::raw_ostream &os, StringRef cppOpName, ComprehensionParsingState &state); - /// Print the C++ StructuredOpsInterface impl of `referenceIndexingMaps`. + /// Print the C++ StructuredOpsInterface impl of `indexing_maps`. void printReferenceIndexingMaps(llvm::raw_ostream &os, StringRef cppOpName, ComprehensionParsingState &state); @@ -1446,7 +1446,6 @@ void TCParser::printODS(llvm::raw_ostream &os, StringRef cppOpName, const char *header = R"FMT( def {0} : LinalgNamedStructured_Op<"{1}", [ NInputs<{2}>, NOutputs<{3}>, - NamedStructuredOpTraits, SingleBlockImplicitTerminator<"YieldOp">]> { let arguments = (ins Variadic:$views); let results = (outs Variadic:$output_tensors); @@ -1465,16 +1464,9 @@ void TCParser::printODS(llvm::raw_ostream &os, StringRef cppOpName, return ::parseNamedStructuredOp<{0}>(parser, result); }]; let extraClassDeclaration = [{{ - llvm::Optional> referenceIterators(); - static SmallVector referenceIterators( - TypeRange inputTypes, TypeRange outputTypes); - - llvm::Optional> referenceIndexingMaps(); - static SmallVector referenceIndexingMaps( - TypeRange inputTypes, TypeRange outputTypes); - + ArrayAttr iterator_types(); + ArrayAttr indexing_maps(); static void regionBuilder(Block &block); - std::string getLibraryCallName() {{ return generateLibraryCallName(getOperation()); } @@ -1492,20 +1484,14 @@ void TCParser::printODS(llvm::raw_ostream &os, StringRef cppOpName, os << llvm::formatv(header, cppOpName, linalgOpName, nInputs, nOutputs); } -/// Print the C++ StructuredOpsInterface impl of `referenceIterators`. +/// Print the C++ StructuredOpsInterface impl of `iterator_types`. void TCParser::printReferenceIterators(llvm::raw_ostream &os, StringRef cppOpName, ComprehensionParsingState &state) { const char *referenceReferenceIteratorsFmt = R"FMT( - // This is temporary until we transition out of manually specified ops - // that should be auto-generated with linalg-ods-gen. - llvm::Optional> {0}::referenceIterators() {{ - llvm_unreachable("Unexpected missing `iterator_types` attribute."); - } - SmallVector {0}::referenceIterators( - TypeRange inputTypes, TypeRange outputTypes) { - return SmallVector{{ {1} }; + ArrayAttr {0}::iterator_types() { + return Builder(getContext()).getStrArrayAttr(SmallVector{{ {1} }); })FMT"; std::string iteratorsStr; @@ -1542,16 +1528,11 @@ void TCParser::printReferenceIndexingMaps(llvm::raw_ostream &os, R"FMT( // This is temporary until we transition out of manually specified ops that // should be auto-generated with linalg-ods-gen. - llvm::Optional> {0}::referenceIndexingMaps() {{ - llvm_unreachable("Unexpected missing `indexing_maps` attribute."); - } - SmallVector {0}::referenceIndexingMaps( - TypeRange inputTypes, TypeRange outputTypes) { - assert(!inputTypes.empty() && "At least one input expected"); - MLIRContext *context = (*inputTypes.begin()).getContext(); + ArrayAttr {0}::indexing_maps() { + MLIRContext *context = getContext(); AffineExpr {1}; bindDims(context, {1}); - return SmallVector{{ {2} }; + return Builder(context).getAffineMapArrayAttr({ {2} }); })FMT"; // 2. Print a comma-separated list of identifiers for the AffineExpr in diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index ad76abed647e7..93934d40fe591 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -38,13 +38,16 @@ void registerPatternsTestPass(); void registerPrintOpAvailabilityPass(); void registerSideEffectTestPasses(); void registerSimpleParametricTilingPass(); +void registerSliceAnalysisTestPass(); void registerSymbolTestPasses(); void registerTestAffineDataCopyPass(); +void registerTestAffineLoopParametricTilingPass(); void registerTestAffineLoopUnswitchingPass(); void registerTestAllReduceLoweringPass(); void registerTestBufferPlacementPreparationPass(); void registerTestCallGraphPass(); void registerTestConstantFold(); +void registerTestConvVectorization(); void registerTestConvertGPUKernelToCubinPass(); void registerTestConvertGPUKernelToHsacoPass(); void registerTestDominancePass(); @@ -66,6 +69,8 @@ void registerTestMemRefDependenceCheck(); void registerTestMemRefStrideCalculation(); void registerTestOpaqueLoc(); void registerTestPreparationPassWithAllowedMemrefResults(); +void registerTestPrintDefUsePass(); +void registerTestPrintNestingPass(); void registerTestRecursiveTypesPass(); void registerTestReducer(); void registerTestSpirvEntryPointABIPass(); @@ -85,12 +90,14 @@ void registerTestPasses() { registerPrintOpAvailabilityPass(); registerSideEffectTestPasses(); registerSimpleParametricTilingPass(); + registerSliceAnalysisTestPass(); registerSymbolTestPasses(); registerTestAffineDataCopyPass(); registerTestAllReduceLoweringPass(); registerTestAffineLoopUnswitchingPass(); registerTestLoopPermutationPass(); registerTestCallGraphPass(); + registerTestConvVectorization(); registerTestConstantFold(); #if MLIR_CUDA_CONVERSIONS_ENABLED registerTestConvertGPUKernelToCubinPass(); @@ -98,6 +105,7 @@ void registerTestPasses() { #if MLIR_ROCM_CONVERSIONS_ENABLED registerTestConvertGPUKernelToHsacoPass(); #endif + registerTestAffineLoopParametricTilingPass(); registerTestBufferPlacementPreparationPass(); registerTestDominancePass(); registerTestFunc(); @@ -115,6 +123,8 @@ void registerTestPasses() { registerTestMemRefStrideCalculation(); registerTestOpaqueLoc(); registerTestPreparationPassWithAllowedMemrefResults(); + registerTestPrintDefUsePass(); + registerTestPrintNestingPass(); registerTestRecursiveTypesPass(); registerTestReducer(); registerTestGpuParallelLoopMappingPass(); diff --git a/mlir/tools/mlir-rocm-runner/CMakeLists.txt b/mlir/tools/mlir-rocm-runner/CMakeLists.txt index 9b07d00d80961..2c0791d7a5c1d 100644 --- a/mlir/tools/mlir-rocm-runner/CMakeLists.txt +++ b/mlir/tools/mlir-rocm-runner/CMakeLists.txt @@ -38,7 +38,7 @@ if(MLIR_ROCM_RUNNER_ENABLED) add_definitions(-D__ROCM_PATH__="${ROCM_PATH}") # Locate HIP runtime library. - find_library(ROCM_RUNTIME_LIBRARY hip_hcc + find_library(ROCM_RUNTIME_LIBRARY amdhip64 PATHS "${HIP_PATH}/lib") if (NOT ROCM_RUNTIME_LIBRARY) message(SEND_ERROR "Could not locate ROCm HIP runtime library") diff --git a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp index 4689926be87d5..d0c515ba1f03c 100644 --- a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp +++ b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp @@ -16,6 +16,7 @@ #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" +#include "mlir/Conversion/SCFToStandard/SCFToStandard.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" #include "mlir/Dialect/GPU/GPUDialect.h" @@ -302,6 +303,7 @@ static LogicalResult runMLIRPasses(ModuleOp m) { configTargetFeatures(); const char gpuBinaryAnnotation[] = "rocdl.hsaco"; + pm.addPass(createLowerToCFGPass()); pm.addPass(createGpuKernelOutliningPass()); auto &kernelPm = pm.nest(); kernelPm.addPass(createStripDebugInfoPass()); diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp index 3a19379da8a3a..4a9ec48b777e2 100644 --- a/mlir/tools/mlir-tblgen/DialectGen.cpp +++ b/mlir/tools/mlir-tblgen/DialectGen.cpp @@ -153,6 +153,15 @@ static void emitDialectDecl(Dialect &dialect, dialectsOs << llvm::formatv(dialectRegistrationTemplate, dependentDialect); } + + // Emit all nested namespaces. + StringRef cppNamespace = dialect.getCppNamespace(); + llvm::SmallVector namespaces; + llvm::SplitString(cppNamespace, namespaces, "::"); + + for (auto ns : namespaces) + os << "namespace " << ns << " {\n"; + // Emit the start of the decl. std::string cppName = dialect.getCppClassName(); os << llvm::formatv(dialectDeclBeginStr, cppName, dialect.getName(), @@ -179,6 +188,10 @@ static void emitDialectDecl(Dialect &dialect, // End the dialect decl. os << "};\n"; + + // Close all nested namespaces in reverse order. + for (auto ns : llvm::reverse(namespaces)) + os << "} // namespace " << ns << "\n"; } static bool emitDialectDecls(const llvm::RecordKeeper &recordKeeper, diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp index 0b3ad38b035ff..ecadd20cd9824 100644 --- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp @@ -232,10 +232,6 @@ class OpEmitter { // operand's type as all results' types. void genUseOperandAsResultTypeCollectiveParamBuilder(); - // Returns true if the inferred collective param build method should be - // generated. - bool shouldGenerateInferredTypeCollectiveParamBuilder(); - // Generates the build() method that takes aggregate operands/attributes // parameters. This build() method uses inferred types as result types. // Requires: The type needs to be inferable via InferTypeOpInterface. @@ -268,7 +264,7 @@ class OpEmitter { // `resultTypeNames` with the names for parameters for specifying result // types. The given `typeParamKind` and `attrParamKind` controls how result // types and attributes are placed in the parameter list. - void buildParamList(std::string ¶mList, + void buildParamList(llvm::SmallVectorImpl ¶mList, SmallVectorImpl &resultTypeNames, TypeParamKind typeParamKind, AttrParamKind attrParamKind = AttrParamKind::WrappedAttr); @@ -494,17 +490,29 @@ void OpEmitter::genAttrGetters() { FmtContext fctx; fctx.withBuilder("::mlir::Builder(this->getContext())"); + Dialect opDialect = op.getDialect(); // Emit the derived attribute body. auto emitDerivedAttr = [&](StringRef name, Attribute attr) { - auto &method = opClass.newMethod(attr.getReturnType(), name); - auto &body = method.body(); + auto *method = opClass.addMethodAndPrune(attr.getReturnType(), name); + if (!method) + return; + auto &body = method->body(); body << " " << attr.getDerivedCodeBody() << "\n"; }; // Emit with return type specified. auto emitAttrWithReturnType = [&](StringRef name, Attribute attr) { - auto &method = opClass.newMethod(attr.getReturnType(), name); - auto &body = method.body(); + Dialect attrDialect = attr.getDialect(); + // Does the current operation have a different namespace than the attribute? + bool differentNamespace = + attrDialect && opDialect && attrDialect != opDialect; + std::string returnType = differentNamespace + ? (llvm::Twine(attrDialect.getCppNamespace()) + + "::" + attr.getReturnType()) + .str() + : attr.getReturnType().str(); + auto *method = opClass.addMethodAndPrune(returnType, name); + auto &body = method->body(); body << " auto attr = " << name << "Attr();\n"; if (attr.hasDefaultValue()) { // Returns the default value if not set. @@ -526,9 +534,11 @@ void OpEmitter::genAttrGetters() { // referring to the attributes via accessors instead of having to use // the string interface for better compile time verification. auto emitAttrWithStorageType = [&](StringRef name, Attribute attr) { - auto &method = - opClass.newMethod(attr.getStorageType(), (name + "Attr").str()); - auto &body = method.body(); + auto *method = + opClass.addMethodAndPrune(attr.getStorageType(), (name + "Attr").str()); + if (!method) + return; + auto &body = method->body(); body << " return this->getAttr(\"" << name << "\")."; if (attr.isOptional() || attr.hasDefaultValue()) body << "dyn_cast_or_null<"; @@ -558,19 +568,19 @@ void OpEmitter::genAttrGetters() { // attribute. This enables, for example, avoiding adding an attribute that // overlaps with a derived attribute. { - auto &method = - opClass.newMethod("bool", "isDerivedAttribute", - "::llvm::StringRef name", OpMethod::MP_Static); - auto &body = method.body(); + auto *method = opClass.addMethodAndPrune("bool", "isDerivedAttribute", + OpMethod::MP_Static, + "::llvm::StringRef", "name"); + auto &body = method->body(); for (auto namedAttr : derivedAttrs) body << " if (name == \"" << namedAttr.name << "\") return true;\n"; body << " return false;"; } // Generate method to materialize derived attributes as a DictionaryAttr. { - OpMethod &method = opClass.newMethod("::mlir::DictionaryAttr", - "materializeDerivedAttributes"); - auto &body = method.body(); + auto *method = opClass.addMethodAndPrune("::mlir::DictionaryAttr", + "materializeDerivedAttributes"); + auto &body = method->body(); auto nonMaterializable = make_filter_range(derivedAttrs, [](const NamedAttribute &namedAttr) { @@ -618,9 +628,11 @@ void OpEmitter::genAttrSetters() { // to the attributes via setters instead of having to use the string interface // for better compile time verification. auto emitAttrWithStorageType = [&](StringRef name, Attribute attr) { - auto &method = opClass.newMethod("void", (name + "Attr").str(), - (attr.getStorageType() + " attr").str()); - auto &body = method.body(); + auto *method = opClass.addMethodAndPrune("void", (name + "Attr").str(), + attr.getStorageType(), "attr"); + if (!method) + return; + auto &body = method->body(); body << " this->getOperation()->setAttr(\"" << name << "\", attr);"; }; @@ -640,13 +652,15 @@ generateValueRangeStartAndEnd(Class &opClass, StringRef methodName, int numVariadic, int numNonVariadic, StringRef rangeSizeCall, bool hasAttrSegmentSize, StringRef sizeAttrInit, RangeT &&odsValues) { - auto &method = opClass.newMethod("std::pair", methodName, - "unsigned index"); - + auto *method = opClass.addMethodAndPrune("std::pair", + methodName, "unsigned", "index"); + if (!method) + return; + auto &body = method->body(); if (numVariadic == 0) { - method.body() << " return {index, 1};\n"; + body << " return {index, 1};\n"; } else if (hasAttrSegmentSize) { - method.body() << sizeAttrInit << attrSizedSegmentValueRangeCalcCode; + body << sizeAttrInit << attrSizedSegmentValueRangeCalcCode; } else { // Because the op can have arbitrarily interleaved variadic and non-variadic // operands, we need to embed a list in the "sink" getter method for @@ -656,9 +670,8 @@ generateValueRangeStartAndEnd(Class &opClass, StringRef methodName, for (auto &it : odsValues) isVariadic.push_back(it.isVariableLength() ? "true" : "false"); std::string isVariadicList = llvm::join(isVariadic, ", "); - method.body() << formatv(sameVariadicSizeValueRangeCalcCode, isVariadicList, - numNonVariadic, numVariadic, rangeSizeCall, - "operand"); + body << formatv(sameVariadicSizeValueRangeCalcCode, isVariadicList, + numNonVariadic, numVariadic, rangeSizeCall, "operand"); } } @@ -684,9 +697,9 @@ static void generateNamedOperandGetters(const Operator &op, Class &opClass, const int numNormalOperands = numOperands - numVariadicOperands; const auto *sameVariadicSize = - op.getTrait("OpTrait::SameVariadicOperandSize"); + op.getTrait("::mlir::OpTrait::SameVariadicOperandSize"); const auto *attrSizedOperands = - op.getTrait("OpTrait::AttrSizedOperandSegments"); + op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments"); if (numVariadicOperands > 1 && !sameVariadicSize && !attrSizedOperands) { PrintFatalError(op.getLoc(), "op has multiple variadic operands but no " @@ -711,9 +724,11 @@ static void generateNamedOperandGetters(const Operator &op, Class &opClass, rangeSizeCall, attrSizedOperands, sizeAttrInit, const_cast(op).getOperands()); - auto &m = opClass.newMethod(rangeType, "getODSOperands", "unsigned index"); - m.body() << formatv(valueRangeReturnCode, rangeBeginCall, - "getODSOperandIndexAndLength(index)"); + auto *m = opClass.addMethodAndPrune(rangeType, "getODSOperands", "unsigned", + "index"); + auto &body = m->body(); + body << formatv(valueRangeReturnCode, rangeBeginCall, + "getODSOperandIndexAndLength(index)"); // Then we emit nicer named getter methods by redirecting to the "sink" getter // method. @@ -723,15 +738,15 @@ static void generateNamedOperandGetters(const Operator &op, Class &opClass, continue; if (operand.isOptional()) { - auto &m = opClass.newMethod("::mlir::Value", operand.name); - m.body() << " auto operands = getODSOperands(" << i << ");\n" - << " return operands.empty() ? Value() : *operands.begin();"; + m = opClass.addMethodAndPrune("::mlir::Value", operand.name); + m->body() << " auto operands = getODSOperands(" << i << ");\n" + << " return operands.empty() ? Value() : *operands.begin();"; } else if (operand.isVariadic()) { - auto &m = opClass.newMethod(rangeType, operand.name); - m.body() << " return getODSOperands(" << i << ");"; + m = opClass.addMethodAndPrune(rangeType, operand.name); + m->body() << " return getODSOperands(" << i << ");"; } else { - auto &m = opClass.newMethod("::mlir::Value", operand.name); - m.body() << " return *getODSOperands(" << i << ").begin();"; + m = opClass.addMethodAndPrune("::mlir::Value", operand.name); + m->body() << " return *getODSOperands(" << i << ").begin();"; } } } @@ -748,14 +763,15 @@ void OpEmitter::genNamedOperandGetters() { } void OpEmitter::genNamedOperandSetters() { - auto *attrSizedOperands = op.getTrait("OpTrait::AttrSizedOperandSegments"); + auto *attrSizedOperands = + op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments"); for (int i = 0, e = op.getNumOperands(); i != e; ++i) { const auto &operand = op.getOperand(i); if (operand.name.empty()) continue; - auto &m = opClass.newMethod("::mlir::MutableOperandRange", - (operand.name + "Mutable").str()); - auto &body = m.body(); + auto *m = opClass.addMethodAndPrune("::mlir::MutableOperandRange", + (operand.name + "Mutable").str()); + auto &body = m->body(); body << " auto range = getODSOperandIndexAndLength(" << i << ");\n" << " return ::mlir::MutableOperandRange(getOperation(), " "range.first, range.second"; @@ -775,9 +791,10 @@ void OpEmitter::genNamedResultGetters() { // If we have more than one variadic results, we need more complicated logic // to calculate the value range for each result. - const auto *sameVariadicSize = op.getTrait("OpTrait::SameVariadicResultSize"); + const auto *sameVariadicSize = + op.getTrait("::mlir::OpTrait::SameVariadicResultSize"); const auto *attrSizedResults = - op.getTrait("OpTrait::AttrSizedResultSegments"); + op.getTrait("::mlir::OpTrait::AttrSizedResultSegments"); if (numVariadicResults > 1 && !sameVariadicSize && !attrSizedResults) { PrintFatalError(op.getLoc(), "op has multiple variadic results but no " @@ -800,10 +817,11 @@ void OpEmitter::genNamedResultGetters() { numNormalResults, "getOperation()->getNumResults()", attrSizedResults, formatv(opSegmentSizeAttrInitCode, "result_segment_sizes").str(), op.getResults()); - auto &m = opClass.newMethod("::mlir::Operation::result_range", - "getODSResults", "unsigned index"); - m.body() << formatv(valueRangeReturnCode, "getOperation()->result_begin()", - "getODSResultIndexAndLength(index)"); + + auto *m = opClass.addMethodAndPrune("::mlir::Operation::result_range", + "getODSResults", "unsigned", "index"); + m->body() << formatv(valueRangeReturnCode, "getOperation()->result_begin()", + "getODSResultIndexAndLength(index)"); for (int i = 0; i != numResults; ++i) { const auto &result = op.getResult(i); @@ -811,17 +829,17 @@ void OpEmitter::genNamedResultGetters() { continue; if (result.isOptional()) { - auto &m = opClass.newMethod("::mlir::Value", result.name); - m.body() + m = opClass.addMethodAndPrune("::mlir::Value", result.name); + m->body() << " auto results = getODSResults(" << i << ");\n" << " return results.empty() ? ::mlir::Value() : *results.begin();"; } else if (result.isVariadic()) { - auto &m = - opClass.newMethod("::mlir::Operation::result_range", result.name); - m.body() << " return getODSResults(" << i << ");"; + m = opClass.addMethodAndPrune("::mlir::Operation::result_range", + result.name); + m->body() << " return getODSResults(" << i << ");"; } else { - auto &m = opClass.newMethod("::mlir::Value", result.name); - m.body() << " return *getODSResults(" << i << ").begin();"; + m = opClass.addMethodAndPrune("::mlir::Value", result.name); + m->body() << " return *getODSResults(" << i << ").begin();"; } } } @@ -835,15 +853,15 @@ void OpEmitter::genNamedRegionGetters() { // Generate the accessors for a varidiadic region. if (region.isVariadic()) { - auto &m = - opClass.newMethod("::mlir::MutableArrayRef", region.name); - m.body() << formatv( + auto *m = opClass.addMethodAndPrune("::mlir::MutableArrayRef", + region.name); + m->body() << formatv( " return this->getOperation()->getRegions().drop_front({0});", i); continue; } - auto &m = opClass.newMethod("::mlir::Region &", region.name); - m.body() << formatv(" return this->getOperation()->getRegion({0});", i); + auto *m = opClass.addMethodAndPrune("::mlir::Region &", region.name); + m->body() << formatv(" return this->getOperation()->getRegion({0});", i); } } @@ -856,16 +874,18 @@ void OpEmitter::genNamedSuccessorGetters() { // Generate the accessors for a variadic successor list. if (successor.isVariadic()) { - auto &m = opClass.newMethod("::mlir::SuccessorRange", successor.name); - m.body() << formatv( + auto *m = + opClass.addMethodAndPrune("::mlir::SuccessorRange", successor.name); + m->body() << formatv( " return {std::next(this->getOperation()->successor_begin(), {0}), " "this->getOperation()->successor_end()};", i); continue; } - auto &m = opClass.newMethod("::mlir::Block *", successor.name); - m.body() << formatv(" return this->getOperation()->getSuccessor({0});", i); + auto *m = opClass.addMethodAndPrune("::mlir::Block *", successor.name); + m->body() << formatv(" return this->getOperation()->getSuccessor({0});", + i); } } @@ -905,14 +925,16 @@ void OpEmitter::genSeparateArgParamBuilder() { // inferring result type. auto emit = [&](AttrParamKind attrType, TypeParamKind paramKind, bool inferType) { - std::string paramList; + llvm::SmallVector paramList; llvm::SmallVector resultNames; buildParamList(paramList, resultNames, paramKind, attrType); - auto &m = - opClass.newMethod("void", "build", paramList, OpMethod::MP_Static); - auto &body = m.body(); - + auto *m = opClass.addMethodAndPrune("void", "build", OpMethod::MP_Static, + std::move(paramList)); + // If the builder is redundant, skip generating the method. + if (!m) + return; + auto &body = m->body(); genCodeForAddingArgAndRegionForBuilder( body, /*isRawValueAttr=*/attrType == AttrParamKind::UnwrappedValue); @@ -967,54 +989,13 @@ void OpEmitter::genSeparateArgParamBuilder() { llvm_unreachable("unhandled TypeParamKind"); }; - // A separate arg param builder method will have a signature which is - // ambiguous with the collective params build method (generated in - // `genCollectiveParamBuilder` function below) if it has a single - // `ArrayReg` parameter for result types and a single `ArrayRef` - // parameter for the operands, no parameters after that, and the collective - // params build method has `attributes` as its last parameter (with - // a default value). This will happen when all of the following are true: - // 1. [`attributes` as last parameter in collective params build method]: - // getNumVariadicRegions must be 0 (otherwise the collective params build - // method ends with a `numRegions` param, and we don't specify default - // value for attributes). - // 2. [single `ArrayRef` parameter for operands, and no parameters - // after that]: numArgs() must be 1 (if not, each arg gets a separate param - // in the build methods generated here) and the single arg must be a - // non-attribute variadic argument. - // 3. [single `ArrayReg` parameter for result types]: - // 3a. paramKind should be Collective, or - // 3b. paramKind should be Separate and there should be a single variadic - // result - // - // In that case, skip generating such ambiguous build methods here. + // Some of the build methods generated here may be amiguous, but TableGen's + // ambiguous function detection will elide those ones. for (auto attrType : attrBuilderType) { - // Case 3b above. - if (!(op.hasNoVariadicRegions() && op.hasSingleVariadicArg() && - op.hasSingleVariadicResult())) - emit(attrType, TypeParamKind::Separate, /*inferType=*/false); - if (canInferType(op)) { - // When inferType = true, the generated build method does not have - // result types. If the op has a single variadic arg, then this build - // method will be ambiguous with the collective inferred build method - // generated in `genInferredTypeCollectiveParamBuilder`. If we are going - // to generate that collective inferred method, suppress generating the - // ambiguous build method here. - bool buildMethodAmbiguous = - op.hasSingleVariadicArg() && - shouldGenerateInferredTypeCollectiveParamBuilder(); - if (!buildMethodAmbiguous) - emit(attrType, TypeParamKind::None, /*inferType=*/true); - } - // The separate arg + collective param kind method will be: - // (a) Same as the separate arg + separate param kind method if there is - // only one variadic result. - // (b) Ambiguous with the collective params method under conditions in (3a) - // above. - // In either case, skip generating such build method. - if (!op.hasSingleVariadicResult() && - !(op.hasNoVariadicRegions() && op.hasSingleVariadicArg())) - emit(attrType, TypeParamKind::Collective, /*inferType=*/false); + emit(attrType, TypeParamKind::Separate, /*inferType=*/false); + if (canInferType(op)) + emit(attrType, TypeParamKind::None, /*inferType=*/true); + emit(attrType, TypeParamKind::Collective, /*inferType=*/false); } } @@ -1022,19 +1003,23 @@ void OpEmitter::genUseOperandAsResultTypeCollectiveParamBuilder() { int numResults = op.getNumResults(); // Signature - std::string params = - std::string("::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &") + - builderOpState + - ", ::mlir::ValueRange operands, ::llvm::ArrayRef<::mlir::NamedAttribute> " - "attributes"; - if (op.getNumVariadicRegions()) { - params += ", unsigned numRegions"; - } else { - // Provide default value for `attributes` since its the last parameter - params += " = {}"; - } - auto &m = opClass.newMethod("void", "build", params, OpMethod::MP_Static); - auto &body = m.body(); + llvm::SmallVector paramList; + paramList.emplace_back("::mlir::OpBuilder &", "odsBuilder"); + paramList.emplace_back("::mlir::OperationState &", builderOpState); + paramList.emplace_back("::mlir::ValueRange", "operands"); + // Provide default value for `attributes` when its the last parameter + StringRef attributesDefaultValue = op.getNumVariadicRegions() ? "" : "{}"; + paramList.emplace_back("::llvm::ArrayRef<::mlir::NamedAttribute>", + "attributes", attributesDefaultValue); + if (op.getNumVariadicRegions()) + paramList.emplace_back("unsigned", "numRegions"); + + auto *m = opClass.addMethodAndPrune("void", "build", OpMethod::MP_Static, + std::move(paramList)); + // If the builder is redundant, skip generating the method + if (!m) + return; + auto &body = m->body(); // Operands body << " " << builderOpState << ".addOperands(operands);\n"; @@ -1056,19 +1041,20 @@ void OpEmitter::genUseOperandAsResultTypeCollectiveParamBuilder() { << llvm::join(resultTypes, ", ") << "});\n\n"; } -bool OpEmitter::shouldGenerateInferredTypeCollectiveParamBuilder() { - return canInferType(op) && op.getNumSuccessors() == 0; -} - void OpEmitter::genInferredTypeCollectiveParamBuilder() { // TODO: Expand to support regions. - std::string params = - std::string("::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &") + - builderOpState + - ", ::mlir::ValueRange operands, ::llvm::ArrayRef<::mlir::NamedAttribute> " - "attributes = {}"; - auto &m = opClass.newMethod("void", "build", params, OpMethod::MP_Static); - auto &body = m.body(); + SmallVector paramList; + paramList.emplace_back("::mlir::OpBuilder &", "odsBuilder"); + paramList.emplace_back("::mlir::OperationState &", builderOpState); + paramList.emplace_back("::mlir::ValueRange", "operands"); + paramList.emplace_back("::llvm::ArrayRef<::mlir::NamedAttribute>", + "attributes", "{}"); + auto *m = opClass.addMethodAndPrune("void", "build", OpMethod::MP_Static, + std::move(paramList)); + // If the builder is redundant, skip generating the method + if (!m) + return; + auto &body = m->body(); int numResults = op.getNumResults(); int numVariadicResults = op.getNumVariableLengthResults(); @@ -1116,12 +1102,17 @@ void OpEmitter::genInferredTypeCollectiveParamBuilder() { } void OpEmitter::genUseOperandAsResultTypeSeparateParamBuilder() { - std::string paramList; + llvm::SmallVector paramList; llvm::SmallVector resultNames; buildParamList(paramList, resultNames, TypeParamKind::None); - auto &m = opClass.newMethod("void", "build", paramList, OpMethod::MP_Static); - genCodeForAddingArgAndRegionForBuilder(m.body()); + auto *m = opClass.addMethodAndPrune("void", "build", OpMethod::MP_Static, + std::move(paramList)); + // If the builder is redundant, skip generating the method + if (!m) + return; + auto &body = m->body(); + genCodeForAddingArgAndRegionForBuilder(body); auto numResults = op.getNumResults(); if (numResults == 0) @@ -1131,20 +1122,26 @@ void OpEmitter::genUseOperandAsResultTypeSeparateParamBuilder() { const char *index = op.getOperand(0).isVariadic() ? ".front()" : ""; std::string resultType = formatv("{0}{1}.getType()", getArgumentName(op, 0), index).str(); - m.body() << " " << builderOpState << ".addTypes({" << resultType; + body << " " << builderOpState << ".addTypes({" << resultType; for (int i = 1; i != numResults; ++i) - m.body() << ", " << resultType; - m.body() << "});\n\n"; + body << ", " << resultType; + body << "});\n\n"; } void OpEmitter::genUseAttrAsResultTypeBuilder() { - std::string params = - std::string("::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &") + - builderOpState + - ", ::mlir::ValueRange operands, ::llvm::ArrayRef<::mlir::NamedAttribute> " - "attributes"; - auto &m = opClass.newMethod("void", "build", params, OpMethod::MP_Static); - auto &body = m.body(); + SmallVector paramList; + paramList.emplace_back("::mlir::OpBuilder &", "odsBuilder"); + paramList.emplace_back("::mlir::OperationState &", builderOpState); + paramList.emplace_back("::mlir::ValueRange", "operands"); + paramList.emplace_back("::llvm::ArrayRef<::mlir::NamedAttribute>", + "attributes", "{}"); + auto *m = opClass.addMethodAndPrune("void", "build", OpMethod::MP_Static, + std::move(paramList)); + // If the builder is redundant, skip generating the method + if (!m) + return; + + auto &body = m->body(); // Push all result types to the operation state std::string resultType; @@ -1184,11 +1181,12 @@ void OpEmitter::genBuilder() { StringRef body = builderDef->getValueAsString("body"); bool hasBody = !body.empty(); - auto &method = - opClass.newMethod("void", "build", params, OpMethod::MP_Static, - /*declOnly=*/!hasBody); + OpMethod::Property properties = + hasBody ? OpMethod::MP_Static : OpMethod::MP_StaticDeclaration; + auto *method = + opClass.addMethodAndPrune("void", "build", properties, params); if (hasBody) - method.body() << body; + method->body() << body; } } if (op.skipDefaultBuilders()) { @@ -1213,24 +1211,11 @@ void OpEmitter::genBuilder() { // use the first operand or attribute's type as all result types // to facilitate different call patterns. if (op.getNumVariableLengthResults() == 0) { - if (op.getTrait("OpTrait::SameOperandsAndResultType")) { - // If the operation has a single variadic input, then the build method - // generated by `genUseOperandAsResultTypeSeparateParamBuilder` will be - // ambiguous with the one generated by - // `genUseOperandAsResultTypeCollectiveParamBuilder` (they both will have - // a single `ValueRange` argument for operands, and the collective one - // will have a `ArrayRef` argument initialized to empty). - // Suppress such ambiguous build method. - if (!op.hasSingleVariadicArg()) - genUseOperandAsResultTypeSeparateParamBuilder(); - - // The build method generated by the inferred type collective param - // builder and one generated here have the same arguments and hence - // generating both will be ambiguous. Enable just one of them. - if (!shouldGenerateInferredTypeCollectiveParamBuilder()) - genUseOperandAsResultTypeCollectiveParamBuilder(); + if (op.getTrait("::mlir::OpTrait::SameOperandsAndResultType")) { + genUseOperandAsResultTypeSeparateParamBuilder(); + genUseOperandAsResultTypeCollectiveParamBuilder(); } - if (op.getTrait("OpTrait::FirstAttrDerivedResultType")) + if (op.getTrait("::mlir::OpTrait::FirstAttrDerivedResultType")) genUseAttrAsResultTypeBuilder(); } } @@ -1243,21 +1228,25 @@ void OpEmitter::genCollectiveParamBuilder() { int numOperands = op.getNumOperands(); int numVariadicOperands = op.getNumVariableLengthOperands(); int numNonVariadicOperands = numOperands - numVariadicOperands; - // Signature - std::string params = - std::string("::mlir::OpBuilder &, ::mlir::OperationState &") + - builderOpState + - ", ::llvm::ArrayRef<::mlir::Type> resultTypes, ::mlir::ValueRange " - "operands, " - "::llvm::ArrayRef<::mlir::NamedAttribute> attributes"; - if (op.getNumVariadicRegions()) { - params += ", unsigned numRegions"; - } else { - // Provide default value for `attributes` since its the last parameter - params += " = {}"; - } - auto &m = opClass.newMethod("void", "build", params, OpMethod::MP_Static); - auto &body = m.body(); + + SmallVector paramList; + paramList.emplace_back("::mlir::OpBuilder &", ""); + paramList.emplace_back("::mlir::OperationState &", builderOpState); + paramList.emplace_back("::llvm::ArrayRef<::mlir::Type>", "resultTypes"); + paramList.emplace_back("::mlir::ValueRange", "operands"); + // Provide default value for `attributes` when its the last parameter + StringRef attributesDefaultValue = op.getNumVariadicRegions() ? "" : "{}"; + paramList.emplace_back("::llvm::ArrayRef<::mlir::NamedAttribute>", + "attributes", attributesDefaultValue); + if (op.getNumVariadicRegions()) + paramList.emplace_back("unsigned", "numRegions"); + + auto *m = opClass.addMethodAndPrune("void", "build", OpMethod::MP_Static, + std::move(paramList)); + // If the builder is redundant, skip generating the method + if (!m) + return; + auto &body = m->body(); // Operands if (numVariadicOperands == 0 || numNonVariadicOperands != 0) @@ -1287,11 +1276,11 @@ void OpEmitter::genCollectiveParamBuilder() { // Generate builder that infers type too. // TODO: Expand to handle regions and successors. - if (shouldGenerateInferredTypeCollectiveParamBuilder()) + if (canInferType(op) && op.getNumSuccessors() == 0) genInferredTypeCollectiveParamBuilder(); } -void OpEmitter::buildParamList(std::string ¶mList, +void OpEmitter::buildParamList(SmallVectorImpl ¶mList, SmallVectorImpl &resultTypeNames, TypeParamKind typeParamKind, AttrParamKind attrParamKind) { @@ -1299,8 +1288,8 @@ void OpEmitter::buildParamList(std::string ¶mList, auto numResults = op.getNumResults(); resultTypeNames.reserve(numResults); - paramList = "::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &"; - paramList.append(builderOpState); + paramList.emplace_back("::mlir::OpBuilder &", "odsBuilder"); + paramList.emplace_back("::mlir::OperationState &", builderOpState); switch (typeParamKind) { case TypeParamKind::None: @@ -1313,19 +1302,18 @@ void OpEmitter::buildParamList(std::string ¶mList, if (resultName.empty()) resultName = std::string(formatv("resultType{0}", i)); + StringRef type = result.isVariadic() ? "::llvm::ArrayRef<::mlir::Type>" + : "::mlir::Type"; + OpMethodParameter::Property properties = OpMethodParameter::PP_None; if (result.isOptional()) - paramList.append(", /*optional*/::mlir::Type "); - else if (result.isVariadic()) - paramList.append(", ::llvm::ArrayRef<::mlir::Type> "); - else - paramList.append(", ::mlir::Type "); - paramList.append(resultName); + properties = OpMethodParameter::PP_Optional; + paramList.emplace_back(type, resultName, properties); resultTypeNames.emplace_back(std::move(resultName)); } } break; case TypeParamKind::Collective: { - paramList.append(", ::llvm::ArrayRef<::mlir::Type> resultTypes"); + paramList.emplace_back("::llvm::ArrayRef<::mlir::Type>", "resultTypes"); resultTypeNames.push_back("resultTypes"); } break; } @@ -1364,64 +1352,64 @@ void OpEmitter::buildParamList(std::string ¶mList, auto argument = op.getArg(i); if (argument.is()) { const auto &operand = op.getOperand(numOperands); + StringRef type = + operand.isVariadic() ? "::mlir::ValueRange" : "::mlir::Value"; + OpMethodParameter::Property properties = OpMethodParameter::PP_None; if (operand.isOptional()) - paramList.append(", /*optional*/::mlir::Value "); - else if (operand.isVariadic()) - paramList.append(", ::mlir::ValueRange "); - else - paramList.append(", ::mlir::Value "); - paramList.append(getArgumentName(op, numOperands)); + properties = OpMethodParameter::PP_Optional; + + paramList.emplace_back(type, getArgumentName(op, numOperands), + properties); ++numOperands; } else { const auto &namedAttr = op.getAttribute(numAttrs); const auto &attr = namedAttr.attr; - paramList.append(", "); + OpMethodParameter::Property properties = OpMethodParameter::PP_None; if (attr.isOptional()) - paramList.append("/*optional*/"); + properties = OpMethodParameter::PP_Optional; + StringRef type; switch (attrParamKind) { case AttrParamKind::WrappedAttr: - paramList.append(std::string(attr.getStorageType())); + type = attr.getStorageType(); break; case AttrParamKind::UnwrappedValue: - if (canUseUnwrappedRawValue(attr)) { - paramList.append(std::string(attr.getReturnType())); - } else { - paramList.append(std::string(attr.getStorageType())); - } + if (canUseUnwrappedRawValue(attr)) + type = attr.getReturnType(); + else + type = attr.getStorageType(); break; } - paramList.append(" "); - paramList.append(std::string(namedAttr.name)); + std::string defaultValue; // Attach default value if requested and possible. if (attrParamKind == AttrParamKind::UnwrappedValue && i >= defaultValuedAttrStartIndex) { bool isString = attr.getReturnType() == "::llvm::StringRef"; - paramList.append(" = "); if (isString) - paramList.append("\""); - paramList.append(std::string(attr.getDefaultValue())); + defaultValue.append("\""); + defaultValue += attr.getDefaultValue(); if (isString) - paramList.append("\""); + defaultValue.append("\""); } + paramList.emplace_back(type, namedAttr.name, defaultValue, properties); ++numAttrs; } } /// Insert parameters for each successor. for (const NamedSuccessor &succ : op.getSuccessors()) { - paramList += (succ.isVariadic() ? ", ::llvm::ArrayRef<::mlir::Block *> " - : ", ::mlir::Block *"); - paramList += succ.name; + StringRef type = succ.isVariadic() ? "::llvm::ArrayRef<::mlir::Block *>" + : "::mlir::Block *"; + paramList.emplace_back(type, succ.name); } /// Insert parameters for variadic regions. - for (const NamedRegion ®ion : op.getRegions()) { + for (const NamedRegion ®ion : op.getRegions()) if (region.isVariadic()) - paramList += llvm::formatv(", unsigned {0}Count", region.name).str(); - } + paramList.emplace_back("unsigned", + llvm::formatv("{0}Count", region.name).str()); } void OpEmitter::genCodeForAddingArgAndRegionForBuilder(OpMethodBody &body, @@ -1435,7 +1423,7 @@ void OpEmitter::genCodeForAddingArgAndRegionForBuilder(OpMethodBody &body, } // If the operation has the operand segment size attribute, add it here. - if (op.getTrait("OpTrait::AttrSizedOperandSegments")) { + if (op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments")) { body << " " << builderOpState << ".addAttribute(\"operand_segment_sizes\", " "odsBuilder.getI32VectorAttr({"; @@ -1508,10 +1496,12 @@ void OpEmitter::genCanonicalizerDecls() { if (!def.getValueAsBit("hasCanonicalizer")) return; - const char *const params = - "::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context"; - opClass.newMethod("void", "getCanonicalizationPatterns", params, - OpMethod::MP_Static, /*declOnly=*/true); + SmallVector paramList; + paramList.emplace_back("::mlir::OwningRewritePatternList &", "results"); + paramList.emplace_back("::mlir::MLIRContext *", "context"); + opClass.addMethodAndPrune("void", "getCanonicalizationPatterns", + OpMethod::MP_StaticDeclaration, + std::move(paramList)); } void OpEmitter::genFolderDecls() { @@ -1520,17 +1510,16 @@ void OpEmitter::genFolderDecls() { if (def.getValueAsBit("hasFolder")) { if (hasSingleResult) { - const char *const params = "::llvm::ArrayRef<::mlir::Attribute> operands"; - opClass.newMethod("::mlir::OpFoldResult", "fold", params, - OpMethod::MP_None, - /*declOnly=*/true); + opClass.addMethodAndPrune( + "::mlir::OpFoldResult", "fold", OpMethod::MP_Declaration, + "::llvm::ArrayRef<::mlir::Attribute>", "operands"); } else { - const char *const params = - "::llvm::ArrayRef<::mlir::Attribute> operands, " - "::llvm::SmallVectorImpl<::mlir::OpFoldResult> &results"; - opClass.newMethod("::mlir::LogicalResult", "fold", params, - OpMethod::MP_None, - /*declOnly=*/true); + SmallVector paramList; + paramList.emplace_back("::llvm::ArrayRef<::mlir::Attribute>", "operands"); + paramList.emplace_back("::llvm::SmallVectorImpl<::mlir::OpFoldResult> &", + "results"); + opClass.addMethodAndPrune("::mlir::LogicalResult", "fold", + OpMethod::MP_Declaration, std::move(paramList)); } } } @@ -1554,16 +1543,14 @@ void OpEmitter::genOpInterfaceMethod(const tblgen::InterfaceOpTrait *opTrait) { !alwaysDeclaredMethods.count(method.getName())) continue; - std::string args; - llvm::raw_string_ostream os(args); - interleaveComma(method.getArguments(), os, - [&](const InterfaceMethod::Argument &arg) { - os << arg.type << " " << arg.name; - }); - opClass.newMethod(method.getReturnType(), method.getName(), os.str(), - method.isStatic() ? OpMethod::MP_Static - : OpMethod::MP_None, - /*declOnly=*/true); + SmallVector paramList; + for (const InterfaceMethod::Argument &arg : method.getArguments()) + paramList.emplace_back(arg.type, arg.name); + + auto properties = method.isStatic() ? OpMethod::MP_StaticDeclaration + : OpMethod::MP_Declaration; + opClass.addMethodAndPrune(method.getReturnType(), method.getName(), + properties, std::move(paramList)); } } @@ -1622,15 +1609,14 @@ void OpEmitter::genSideEffectInterfaceMethods() { resolveDecorators(op.getResultDecorators(i), i, EffectKind::Result); for (auto &it : interfaceEffects) { - auto effectsParam = - llvm::formatv("::mlir::SmallVectorImpl<::mlir::SideEffects::" - "EffectInstance<{0}>> &effects", - it.first()) - .str(); - // Generate the 'getEffects' method. - auto &getEffects = opClass.newMethod("void", "getEffects", effectsParam); - auto &body = getEffects.body(); + std::string type = llvm::formatv("::mlir::SmallVectorImpl<::mlir::" + "SideEffects::EffectInstance<{0}>> &", + it.first()) + .str(); + auto *getEffects = + opClass.addMethodAndPrune("void", "getEffects", type, "effects"); + auto &body = getEffects->body(); // Add effect instances for each of the locations marked on the operation. for (auto &location : it.second) { @@ -1655,21 +1641,24 @@ void OpEmitter::genTypeInterfaceMethods() { if (!op.allResultTypesKnown()) return; - auto &method = opClass.newMethod( - "::mlir::LogicalResult", "inferReturnTypes", - "::mlir::MLIRContext* context, " - "::llvm::Optional<::mlir::Location> location, " - "::mlir::ValueRange operands, ::mlir::DictionaryAttr attributes, " - "::mlir::RegionRange regions, " - "::llvm::SmallVectorImpl<::mlir::Type>& inferredReturnTypes", - OpMethod::MP_Static, - /*declOnly=*/false); - auto &os = method.body(); - os << " inferredReturnTypes.resize(" << op.getNumResults() << ");\n"; + SmallVector paramList; + paramList.emplace_back("::mlir::MLIRContext *", "context"); + paramList.emplace_back("::llvm::Optional<::mlir::Location>", "location"); + paramList.emplace_back("::mlir::ValueRange", "operands"); + paramList.emplace_back("::mlir::DictionaryAttr", "attributes"); + paramList.emplace_back("::mlir::RegionRange", "regions"); + paramList.emplace_back("::llvm::SmallVectorImpl<::mlir::Type>&", + "inferredReturnTypes"); + auto *method = + opClass.addMethodAndPrune("::mlir::LogicalResult", "inferReturnTypes", + OpMethod::MP_Static, std::move(paramList)); + + auto &body = method->body(); + body << " inferredReturnTypes.resize(" << op.getNumResults() << ");\n"; FmtContext fctx; fctx.withBuilder("odsBuilder"); - os << " ::mlir::Builder odsBuilder(context);\n"; + body << " ::mlir::Builder odsBuilder(context);\n"; auto emitType = [&](const tblgen::Operator::ArgOrType &type) -> OpMethodBody & { @@ -1678,24 +1667,24 @@ void OpEmitter::genTypeInterfaceMethods() { assert(!op.getArg(argIndex).is()); auto arg = op.getArgToOperandOrAttribute(argIndex); if (arg.kind() == Operator::OperandOrAttribute::Kind::Operand) - return os << "operands[" << arg.operandOrAttributeIndex() + return body << "operands[" << arg.operandOrAttributeIndex() + << "].getType()"; + return body << "attributes[" << arg.operandOrAttributeIndex() << "].getType()"; - return os << "attributes[" << arg.operandOrAttributeIndex() - << "].getType()"; } else { - return os << tgfmt(*type.getType().getBuilderCall(), &fctx); + return body << tgfmt(*type.getType().getBuilderCall(), &fctx); } }; for (int i = 0, e = op.getNumResults(); i != e; ++i) { - os << " inferredReturnTypes[" << i << "] = "; + body << " inferredReturnTypes[" << i << "] = "; auto types = op.getSameTypeAsResult(i); emitType(types[0]) << ";\n"; if (types.size() == 1) continue; // TODO: We could verify equality here, but skipping that for verification. } - os << " return success();"; + body << " return ::mlir::success();"; } void OpEmitter::genParser() { @@ -1703,14 +1692,17 @@ void OpEmitter::genParser() { hasStringAttribute(def, "assemblyFormat")) return; - auto &method = opClass.newMethod( - "::mlir::ParseResult", "parse", - "::mlir::OpAsmParser &parser, ::mlir::OperationState &result", - OpMethod::MP_Static); + SmallVector paramList; + paramList.emplace_back("::mlir::OpAsmParser &", "parser"); + paramList.emplace_back("::mlir::OperationState &", "result"); + auto *method = + opClass.addMethodAndPrune("::mlir::ParseResult", "parse", + OpMethod::MP_Static, std::move(paramList)); + FmtContext fctx; fctx.addSubst("cppClass", opClass.getClassName()); auto parser = def.getValueAsString("parser").ltrim().rtrim(" \t\v\f\r"); - method.body() << " " << tgfmt(parser, &fctx); + method->body() << " " << tgfmt(parser, &fctx); } void OpEmitter::genPrinter() { @@ -1722,20 +1714,20 @@ void OpEmitter::genPrinter() { if (!codeInit) return; - auto &method = opClass.newMethod("void", "print", "::mlir::OpAsmPrinter &p"); + auto *method = + opClass.addMethodAndPrune("void", "print", "::mlir::OpAsmPrinter &", "p"); FmtContext fctx; fctx.addSubst("cppClass", opClass.getClassName()); auto printer = codeInit->getValue().ltrim().rtrim(" \t\v\f\r"); - method.body() << " " << tgfmt(printer, &fctx); + method->body() << " " << tgfmt(printer, &fctx); } void OpEmitter::genVerifier() { - auto &method = - opClass.newMethod("::mlir::LogicalResult", "verify", /*params=*/""); - auto &body = method.body(); + auto *method = opClass.addMethodAndPrune("::mlir::LogicalResult", "verify"); + auto &body = method->body(); body << " if (failed(" << op.getAdaptorName() << "(*this).verify(this->getLoc()))) " - << "return failure();\n"; + << "return ::mlir::failure();\n"; auto *valueInit = def.getValueInit("verifier"); CodeInit *codeInit = dyn_cast(valueInit); @@ -1904,21 +1896,21 @@ static void addSizeCountTrait(OpClass &opClass, StringRef traitKind, int numTotal, int numVariadic) { if (numVariadic != 0) { if (numTotal == numVariadic) - opClass.addTrait("OpTrait::Variadic" + traitKind + "s"); + opClass.addTrait("::mlir::OpTrait::Variadic" + traitKind + "s"); else - opClass.addTrait("OpTrait::AtLeastN" + traitKind + "s<" + + opClass.addTrait("::mlir::OpTrait::AtLeastN" + traitKind + "s<" + Twine(numTotal - numVariadic) + ">::Impl"); return; } switch (numTotal) { case 0: - opClass.addTrait("OpTrait::Zero" + traitKind); + opClass.addTrait("::mlir::OpTrait::Zero" + traitKind); break; case 1: - opClass.addTrait("OpTrait::One" + traitKind); + opClass.addTrait("::mlir::OpTrait::One" + traitKind); break; default: - opClass.addTrait("OpTrait::N" + traitKind + "s<" + Twine(numTotal) + + opClass.addTrait("::mlir::OpTrait::N" + traitKind + "s<" + Twine(numTotal) + ">::Impl"); break; } @@ -1947,20 +1939,21 @@ void OpEmitter::genTraits() { // Add operand size trait. if (numVariadicOperands != 0) { if (numOperands == numVariadicOperands) - opClass.addTrait("OpTrait::VariadicOperands"); + opClass.addTrait("::mlir::OpTrait::VariadicOperands"); else - opClass.addTrait("OpTrait::AtLeastNOperands<" + + opClass.addTrait("::mlir::OpTrait::AtLeastNOperands<" + Twine(numOperands - numVariadicOperands) + ">::Impl"); } else { switch (numOperands) { case 0: - opClass.addTrait("OpTrait::ZeroOperands"); + opClass.addTrait("::mlir::OpTrait::ZeroOperands"); break; case 1: - opClass.addTrait("OpTrait::OneOperand"); + opClass.addTrait("::mlir::OpTrait::OneOperand"); break; default: - opClass.addTrait("OpTrait::NOperands<" + Twine(numOperands) + ">::Impl"); + opClass.addTrait("::mlir::OpTrait::NOperands<" + Twine(numOperands) + + ">::Impl"); break; } } @@ -1975,9 +1968,9 @@ void OpEmitter::genTraits() { } void OpEmitter::genOpNameGetter() { - auto &method = opClass.newMethod("::llvm::StringRef", "getOperationName", - /*params=*/"", OpMethod::MP_Static); - method.body() << " return \"" << op.getOperationName() << "\";\n"; + auto *method = opClass.addMethodAndPrune( + "::llvm::StringRef", "getOperationName", OpMethod::MP_Static); + method->body() << " return \"" << op.getOperationName() << "\";\n"; } void OpEmitter::genOpAsmInterface() { @@ -2001,9 +1994,9 @@ void OpEmitter::genOpAsmInterface() { opClass.addTrait("::mlir::OpAsmOpInterface::Trait"); // Generate the right accessor for the number of results. - auto &method = opClass.newMethod("void", "getAsmResultNames", - "OpAsmSetValueNameFn setNameFn"); - auto &body = method.body(); + auto *method = opClass.addMethodAndPrune("void", "getAsmResultNames", + "OpAsmSetValueNameFn", "setNameFn"); + auto &body = method->body(); for (int i = 0; i != numResults; ++i) { body << " auto resultGroup" << i << " = getODSResults(" << i << ");\n" << " if (!llvm::empty(resultGroup" << i << "))\n" @@ -2042,24 +2035,25 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(const Operator &op) adaptor.newField("::mlir::ValueRange", "odsOperands"); adaptor.newField("::mlir::DictionaryAttr", "odsAttrs"); const auto *attrSizedOperands = - op.getTrait("OpTrait::AttrSizedOperandSegments"); + op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments"); { - auto &constructor = adaptor.newConstructor( - attrSizedOperands - ? "::mlir::ValueRange values, ::mlir::DictionaryAttr attrs" - : "::mlir::ValueRange values, ::mlir::DictionaryAttr attrs = " - "nullptr"); - constructor.addMemberInitializer("odsOperands", "values"); - constructor.addMemberInitializer("odsAttrs", "attrs"); + SmallVector paramList; + paramList.emplace_back("::mlir::ValueRange", "values"); + paramList.emplace_back("::mlir::DictionaryAttr", "attrs", + attrSizedOperands ? "" : "nullptr"); + auto *constructor = adaptor.addConstructorAndPrune(std::move(paramList)); + + constructor->addMemberInitializer("odsOperands", "values"); + constructor->addMemberInitializer("odsAttrs", "attrs"); } { - auto &constructor = adaptor.newConstructor( - llvm::formatv("{0}& op", op.getCppClassName()).str()); - constructor.addMemberInitializer("odsOperands", - "op.getOperation()->getOperands()"); - constructor.addMemberInitializer("odsAttrs", - "op.getOperation()->getAttrDictionary()"); + auto *constructor = adaptor.addConstructorAndPrune( + llvm::formatv("{0}&", op.getCppClassName()).str(), "op"); + constructor->addMemberInitializer("odsOperands", + "op.getOperation()->getOperands()"); + constructor->addMemberInitializer("odsAttrs", + "op.getOperation()->getAttrDictionary()"); } std::string sizeAttrInit = @@ -2074,7 +2068,7 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(const Operator &op) fctx.withBuilder("::mlir::Builder(odsAttrs.getContext())"); auto emitAttr = [&](StringRef name, Attribute attr) { - auto &body = adaptor.newMethod(attr.getStorageType(), name).body(); + auto &body = adaptor.addMethodAndPrune(attr.getStorageType(), name)->body(); body << " assert(odsAttrs && \"no attributes when constructing adapter\");" << "\n " << attr.getStorageType() << " attr = " << "odsAttrs.get(\"" << name << "\")."; @@ -2107,9 +2101,9 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(const Operator &op) } void OpOperandAdaptorEmitter::addVerification() { - auto &method = adaptor.newMethod("::mlir::LogicalResult", "verify", - /*params=*/"::mlir::Location loc"); - auto &body = method.body(); + auto *method = adaptor.addMethodAndPrune("::mlir::LogicalResult", "verify", + "::mlir::Location", "loc"); + auto &body = method->body(); const char *checkAttrSizedValueSegmentsCode = R"( { @@ -2125,11 +2119,11 @@ void OpOperandAdaptorEmitter::addVerification() { // getODSOperands()/getODSResults() in the rest of the verifier. for (auto &trait : op.getTraits()) { if (auto *t = dyn_cast(&trait)) { - if (t->getTrait() == "OpTrait::AttrSizedOperandSegments") { + if (t->getTrait() == "::mlir::OpTrait::AttrSizedOperandSegments") { body << formatv(checkAttrSizedValueSegmentsCode, "operand_segment_sizes", op.getNumOperands(), "operand"); - } else if (t->getTrait() == "OpTrait::AttrSizedResultSegments") { + } else if (t->getTrait() == "::mlir::OpTrait::AttrSizedResultSegments") { body << formatv(checkAttrSizedValueSegmentsCode, "result_segment_sizes", op.getNumResults(), "result"); } @@ -2144,7 +2138,7 @@ void OpOperandAdaptorEmitter::addVerification() { "' op \"", /*emitVerificationRequiringOp*/ false, verifyCtx, body); - body << " return success();"; + body << " return ::mlir::success();"; } void OpOperandAdaptorEmitter::emitDecl(const Operator &op, raw_ostream &os) { @@ -2165,6 +2159,7 @@ static void emitOpClasses(const std::vector &defs, raw_ostream &os, os << "#undef GET_OP_FWD_DEFINES\n"; for (auto *def : defs) { Operator op(*def); + Operator::NamespaceEmitter emitter(os, op); os << "class " << op.getCppClassName() << ";\n"; } os << "#endif\n\n"; @@ -2173,6 +2168,7 @@ static void emitOpClasses(const std::vector &defs, raw_ostream &os, IfDefScope scope("GET_OP_CLASSES", os); for (auto *def : defs) { Operator op(*def); + Operator::NamespaceEmitter emitter(os, op); if (emitDecl) { os << formatv(opCommentHeader, op.getQualCppClassName(), "declarations"); OpOperandAdaptorEmitter::emitDecl(op, os); diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp index 1542e9c55e41c..01877855802d4 100644 --- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp +++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp @@ -439,14 +439,14 @@ static bool shouldFormatSymbolNameAttr(const NamedAttribute *attr) { /// {1}: The type for the attribute. const char *const attrParserCode = R"( if (parser.parseAttribute({0}Attr{1}, "{0}", result.attributes)) - return failure(); + return ::mlir::failure(); )"; const char *const optionalAttrParserCode = R"( { ::mlir::OptionalParseResult parseResult = parser.parseOptionalAttribute({0}Attr{1}, "{0}", result.attributes); if (parseResult.hasValue() && failed(*parseResult)) - return failure(); + return ::mlir::failure(); } )"; @@ -455,7 +455,7 @@ const char *const optionalAttrParserCode = R"( /// {0}: The name of the attribute. const char *const symbolNameAttrParserCode = R"( if (parser.parseSymbolName({0}Attr, "{0}", result.attributes)) - return failure(); + return ::mlir::failure(); )"; const char *const optionalSymbolNameAttrParserCode = R"( // Parsing an optional symbol name doesn't fail, so no need to check the @@ -476,7 +476,7 @@ const char *const enumAttrParserCode = R"( auto loc = parser.getCurrentLocation(); if (parser.parseAttribute(attrVal, parser.getBuilder().getNoneType(), "{0}", attrStorage)) - return failure(); + return ::mlir::failure(); auto attrOptional = {1}::{2}(attrVal.getValue()); if (!attrOptional) @@ -498,7 +498,7 @@ const char *const optionalEnumAttrParserCode = R"( "{0}", attrStorage); if (parseResult.hasValue()) { if (failed(*parseResult)) - return failure(); + return ::mlir::failure(); auto attrOptional = {1}::{2}(attrVal.getValue()); if (!attrOptional) @@ -517,7 +517,7 @@ const char *const optionalEnumAttrParserCode = R"( const char *const variadicOperandParserCode = R"( {0}OperandsLoc = parser.getCurrentLocation(); if (parser.parseOperandList({0}Operands)) - return failure(); + return ::mlir::failure(); )"; const char *const optionalOperandParserCode = R"( { @@ -527,7 +527,7 @@ const char *const optionalOperandParserCode = R"( parser.parseOptionalOperand(operand); if (parseResult.hasValue()) { if (failed(*parseResult)) - return failure(); + return ::mlir::failure(); {0}Operands.push_back(operand); } } @@ -535,7 +535,7 @@ const char *const optionalOperandParserCode = R"( const char *const operandParserCode = R"( {0}OperandsLoc = parser.getCurrentLocation(); if (parser.parseOperand({0}RawOperands[0])) - return failure(); + return ::mlir::failure(); )"; /// The code snippet used to generate a parser call for a type list. @@ -543,7 +543,7 @@ const char *const operandParserCode = R"( /// {0}: The name for the type list. const char *const variadicTypeParserCode = R"( if (parser.parseTypeList({0}Types)) - return failure(); + return ::mlir::failure(); )"; const char *const optionalTypeParserCode = R"( { @@ -552,14 +552,14 @@ const char *const optionalTypeParserCode = R"( parser.parseOptionalType(optionalType); if (parseResult.hasValue()) { if (failed(*parseResult)) - return failure(); + return ::mlir::failure(); {0}Types.push_back(optionalType); } } )"; const char *const typeParserCode = R"( if (parser.parseType({0}RawTypes[0])) - return failure(); + return ::mlir::failure(); )"; /// The code snippet used to generate a parser call for a functional type. @@ -569,7 +569,7 @@ const char *const typeParserCode = R"( const char *const functionalTypeParserCode = R"( ::mlir::FunctionType {0}__{1}_functionType; if (parser.parseType({0}__{1}_functionType)) - return failure(); + return ::mlir::failure(); {0}Types = {0}__{1}_functionType.getInputs(); {1}Types = {0}__{1}_functionType.getResults(); )"; @@ -583,14 +583,14 @@ const char *regionListParserCode = R"( auto firstRegionResult = parser.parseOptionalRegion(region); if (firstRegionResult.hasValue()) { if (failed(*firstRegionResult)) - return failure(); + return ::mlir::failure(); {0}Regions.emplace_back(std::move(region)); // Parse any trailing regions. while (succeeded(parser.parseOptionalComma())) { region = std::make_unique<::mlir::Region>(); if (parser.parseRegion(*region)) - return failure(); + return ::mlir::failure(); {0}Regions.emplace_back(std::move(region)); } } @@ -610,7 +610,7 @@ const char *regionListEnsureTerminatorParserCode = R"( /// {0}: The name of the region. const char *optionalRegionParserCode = R"( if (parser.parseOptionalRegion(*{0}Region)) - return failure(); + return ::mlir::failure(); )"; /// The code snippet used to generate a parser call for a region. @@ -618,7 +618,7 @@ const char *optionalRegionParserCode = R"( /// {0}: The name of the region. const char *regionParserCode = R"( if (parser.parseRegion(*{0}Region)) - return failure(); + return ::mlir::failure(); )"; /// The code snippet used to ensure a region has a terminator. @@ -637,13 +637,13 @@ const char *successorListParserCode = R"( auto firstSucc = parser.parseOptionalSuccessor(succ); if (firstSucc.hasValue()) { if (failed(*firstSucc)) - return failure(); + return ::mlir::failure(); {0}Successors.emplace_back(succ); // Parse any trailing successors. while (succeeded(parser.parseOptionalComma())) { if (parser.parseSuccessor(succ)) - return failure(); + return ::mlir::failure(); {0}Successors.emplace_back(succ); } } @@ -655,7 +655,7 @@ const char *successorListParserCode = R"( /// {0}: The name of the successor. const char *successorParserCode = R"( if (parser.parseSuccessor({0}Successor)) - return failure(); + return ::mlir::failure(); )"; namespace { @@ -889,7 +889,7 @@ static void genCustomDirectiveParser(CustomDirective *dir, OpMethodBody &body) { genCustomParameterParser(param, body); body << "))\n" - << " return failure();\n"; + << " return ::mlir::failure();\n"; // After parsing, add handling for any of the optional constructs. for (Element ¶m : dir->getArguments()) { @@ -922,11 +922,14 @@ static void genCustomDirectiveParser(CustomDirective *dir, OpMethodBody &body) { } void OperationFormat::genParser(Operator &op, OpClass &opClass) { - auto &method = opClass.newMethod( - "::mlir::ParseResult", "parse", - "::mlir::OpAsmParser &parser, ::mlir::OperationState &result", - OpMethod::MP_Static); - auto &body = method.body(); + llvm::SmallVector paramList; + paramList.emplace_back("::mlir::OpAsmParser &", "parser"); + paramList.emplace_back("::mlir::OperationState &", "result"); + + auto *method = + opClass.addMethodAndPrune("::mlir::ParseResult", "parse", + OpMethod::MP_Static, std::move(paramList)); + auto &body = method->body(); // Generate variables to store the operands and type within the format. This // allows for referencing these variables in the presence of optional @@ -949,7 +952,7 @@ void OperationFormat::genParser(Operator &op, OpClass &opClass) { genParserSuccessorResolution(op, body); genParserVariadicSegmentResolution(op, body); - body << " return success();\n"; + body << " return ::mlir::success();\n"; } void OperationFormat::genElementParser(Element *element, OpMethodBody &body, @@ -1007,7 +1010,7 @@ void OperationFormat::genElementParser(Element *element, OpMethodBody &body, } else if (LiteralElement *literal = dyn_cast(element)) { body << " if (parser.parse"; genLiteralParser(literal->getLiteral(), body); - body << ")\n return failure();\n"; + body << ")\n return ::mlir::failure();\n"; /// Arguments. } else if (auto *attr = dyn_cast(element)) { @@ -1081,14 +1084,14 @@ void OperationFormat::genElementParser(Element *element, OpMethodBody &body, body << " if (parser.parseOptionalAttrDict" << (attrDict->isWithKeyword() ? "WithKeyword" : "") << "(result.attributes))\n" - << " return failure();\n"; + << " return ::mlir::failure();\n"; } else if (auto *customDir = dyn_cast(element)) { genCustomDirectiveParser(customDir, body); } else if (isa(element)) { body << " ::llvm::SMLoc allOperandLoc = parser.getCurrentLocation();\n" << " if (parser.parseOperandList(allOperands))\n" - << " return failure();\n"; + << " return ::mlir::failure();\n"; } else if (isa(element)) { body << llvm::formatv(regionListParserCode, "full"); @@ -1197,7 +1200,7 @@ void OperationFormat::genParserTypeResolution(Operator &op, if (allOperands) { body << " if (parser.resolveOperands(allOperands, allOperandTypes, " "allOperandLoc, result.operands))\n" - " return failure();\n"; + " return ::mlir::failure();\n"; return; } @@ -1214,7 +1217,7 @@ void OperationFormat::genParserTypeResolution(Operator &op, body << op.operand_begin()->name << "Operands"; } body << ", allOperandTypes, parser.getNameLoc(), result.operands))\n" - << " return failure();\n"; + << " return ::mlir::failure();\n"; return; } // Handle the case where all of the operands were grouped together. @@ -1238,7 +1241,7 @@ void OperationFormat::genParserTypeResolution(Operator &op, } body << ", allOperandLoc, result.operands))\n" - << " return failure();\n"; + << " return ::mlir::failure();\n"; return; } @@ -1270,7 +1273,7 @@ void OperationFormat::genParserTypeResolution(Operator &op, // overload. if (verifyOperandAndTypeSize) body << ", " << operand.name << "OperandsLoc"; - body << ", result.operands))\n return failure();\n"; + body << ", result.operands))\n return ::mlir::failure();\n"; } } @@ -1314,7 +1317,8 @@ void OperationFormat::genParserSuccessorResolution(Operator &op, void OperationFormat::genParserVariadicSegmentResolution(Operator &op, OpMethodBody &body) { - if (!allOperands && op.getTrait("OpTrait::AttrSizedOperandSegments")) { + if (!allOperands && + op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments")) { body << " result.addAttribute(\"operand_segment_sizes\", " << "parser.getBuilder().getI32VectorAttr({"; auto interleaveFn = [&](const NamedTypeConstraint &operand) { @@ -1328,7 +1332,8 @@ void OperationFormat::genParserVariadicSegmentResolution(Operator &op, body << "}));\n"; } - if (!allResultTypes && op.getTrait("OpTrait::AttrSizedResultSegments")) { + if (!allResultTypes && + op.getTrait("::mlir::OpTrait::AttrSizedResultSegments")) { body << " result.addAttribute(\"result_segment_sizes\", " << "parser.getBuilder().getI32VectorAttr({"; auto interleaveFn = [&](const NamedTypeConstraint &result) { @@ -1369,9 +1374,11 @@ static void genAttrDictPrinter(OperationFormat &fmt, Operator &op, body << " p.printOptionalAttrDict" << (withKeyword ? "WithKeyword" : "") << "(getAttrs(), /*elidedAttrs=*/{"; // Elide the variadic segment size attributes if necessary. - if (!fmt.allOperands && op.getTrait("OpTrait::AttrSizedOperandSegments")) + if (!fmt.allOperands && + op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments")) body << "\"operand_segment_sizes\", "; - if (!fmt.allResultTypes && op.getTrait("OpTrait::AttrSizedResultSegments")) + if (!fmt.allResultTypes && + op.getTrait("::mlir::OpTrait::AttrSizedResultSegments")) body << "\"result_segment_sizes\", "; llvm::interleaveComma( fmt.usedAttributes, body, @@ -1607,8 +1614,9 @@ void OperationFormat::genElementPrinter(Element *element, OpMethodBody &body, } void OperationFormat::genPrinter(Operator &op, OpClass &opClass) { - auto &method = opClass.newMethod("void", "print", "OpAsmPrinter &p"); - auto &body = method.body(); + auto *method = + opClass.addMethodAndPrune("void", "print", "::mlir::OpAsmPrinter &p"); + auto &body = method->body(); // Emit the operation name, trimming the prefix if this is the standard // dialect. @@ -2004,16 +2012,16 @@ class FormatParser { if (curToken.getKind() != kind) return emitError(curToken.getLoc(), msg); consumeToken(); - return success(); + return ::mlir::success(); } LogicalResult emitError(llvm::SMLoc loc, const Twine &msg) { lexer.emitError(loc, msg); - return failure(); + return ::mlir::failure(); } LogicalResult emitErrorAndNote(llvm::SMLoc loc, const Twine &msg, const Twine ¬e) { lexer.emitErrorAndNote(loc, msg, note); - return failure(); + return ::mlir::failure(); } //===--------------------------------------------------------------------===// @@ -2045,7 +2053,7 @@ LogicalResult FormatParser::parse() { while (curToken.getKind() != Token::eof) { std::unique_ptr element; if (failed(parseElement(element, /*isTopLevel=*/true))) - return failure(); + return ::mlir::failure(); fmt.elements.push_back(std::move(element)); } @@ -2075,11 +2083,11 @@ LogicalResult FormatParser::parse() { failed(verifyResults(loc, variableTyResolver)) || failed(verifyOperands(loc, variableTyResolver)) || failed(verifyRegions(loc)) || failed(verifySuccessors(loc))) - return failure(); + return ::mlir::failure(); // Collect the set of used attributes in the format. fmt.usedAttributes = seenAttrs.takeVector(); - return success(); + return ::mlir::success(); } LogicalResult FormatParser::verifyAttributes(llvm::SMLoc loc) { @@ -2093,8 +2101,8 @@ LogicalResult FormatParser::verifyAttributes(llvm::SMLoc loc) { iteratorStack.emplace_back(fmt.elements.begin(), fmt.elements.end()); while (!iteratorStack.empty()) if (failed(verifyAttributes(loc, iteratorStack))) - return failure(); - return success(); + return ::mlir::failure(); + return ::mlir::success(); } /// Verify the attribute elements at the back of the given stack of iterators. LogicalResult FormatParser::verifyAttributes( @@ -2109,7 +2117,7 @@ LogicalResult FormatParser::verifyAttributes( if (auto *optional = dyn_cast(element)) { auto elements = optional->getElements(); iteratorStack.emplace_back(elements.begin(), elements.end()); - return success(); + return ::mlir::success(); } // We are checking for an attribute element followed by a `:`, so there is @@ -2145,7 +2153,7 @@ LogicalResult FormatParser::verifyAttributes( } } iteratorStack.pop_back(); - return success(); + return ::mlir::success(); } LogicalResult FormatParser::verifyOperands( @@ -2193,13 +2201,13 @@ LogicalResult FormatParser::verifyOperands( auto it = buildableTypes.insert({*builder, buildableTypes.size()}); fmt.operandTypes[i].setBuilderIdx(it.first->second); } - return success(); + return ::mlir::success(); } LogicalResult FormatParser::verifyRegions(llvm::SMLoc loc) { // Check that all of the regions are within the format. if (hasAllRegions) - return success(); + return ::mlir::success(); for (unsigned i = 0, e = op.getNumRegions(); i != e; ++i) { const NamedRegion ®ion = op.getRegion(i); @@ -2211,7 +2219,7 @@ LogicalResult FormatParser::verifyRegions(llvm::SMLoc loc) { "' directive to the custom assembly format"); } } - return success(); + return ::mlir::success(); } LogicalResult FormatParser::verifyResults( @@ -2219,7 +2227,7 @@ LogicalResult FormatParser::verifyResults( llvm::StringMap &variableTyResolver) { // If we format all of the types together, there is nothing to check. if (fmt.allResultTypes) - return success(); + return ::mlir::success(); // Check that all of the result types can be inferred. auto &buildableTypes = fmt.buildableTypes; @@ -2252,13 +2260,13 @@ LogicalResult FormatParser::verifyResults( auto it = buildableTypes.insert({*builder, buildableTypes.size()}); fmt.resultTypes[i].setBuilderIdx(it.first->second); } - return success(); + return ::mlir::success(); } LogicalResult FormatParser::verifySuccessors(llvm::SMLoc loc) { // Check that all of the successors are within the format. if (hasAllSuccessors) - return success(); + return ::mlir::success(); for (unsigned i = 0, e = op.getNumSuccessors(); i != e; ++i) { const NamedSuccessor &successor = op.getSuccessor(i); @@ -2270,7 +2278,7 @@ LogicalResult FormatParser::verifySuccessors(llvm::SMLoc loc) { "' directive to the custom assembly format"); } } - return success(); + return ::mlir::success(); } void FormatParser::handleAllTypesMatchConstraint( @@ -2368,7 +2376,7 @@ LogicalResult FormatParser::parseVariable(std::unique_ptr &element, if (isTopLevel && !seenAttrs.insert(attr)) return emitError(loc, "attribute '" + name + "' is already bound"); element = std::make_unique(attr); - return success(); + return ::mlir::success(); } /// Operands if (const NamedTypeConstraint *operand = findArg(op.getOperands(), name)) { @@ -2377,7 +2385,7 @@ LogicalResult FormatParser::parseVariable(std::unique_ptr &element, return emitError(loc, "operand '" + name + "' is already bound"); } element = std::make_unique(operand); - return success(); + return ::mlir::success(); } /// Regions if (const NamedRegion *region = findArg(op.getRegions(), name)) { @@ -2386,14 +2394,14 @@ LogicalResult FormatParser::parseVariable(std::unique_ptr &element, if (hasAllRegions || !seenRegions.insert(region).second) return emitError(loc, "region '" + name + "' is already bound"); element = std::make_unique(region); - return success(); + return ::mlir::success(); } /// Results. if (const auto *result = findArg(op.getResults(), name)) { if (isTopLevel) return emitError(loc, "results can not be used at the top level"); element = std::make_unique(result); - return success(); + return ::mlir::success(); } /// Successors. if (const auto *successor = findArg(op.getSuccessors(), name)) { @@ -2402,7 +2410,7 @@ LogicalResult FormatParser::parseVariable(std::unique_ptr &element, if (hasAllSuccessors || !seenSuccessors.insert(successor).second) return emitError(loc, "successor '" + name + "' is already bound"); element = std::make_unique(successor); - return success(); + return ::mlir::success(); } return emitError(loc, "expected variable to refer to an argument, region, " "result, or successor"); @@ -2450,7 +2458,7 @@ LogicalResult FormatParser::parseLiteral(std::unique_ptr &element) { return emitError(literalTok.getLoc(), "expected valid literal"); element = std::make_unique(value); - return success(); + return ::mlir::success(); } LogicalResult FormatParser::parseOptional(std::unique_ptr &element, @@ -2467,11 +2475,11 @@ LogicalResult FormatParser::parseOptional(std::unique_ptr &element, Optional anchorIdx; do { if (failed(parseOptionalChildElement(elements, seenVariables, anchorIdx))) - return failure(); + return ::mlir::failure(); } while (curToken.getKind() != Token::r_paren); consumeToken(); if (failed(parseToken(Token::question, "expected '?' after optional group"))) - return failure(); + return ::mlir::failure(); // The optional group is required to have an anchor. if (!anchorIdx) @@ -2494,22 +2502,22 @@ LogicalResult FormatParser::parseOptional(std::unique_ptr &element, if (!seenVariables.count(var)) return emitError(curLoc, "type directive can only refer to variables " "within the optional group"); - return success(); + return ::mlir::success(); }; for (auto &ele : elements) { if (auto *typeEle = dyn_cast(ele.get())) { if (failed(checkTypeOperand(typeEle->getOperand()))) - return failure(); + return ::mlir::failure(); } else if (auto *typeEle = dyn_cast(ele.get())) { if (failed(checkTypeOperand(typeEle->getInputs())) || failed(checkTypeOperand(typeEle->getResults()))) - return failure(); + return ::mlir::failure(); } } optionalVariables.insert(seenVariables.begin(), seenVariables.end()); element = std::make_unique(std::move(elements), *anchorIdx); - return success(); + return ::mlir::success(); } LogicalResult FormatParser::parseOptionalChildElement( @@ -2519,7 +2527,7 @@ LogicalResult FormatParser::parseOptionalChildElement( llvm::SMLoc childLoc = curToken.getLoc(); childElements.push_back({}); if (failed(parseElement(childElements.back(), /*isTopLevel=*/true))) - return failure(); + return ::mlir::failure(); // Check to see if this element is the anchor of the optional group. bool isAnchor = curToken.getKind() == Token::caret; @@ -2538,7 +2546,7 @@ LogicalResult FormatParser::parseOptionalChildElement( if (isAnchor && !attrEle->getVar()->attr.isOptional()) return emitError(childLoc, "only optional attributes can be used to " "anchor an optional group"); - return success(); + return ::mlir::success(); }) // Only optional-like(i.e. variadic) operands can be within an optional // group. @@ -2547,12 +2555,12 @@ LogicalResult FormatParser::parseOptionalChildElement( return emitError(childLoc, "only variable length operands can be " "used within an optional group"); seenVariables.insert(ele->getVar()); - return success(); + return ::mlir::success(); }) .Case([&](RegionVariable *) { // TODO: When ODS has proper support for marking "optional" regions, add // a check here. - return success(); + return ::mlir::success(); }) // Literals, custom directives, and type directives may be used, // but they can't anchor the group. @@ -2561,7 +2569,7 @@ LogicalResult FormatParser::parseOptionalChildElement( if (isAnchor) return emitError(childLoc, "only variables can be used to anchor " "an optional group"); - return success(); + return ::mlir::success(); }) .Default([&](Element *) { return emitError(childLoc, "only literals, types, and variables can be " @@ -2581,7 +2589,7 @@ FormatParser::parseAttrDictDirective(std::unique_ptr &element, hasAttrDict = true; element = std::make_unique(withKeyword); - return success(); + return ::mlir::success(); } LogicalResult @@ -2592,7 +2600,7 @@ FormatParser::parseCustomDirective(std::unique_ptr &element, // Parse the custom directive name. if (failed( parseToken(Token::less, "expected '<' before custom directive name"))) - return failure(); + return ::mlir::failure(); Token nameTok = curToken; if (failed(parseToken(Token::identifier, @@ -2601,13 +2609,13 @@ FormatParser::parseCustomDirective(std::unique_ptr &element, "expected '>' after custom directive name")) || failed(parseToken(Token::l_paren, "expected '(' before custom directive parameters"))) - return failure(); + return ::mlir::failure(); // Parse the child elements for this optional group.= std::vector> elements; do { if (failed(parseCustomDirectiveParameter(elements))) - return failure(); + return ::mlir::failure(); if (curToken.getKind() != Token::comma) break; consumeToken(); @@ -2615,7 +2623,7 @@ FormatParser::parseCustomDirective(std::unique_ptr &element, if (failed(parseToken(Token::r_paren, "expected ')' after custom directive parameters"))) - return failure(); + return ::mlir::failure(); // After parsing all of the elements, ensure that all type directives refer // only to variables. @@ -2630,7 +2638,7 @@ FormatParser::parseCustomDirective(std::unique_ptr &element, element = std::make_unique(nameTok.getSpelling(), std::move(elements)); - return success(); + return ::mlir::success(); } LogicalResult FormatParser::parseCustomDirectiveParameter( @@ -2638,7 +2646,7 @@ LogicalResult FormatParser::parseCustomDirectiveParameter( llvm::SMLoc childLoc = curToken.getLoc(); parameters.push_back({}); if (failed(parseElement(parameters.back(), /*isTopLevel=*/true))) - return failure(); + return ::mlir::failure(); // Verify that the element can be placed within a custom directive. if (!isa &element, failed(parseToken(Token::comma, "expected ',' after inputs argument")) || failed(parseTypeDirectiveOperand(results)) || failed(parseToken(Token::r_paren, "expected ')' after argument list"))) - return failure(); + return ::mlir::failure(); element = std::make_unique(std::move(inputs), std::move(results)); - return success(); + return ::mlir::success(); } LogicalResult @@ -2679,7 +2687,7 @@ FormatParser::parseOperandsDirective(std::unique_ptr &element, fmt.allOperands = true; } element = std::make_unique(); - return success(); + return ::mlir::success(); } LogicalResult @@ -2691,7 +2699,7 @@ FormatParser::parseRegionsDirective(std::unique_ptr &element, return emitError(loc, "'regions' directive creates overlap in format"); hasAllRegions = true; element = std::make_unique(); - return success(); + return ::mlir::success(); } LogicalResult @@ -2701,7 +2709,7 @@ FormatParser::parseResultsDirective(std::unique_ptr &element, return emitError(loc, "'results' directive can not be used as a " "top-level directive"); element = std::make_unique(); - return success(); + return ::mlir::success(); } LogicalResult @@ -2714,7 +2722,7 @@ FormatParser::parseSuccessorsDirective(std::unique_ptr &element, return emitError(loc, "'successors' directive creates overlap in format"); hasAllSuccessors = true; element = std::make_unique(); - return success(); + return ::mlir::success(); } LogicalResult @@ -2728,16 +2736,16 @@ FormatParser::parseTypeDirective(std::unique_ptr &element, Token tok, if (failed(parseToken(Token::l_paren, "expected '(' before argument list")) || failed(parseTypeDirectiveOperand(operand)) || failed(parseToken(Token::r_paren, "expected ')' after argument list"))) - return failure(); + return ::mlir::failure(); element = std::make_unique(std::move(operand)); - return success(); + return ::mlir::success(); } LogicalResult FormatParser::parseTypeDirectiveOperand(std::unique_ptr &element) { llvm::SMLoc loc = curToken.getLoc(); if (failed(parseElement(element, /*isTopLevel=*/false))) - return failure(); + return ::mlir::failure(); if (isa(element.get())) return emitError( loc, "'type' directive operand expects variable or directive operand"); @@ -2765,7 +2773,7 @@ FormatParser::parseTypeDirectiveOperand(std::unique_ptr &element) { } else { return emitError(loc, "invalid argument to 'type' directive"); } - return success(); + return ::mlir::success(); } //===----------------------------------------------------------------------===// diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp index 9884d1ccb077d..9b2f35f566246 100644 --- a/mlir/tools/mlir-tblgen/RewriterGen.cpp +++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp @@ -887,8 +887,9 @@ std::string PatternEmitter::handleOpCreation(DagNode tree, int resultIndex, // special cases listed below, DRR needs to supply types for all results // when building an op. bool isSameOperandsAndResultType = - resultOp.getTrait("OpTrait::SameOperandsAndResultType"); - bool useFirstAttr = resultOp.getTrait("OpTrait::FirstAttrDerivedResultType"); + resultOp.getTrait("::mlir::OpTrait::SameOperandsAndResultType"); + bool useFirstAttr = + resultOp.getTrait("::mlir::OpTrait::FirstAttrDerivedResultType"); if (isSameOperandsAndResultType || useFirstAttr) { // We know how to deduce the result type for ops with these traits and we've diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake index 95254e7a9e128..05742bd4fbf7a 100644 --- a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake +++ b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake @@ -137,17 +137,8 @@ find_library ( # There is a libcuda.so in lib64/stubs that can be used for linking. if (NOT LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES AND CUDA_FOUND) - # Since CMake 3.3 FindCUDA.cmake defaults to using static libraries. In this - # case CUDA_LIBRARIES contains additional linker arguments which breaks - # get_filename_component below. Fortunately, since that change the module - # exports CUDA_cudart_static_LIBRARY which points to a single file in the - # right directory. - set(cuda_library ${CUDA_LIBRARIES}) - if (DEFINED CUDA_cudart_static_LIBRARY) - set(cuda_library ${CUDA_cudart_static_LIBRARY}) - endif() - get_filename_component(CUDA_LIBDIR ${cuda_library} DIRECTORY) - find_library ( + get_filename_component(CUDA_LIBDIR "${CUDA_cudart_static_LIBRARY}" DIRECTORY) + find_library( LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES NAMES cuda diff --git a/openmp/libomptarget/include/Debug.h b/openmp/libomptarget/include/Debug.h index b7092dd61a3d8..4f42794e1bcad 100644 --- a/openmp/libomptarget/include/Debug.h +++ b/openmp/libomptarget/include/Debug.h @@ -70,23 +70,26 @@ static inline int getDebugLevel() { #define GETNAME2(name) #name #define GETNAME(name) GETNAME2(name) -// Messaging interface +/// Print a generic message string from libomptarget or a plugin RTL #define MESSAGE0(_str) \ do { \ fprintf(stderr, GETNAME(TARGET_NAME) " message: %s\n", _str); \ } while (0) +/// Print a printf formatting string message from libomptarget or a plugin RTL #define MESSAGE(_str, ...) \ do { \ fprintf(stderr, GETNAME(TARGET_NAME) " message: " _str "\n", __VA_ARGS__); \ } while (0) +/// Print fatal error message with an error string and error identifier #define FATAL_MESSAGE0(_num, _str) \ do { \ fprintf(stderr, GETNAME(TARGET_NAME) " fatal error %d: %s\n", _num, _str); \ abort(); \ } while (0) +/// Print fatal error message with a printf string and error identifier #define FATAL_MESSAGE(_num, _str, ...) \ do { \ fprintf(stderr, GETNAME(TARGET_NAME) " fatal error %d:" _str "\n", _num, \ @@ -94,12 +97,20 @@ static inline int getDebugLevel() { abort(); \ } while (0) +/// Print a generic error string from libomptarget or a plugin RTL #define FAILURE_MESSAGE(...) \ do { \ fprintf(stderr, GETNAME(TARGET_NAME) " error: "); \ fprintf(stderr, __VA_ARGS__); \ } while (0) +/// Print a generic information string used if LIBOMPTARGET_INFO=1 +#define INFO_MESSAGE(_num, ...) \ + do { \ + fprintf(stderr, GETNAME(TARGET_NAME) " device %d info: ", _num); \ + fprintf(stderr, __VA_ARGS__); \ + } while (0) + // Debugging messages #ifdef OMPTARGET_DEBUG #include @@ -110,6 +121,7 @@ static inline int getDebugLevel() { fprintf(stderr, __VA_ARGS__); \ } +/// Emit a message for debugging #define DP(...) \ do { \ if (getDebugLevel() > 0) { \ @@ -117,6 +129,7 @@ static inline int getDebugLevel() { } \ } while (false) +/// Emit a message for debugging or failure if debugging is disabled #define REPORT(...) \ do { \ if (getDebugLevel() > 0) { \ @@ -133,4 +146,14 @@ static inline int getDebugLevel() { #define REPORT(...) FAILURE_MESSAGE(__VA_ARGS__); #endif // OMPTARGET_DEBUG +/// Emit a message giving the user extra information about the runtime if +#define INFO(_id, ...) \ + do { \ + if (getDebugLevel() > 0) { \ + DEBUGP(DEBUG_PREFIX, __VA_ARGS__); \ + } else if (getInfoLevel() > 0) { \ + INFO_MESSAGE(_id, __VA_ARGS__); \ + } \ + } while (false) + #endif // _OMPTARGET_DEBUG_H diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp index 2675f83ae28f2..1a0bffb9557c3 100644 --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -29,7 +29,7 @@ #ifdef OMPTARGET_DEBUG #define CUDA_ERR_STRING(err) \ do { \ - if (getDebugLevel() > 0) { \ + if (getDebugLevel() > 0) { \ const char *errStr; \ cuGetErrorString(err, &errStr); \ DP("CUDA error is: %s\n", errStr); \ @@ -277,14 +277,15 @@ class DeviceRTLTy { E.Entries.push_back(entry); } - // Return true if the entry is associated with device - bool findOffloadEntry(const int DeviceId, const void *Addr) const { + // Return a pointer to the entry associated with the pointer + const __tgt_offload_entry *getOffloadEntry(const int DeviceId, + const void *Addr) const { for (const __tgt_offload_entry &Itr : DeviceData[DeviceId].FuncGblEntries.back().Entries) if (Itr.addr == Addr) - return true; + return &Itr; - return false; + return nullptr; } // Return the pointer to the target entries table @@ -492,9 +493,11 @@ class DeviceRTLTy { DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; } - DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", - DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock, - DeviceData[DeviceId].WarpSize); + INFO(DeviceId, + "Device supports up to %d CUDA blocks and %d threads with a " + "warp size of %d\n", + DeviceData[DeviceId].BlocksPerGrid, + DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); // Set default number of teams if (EnvNumTeams > 0) { @@ -926,9 +929,14 @@ class DeviceRTLTy { CudaBlocksPerGrid = TeamNum; } - // Run on the device. - DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid, - CudaThreadsPerBlock); + INFO(DeviceId, + "Launching kernel %s with %d blocks and %d threads in %s " + "mode\n", + (getOffloadEntry(DeviceId, TgtEntryPtr)) + ? getOffloadEntry(DeviceId, TgtEntryPtr)->name + : "(null)", + CudaBlocksPerGrid, CudaThreadsPerBlock, + (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic"); CUstream Stream = getStream(DeviceId, AsyncInfo); Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index fdf625cb71f66..79feebe6f32ba 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -17,6 +17,7 @@ #include #include +#include #include /// Map between Device ID (i.e. openmp device id) and its DeviceTy. @@ -50,7 +51,12 @@ DeviceTy::DeviceTy(RTLInfoTy *RTL) ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(), MemoryManager(nullptr) {} -DeviceTy::~DeviceTy() = default; +DeviceTy::~DeviceTy() { + if (DeviceID == -1 || getInfoLevel() < 1) + return; + + dumpTargetPointerMappings(*this); +} int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) { DataMapMtx.lock(); @@ -214,11 +220,13 @@ void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, HT.incRefCount(); uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); - DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " - "Size=%" PRId64 ",%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""), - DPxPTR(HstPtrBegin), DPxPTR(tp), Size, - (UpdateRefCount ? " updated" : ""), - HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str()); + INFO(DeviceID, + "Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD + ", " + "Size=%" PRId64 ",%s RefCount=%s\n", + (IsImplicit ? " (implicit)" : ""), DPxPTR(HstPtrBegin), DPxPTR(tp), + Size, (UpdateRefCount ? " updated" : ""), + HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str()); rc = (void *)tp; } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) { // Explicit extension of mapped data - not allowed. diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index d22e5978c20af..76a9e766ec76e 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -16,6 +16,7 @@ #include "rtl.h" #include +#include #include #include @@ -25,7 +26,6 @@ std::mutex TargetOffloadMtx; //////////////////////////////////////////////////////////////////////////////// /// manage the success or failure of a target construct - static void HandleDefaultTargetOffload() { TargetOffloadMtx.lock(); if (TargetOffloadPolicy == tgt_default) { @@ -60,8 +60,13 @@ static void HandleTargetOutcome(bool success) { break; case tgt_mandatory: if (!success) { - if (getInfoLevel() > 0) - MESSAGE0("LIBOMPTARGET_INFO is not supported yet"); + if (getInfoLevel() > 1) + for (const auto &Device : Devices) + dumpTargetPointerMappings(Device); + else + FAILURE_MESSAGE("run with env LIBOMPTARGET_INFO>1 to dump host-target" + "pointer maps\n"); + FATAL_MESSAGE0(1, "failure of target construct while offloading is mandatory"); } break; diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index f01714808dd4e..17ca81e353f1a 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -96,4 +96,20 @@ int __kmpc_get_target_offload(void) __attribute__((weak)); #define TARGET_NAME Libomptarget #define DEBUG_PREFIX GETNAME(TARGET_NAME) +//////////////////////////////////////////////////////////////////////////////// +/// dump a table of all the host-target pointer pairs on failure +static inline void dumpTargetPointerMappings(const DeviceTy &Device) { + if (Device.HostDataToTargetMap.empty()) + return; + + fprintf(stderr, "Device %d Host-Device Pointer Mappings:\n", Device.DeviceID); + fprintf(stderr, "%-18s %-18s %s\n", "Host Ptr", "Target Ptr", "Size (B)"); + for (const auto &HostTargetMap : Device.HostDataToTargetMap) { + fprintf(stderr, DPxMOD " " DPxMOD " %lu\n", + DPxPTR(HostTargetMap.HstPtrBegin), + DPxPTR(HostTargetMap.TgtPtrBegin), + HostTargetMap.HstPtrEnd - HostTargetMap.HstPtrBegin); + } +} + #endif diff --git a/openmp/libomptarget/test/offloading/info.c b/openmp/libomptarget/test/offloading/info.c new file mode 100644 index 0000000000000..e0d3f1a0e94c1 --- /dev/null +++ b/openmp/libomptarget/test/offloading/info.c @@ -0,0 +1,15 @@ +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_INFO=1 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=INFO + +#include +#include + +int main() { + int ptr = 1; + +// INFO: CUDA device {{[0-9]+}} info: Device supports up to {{[0-9]+}} CUDA blocks and {{[0-9]+}} threads with a warp size of {{[0-9]+}} +// INFO: CUDA device {{[0-9]+}} info: Launching kernel {{.*}} with {{[0-9]+}} blocks and {{[0-9]+}} threads in Generic mode +#pragma omp target map(tofrom:ptr) + {ptr = 1;} + + return 0; +} diff --git a/polly/cmake/CMakeLists.txt b/polly/cmake/CMakeLists.txt index fd8028a8937af..7cc129ba2e906 100644 --- a/polly/cmake/CMakeLists.txt +++ b/polly/cmake/CMakeLists.txt @@ -10,7 +10,7 @@ else() endif() set(POLLY_CONFIG_EXPORTED_TARGETS Polly ${ISL_TARGET}) -if (NOT MSVC AND LLVM_ENABLE_PIC) +if (NOT WIN32 AND LLVM_ENABLE_PIC) # LLVMPolly is a dummy target on Win or if PIC code is disabled. list(APPEND POLLY_CONFIG_EXPORTED_TARGETS LLVMPolly) endif() diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt index 113ae5f2eb577..b20358e4b3d67 100644 --- a/polly/lib/CMakeLists.txt +++ b/polly/lib/CMakeLists.txt @@ -137,7 +137,7 @@ endif () # Create a loadable module Polly.so that can be loaded using # LLVM's/clang's "-load" option. -if (MSVC OR NOT LLVM_ENABLE_PIC) +if (WIN32 OR NOT LLVM_ENABLE_PIC) # Add dummy target, either because loadable modules are not supported # as on Windows or because PIC code has been disabled add_custom_target(LLVMPolly) diff --git a/pstl/include/pstl/internal/parallel_backend_tbb.h b/pstl/include/pstl/internal/parallel_backend_tbb.h index a9ea0c7456fb4..f1836aace0ae5 100644 --- a/pstl/include/pstl/internal/parallel_backend_tbb.h +++ b/pstl/include/pstl/internal/parallel_backend_tbb.h @@ -25,6 +25,7 @@ #include #include #include +#include #if TBB_INTERFACE_VERSION < 10000 # error Intel(R) Threading Building Blocks 2018 is required; older versions are not supported. @@ -71,7 +72,11 @@ class __buffer inline void __cancel_execution() { +#if TBB_INTERFACE_VERSION <= 12000 tbb::task::self().group()->cancel_group_execution(); +#else + tbb::task::current_context()->cancel_group_execution(); +#endif } //------------------------------------------------------------------------ @@ -413,17 +418,308 @@ __parallel_transform_scan(_ExecutionPolicy&&, _Index __n, _Up __u, _Tp __init, _ //------------------------------------------------------------------------ #define _PSTL_MERGE_CUT_OFF 2000 +template +class __func_task; +template +class __root_task; + +#if TBB_INTERFACE_VERSION <= 12000 +class __task : public tbb::task +{ + public: + template + __task* + make_continuation(_Fn&& __f) + { + return new (allocate_continuation()) __func_task::type>(std::forward<_Fn>(__f)); + } + + template + __task* + make_child_of(__task* parent, _Fn&& __f) + { + return new (parent->allocate_child()) __func_task::type>(std::forward<_Fn>(__f)); + } + + template + __task* + make_additional_child_of(tbb::task* parent, _Fn&& __f) + { + return new (tbb::task::allocate_additional_child_of(*parent)) + __func_task::type>(std::forward<_Fn>(__f)); + } + + inline void + recycle_as_continuation() + { + tbb::task::recycle_as_continuation(); + } + + inline void + recycle_as_child_of(__task* parent) + { + tbb::task::recycle_as_child_of(*parent); + } + + inline void + spawn(__task* __t) + { + tbb::task::spawn(*__t); + } + + template + static inline void + spawn_root_and_wait(__root_task<_Fn>& __root) + { + tbb::task::spawn_root_and_wait(*__root._M_task); + } +}; + +template +class __func_task : public __task +{ + _Func _M_func; + + tbb::task* + execute() + { + return _M_func(this); + }; + + public: + template + __func_task(_Fn&& __f) : _M_func{std::forward<_Fn>(__f)} + { + } + + _Func& + body() + { + return _M_func; + } +}; + +template +class __root_task +{ + tbb::task* _M_task; + + public: + template + __root_task(Args&&... args) + : _M_task{new (tbb::task::allocate_root()) __func_task<_Func>{_Func(std::forward(args)...)}} + { + } + + friend class __task; + friend class __func_task<_Func>; +}; + +#else // TBB_INTERFACE_VERSION <= 12000 +class __task : public tbb::detail::d1::task +{ + protected: + tbb::detail::d1::small_object_allocator _M_allocator{}; + tbb::detail::d1::execution_data* _M_execute_data{}; + __task* _M_parent{}; + std::atomic _M_refcount{}; + bool _M_recycle{}; + + template + __task* + allocate_func_task(_Fn&& __f) + { + assert(_M_execute_data != nullptr); + tbb::detail::d1::small_object_allocator __alloc{}; + auto __t = + __alloc.new_object<__func_task::type>>(*_M_execute_data, std::forward<_Fn>(__f)); + __t->_M_allocator = __alloc; + return __t; + } + + public: + __task* + parent() + { + return _M_parent; + } + + void + set_ref_count(int __n) + { + _M_refcount.store(__n, std::memory_order_release); + } + + template + __task* + make_continuation(_Fn&& __f) + { + auto __t = allocate_func_task(std::forward<_Fn&&>(__f)); + __t->_M_parent = _M_parent; + _M_parent = nullptr; + return __t; + } + + template + __task* + make_child_of(__task* __parent, _Fn&& __f) + { + auto __t = allocate_func_task(std::forward<_Fn&&>(__f)); + __t->_M_parent = __parent; + return __t; + } + + template + __task* + make_additional_child_of(__task* __parent, _Fn&& __f) + { + auto __t = make_child_of(__parent, std::forward<_Fn>(__f)); + assert(__parent->_M_refcount.load(std::memory_order_relaxed) > 0); + ++__parent->_M_refcount; + return __t; + } + + inline void + recycle_as_continuation() + { + _M_recycle = true; + } + + inline void + recycle_as_child_of(__task* parent) + { + _M_recycle = true; + _M_parent = parent; + } + + inline void + spawn(__task* __t) + { + assert(_M_execute_data != nullptr); + tbb::detail::d1::spawn(*__t, *_M_execute_data->context); + } + + template + static inline void + spawn_root_and_wait(__root_task<_Fn>& __root) + { + tbb::detail::d1::execute_and_wait(*__root._M_func_task, __root._M_context, __root._M_wait_object, + __root._M_context); + } + + template + friend class __func_task; +}; + +template +class __func_task : public __task +{ + _Func _M_func; + + __task* + execute(tbb::detail::d1::execution_data& __ed) override + { + _M_execute_data = &__ed; + _M_recycle = false; + __task* __next = _M_func(this); + return finalize(__next); + }; + + __task* + cancel(tbb::detail::d1::execution_data& __ed) override + { + return finalize(nullptr); + } + + __task* + finalize(__task* __next) + { + bool __recycle = _M_recycle; + _M_recycle = false; + + if (__recycle) + { + return __next; + } + + auto __parent = _M_parent; + auto __alloc = _M_allocator; + auto __ed = _M_execute_data; + + this->~__func_task(); + + assert(__parent != nullptr); + assert(__parent->_M_refcount.load(std::memory_order_relaxed) > 0); + if (--__parent->_M_refcount == 0) + { + assert(__next == nullptr); + __alloc.deallocate(this, *__ed); + return __parent; + } + + return __next; + } + + friend class __root_task<_Func>; + + public: + template + __func_task(_Fn&& __f) : _M_func(std::forward<_Fn>(__f)) + { + } + + _Func& + body() + { + return _M_func; + } +}; + +template +class __root_task : public __task +{ + __task* + execute(tbb::detail::d1::execution_data& __ed) override + { + _M_wait_object.release(); + return nullptr; + }; + + __task* + cancel(tbb::detail::d1::execution_data& __ed) override + { + _M_wait_object.release(); + return nullptr; + } + + __func_task<_Func>* _M_func_task{}; + tbb::detail::d1::wait_context _M_wait_object{0}; + tbb::task_group_context _M_context{}; + + public: + template + __root_task(Args&&... args) : _M_wait_object{1} + { + tbb::detail::d1::small_object_allocator __alloc{}; + _M_func_task = __alloc.new_object<__func_task<_Func>>(_Func(std::forward(args)...)); + _M_func_task->_M_allocator = __alloc; + _M_func_task->_M_parent = this; + _M_refcount.store(1, std::memory_order_relaxed); + } + + friend class __task; +}; +#endif // TBB_INTERFACE_VERSION <= 12000 + template -class __merge_task : public tbb::task +class __merge_func { typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1; typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2; typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType; typedef typename std::iterator_traits<_RandomAccessIterator1>::value_type _ValueType; - /*override*/ tbb::task* - execute(); _RandomAccessIterator1 _M_x_beg; _RandomAccessIterator2 _M_z_beg; @@ -529,7 +825,7 @@ class __merge_task : public tbb::task }; public: - __merge_task(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp, + __merge_func(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp, _Cleanup, _LeafMerge __leaf_merge, _SizeType __nsort, _RandomAccessIterator1 __x_beg, _RandomAccessIterator2 __z_beg, bool __x_orig, bool __y_orig, bool __root) : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_x_beg(__x_beg), _M_z_beg(__z_beg), @@ -554,12 +850,14 @@ class __merge_task : public tbb::task _y_orig = __on_off; } + __task* + operator()(__task* __self); + private: - __merge_task* - parent_merge() const + __merge_func* + parent_merge(__task* __self) const { - tbb::task* p = (_root ? nullptr : parent()); - return static_cast<__merge_task*>(p); + return _root ? nullptr : &static_cast<__func_task<__merge_func>*>(__self->parent())->body(); } bool x_less_y() @@ -615,8 +913,8 @@ class __merge_task : public tbb::task _y_orig = !_y_orig; } - tbb::task* - merge_ranges() + __task* + merge_ranges(__task* __self) { assert(_x_orig == _y_orig); //two merged subrange must be lie into the same buffer @@ -626,7 +924,7 @@ class __merge_task : public tbb::task // need to merge {x} and {y} if (__n > __merge_cut_off) - return split_merging(); + return split_merging(__self); //merge to buffer if (_x_orig) @@ -634,7 +932,7 @@ class __merge_task : public tbb::task _M_leaf_merge(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs, _M_comp, __move_value_construct(), __move_value_construct(), __move_range_construct(), __move_range_construct()); - assert(parent_merge()); //not root merging task + assert(parent_merge(__self)); //not root merging task } //merge to "origin" else @@ -656,13 +954,13 @@ class __merge_task : public tbb::task return nullptr; } - tbb::task* - process_ranges() + __task* + process_ranges(__task* __self) { assert(_x_orig == _y_orig); assert(!_split); - auto p = parent_merge(); + auto p = parent_merge(__self); if (!p) { //root merging task @@ -685,7 +983,7 @@ class __merge_task : public tbb::task move_y_range(); //parallel moving } // need to merge {x} and {y}. - return merge_ranges(); + return merge_ranges(__self); } //else: not root merging task (parent_merge() == NULL) //optimization, just for sort algorithm, //{x} <= {y} @@ -699,12 +997,12 @@ class __merge_task : public tbb::task const auto id_range = _M_zs; p->set_odd(id_range, !_x_orig); - return merge_ranges(); + return merge_ranges(__self); } //splitting as merge task into 2 of the same level - tbb::task* - split_merging() + __task* + split_merging(__task* __self) { assert(_x_orig == _y_orig); const auto __nx = (_M_xe - _M_xs); @@ -732,43 +1030,42 @@ class __merge_task : public tbb::task } auto __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys)); + __merge_func __right_func(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort, + _M_x_beg, _M_z_beg, _x_orig, _y_orig, _root); + __right_func._split = true; + auto __merge_task = __self->make_additional_child_of(__self->parent(), std::move(__right_func)); + __self->spawn(__merge_task); + __self->recycle_as_continuation(); - __merge_task* __right = new (tbb::task::allocate_additional_child_of(*parent())) - __merge_task(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort, _M_x_beg, - _M_z_beg, _x_orig, _y_orig, _root); - - __right->_split = true; - - tbb::task::spawn(*__right); - tbb::task::recycle_as_continuation(); _M_xe = __xm; _M_ye = __ym; _split = true; - return this; + return __self; } }; template -tbb::task* -__merge_task<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>::execute() +__task* +__merge_func<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>:: +operator()(__task* __self) { //a. split merge task into 2 of the same level; the special logic, //without processing(process_ranges) adjacent sub-ranges x and y if (_split) - return merge_ranges(); + return merge_ranges(__self); //b. General merging of adjacent sub-ranges x and y (with optimization in case of {x} <= {y} ) //1. x and y are in the even buffer //2. x and y are in the odd buffer if (_x_orig == _y_orig) - return process_ranges(); + return process_ranges(__self); //3. x is in even buffer, y is in the odd buffer //4. x is in odd buffer, y is in the even buffer - if (!parent_merge()) + if (!parent_merge(__self)) { //root merge task if (_x_orig) move_x_range(); @@ -788,11 +1085,11 @@ __merge_task<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Clean move_y_range(); } - return process_ranges(); + return process_ranges(__self); } template -class __stable_sort_task : public tbb::task +class __stable_sort_func { public: typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1; @@ -800,8 +1097,6 @@ class __stable_sort_task : public tbb::task typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType; private: - /*override*/ tbb::task* - execute(); _RandomAccessIterator1 _M_xs, _M_xe, _M_x_beg; _RandomAccessIterator2 _M_zs, _M_z_beg; _Compare _M_comp; @@ -810,22 +1105,25 @@ class __stable_sort_task : public tbb::task _SizeType _M_nsort; //zero or number of elements to be sorted for partial_sort alforithm public: - __stable_sort_task(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs, + __stable_sort_func(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs, bool __root, _Compare __comp, _LeafSort __leaf_sort, _SizeType __nsort, _RandomAccessIterator1 __x_beg, _RandomAccessIterator2 __z_beg) : _M_xs(__xs), _M_xe(__xe), _M_x_beg(__x_beg), _M_zs(__zs), _M_z_beg(__z_beg), _M_comp(__comp), _M_leaf_sort(__leaf_sort), _M_root(__root), _M_nsort(__nsort) { } + + __task* + operator()(__task* __self); }; #define _PSTL_STABLE_SORT_CUT_OFF 500 template -tbb::task* -__stable_sort_task<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::execute() +__task* +__stable_sort_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::operator()(__task* __self) { - typedef __merge_task<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy, + typedef __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy, __utils::__serial_move_merge> _MergeTaskType; @@ -835,34 +1133,27 @@ __stable_sort_task<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _Le if (__n <= __sort_cut_off) { _M_leaf_sort(_M_xs, _M_xe, _M_comp); - assert(!_M_root); - - tbb::task* p = parent(); - const auto id_range = _M_xs - _M_x_beg; - return nullptr; } const _RandomAccessIterator1 __xm = _M_xs + __n / 2; const _RandomAccessIterator2 __zm = _M_zs + (__xm - _M_xs); const _RandomAccessIterator2 __ze = _M_zs + __n; - _MergeTaskType* __m = new (allocate_continuation()) _MergeTaskType( - _M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg, _M_zs - _M_z_beg, _M_comp, - __utils::__serial_destroy(), __utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg, - /*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root); - + _MergeTaskType __m(_MergeTaskType(_M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg, + _M_zs - _M_z_beg, _M_comp, __utils::__serial_destroy(), + __utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg, + /*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root)); + auto __parent = __self->make_continuation(std::move(__m)); + __parent->set_ref_count(2); + auto __right = __self->make_child_of( + __parent, __stable_sort_func(__xm, _M_xe, __zm, false, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg)); + __self->spawn(__right); + __self->recycle_as_child_of(__parent); _M_root = false; - - __m->set_ref_count(2); - auto __right = new (__m->allocate_child()) - __stable_sort_task(__xm, _M_xe, __zm, _M_root, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg); - - spawn(*__right); - recycle_as_child_of(*__m); _M_xe = __xm; - return this; + return __self; } template @@ -882,11 +1173,9 @@ __parallel_stable_sort(_ExecutionPolicy&&, _RandomAccessIterator __xs, _RandomAc if (__n > __sort_cut_off) { __buffer<_ValueType> __buf(__n); - tbb::task* root = new (tbb::task::allocate_root()) - __stable_sort_task<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>( - __xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get()); - tbb::task::spawn_root_and_wait(*root); - + __root_task<__stable_sort_func<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>> __root{ + __xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get()}; + __task::spawn_root_and_wait(__root); return; } //serial sort @@ -899,10 +1188,8 @@ __parallel_stable_sort(_ExecutionPolicy&&, _RandomAccessIterator __xs, _RandomAc //------------------------------------------------------------------------ template -class __merge_task_static : public tbb::task +class __merge_func_static { - /*override*/ tbb::task* - execute(); _RandomAccessIterator1 _M_xs, _M_xe; _RandomAccessIterator2 _M_ys, _M_ye; _RandomAccessIterator3 _M_zs; @@ -910,20 +1197,23 @@ class __merge_task_static : public tbb::task _LeafMerge _M_leaf_merge; public: - __merge_task_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys, + __merge_func_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp, _LeafMerge __leaf_merge) : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_comp(__comp), _M_leaf_merge(__leaf_merge) { } + + __task* + operator()(__task* __self); }; //TODO: consider usage of parallel_for with a custom blocked_range template -tbb::task* -__merge_task_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare, - _LeafMerge>::execute() +__task* +__merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare, _LeafMerge>:: +operator()(__task* __self) { typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1; typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2; @@ -949,14 +1239,14 @@ __merge_task_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAcces __ym = std::lower_bound(_M_ys, _M_ye, *__xm, _M_comp); } const _RandomAccessIterator3 __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys)); - tbb::task* __right = new (tbb::task::allocate_additional_child_of(*parent())) - __merge_task_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge); - tbb::task::spawn(*__right); - tbb::task::recycle_as_continuation(); + auto __right = __self->make_additional_child_of( + __self->parent(), __merge_func_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge)); + __self->spawn(__right); + __self->recycle_as_continuation(); _M_xe = __xm; _M_ye = __ym; - return this; + return __self; } template _TaskType; - tbb::task::spawn_root_and_wait(*new (tbb::task::allocate_root()) - _TaskType(__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge)); + __root_task<_TaskType> __root{__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge}; + __task::spawn_root_and_wait(__root); }); } } diff --git a/sycl/test/sub_group/generic-shuffle.cpp b/sycl/test/sub_group/generic-shuffle.cpp index e6825750925fc..60dc07c0b8e4c 100644 --- a/sycl/test/sub_group/generic-shuffle.cpp +++ b/sycl/test/sub_group/generic-shuffle.cpp @@ -1,6 +1,7 @@ // UNSUPPORTED: cuda // CUDA compilation and runtime do not yet support sub-groups. // +// XFAIL: linux && gpu // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out diff --git a/sycl/test/sub_group/shuffle.cpp b/sycl/test/sub_group/shuffle.cpp index 5207716148ef6..c55b63d6f3fad 100644 --- a/sycl/test/sub_group/shuffle.cpp +++ b/sycl/test/sub_group/shuffle.cpp @@ -1,6 +1,7 @@ // UNSUPPORTED: cuda // CUDA compilation and runtime do not yet support sub-groups. // +// XFAIL: linux && gpu // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out diff --git a/sycl/test/sub_group/shuffle_fp16.cpp b/sycl/test/sub_group/shuffle_fp16.cpp index 62f07fc612de8..5bf485a307006 100644 --- a/sycl/test/sub_group/shuffle_fp16.cpp +++ b/sycl/test/sub_group/shuffle_fp16.cpp @@ -1,6 +1,7 @@ // UNSUPPORTED: cuda // CUDA compilation and runtime do not yet support sub-groups. // +// XFAIL: linux && gpu // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // diff --git a/sycl/test/sub_group/shuffle_fp64.cpp b/sycl/test/sub_group/shuffle_fp64.cpp index 3b1ed56907601..890a806677ae9 100644 --- a/sycl/test/sub_group/shuffle_fp64.cpp +++ b/sycl/test/sub_group/shuffle_fp64.cpp @@ -1,6 +1,7 @@ // UNSUPPORTED: cuda // CUDA compilation and runtime do not yet support sub-groups. // +// XFAIL: linux && gpu // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out